Welcome to scrapex’s documentation!

Scrapex is a simple web scraping framework. Built on top of requests and lxml, supports Python 2 and Python 3.

>>> from scrapex import Scraper
>>> s = Scraper(use_cache = True)
>>> doc = s.load('https://github.com/search?q=scraping')
>>>
>>> print(doc.extract("//h3[contains(text(),'results')]").strip())
59,256 repository results
>>>
>>> listings = doc.query("//ul[@class='repo-list']/li")
>>> print('number of listings on first page:', len(listings) )
number of listings on first page: 10
>>>
>>> for listing in listings[0:3]:
...     print('repo name: ',listing.extract(".//div[contains(@class,'text-normal')]/a"))
...
repo name:  scrapinghub/portia
repo name:  scrapy/scrapy
repo name:  REMitchell/python-scraping
>>>

Key Features

  • Easy to parse data points from html documents using XPATH, regular expression, or string subtraction
  • Easy to parse street address into components (street, city, state, zip code)
  • Easy to parse person name into components (prefix, first name, middle name, last name, suffix)
  • Easy to save results to CSV, Excel, JSON files
  • HTML cache:
    • quickly response to any parsing mistakes/changes without having to send HTTP rquests again.
    • easy to resume a broken scrape without having to re-download the downloaded html files.
  • Ramdom proxy rotation
  • Random user agent rotation

Installation

To install stable version

pip install scrapex

To install development version

pip install https://github.com/cungnv/scrapex/archive/master.zip

Example scraping script

A complete scraping script to scrape Github search result pages. View the source file.

#encoding: utf8

import sys
import logging
from collections import OrderedDict

from scrapex import Scraper
from scrapex import common


logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,
    format='%(levelname)s %(message)s')

logger = logging.getLogger(__name__)


#create the scraper object
#enable the cache system so that we can re-run the scrape without re-downloading the html files

s = Scraper(use_cache=True, greeting=True)


def scrape():
    """
    - scrape repos from this search (first 5 result pages):
    https://github.com/search?q=scraping%20framework

    - results are save into an excel file: results.xlsx

    """

    url = 'https://github.com/search?q=scraping%20framework'

    page = 0

    while True:
        page += 1

        doc = s.load(url)

        if page==1:
            logger.info(doc.extract("//h3[contains(text(),'results')]").strip())


        listings = doc.query("//ul[@class='repo-list']/li")
        logger.info('page# %s >> %s listings', page, len(listings))

        for listing in listings:
            item = OrderedDict()

            item['name'] = listing.extract(".//div[contains(@class,'text-normal')]/a")

            item['description'] = listing.extract(".//div[@class='mt-n1']/p").rr('\s+',' ').strip()

            item['tags'] = listing.query(".//a[contains(@class,'topic-tag')]").join(', ')

            #save this item to excel file

            s.save(item,'results.xlsx')



        #find the url of the next result page

        url = doc.x("//a[.='Next']/@href")

        if url:
            logger.info('next page url: %s', url)
        else:
            logger.info('last page reached')

        if page == 5:
            break

if __name__ == '__main__':
    scrape()

Output screen:

CungMac:demo cung$ python example-script.py
scrape started
INFO 443 repository results
INFO page# 1 >> 10 listings
INFO next page url: https://github.com/search?p=2&q=scraping+framework&type=Repositories
INFO page# 2 >> 10 listings
INFO next page url: https://github.com/search?p=3&q=scraping+framework&type=Repositories
INFO page# 3 >> 10 listings
INFO next page url: https://github.com/search?p=4&q=scraping+framework&type=Repositories
INFO page# 4 >> 10 listings
INFO next page url: https://github.com/search?p=5&q=scraping+framework&type=Repositories
INFO page# 5 >> 10 listings
INFO next page url: https://github.com/search?p=6&q=scraping+framework&type=Repositories
scrape finished

The results file screenshot:

https://scrapex-images.s3-us-west-2.amazonaws.com/example-results.png

Make HTTP requests

Send GET requests

>>> from scrapex import Scraper
>>> s = Scraper()
>>> doc = s.load('https://github.com/search?q=scraping')
>>> print(doc.response.status_code)
200
>>> print(doc.response.request.headers)
{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive'}
>>>

Send POST requests

>>> doc = s.load('http://httpbin.org/post', data={'name1':'value1', 'name2':'value2'})
>>> doc.response.json()
{'args': {}, 'data': '', 'files': {}, 'form': {'name1': 'value1', 'name2': 'value2'}, 'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', 'Content-Length': '25', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'httpbin.org', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', 'X-Amzn-Trace-Id': 'Root=1-5e42296f-72e27fb04c47860421601594'}, 'json': None, 'origin': '42.114.13.13', 'url': 'http://httpbin.org/post'}
>>>
>>> doc = s.load('http://httpbin.org/post', data='name1=value1&name2=value2')
>>> doc.response.json()
{'args': {}, 'data': 'name1=value1&name2=value2', 'files': {}, 'form': {}, 'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', 'Content-Length': '25', 'Host': 'httpbin.org', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15', 'X-Amzn-Trace-Id': 'Root=1-5e4229f8-ce30254a99ad99202e50b612'}, 'json': None, 'origin': '42.114.13.13', 'url': 'http://httpbin.org/post'}
>>>

Send requests with custom headers

>>> headers = {'Referer': 'https://github.com/?q=scraping',
...     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko)',
...     }
>>> doc = s.load('https://github.com/cungnv/scrapex', headers=headers)
>>>
>>> doc.response.request.headers
{'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko)', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Referer': 'https://github.com/?q=scraping', 'Cookie': '_octo=GH1.1.1947465757.1581393340; logged_in=no; _gh_sess=WjhnT0tOV2Nqa2xZRHVqY1VwWkgwckRMT3FJck03UWtlSThMYkdUVFNTYzBMNW5jcTYvd3NKbzhGR0Y3bUJhOFlUZkt1VWJNQ2UyWjJzL0FhMFprOTBVdTNEQzgraCtidE9IVnJhZ25sWXRKdHJhNUZZeEVNNWdDM3NZekg1YTRrQmlIakZEU21qWVVHc2N2OVRnM3ZBPT0tLVZVeWt6UTY4OEVzYm03S3pqY3dqaUE9PQ%3D%3D--85276db3b6db4db17aa5766f84e5251d8911f146'}
>>>

Failed requests

>>> try:
...     doc = s.load('http://httpbin.org/status/404')
... except Exception as e:
...     print(type(e))
...     print('status code:', e.response.status_code)
...
<class 'requests.exceptions.HTTPError'>
status code: 404
>>>

How to parse data in scrapex

It’s very easy to parse html document to data points in scrapex.

parse data points from the document using XPATH

>>> doc = s.load('https://github.com/search?q=scraping+framework')
>>> headline = doc.extract("//h3[contains(text(),'results')]").strip()
>>> print(headline)
445 repository results
>>>
>>> number_of_results = headline.subreg('([\d\,]+)').replace(',','')
>>> print(number_of_results)
445
>>>
>>> next_page_url = doc.extract("//a[.='Next']/@href")
>>> print(next_page_url)
https://github.com/search?p=2&q=scraping+framework&type=Repositories
>>>

parse data from a child node

>>> nodes = doc.query("//ul[@class='repo-list']/li")
>>> print('num of nodes:', len(nodes))
num of nodes: 10
>>>
>>> node = nodes[0] #play with first result
>>> repo_name = node.extract(".//div[contains(@class,'text-normal')]/a")
>>> print(repo_name)
scrapy/scrapy
>>>
>>> description = node.extract(".//div[@class='mt-n1']/p").strip()
>>> print(description)
Scrapy, a fast high-level web crawling & scraping framework for Python.
>>>
>>> tags = node.query(".//a[contains(@class,'topic-tag')]").join(', ')
>>> print(tags)
python, crawler, framework, scraping, crawling
>>>

parse street address components

>>> from scrapex import common
>>>
>>> doc = s.load('https://www.yellowpages.com/search?search_terms=restaurant&geo_location_terms=New+York%2C+NY')
>>> first_result = doc.node("//div[@class='result']")
>>>
>>> name = first_result.extract(".//a[@class='business-name']").strip()
>>> print(name)
Mr. K's
>>>
>>> full_address = first_result.query(".//p[@class='adr']/following-sibling::div").join(', ').replace(',,',',')
>>> print(full_address)
570 Lexington Ave, New York, NY 10022
>>>
>>> parsed_address = common.parse_address(full_address)
>>> print(parsed_address)
{'address': 570 Lexington Ave, 'city': New York, 'state': NY, 'zipcode': 10022}
>>>

Save results to csv/excel file

>>> from scrapex import Scraper
>>> from collections import OrderedDict
>>>
>>> s = Scraper(use_cache=True)
>>> doc = s.load('https://github.com/search?q=scraping+framework')
>>> nodes = doc.query("//ul[@class='repo-list']/li")
>>> print('num of nodes:', len(nodes))
num of nodes: 10
>>>
>>> for node in nodes:
...     item = OrderedDict()
...     item['name'] = node.extract(".//div[contains(@class,'text-normal')]/a")
...     item['description'] = node.extract(".//div[@class='mt-n1']/p").strip()
...     item['tags'] = node.query(".//a[contains(@class,'topic-tag')]").join(', ')
...
...     s.save(item, 'results.csv')
...     s.save(item, 'other_file.xlsx') #save this same item to another file
...
>>>

Work with proxies

Enable proxies at scraper level, and choose a random one for each request

>>> from scrapex import Scraper
>>> s = Scraper(proxy_file="/path/to/a_proxy_file.txt")
>>>
>>> doc = s.load('https://httpbin.org/anything')
>>> proxy_used = doc.response.json()['origin']
>>> print(proxy_used)
193.31.72.120
>>>

Disable use of proxy at request level

>>> doc = s.load('https://httpbin.org/anything', use_proxy=False)
>>> client_real_ip = doc.response.json()['origin']
>>> print(client_real_ip)
42.114.13.13
>>>

The format of proxy file without authentication

ip:port
ip:port
ip:port
.......

The format of proxy file with authentication

user:password@ip:port
user:password@ip:port
user:password@ip:port
.......

Custom HTTP headers

By default, for each request, the scraper chooses a random User-Agent from a list of popular ones. It also uses a set of HTTP headers that a normal browser does.

You can easily customize the headers per request.

>>> from scrapex import Scraper
>>> s = Scraper()
>>> doc = s.load('https://httpbin.org/headers')
>>> print(doc.response.text)
{
  "headers": {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "en-us,en;q=0.5",
    "Host": "httpbin.org",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0",
    "X-Amzn-Trace-Id": "Root=1-5e42957e-d5aa6cbae8ef6e0c2a2946c4"
  }
}
>>>
>>> headers={'User-Agent':'my own user-agent', 'Referer': 'some referer url'}
>>> doc = s.load('https://httpbin.org/headers', headers=headers)
>>> print(doc.response.text)
{
  "headers": {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "en-us,en;q=0.5",
    "Host": "httpbin.org",
    "Referer": "some referer url",
    "User-Agent": "my own user-agent",
    "X-Amzn-Trace-Id": "Root=1-5e4295fe-d8454ea67c11ecf84e6c51be"
  }
}
>>>

Work with session/cookies

By default, session is turned off because it’s a good practice for web scraping to avoid being tracked.

For scraping task that requires session to manage cookies, like scraping behind a loginwall, you can easily turn it on.

>>> from scrapex import Scraper
>>> s = Scraper(use_session=True)
>>>
>>> doc = s.load('http://httpbin.org/anything')
>>> print('cookies sent:',doc.response.request.headers.get('Cookie'))
cookies sent: None
>>>
>>> doc = s.load('http://httpbin.org/cookies/set?name1=value1&name2=value2')
>>> doc = s.load('http://httpbin.org/anything')
>>> print('cookies sent:',doc.response.request.headers.get('Cookie'))
cookies sent: name1=value1; name2=value2
>>>

Cache html content

By default, cache is turned off. In many scraping jobs, we need to make some tweaks to our parsing part and re-scrape the site again. In that situation, caching the html content from the first scrape is very helpful, especially for big scrapes.

Enable cache

>>> import os
>>> from scrapex import Scraper
>>> s = Scraper(use_cache=True)
>>> doc = s.load('http://httpbin.org/anything')
>>>
>>> print(os.listdir(s.cache.location))
['47a7ec08a34ed1fb8c78c931818dd082.htm']
>>>

Disable cache at request level

>>> doc = s.load('http://httpbin.org/anything', use_cache=False)

Disable cache at scraper level

>>> import os
>>> from scrapex import Scraper
>>> s = Scraper(use_cache=False)
>>> doc = s.load('http://httpbin.org/anything')
>>>
>>> print(os.listdir(s.cache.location))
[]
>>>

Multi threads

Sometimes multi-threads is required to speed up your scrape.

Example code

from scrapex import Scraper
from scrapex import common

s = Scraper()

def scrape_github_by_keyword(keyword):
        doc = s.load(url = 'https://github.com/search', params={'q': keyword})
        print(doc.extract("//h3[contains(text(),'results')]").strip())
        #....#

def scrape():
        keywords = ['scraping tool','python','nodejs','image processing','many more']
        print('start 3 threads')
        common.start_threads(keywords, scrape_github_by_keyword, cc=3)

if __name__ == '__main__':
        scrape()

Indices and tables