Skip to content

Builtin selectors

Use these selectors for common scraping tasks, for instance:

selectors = {
    'title': scrape.title,
    'text': scrape.text,
}

Note: you must not call the selector functions! The dom argument will be auto-injected by every Page object and represents a parsel.Selector of the whole HTML document.

microwler.scrape

canonicals(dom)

Extract <link rel='canonical'> tags

Source code in microwler/scrape.py
def canonicals(dom: parsel.Selector):
    """ Extract `<link rel='canonical'>` tags """
    return dom.xpath('//link[@rel="canonical"]/@href').getall()

emails(dom)

Extract email addresses from <a> tags

Source code in microwler/scrape.py
def emails(dom: parsel.Selector):
    """ Extract email addresses from `<a>` tags """
    hrefs = dom.xpath('//a[starts-with(@href, "mailto")]/@href').getall()
    return [href.strip('mailto:') for href in hrefs]

headings(dom)

Extract first 3 levels of heading tags: <h1>, <h2>, <h3>

Source code in microwler/scrape.py
def headings(dom: parsel.Selector):
    """ Extract first 3 levels of heading tags: `<h1>`, `<h2>`, `<h3>` """
    return {
        'h1': remove_multi_whitespace(dom.xpath('string(//h1[1])').getall()),
        'h2': remove_multi_whitespace(dom.xpath('string(//h2[1])').getall()),
        'h3': remove_multi_whitespace(dom.xpath('string(//h3[1])').getall()),
    }

images(dom)

Extract URLs from <img> tags

Source code in microwler/scrape.py
def images(dom: parsel.Selector):
    """ Extract URLs from `<img>` tags """
    return dom.xpath('//img/@src').getall()

meta(dom)

Extract <meta> tags

Source code in microwler/scrape.py
def meta(dom: parsel.Selector):
    """ Extract `<meta>` tags """
    tags = dom.xpath('//meta')
    return {tag.get('name'): tag.attrib['content'] for tag in tags}

paragraphs(dom)

Extract <p> tags

Source code in microwler/scrape.py
def paragraphs(dom: parsel.Selector):
    """ Extract `<p>` tags """
    return dom.xpath('string(//p[1])').getall()

schemas(dom)

Extract itemtype schemas

Source code in microwler/scrape.py
def schemas(dom: parsel.Selector):
    """ Extract itemtype schemas """
    schema_links = dom.xpath('//*[@itemtype]/@itemtype').getall()
    return [link.split('/')[-1] for link in schema_links]

text(dom)

Extract and clean text content

Source code in microwler/scrape.py
def text(dom: parsel.Selector):
    """ Extract and clean text content """
    return extract_text(dom.xpath('//body').get())

title(dom)

Extract <title> tag

Source code in microwler/scrape.py
def title(dom: parsel.Selector):
    """ Extract `<title>` tag """
    return dom.xpath('string(//title[1])').get()