Builtin selectors
Use these selectors for common scraping tasks, for instance:
selectors = {
    'title': scrape.title,
    'text': scrape.text,
}
Note: you must not call the selector functions! The
domargument will be auto-injected by everyPageobject and represents aparsel.Selectorof the whole HTML document.
        microwler.scrape
      
canonicals(dom)
      
Extract <link rel='canonical'> tags 
Source code in microwler/scrape.py
          def canonicals(dom: parsel.Selector):
    """ Extract `<link rel='canonical'>` tags """
    return dom.xpath('//link[@rel="canonical"]/@href').getall()
emails(dom)
      
Extract email addresses from <a> tags 
Source code in microwler/scrape.py
          def emails(dom: parsel.Selector):
    """ Extract email addresses from `<a>` tags """
    hrefs = dom.xpath('//a[starts-with(@href, "mailto")]/@href').getall()
    return [href.strip('mailto:') for href in hrefs]
headings(dom)
      
Extract first 3 levels of heading tags: <h1>, <h2>, <h3> 
Source code in microwler/scrape.py
          def headings(dom: parsel.Selector):
    """ Extract first 3 levels of heading tags: `<h1>`, `<h2>`, `<h3>` """
    return {
        'h1': remove_multi_whitespace(dom.xpath('string(//h1[1])').getall()),
        'h2': remove_multi_whitespace(dom.xpath('string(//h2[1])').getall()),
        'h3': remove_multi_whitespace(dom.xpath('string(//h3[1])').getall()),
    }
images(dom)
      
Extract URLs from <img> tags 
Source code in microwler/scrape.py
          def images(dom: parsel.Selector):
    """ Extract URLs from `<img>` tags """
    return dom.xpath('//img/@src').getall()
meta(dom)
      
Extract <meta> tags 
Source code in microwler/scrape.py
          def meta(dom: parsel.Selector):
    """ Extract `<meta>` tags """
    tags = dom.xpath('//meta')
    return {tag.get('name'): tag.attrib['content'] for tag in tags}
paragraphs(dom)
      
Extract <p> tags 
Source code in microwler/scrape.py
          def paragraphs(dom: parsel.Selector):
    """ Extract `<p>` tags """
    return dom.xpath('string(//p[1])').getall()
schemas(dom)
      
Extract itemtype schemas
Source code in microwler/scrape.py
          def schemas(dom: parsel.Selector):
    """ Extract itemtype schemas """
    schema_links = dom.xpath('//*[@itemtype]/@itemtype').getall()
    return [link.split('/')[-1] for link in schema_links]
text(dom)
      
Extract and clean text content
Source code in microwler/scrape.py
          def text(dom: parsel.Selector):
    """ Extract and clean text content """
    return extract_text(dom.xpath('//body').get())
title(dom)
      
Extract <title> tag 
Source code in microwler/scrape.py
          def title(dom: parsel.Selector):
    """ Extract `<title>` tag """
    return dom.xpath('string(//title[1])').get()