Builtin selectors
Use these selectors for common scraping tasks, for instance:
selectors = {
'title': scrape.title,
'text': scrape.text,
}
Note: you must not call the selector functions! The
dom
argument will be auto-injected by everyPage
object and represents aparsel.Selector
of the whole HTML document.
microwler.scrape
canonicals(dom)
Extract <link rel='canonical'>
tags
Source code in microwler/scrape.py
def canonicals(dom: parsel.Selector):
""" Extract `<link rel='canonical'>` tags """
return dom.xpath('//link[@rel="canonical"]/@href').getall()
emails(dom)
Extract email addresses from <a>
tags
Source code in microwler/scrape.py
def emails(dom: parsel.Selector):
""" Extract email addresses from `<a>` tags """
hrefs = dom.xpath('//a[starts-with(@href, "mailto")]/@href').getall()
return [href.strip('mailto:') for href in hrefs]
headings(dom)
Extract first 3 levels of heading tags: <h1>
, <h2>
, <h3>
Source code in microwler/scrape.py
def headings(dom: parsel.Selector):
""" Extract first 3 levels of heading tags: `<h1>`, `<h2>`, `<h3>` """
return {
'h1': remove_multi_whitespace(dom.xpath('string(//h1[1])').getall()),
'h2': remove_multi_whitespace(dom.xpath('string(//h2[1])').getall()),
'h3': remove_multi_whitespace(dom.xpath('string(//h3[1])').getall()),
}
images(dom)
Extract URLs from <img>
tags
Source code in microwler/scrape.py
def images(dom: parsel.Selector):
""" Extract URLs from `<img>` tags """
return dom.xpath('//img/@src').getall()
meta(dom)
Extract <meta>
tags
Source code in microwler/scrape.py
def meta(dom: parsel.Selector):
""" Extract `<meta>` tags """
tags = dom.xpath('//meta')
return {tag.get('name'): tag.attrib['content'] for tag in tags}
paragraphs(dom)
Extract <p>
tags
Source code in microwler/scrape.py
def paragraphs(dom: parsel.Selector):
""" Extract `<p>` tags """
return dom.xpath('string(//p[1])').getall()
schemas(dom)
Extract itemtype schemas
Source code in microwler/scrape.py
def schemas(dom: parsel.Selector):
""" Extract itemtype schemas """
schema_links = dom.xpath('//*[@itemtype]/@itemtype').getall()
return [link.split('/')[-1] for link in schema_links]
text(dom)
Extract and clean text content
Source code in microwler/scrape.py
def text(dom: parsel.Selector):
""" Extract and clean text content """
return extract_text(dom.xpath('//body').get())
title(dom)
Extract <title>
tag
Source code in microwler/scrape.py
def title(dom: parsel.Selector):
""" Extract `<title>` tag """
return dom.xpath('string(//title[1])').get()