from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst, MapCompose, Join class DemoLoader(ItemLoader): default_output_processor = TakeFirst() title_in = MapCompose(unicode.title) title_out = Join() size_in = MapCompose(unicode.strip) # you can continue scraping here
from scrapy.loader import ItemLoader from demoproject.items import Demo def parse(self, response): l = ItemLoader(item = Product(), response = response) l.add_xpath("title", "//div[@class = 'product_title']") l.add_xpath("title", "//div[@class = 'product_name']") l.add_xpath("desc", "//div[@class = 'desc']") l.add_css("size", "div#size]") l.add_value("last_updated", "yesterday") return l.load_item()
1. //div[@class = "product_title"]
2. //div[@class = "product_name"]
l = ItemLoader(Product(), some_selector) l.add_xpath("title", xpath1) # [1] l.add_xpath("title", xpath2) # [2] l.add_css("title", css) # [3] l.add_value("title", "demo") # [4] return l.load_item() # [5]
import scrapy from scrapy.loader.processors import Join, MapCompose, TakeFirst from w3lib.html import remove_tags def filter_size(value): if value.isdigit(): return value class Item(scrapy.Item): name = scrapy.Field( input_processor = MapCompose(remove_tags), output_processor = Join(), ) size = scrapy.Field( input_processor = MapCompose(remove_tags, filter_price), output_processor = TakeFirst(), ) >>> from scrapy.loader import ItemLoader >>> il = ItemLoader(item = Product()) >>> il.add_value('title', [u'Hello', u'<strong>world</strong>']) >>> il.add_value('size', [u'<span>100 kg</span>']) >>> il.load_item()
{'title': u'Hello world', 'size': u'100 kg'}
def parse_length(text, loader_context): unit = loader_context.get('unit', 'cm') # You can write parsing code of length here return parsed_length
loader = ItemLoader (product) loader.context ["unit"] = "mm"
loader = ItemLoader(product, unit = "mm")
class ProductLoader(ItemLoader): length_out = MapCompose(parse_length, unit = "mm")
class scrapy.loader.ItemLoader([item, selector, response, ]**kwargs)
参数和说明 |
item
它是通过调用 add_xpath()、add_css() 或 add_value() 来填充的项目。
|
selector
用于从网站中提取数据。
|
response
用于使用default_selector_class构造选择器。
|
方法和说明 | 示例 |
get_value(value, *processors, **kwargs)
根据给定的处理器和关键字参数,值由 get_value() 方法处理。
|
>>> from scrapy.loader.processors import TakeFirst >>> loader.get_value(u'title: demoweb', TakeFirst(), unicode.upper, re = 'title: (.+)') 'DEMOWEB` |
add_value(field_name, value, *processors, **kwargs)
它处理值并添加到它首先通过 get_value 传递的字段中在通过字段输入处理器之前给出处理器和关键字参数。
|
loader.add_value('title', u'DVD') loader.add_value('colors', [u'black', u'white']) loader.add_value('length', u'80') loader.add_value('price', u'2500') |
replace_value(field_name, value, *processors, **kwargs)
它将收集的数据替换为新值。
|
loader.replace_value('title', u'DVD') loader.replace_value('colors', [u'black', u'white']) loader.replace_value('length', u'80') loader.replace_value('price', u'2500') |
get_xpath(xpath, *processors, **kwargs)
它用于通过接收
XPath给处理器和关键字参数来提取unicode字符串.
|
# HTML code: <div class = "item-name">DVD</div> loader.get_xpath("//div[@class = 'item-name']") # HTML code: <div id = "length">the length is 45cm</div> loader.get_xpath("//div[@id = 'length']", TakeFirst(), re = "the length is (.*)") |
add_xpath(field_name, xpath, *processors, **kwargs)
它接收
XPath到提取unicode字符串的字段.
|
# HTML code: <div class = "item-name">DVD</div> loader.add_xpath('name', '//div [@class = "item-name"]') # HTML code: <div id = "length">the length is 45cm</div> loader.add_xpath('length', '//div[@id = "length"]', re = 'the length is (.*)') |
replace_xpath(field_name, xpath, *processors, **kwargs)
它使用
XPath 从站点替换收集的数据。
|
# HTML code: <div class = "item-name">DVD</div> loader.replace_xpath('name', ' //div[@class = "item-name"]') # HTML code: <div id = "length">the length is 45cm</div> loader.replace_xpath('length', ' //div[@id = "length"]', re = 'the length is (.*)') |
get_css(css, *processors, **kwargs)
它接收用于提取unicode字符串的CSS选择器。
|
loader.get_css("div.item-name") loader.get_css("div#length", TakeFirst(), re = "the length is (.*)") |
add_css(field_name, css, *processors, **kwargs)
它与 add_value() 方法类似,但有一个区别是它添加了 CSS 选择器领域。
|
loader.add_css('name', 'div.item-name') loader.add_css('length', 'div#length', re = 'the length is (.*)') |
replace_css(field_name, css, *processors, **kwargs)
它使用CSS选择器替换提取的数据。
|
loader.replace_css('name', 'div.item-name') loader.replace_css('length', 'div#length', re = 'the length is (.*)') |
load_item()
当收集到数据时,此方法用收集到的数据填充项目并返回。
|
def parse(self, response): l = ItemLoader(item = Product(), response = response) l.add_xpath('title', '// div[@class = "product_title"]') loader.load_item() |
nested_xpath(xpath)
它用于创建带有 XPath 选择器的嵌套加载器。
|
loader = ItemLoader(item = Item()) loader.add_xpath('social', ' a[@class = "social"]/@href') loader.add_xpath('email', ' a[@class = "email"]/@href') |
nested_css(css)
它用于创建带有 CSS 选择器的嵌套加载器。
|
loader = ItemLoader(item = Item()) loader.add_css('social', 'a[@class = "social"]/@href') loader.add_css('email', 'a[@class = "email"]/@href') |
属性和描述 |
item
Item Loader 解析的对象。
|
context
它是当前处于活动状态的项目加载器的上下文。
|
default_item_class
如果没有在构造函数中给出,则用于表示项目。
|
default_input_processor
未指定输入处理器的字段是唯一使用 default_input_processor 的字段。
|
default_output_processor
未指定输出处理器的字段是唯一使用 default_output_processor 的字段。
|
default_selector_class
如果在构造器中没有给出,则是用来构造选择器的类。
|
selector
它是一个可以用来从站点中提取数据的对象。
|
<header> <a class = "social" href = "http://facebook.com/whatever">facebook</a> <a class = "social" href = "http://twitter.com/whatever">twitter</a> <a class = "email" href = "mailto:someone@example.com">send mail</a> </header>
loader = ItemLoader(item = Item()) header_loader = loader.nested_xpath('//header') header_loader.add_xpath('social', 'a[@class = "social"]/@href') header_loader.add_xpath('email', 'a[@class = "email"]/@href') loader.load_item()
from scrapy.loader.processors import MapCompose from demoproject.ItemLoaders import DemoLoader def strip_dashes(x): return x.strip('-') class SiteSpecificLoader(DemoLoader): title_in = MapCompose(strip_dashes, DemoLoader.title_in)
>>> from scrapy.loader.processors import Identity >>> proc = Identity() >>> proc(['a', 'b', 'c']) ['a', 'b', 'c']
>>> from scrapy.loader.processors import TakeFirst >>> proc = TakeFirst() >>> proc(['', 'a', 'b', 'c']) 'a'
>>> from scrapy.loader.processors import Join >>> proc = Join() >>> proc(['a', 'b', 'c']) u'a b c' >>> proc = Join('<br>') >>> proc(['a', 'b', 'c']) u'a<br>b<br>c'
>>> from scrapy.loader.processors import Compose >>> proc = Compose(lambda v: v[0], str.upper) >>> proc(['python', 'scrapy']) 'PYTHON'
>>> def filter_scrapy(x): return None if x == 'scrapy' else x >>> from scrapy.loader.processors import MapCompose >>> proc = MapCompose(filter_scrapy, unicode.upper) >>> proc([u'hi', u'everyone', u'im', u'pythonscrapy']) [u'HI, u'IM', u'PYTHONSCRAPY']
>>> from scrapy.loader.processors import SelectJmes, Compose, MapCompose >>> proc = SelectJmes("hello") >>> proc({'hello': 'scrapy'}) 'scrapy' >>> proc({'hello': {'scrapy': 'world'}}) {'scrapy': 'world'}
>>> import json >>> proc_single_json_str = Compose(json.loads, SelectJmes("hello")) >>> proc_single_json_str('{"hello": "scrapy"}') u'scrapy' >>> proc_json_list = Compose(json.loads, MapCompose(SelectJmes('hello'))) >>> proc_json_list('[{"hello":"scrapy"}, {"world":"env"}]') [u'scrapy']