Witam,
Mam do zescrapowania ponad 100 portali informacyjnych. Napisałem szablon i chciałbym rotować w nim argumentami, jednak wskazówki z dokumentacji nie pomagają mi w rozwiązaniu problemu. Python nie widzi argumentów w obiekcie Rule. Poniżej jest przykład jakby to miało wyglądać dla jednego z portali. Czy ktoś miałby jakieś sugestie?
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from w3lib.html import remove_tags
class UniwersalscraperSpider(CrawlSpider):
name = 'Uniwersalscraper'
def __init__(self,url="",domain="", xpath_text='',xpath_title = '',xpath_date ='', xpath_summarize='',xpath_pagination='',xpath_all_articles_links_on_page='', *args, **kwargs):
super(UniwersalscraperSpider, self).__init__(*args, **kwargs)
self.start_urls = [url]
self.allowed_domains = [domain]
self.xpath_text= xpath_text
self.xpath_title = xpath_title
self.xpath_date = xpath_date
self.xpath_summarize= xpath_summarize
self.xpath_pagination = xpath_pagination
self.xpath_all_articles_links_on_page=xpath_all_articles_links_on_page
rules = (Rule(LinkExtractor(restrict_xpaths=xpath_all_articles_links_on_page), callback='parse', follow=True),
Rule(LinkExtractor(restrict_xpaths=xpath_pagination)),
)
def parse(self, response):
Ugly_text = response.xpath(self.xpath_text).getall()
Good_text = [remove_tags(text) for text in Ugly_text]
yield {
"Title" : response.xpath(self.xpath_title).get(),
"Date" : response.xpath(self.xpath_date).get(),
"Summarize" : response.xpath(self.xpath_summarize).get(),
"Text" : Good_text,
"Url" : response.url
}
process = CrawlerProcess(get_project_settings())
process.crawl('Uniwersalscraper',
url = 'https://krosno24.pl/archiwum',
domain = 'krosno24.pl',
xpath_all_articles_links_on_page = "//ul[@class='archive-list']/li/a",
xpath_pagination = "//div[@class='pagination-container']/a[position () = last()-1]",
xpath_title = "normalize-space((//h1/text())[1])",
xpath_date = "(//div[@class='article-stats']/div/text())[1]",
xpath_summarize = "//div[@class='article-excerpt']/text()",
xpath_text = 'normalize-space(//div[@class="col-lg article"])',
)
process.start()