Scrapy - rotacja argumentami użytkownika w Crawlerze

0

Witam,

Mam do zescrapowania ponad 100 portali informacyjnych. Napisałem szablon i chciałbym rotować w nim argumentami, jednak wskazówki z dokumentacji nie pomagają mi w rozwiązaniu problemu. Python nie widzi argumentów w obiekcie Rule. Poniżej jest przykład jakby to miało wyglądać dla jednego z portali. Czy ktoś miałby jakieś sugestie?

import scrapy
from scrapy.linkextractors import LinkExtractor 
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings 
from w3lib.html import remove_tags



class UniwersalscraperSpider(CrawlSpider):

     name = 'Uniwersalscraper'

     def __init__(self,url="",domain="", xpath_text='',xpath_title = '',xpath_date ='', xpath_summarize='',xpath_pagination='',xpath_all_articles_links_on_page='', *args, **kwargs):
          super(UniwersalscraperSpider, self).__init__(*args, **kwargs)  
         self.start_urls = [url]
         self.allowed_domains = [domain]
         self.xpath_text= xpath_text
         self.xpath_title = xpath_title
         self.xpath_date = xpath_date
         self.xpath_summarize= xpath_summarize
         self.xpath_pagination = xpath_pagination
         self.xpath_all_articles_links_on_page=xpath_all_articles_links_on_page

         rules = (Rule(LinkExtractor(restrict_xpaths=xpath_all_articles_links_on_page), callback='parse', follow=True),
           Rule(LinkExtractor(restrict_xpaths=xpath_pagination)),
        )

     def parse(self, response):
        Ugly_text = response.xpath(self.xpath_text).getall()
        Good_text = [remove_tags(text) for text in Ugly_text]
        yield {
            "Title" : response.xpath(self.xpath_title).get(),
            "Date" : response.xpath(self.xpath_date).get(),
            "Summarize" : response.xpath(self.xpath_summarize).get(),
            "Text" : Good_text,
            "Url" : response.url       
        }

 process = CrawlerProcess(get_project_settings())
 process.crawl('Uniwersalscraper',
            url = 'https://krosno24.pl/archiwum',
            domain = 'krosno24.pl',
            xpath_all_articles_links_on_page = "//ul[@class='archive-list']/li/a", 
            xpath_pagination = "//div[@class='pagination-container']/a[position () = last()-1]",
            xpath_title = "normalize-space((//h1/text())[1])",
            xpath_date = "(//div[@class='article-stats']/div/text())[1]",
            xpath_summarize = "//div[@class='article-excerpt']/text()",
            xpath_text = 'normalize-space(//div[@class="col-lg article"])',
            )
 process.start()
0

Jaki błąd dostajesz?

0

@ledi12: Tutaj jest fragment terminala:

2021-12-14 1227 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://krosno24.pl/robots.txt> (referer: None)
2021-12-14 1227 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://krosno24.pl/archiwum> (referer: None)
2021-12-14 1227 [scrapy.core.engine] INFO: Closing spider (finished)
...
...
...

File "C:\Users\Admin\Desktop\Repozytorium\uniwersalscraper\uniwersalscraper\spiders\UniwersalScraper.py", line 15, in init
super(UniwersalscraperSpider, self).init(*args, **kwargs)
File "C:\Users\Admin\anaconda3\lib\site-packages\scrapy\spiders\crawl.py", line 68, in init
self._compile_rules()
File "C:\Users\Admin\anaconda3\lib\site-packages\scrapy\spiders\crawl.py", line 131, in _compile_rules
for rule in self.rules:
TypeError: 'method' object is not iterable

[TO UDAŁO SIĘ POPRAWIĆ PO DODANIU self. DO ZMIENNYCH: xpath_all_articles_links_on_page i xpath_pagination]
...
...

[ZOSTAŁ JESZCZE TEN]
File "c:/Users/Admin/Desktop/Repozytorium/uniwersalscraper/uniwersalscraper/spiders/UniwersalScraper.py", line 49, in <module>
process.start()
...
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable

0

UPDATE:

na końcu wystarczyło dodać:

process.start(stop_after_crawl=False)

Niestety Crawler wyłącza się po wejściu na portal internetowy (otrzymuje 2 razy response (200) od robots.txt i startowego url). Mój linter przyciemnia mi tuple rules. I może mieć racje, bo w sumie w normalnej wersji Crawlera rules jest poza metodami. Ale jak będzie poza metodami to nie będę wstanie zmieniać argumentów wewenątrz Rule, bo wtedy musze usunąć self.:

from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings 
from w3lib.html import remove_tags

class UniwersalscraperSpider(CrawlSpider):

     name = 'Uniwersalscraper'

     def __init__(self,url="",domain="", xpath_text="",xpath_title = "",xpath_date ="", xpath_summarize="", *args, **kwargs):
         super(UniwersalscraperSpider, self).__init__(*args, **kwargs)  
         self.start_urls = [url]
         self.allowed_domains = [domain]
         self.xpath_text= xpath_text
         self.xpath_title = xpath_title
         self.xpath_date = xpath_date
         self.xpath_summarize= xpath_summarize
         
         rules = (Rule(LinkExtractor(restrict_xpaths=self.xpath_all_articles_links_on_page), callback='parse', follow=True),
            Rule(LinkExtractor(restrict_xpaths=self.xpath_pagination)),
            )

     def parse(self, response):
        Ugly_text = response.xpath(self.xpath_text).getall()
        Good_text = [remove_tags(text) for text in Ugly_text]
        yield {
            "Title" : response.xpath(self.xpath_title).get(),
            "Date" : response.xpath(self.xpath_date).get(),
            "Summarize" : response.xpath(self.xpath_summarize).get(),
            "Text" : Good_text,
            "Url" : response.url       
        }

process = CrawlerProcess(get_project_settings())
process.crawl('Uniwersalscraper',
            url = 'https://krosno24.pl/archiwum',
            domain = 'krosno24.pl',
            xpath_all_articles_links_on_page = '//ul[@class="archive-list"]/li/a', 
            xpath_pagination = '//div[@class="pagination-container"]/a[position () = last()-1]',
            xpath_title = 'normalize-space((//h1/text())[1])',
            xpath_date = '(//div[@class="article-stats"]/div/text())[1]',
            xpath_summarize = '//div[@class="article-excerpt"]/text()',
            xpath_text = 'normalize-space(//div[@class="col-lg article"])'
            )
process.start(stop_after_crawl=False)

1 użytkowników online, w tym zalogowanych: 0, gości: 1