Quantcast
Channel: Recent Questions - Stack Overflow
Viewing all articles
Browse latest Browse all 12111

Consistently getting timeout errors in scrapy playwright

$
0
0

Been trying to implement a Yelp Scraper to scrape reviews of restaurants but am constantly getting download and timeout errors. I feed the base url page and find the number of reviews to calculate how many requests I need to make on accord of yelps pagination of reviews to get all of the restaurant reviews. I use the command scrapy crawl yelp -o reviews.csv to get a csv output of the reviews. Have tried decreasing concurrent_requests in settings.py. And even have tried to serialize the requests rather than running it concurrently but found no luck. Also tried changing Timeout delays and still failed. Any suggestions as to how I could fix this?

Here is my spider implementation

import scrapyfrom scrapy.spidermiddlewares.httperror import HttpErrorfrom twisted.internet.error import DNSLookupErrorfrom twisted.internet.error import TimeoutError, TCPTimedOutErrorclass YelpSpider(scrapy.Spider):    name = 'yelp'    HEADERS = {"user-agent": """             Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko)             Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66,""","referer": None            }    page_number = 0    def start_requests(self):        urls = ["""            https://www.yelp.com/biz/tulsi-indian-eatery-westwood-los-angeles?sort_by=date_desc""",        ]        for url in urls:            yield scrapy.Request(url=url,                                 callback=self.page_breakdown,                                 meta={'playwright': True,'playwright_include_page': True                                     },                                 errback = self.errback_httpbin,                                 dont_filter=True)    def page_breakdown(self, response):        number_of_reviews = 158    # response.xpath("//div[@class='arrange-unit__09f24__rqHTg css-73a431']/div/span/div[2]/p/text()").get().split(" ")[0]        number_of_pages = round(int(number_of_reviews) / 10)        for i in range(0, number_of_pages):            yield scrapy.Request(url=response.url + f'&start={i*10}',                                 callback=self.parse_reviews,                                 meta={'playwright': True,'playwright_include_page': True                                     },                                 errback = self.errback_httpbin,                                 dont_filter=True)    def errback_httpbin(self, failure):        self.logger.error(repr(failure))        print("Reached Error function: ",repr(failure))        if failure.check(HttpError):            response = failure.value.response            print("a_debug: HttpError on %s", response.url)            self.logger.error("HttpError on %s", response.url)        elif failure.check(DNSLookupError):            request = failure.request            print("a_debug: DNSLookupError on %s", request.url)             self.logger.error("DNSLookupError on %s", request.url)        elif failure.check(TimeoutError, TCPTimedOutError):            request = failure.request            yield scrapy.Request(url=request.url,                                 callback=self.parse_reviews,                                 meta={'playwright': True,'playwright_include_page': True                                     },                                 errback = self.errback_httpbin,                                 dont_filter=True)            print("a_debug: TimeoutError on %s", request.url)            self.logger.error("a_debug: TimeoutError on %s", request.url)    def parse_reviews(self, response):        print("Reached Parsed Reviews")        all_reviews = response.xpath("//*[@id='reviews']/section/div[2]/ul/li")        print("a_debug: url: ", response.url)        for review in all_reviews:            yield {"reviewer": review.xpath(".//*[starts-with(@class, 'user-passport-info')]/span/a/text()"                        ).get(),"descr": review.css("span.raw__09f24__T4Ezm::text").extract_first(),"rating": review.xpath(".//div[@class='css-14g69b3']/@aria-label"                        ).get().split(" ")[0],"date": review.css("span.css-chan6m::text").extract_first(),            }

Here is the link to the log file: https://drive.google.com/file/d/1PHhTxiG-bCzgMjo5iMy9I-oOqdWmRVZ3/view?usp=sharing


Viewing all articles
Browse latest Browse all 12111

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>