Been trying to implement a Yelp Scraper to scrape reviews of restaurants but am constantly getting download and timeout errors. I feed the base url page and find the number of reviews to calculate how many requests I need to make on accord of yelps pagination of reviews to get all of the restaurant reviews. I use the command scrapy crawl yelp -o reviews.csv
to get a csv output of the reviews. Have tried decreasing concurrent_requests
in settings.py
. And even have tried to serialize the requests rather than running it concurrently but found no luck. Also tried changing Timeout delays and still failed. Any suggestions as to how I could fix this?
Here is my spider implementation
import scrapyfrom scrapy.spidermiddlewares.httperror import HttpErrorfrom twisted.internet.error import DNSLookupErrorfrom twisted.internet.error import TimeoutError, TCPTimedOutErrorclass YelpSpider(scrapy.Spider): name = 'yelp' HEADERS = {"user-agent": """ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66,""","referer": None } page_number = 0 def start_requests(self): urls = [""" https://www.yelp.com/biz/tulsi-indian-eatery-westwood-los-angeles?sort_by=date_desc""", ] for url in urls: yield scrapy.Request(url=url, callback=self.page_breakdown, meta={'playwright': True,'playwright_include_page': True }, errback = self.errback_httpbin, dont_filter=True) def page_breakdown(self, response): number_of_reviews = 158 # response.xpath("//div[@class='arrange-unit__09f24__rqHTg css-73a431']/div/span/div[2]/p/text()").get().split(" ")[0] number_of_pages = round(int(number_of_reviews) / 10) for i in range(0, number_of_pages): yield scrapy.Request(url=response.url + f'&start={i*10}', callback=self.parse_reviews, meta={'playwright': True,'playwright_include_page': True }, errback = self.errback_httpbin, dont_filter=True) def errback_httpbin(self, failure): self.logger.error(repr(failure)) print("Reached Error function: ",repr(failure)) if failure.check(HttpError): response = failure.value.response print("a_debug: HttpError on %s", response.url) self.logger.error("HttpError on %s", response.url) elif failure.check(DNSLookupError): request = failure.request print("a_debug: DNSLookupError on %s", request.url) self.logger.error("DNSLookupError on %s", request.url) elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request yield scrapy.Request(url=request.url, callback=self.parse_reviews, meta={'playwright': True,'playwright_include_page': True }, errback = self.errback_httpbin, dont_filter=True) print("a_debug: TimeoutError on %s", request.url) self.logger.error("a_debug: TimeoutError on %s", request.url) def parse_reviews(self, response): print("Reached Parsed Reviews") all_reviews = response.xpath("//*[@id='reviews']/section/div[2]/ul/li") print("a_debug: url: ", response.url) for review in all_reviews: yield {"reviewer": review.xpath(".//*[starts-with(@class, 'user-passport-info')]/span/a/text()" ).get(),"descr": review.css("span.raw__09f24__T4Ezm::text").extract_first(),"rating": review.xpath(".//div[@class='css-14g69b3']/@aria-label" ).get().split(" ")[0],"date": review.css("span.css-chan6m::text").extract_first(), }
Here is the link to the log file: https://drive.google.com/file/d/1PHhTxiG-bCzgMjo5iMy9I-oOqdWmRVZ3/view?usp=sharing