from __future__ import print_function import os import sys import multiprocessing from multiprocessing.queues import Queue import lxml.etree import lxml.html from scrapy import project, signals from scrapy.spider import BaseSpider from scrapy.item import Item, Field from scrapy.crawler import CrawlerProcess from scrapy.xlib.pydispatch import dispatcher from scrapy.utils.project import get_project_settings from scrapy.http import Request from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector TMP_DIR = './tmp' class ResponseItem(Item): response = Field() class ResponseSpider(BaseSpider): name = 'response_spider' def __init__(self, url): self.url = url super(ResponseSpider, self).__init__() def start_requests(self): return [Request(self.url, self.parse, dont_filter=True)] def parse(self, response): # request with callback fails to serialize - why? req = response.request.replace(callback=None) return ResponseItem( response=response.replace(request=req), ) class CrawlerWorker(multiprocessing.Process): def __init__(self, result_queue, spider, settings=None): multiprocessing.Process.__init__(self) self.settings = settings or get_project_settings() self.result_queue = result_queue self.spider = spider self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): self.crawler = CrawlerProcess(self.settings) self.crawler.install() self.crawler.configure() self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.result_queue.put(self.items) def _download(url): result_queue = Queue() spider = ResponseSpider(url) crawler = CrawlerWorker(result_queue, spider) crawler.start() item = result_queue.get()[0] result_queue.cancel_join_thread() crawler.join() return item['response'] def set_base(body, base): if '', '' % base) return body def download(url): """ Download 'url' using Scrapy. Return Response. """ response = _download(url) return response.replace(body=set_base(response.body, url)) from IPython import display def _show_in_iframe(local_url): fname = os.path.join(TMP_DIR, 'output.html') html = """

""" % local_url display.display(display.HTML(html)) def show_in_iframe(html): fname = os.path.join(TMP_DIR, 'output.html') with open(fname, 'wb') as f: f.write(html) _show_in_iframe('http://127.0.0.1:8000/output.html') def _highlight(hxs): el = hxs._root el.attrib['style'] = 'background-color: yellow;' + el.get('style', '') def show_hxs_select(hxs, xpath): for link in hxs.select(xpath): _highlight(link) body = lxml.html.tostring(hxs._root.getroottree()) show_in_iframe(body) def show_xpath(url, xpath): response = download(url) hxs = HtmlXPathSelector(response) show_hxs_select(hxs, xpath) show_xpath('http://crawlera.com', '//a[contains(text(), "i")]')