!scrapy startproject tutorial !ls -R tutorial %%writefile tutorial/tutorial/items.py from scrapy.item import Item, Field class DmozItem(Item): title = Field() link = Field() desc = Field() %%writefile tutorial/tutorial/spiders/dmoz_spider.py from scrapy.spider import BaseSpider class DmozSpider(BaseSpider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" ] def parse(self, response): filename = response.url.split("/")[-2] open(filename, 'wb').write(response.body) ! cd tutorial/; scrapy crawl dmoz %%writefile tutorial/tutorial/spiders/dmoz_spider.py from scrapy.spider import BaseSpider from scrapy.selector import Selector class DmozSpider(BaseSpider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" ] def parse(self, response): sel = Selector(response) sites = sel.xpath('//ul/li') for site in sites: title = site.xpath('a/text()').extract() link = site.xpath('a/@href').extract() desc = site.xpath('text()').extract() print title, link, desc ! cd tutorial/; scrapy crawl dmoz %%writefile tutorial/tutorial/spiders/dmoz_spider.py from scrapy.spider import BaseSpider from scrapy.selector import Selector from tutorial.items import DmozItem class DmozSpider(BaseSpider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" ] def parse(self, response): sel = Selector(response) sites = sel.xpath('//ul/li') items = [] for site in sites: item = DmozItem() item['title'] = site.xpath('a/text()').extract() item['link'] = site.xpath('a/@href').extract() item['desc'] = [x.strip() for x in site.xpath('text()').extract()] items.append(item) return items ! cd tutorial/; scrapy crawl dmoz !cd tutorial/; scrapy crawl dmoz -o items.json -t json ! head tutorial/items.json %%writefile tutorial/tutorial/spiders/dmoz_spider.py from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.http import Request from scrapy.selector import Selector from tutorial.items import DmozItem from scrapy.conf import settings settings.overrides['DOWNLOAD_DELAY'] = 1 class DmozSpider(CrawlSpider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = ['http://www.dmoz.org/Computers/Programming/Languages'] rules = ( Rule(SgmlLinkExtractor(deny=('\?*', )), follow=True), Rule(SgmlLinkExtractor(allow=('www\.dmoz\.org\/Computers\/Programming\/Languages\/Python', ), unique=True), follow=True), ) def parse_python_page(self, response): sel = Selector(response) sites = sel.xpath('//ul/li') print response.url for site in sites: try: url = site.xpath('a/@href').extract()[0] if url.startswith('/'): url = 'http://www.dmoz.org' + url yield Request(url=url) except: pass ! cd tutorial/; scrapy crawl dmoz