-
Notifications
You must be signed in to change notification settings - Fork 1
/
distros.py
30 lines (25 loc) · 1.06 KB
/
distros.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# -*- coding: utf-8 -*-
import scrapy
class DistrosItem(scrapy.Item):
title = scrapy.Field()
data = scrapy.Field()
class DistrosSpider(scrapy.Spider):
name = "distros"
allowed_domains = ["distrowatch.com"]
start_urls = (
'http://distrowatch.com/table.php',
)
def parse(self, response):
for distro in response.xpath("//select/option/@value").extract():
url = 'http://distrowatch.com/table.php?distribution={}'.format(distro)
r= scrapy.Request(url,callback=self.parse_distro_page)
r.meta['name']=distro
yield r
def parse_distro_page(self,response):
item = DistrosItem()
item['title']=response.meta['name']
pyversions=response.xpath("//th/a[text()='Python']")[0].xpath("../../td/text()").extract()
versions=response.xpath("//td[@class='TablesInvert']/text()").extract()
# item['versiontable'] = zip(versions,pyversions)
item['data']=[{'Distro_Version':dv,'Python_version':pv} for dv,pv in zip(versions,pyversions)]
return item