#
# Gianluca Demartini, 2015
# http://gianlucademartini.net
# Processing of WikiR data https://wikireverse.org/data (version 2014-23-data) to see which top-level domain (e.g., .com .net) link to each Wikipedia
# The final data goes to a tab separated file which is load in to Tableau Public to generate this visualisation: http:
#
import pandas as pd
file0='./Desktop/2014-23-data/part-0'
file1='./Desktop/2014-23-data/part-1'
file2='./Desktop/2014-23-data/part-2'
file3='./Desktop/2014-23-data/part-3'
header_row=['lan','wiki','url','WebTitle', 'time']
#load data from files
try:
wikiR0 = pd.read_csv(file0, error_bad_lines=False, header=None, names=header_row, sep='\t')
wikiR1 = pd.read_csv(file1, error_bad_lines=False, header=None, names=header_row, sep='\t')
wikiR2 = pd.read_csv(file2, error_bad_lines=False, header=None, names=header_row, sep='\t')
wikiR3 = pd.read_csv(file3, error_bad_lines=False, header=None, names=header_row, sep='\t')
except pd.parser.CParserError as e:
print e
#put all in one dataframe
wikiR = wikiR0.append(wikiR1, ignore_index=True)
wikiR = wikiR.append(wikiR2, ignore_index=True)
wikiR = wikiR.append(wikiR3, ignore_index=True)
wikiR.head()
lan | wiki | url | WebTitle | time | |
---|---|---|---|---|---|
0 | ab | ???????? | http://mail-archives.apache.org/mod_mbox/openo... | NaN | 2014-07-13 08:10:00 |
1 | ab | 1939 | http://rarplayer.appspot.com/wiki/1939 | 1939 – Wikipedia, wolna encyklopedia | 2014-08-02 06:30:00 |
2 | ab | 1994 | http://rarplayer.appspot.com/wiki/1994 | 1994 – Wikipedia, wolna encyklopedia | 2014-08-01 00:02:00 |
3 | ab | 2004 | http://instapedia.com/m/2004 | 2004 - iPhone/Mobile Wikipedia | 2014-07-13 16:18:00 |
4 | ab | 2007 | http://rarplayer.appspot.com/wiki/2007 | 2007 – Wikipedia, wolna encyklopedia | 2014-07-31 09:37:00 |
#how much data?
len(wikiR.index)
36301008
from __future__ import with_statement
from urlparse import urlparse
import urllib2
# load tlds, ignore comments and empty lines:
file="https://publicsuffix.org/list/effective_tld_names.dat"
filehandle = urllib2.urlopen(file)
tlds = [line.strip() for line in filehandle if line[0] not in "/\n"]
#function to get the top level domain from the url given the list of tlds
def get_TLdomain(url, tlds):
# print url
url_elements = urlparse(url)[1].split('.')
# url_elements = ["abcde","co","uk"]
for i in range(-len(url_elements), 0):
last_i_elements = url_elements[i:]
# i=-3: ["abcde","co","uk"]
# i=-2: ["co","uk"]
# i=-1: ["uk"] etc
candidate = ".".join(last_i_elements) # abcde.co.uk, co.uk, uk
wildcard_candidate = ".".join(["*"] + last_i_elements[1:]) # *.co.uk, *.uk, *
exception_candidate = "!" + candidate
# match tlds:
if (exception_candidate in tlds):
return ".".join(url_elements[i:])
if (candidate in tlds or wildcard_candidate in tlds):
return ".".join(url_elements[i-0:]) # i-1 returns domain name with TLD, i-0 return TLD
# returns "abcde.co.uk"
# raise ValueError("Domain not in global list of TLDs "+url)
print "Domain not in global list of TLDs "+url
return "null"
print get_TLdomain("http://io.abcde.com", tlds)
print wikiR.url[3]
print get_TLdomain(wikiR.url[3], tlds)
com http://instapedia.com/m/2004 com
# extract the top level domain from the url for all entries of wikiR. This takes time.
wikiR.loc[:,'tld']=wikiR.apply(lambda x: get_TLdomain(x['url'], tlds), axis=1)
wikiR.head()
Domain not in global list of TLDs http://62.143.88.190/devalco/ Domain not in global list of TLDs 2014-07-28 10:19:00 Domain not in global list of TLDs 2014-07-31 17:36:00 Domain not in global list of TLDs 2014-08-01 01:43:00 Domain not in global list of TLDs http://62.143.88.190/devalco/ Domain not in global list of TLDs http://62.143.88.190/devalco/ Domain not in global list of TLDs 2014-07-28 22:32:00 Domain not in global list of TLDs 2014-07-31 17:32:00 Domain not in global list of TLDs http://www.visualnews.com./ Domain not in global list of TLDs 2014-07-10 10:18:00 Domain not in global list of TLDs 2014-07-31 09:47:00 Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips Domain not in global list of TLDs http://www.geni.com./people/Anna-Paquin/6000000005350378009 Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php Domain not in global list of TLDs http://www.ativanonline.net./ Domain not in global list of TLDs http://195.84.101.101/~goranl/shack/ant_deltaloop40/index.html Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx?pageid=FD30F09484F44DE4 Domain not in global list of TLDs http://cd8308.myfoscam.org:8080/evanvaughan_com/default.aspx?postid=F6B7D3412CD04432 Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/CamelCase Domain not in global list of TLDs http://rubygarage.org./ Domain not in global list of TLDs http://gps.nju.edu.cn:88/mediawiki/index.php/Compiler_Crafting Domain not in global list of TLDs http://173.8.135.113/ Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=Dark_Frame Domain not in global list of TLDs http://blog.bluehost.com./blog/bluehost/clean-up-online-business-listings-with-yext-1664/ Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=The_Floating_Point_Engine Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=Dark_Frame Domain not in global list of TLDs http://216.119.148.216/update/deleting-online-predators-act/ Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips Domain not in global list of TLDs http://198.102.103.39/community/ Domain not in global list of TLDs http://198.102.103.39/community/index.php Domain not in global list of TLDs http://62.143.88.190/devalco/ Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php Domain not in global list of TLDs http://viper.infotech.monash.edu:4277/ Domain not in global list of TLDs http://www.visualnews.com./ Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0518.htm Domain not in global list of TLDs http://173.8.135.113/ Domain not in global list of TLDs http://www.visualnews.com./ Domain not in global list of TLDs http://www.visualnews.com./ Domain not in global list of TLDs http://50.87.144.65/~rt/w/index.php?title=The_Floating_Point_Engine Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips Domain not in global list of TLDs http://www.visualnews.com./ Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips Domain not in global list of TLDs http://rubygarage.org./ Domain not in global list of TLDs 2014-07-30 21:17:00 Domain not in global list of TLDs http://173.8.135.113/ Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt Domain not in global list of TLDs http://rubygarage.org./ Domain not in global list of TLDs http://www.klonopinonline.org./ Domain not in global list of TLDs http://204.14.213.185/Laptops-Notebooks/SubCategory/ID-32?Category=223 Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php Domain not in global list of TLDs http://www.visualnews.com./ Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0514.htm Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php Domain not in global list of TLDs 2014-07-30 21:16:00 Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips Domain not in global list of TLDs http://gps.nju.edu.cn:88/mediawiki/index.php/Compiler_Crafting Domain not in global list of TLDs 2014-07-29 16:49:00 Domain not in global list of TLDs http://www.scotxblog.com./ Domain not in global list of TLDs http://62.143.88.190/devalco/ Domain not in global list of TLDs http://62.143.88.190/devalco/ Domain not in global list of TLDs http://62.143.88.190/devalco/ Domain not in global list of TLDs http://144.76.109.211/redirect.php?dst= Domain not in global list of TLDs http://www.thegatewaypundit.com./ Domain not in global list of TLDs http://multistage.ssel.caltech.edu:8000/multistage/wiki/InterMapTxt Domain not in global list of TLDs http://www.visualnews.com./ Domain not in global list of TLDs http://look.gvsu.edu:8000/opc Domain not in global list of TLDs http://198.100.46.202/2009/09/ Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0514.htm Domain not in global list of TLDs http://72.52.86.84/www.gentlespirit.net/index.php Domain not in global list of TLDs http://198.100.46.202/2009/09/ Domain not in global list of TLDs http://173.8.135.113/ Domain not in global list of TLDs http://www.russianseason.net./ Domain not in global list of TLDs http://74.220.215.218/~recollec/bleed/0531.htm Domain not in global list of TLDs http://62.143.88.190/devalco/ Domain not in global list of TLDs http://www.silverseek.com:8080/article/silver-sales-supply-slips Domain not in global list of TLDs http://www.buy-online-viagra.us./ Domain not in global list of TLDs http://www.visualnews.com./ Domain not in global list of TLDs http://viper.infotech.monash.edu:4277/ Domain not in global list of TLDs http://173.8.135.113/ Domain not in global list of TLDs http://195.84.101.101/~goranl/shack/ant_deltaloop40/index.html Domain not in global list of TLDs 2014-07-29 06:44:00 Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi Domain not in global list of TLDs http://190.19.84.90:8080/pergamo/opac/cgi-bin/pgopac.cgi Domain not in global list of TLDs 2014-08-01 15:54:00 Domain not in global list of TLDs http://test2.mafiachat.net:8800/ Domain not in global list of TLDs http://www.xn--e1adkpj5f.xn--p1ai/%D0%B8%D0%B2%D0%B0%D0%BD-%D0%BF%D0%BE%D0%BB%D0%BE%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%80%D0%B0%D0%B7%D0%B2%D0%BE%D0%B4-%D0%B4%D0%BB%D1%8F-%D0%BD%D0%BE%D0%B2%D0%B8%D1%87%D0%BA%D0%BE%D0%B2/ Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/games-for-xbox/1838-skachat-besplatno-igru-betmen-arkhem-origins-na-iksboks-360-batman-arkham-origins-xbox-360-2013-god-russkaya-licenzionnaya-versiya.html Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1713-skachat-besplatno-igru-pro-zombi-na-kompyuter-dead-island-2-dead-island-riptide-russkaya-versiya-repak.html Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44 Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44 Domain not in global list of TLDs http://xn--e1abgfitfjgcl0c.xn--p1ai/?page_id=44 Domain not in global list of TLDs http://xn--p1af1b.xn--p1ai/%D0%91%D0%B0%D1%88%D0%B0%D1%80_%D0%90%D1%81%D0%B0%D0%B4 Domain not in global list of TLDs http://www.xn----dtbikagememahdgab5aia4a3b3k.xn--p1ai/ Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1713-skachat-besplatno-igru-pro-zombi-na-kompyuter-dead-island-2-dead-island-riptide-russkaya-versiya-repak.html Domain not in global list of TLDs http://www.xn--80afgqph1c.xn--p1ai/computer-games/1804-skachat-besplatno-igru-ekspediciya-konkistadorov-na-kompyuter-expeditions-conquistador-2013-god-russkaya-versiya-repak.html Domain not in global list of TLDs http://xn----7sbiew6aadnema7p.xn--p1ai/sity_id.php?id=3 Domain not in global list of TLDs http://xn----7sbb5ahj4aiadq2m.xn--p1ai/guide/army/ta/t10.shtml Domain not in global list of TLDs http://xn----7sbiew6aadnema7p.xn--p1ai/sity_id.php?id=25
lan | wiki | url | WebTitle | time | tld | |
---|---|---|---|---|---|---|
0 | ab | ???????? | http://mail-archives.apache.org/mod_mbox/openo... | NaN | 2014-07-13 08:10:00 | org |
1 | ab | 1939 | http://rarplayer.appspot.com/wiki/1939 | 1939 – Wikipedia, wolna encyklopedia | 2014-08-02 06:30:00 | appspot.com |
2 | ab | 1994 | http://rarplayer.appspot.com/wiki/1994 | 1994 – Wikipedia, wolna encyklopedia | 2014-08-01 00:02:00 | appspot.com |
3 | ab | 2004 | http://instapedia.com/m/2004 | 2004 - iPhone/Mobile Wikipedia | 2014-07-13 16:18:00 | com |
4 | ab | 2007 | http://rarplayer.appspot.com/wiki/2007 | 2007 – Wikipedia, wolna encyklopedia | 2014-07-31 09:37:00 | appspot.com |
# seralize dataframe now
wikiR.to_pickle('/Users/gianlucademartini/wikiRall.pickle')
#group by and count inlinks for each tld and wikipedia edition
tldCount=wikiR.groupby(['lan','tld']).count()
# save as tab separated
tldCount.reset_index()
tldCount.to_csv('tldCount.tsv', sep='\t')