In [3]:
# For parsing the csv
import csv
import urllib2
import StringIO
import math

# For actual computations
import random as rand 
import pandas as pd

In 2010 Google released a page listing the top 1000 websites on the internet: https://web.archive.org/web/20130102235318/http://www.google.com/adplanner/static/top1000

The page has since been taken down, but you can still find the data mirrored.

Let's take a look at it!

In [7]:
websites_url = 'https://raw.githubusercontent.com/ledeprogram/courses/master/platforms/anonymization/googletop1000april2010.csv'
websites_response = urllib2.urlopen(websites_url)
websites = pd.read_csv(websites_response)
In [8]:
websites
Out[8]:
Rank Unique Visitors (users) Page Views Reach Site Category Has Advertising
0 1 540000000 570000000000 35.2% facebook.com Social Networks Yes
1 2 490000000 70000000000 31.8% yahoo.com Web Portals Yes
2 3 370000000 39000000000 24.1% live.com Search Engines Yes
3 4 310000000 7900000000 20% wikipedia.org Dictionaries & Encyclopedias No
4 5 280000000 11000000000 18.1% msn.com Web Portals Yes
5 6 230000000 3300000000 14.8% microsoft.com Software Yes
6 7 230000000 4400000000 14.7% blogspot.com Blogging Resources & Services Yes
7 8 230000000 27000000000 15% baidu.com Web Portals Yes
8 9 170000000 25000000000 11.1% qq.com Email & Messaging Yes
9 10 140000000 2100000000 9.2% mozilla.com Internet Clients & Browsers No
10 11 130000000 3600000000 8.4% sina.com.cn Web Portals Yes
11 12 120000000 1200000000 7.7% wordpress.com Blogging Resources & Services Yes
12 13 110000000 2700000000 7.% bing.com Search Engines Yes
13 14 110000000 1000000000 6.9% adobe.com Programming Yes
14 15 98000000 2700000000 6.3% 163.com Web Portals Yes
15 16 98000000 10000000000 6.3% taobao.com Shopping No
16 17 97000000 1400000000 6.3% soso.com Entertainment No
17 18 96000000 5400000000 6.2% twitter.com Email & Messaging No
18 19 89000000 1700000000 5.8% youku.com Video Clips & Movie Downloads Yes
19 20 88000000 1700000000 5.7% ask.com Search Engines Yes
20 21 82000000 1900000000 5.3% sohu.com Web Portals Yes
21 22 74000000 3300000000 4.8% amazon.com Shopping Yes
22 23 74000000 490000000 4.8% windows.com Windows No
23 24 74000000 9400000000 4.8% ebay.com Auctions Yes
24 25 72000000 27000000000 4.7% yahoo.co.jp Web Portals Yes
25 26 72000000 27000000000 4.7% myspace.com Social Networks Yes
26 27 72000000 960000000 4.7% apple.com Mac Yes
27 28 66000000 1100000000 4.3% tudou.com Photo & Video Sharing No
28 29 60000000 2000000000 3.9% conduit.com Advertising & Marketing No
29 30 60000000 1100000000 3.9% hotmail.com Email & Messaging Yes
30 31 55000000 1800000000 3.6% flickr.com Photo & Video Sharing Yes
31 32 55000000 1100000000 3.6% photobucket.com Photo & Video Sharing Yes
32 33 55000000 590000000 3.6% tianya.cn Online Communities Yes
33 34 55000000 710000000 3.6% about.com How-To & Expert Content Yes
34 35 55000000 490000000 3.6% cnet.com Technology News Yes
35 36 50000000 1400000000 3.3% hao123.com Online Directories No
36 37 50000000 270000000 3.2% iefxz.com NaN No
37 38 50000000 870000000 3.2% xunlei.com TV Programs No
38 39 49000000 1900000000 3.2% paypal.com Merchant Services & Payment Systems Yes
39 40 46000000 800000000 3% rapidshare.com File Sharing & Hosting No
40 41 46000000 3000000000 3% go.com Web Portals Yes
41 42 45000000 2400000000 2.9% fc2.com Blogging Resources & Services Yes
42 43 45000000 2500000000 2.9% bbc.co.uk News & Current Events Yes
43 44 45000000 1400000000 2.9% imdb.com Movies Yes
44 45 45000000 5300000000 2.9% orkut.com Social Networks Yes
45 46 45000000 540000000 2.9% sogou.com Web Portals No
46 47 42000000 450000000 2.7% 56.com Multimedia Content No
47 48 42000000 4400000000 2.7% aol.com Web Portals Yes
48 49 42000000 14000000000 2.7% craigslist.org Classifieds No
49 50 41000000 4000000000 2.6% rakuten.co.jp Shopping Portals & Search Engines Yes
50 51 41000000 310000000 2.7% imageshack.us File Sharing & Hosting Yes
51 52 41000000 410000000 2.7% ku6.com Multimedia Content Yes
52 53 41000000 1700000000 2.7% blogger.com Blogging Resources & Services Yes
53 54 41000000 810000000 2.6% goo.ne.jp Web Services Yes
54 55 41000000 860000000 2.7% ifeng.com News & Current Events Yes
55 56 38000000 1700000000 2.5% linkedin.com Social Networks Yes
56 57 38000000 7000000000 2.4% yandex.ru Search Engines Yes
57 58 37000000 10000000000 2.4% mail.ru Email & Messaging Yes
58 59 35000000 280000000 2.2% partypoker.com Cards & Casino Games No
59 60 34000000 880000000 2.2% megaupload.com File Sharing & Hosting No
... ... ... ... ... ... ...

1001 rows × 7 columns