I have s3cmd
and boto
set up on PiCloud's /rdhyee/Working_with_Open_Data
environment set up.
sudo pip install cloud
picloud setup
I used https://github.com/s3tools/s3cmd
how I installed on my notebook:
cd ~/C/src/
git clone git://github.com/s3tools/s3cmd.git
cd s3cmd/
python setup.py install
s3cmd --configure
# import our library
import cloud
# this key, secret access to aws-publicdatasets only -- createdd for WwOD 13 student usage
KEY = 'AKIAJH2FD7572FCTVSSQ'
SECRET = '8dVCRIWhboKMiJxgs1exIh6eMCG13B+gp/bf5bsl'
# s3cmd installed in custom PiCloud environment -- and maybe in your local environment too
!s3cmd ls s3://aws-publicdatasets/common-crawl/parse-output/valid_segments.txt
2012-11-09 11:28 2478 s3://aws-publicdatasets/common-crawl/parse-output/valid_segments.txt
# http://boto.s3.amazonaws.com/s3_tut.html
import boto
from boto.s3.connection import S3Connection
from itertools import islice
conn = S3Connection(KEY,SECRET)
bucket = conn.get_bucket('aws-publicdatasets')
for key in islice(bucket.list(prefix="common-crawl/parse-output/", delimiter="/"),None):
print key.name.encode('utf-8')
common-crawl/parse-output/checkpoint_staging_$folder$ common-crawl/parse-output/checkpoints_$folder$ common-crawl/parse-output/segment_$folder$ common-crawl/parse-output/valid_segments.txt common-crawl/parse-output/valid_segments2_$folder$ common-crawl/parse-output/valid_segments_$folder$ common-crawl/parse-output/checkpoint_staging/ common-crawl/parse-output/checkpoints/ common-crawl/parse-output/segment/ common-crawl/parse-output/valid_segments2/
# get valid_segments
# https://commoncrawl.atlassian.net/wiki/display/CRWL/About+the+Data+Set
import boto
from boto.s3.connection import S3Connection
conn = S3Connection(KEY,SECRET)
bucket = conn.get_bucket('aws-publicdatasets')
k = bucket.get_key("common-crawl/parse-output/valid_segments.txt")
s = k.get_contents_as_string()
# how many valid segments in current crawl
len(s.split("\n"))
178
valid_segments = s.split("\n")
# get sample valid segment
valid_segments[0]
'1346823845675'
# what to do with a valid segment instance?
# https://groups.google.com/forum/#!msg/common-crawl/QYTmnttZZyo/NPiXvK8ZeiMJ
# "s3n://aws-publicdatasets/common-crawl/parse-output/segment/"+segmentId+"/*.arc.gz";
!s3cmd ls s3://aws-publicdatasets/common-crawl/parse-output/segment/1346823845675
DIR s3://aws-publicdatasets/common-crawl/parse-output/segment/1346823845675/ 2012-09-05 19:18 0 s3://aws-publicdatasets/common-crawl/parse-output/segment/1346823845675_$folder$
from itertools import islice
conn = S3Connection(KEY,SECRET)
bucket = conn.get_bucket('aws-publicdatasets')
for key in islice(bucket.list(prefix="common-crawl/parse-output/segment/1346823845675/", delimiter="/"),10):
print key.name.encode('utf-8')
common-crawl/parse-output/segment/1346823845675/1346864466526_10.arc.gz common-crawl/parse-output/segment/1346823845675/1346864469604_0.arc.gz common-crawl/parse-output/segment/1346823845675/1346864469638_1.arc.gz common-crawl/parse-output/segment/1346823845675/1346864471290_4.arc.gz common-crawl/parse-output/segment/1346823845675/1346864477152_29.arc.gz common-crawl/parse-output/segment/1346823845675/1346864479613_6.arc.gz common-crawl/parse-output/segment/1346823845675/1346864480261_2.arc.gz common-crawl/parse-output/segment/1346823845675/1346864480936_5.arc.gz common-crawl/parse-output/segment/1346823845675/1346864484063_39.arc.gz common-crawl/parse-output/segment/1346823845675/1346864484163_3.arc.gz
# WARNING -- this might take a bit of time to run
%time all_files = list(islice(bucket.list(prefix="common-crawl/parse-output/segment/1346823845675/", delimiter="/"),None))
CPU times: user 3.77 s, sys: 0.24 s, total: 4.01 s Wall time: 30.45 s
len(all_files), all_files[0]
(20659, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/1346864466526_10.arc.gz>)
!s3cmd ls s3://aws-publicdatasets/common-crawl/parse-output/segment/1346823845675/1346864466526_10.arc.gz
2012-09-05 17:17 100011998 s3://aws-publicdatasets/common-crawl/parse-output/segment/1346823845675/1346864466526_10.arc.gz
file0 = all_files[0]
# http://boto.readthedocs.org/en/latest/ref/s3.html#module-boto.s3.key
type(file0), file0.size, file0.content_type
(boto.s3.key.Key, 100011998, 'application/octet-stream')
sum([f.size for f in all_files])
289226018
# estimate of size
len(valid_segments)*__builtin__.sum([f.size for f in all_files])
228638231658916L
all_files[-10:]
[<Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04371>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04372>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04373>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04374>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04375>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04376>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04377>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04378>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04379>, <Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/textData-04380>]
types of files
everything belongs into one of those classes?
from collections import Counter
def cc_file_type(path):
fname = path.split("/")[-1]
if fname[-7:] == '.arc.gz':
return 'arc.gz'
elif fname[:9] == 'textData-':
return 'textData'
elif fname[:9] == 'metadata-':
return 'metadata'
elif fname == '_SUCCESS':
return 'success'
else:
print path
return 'other'
Counter([cc_file_type(f.name) for f in all_files])
Counter({'arc.gz': 11904, 'textData': 4377, 'metadata': 4377, 'success': 1})
all_files[0]
<Key: aws-publicdatasets,common-crawl/parse-output/segment/1346823845675/1346864466526_10.arc.gz>
import requests
import json
s = requests.get("http://urlsearch.commoncrawl.org/download?q=edu.berkeley.ischool")
data = [json.loads(row) for row in s.content.split("\n") if row]
u = data[0]
# http://urlsearch.commoncrawl.org/page/1346876860493/1346901517112/422/320051/596
u
{u'arcFileDate': 1346901517112L, u'arcFileOffset': 320051, u'arcFileParition': 422, u'arcSourceSegmentId': 1346876860493L, u'compressedSize': 596, u'url': u'http://people.ischool.berkeley.edu/~rosario/papers.html'}
urlsearch_url = "http://urlsearch.commoncrawl.org/page/{arcSourceSegmentId}/{arcFileDate}/{arcFileParition}/{arcFileOffset}/{compressedSize}".format(**u)
urlsearch_url
'http://urlsearch.commoncrawl.org/page/1346876860493/1346901517112/422/320051/596'
!s3cmd ls s3://aws-publicdatasets/common-crawl/parse-output/segment/1346876860493/1346901517112_422.arc.gz
2012-09-06 04:03 100067216 s3://aws-publicdatasets/common-crawl/parse-output/segment/1346876860493/1346901517112_422.arc.gz
# how to grab 320051/596 out of that file?
# hints at https://github.com/trivio/common_crawl_index