from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL)
from itertools import islice
import logging
import requests
import json
import urllib
import urlparse
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING)
logger=logging.getLogger()
def query(q, fl="id"):
url = "{base_url}?".format(base_url=HACKFSM_BASEURL) + \
urllib.urlencode({'q':q,
'fl':fl,
'wt':'json',
'app_id':HACKFSM_ID,
'app_key':HACKFSM_KEY})
r = requests.get(url)
return r.json()
result = query(q="fsmTitle:Savio")['response']
result
# try again
# http://stackoverflow.com/a/5724453/7782
# http://excess.org/article/2013/02/itergen1/
class my_g(object):
def __init__(self,max_count):
self._remaining = range(max_count)
self._len = max_count
def __iter__(self):
return self
def __len__(self):
return self._len
def next(self):
if not self._remaining:
raise StopIteration
return self._remaining.pop(0)
g=my_g(10)
print len(g)
list(g)
class FSM(object):
def __init__(self, q, fl="id", start=0, rows=30,
base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY):
self.q = q
self.fl = fl
self.start = start
self.rows = rows
self.base_url = base_url
self.app_id = app_id
self.app_key = app_key
# get first page and numfound
self.cursor = start
# get the first page
result = self._get_page(q, fl, self.cursor, self.rows)
self.numfound = result['response']['numFound']
def _check_status(self,result):
"""throw exception if non-zero status"""
if result['responseHeader']['status'] != 0:
raise FSMException("status: " + str(result['responseHeader']['status']))
def _get_page(self, q, fl, start, rows):
result = self._call_api(q, fl, start, rows)
# update current page
self.page = result['response']['docs']
self.page_len = len(self.page)
return result
def _call_api(self, q, fl, start, rows):
url = "{base_url}?".format(base_url=self.base_url) + \
urllib.urlencode({'q':q,
'fl':fl,
'wt':'json',
'start':start,
'row':rows,
'app_id':self.app_id,
'app_key':self.app_key})
result = requests.get(url).json()
self._check_status(result)
# check whether we're getting fewer records than expected
if len(result['response']['docs']) < rows:
# are we at the end of the results
if start + len(result['response']['docs']) != self.numfound:
logger.warning("url:{url}, numfound:{numfound}, start+len{start_plus_len}".format(url=url,
numfound=self.numfound,
start_plus_len=start + len(result['response']['docs'])))
return result
def __iter__(self):
return self
def __len__(self):
return self.numfound
def next(self):
if not self.page:
# retrieve next page and check whether there's anything left
self.cursor += self.page_len
result = self._get_page(self.q, self.fl, self.cursor, self.rows)
if self.page_len == 0:
raise StopIteration
return self.page.pop(0)
fsm = FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmTitle,fsmImageUrl,fsmDateCreated")
len(fsm)
results = list(islice(fsm,None))
results[:10]
df = DataFrame(results)
len(df)
df.fsmImageUrl
from IPython.display import HTML
from jinja2 import Template
CSS = """
"""
IMAGES_TEMPLATE = CSS + """
{% for item in items %}
{% endfor %}
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=results[:10]))
# TEI-encoded docs
len(FSM("-fsmImageUrl:[* TO *]"))
# images
len(FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmImageUrl"))
from lxml.html import parse, fromstring
from collections import OrderedDict
api_docs_url = "http://digitalhumanities.berkeley.edu/hackfsm/api/detail"
r = requests.get(api_docs_url).content
doc = fromstring(r)
rows = doc.xpath('//div[@id="content"]/article/div/div/div/table[1]//tr')
headers = [col.text_content().strip() for col in rows[0].findall('td')]
headers
fields = []
for row in rows[1:]:
field = [col.text_content().strip() for col in row.findall('td')]
fields.append(field)
fsmfields = OrderedDict(fields)
fsmfields.keys()
fsm = FSM(q="*",fl=",".join(fsmfields.keys()))
len(fsm)
df = DataFrame(list(fsm))
len(df)
df.head()
# TEI URIs
len(list(df[~df.fsmTeiUrl.isnull()].fsmTeiUrl.apply(lambda a: a[0])))
# null dates
len(df[df.fsmDateCreated.isnull()])
# non-null image URLs
len(df[~df.fsmImageUrl.isnull()])
df[~df.fsmImageUrl.isnull()].id
# distribution of number of image URLs
df[~df.fsmImageUrl.isnull()].fsmImageUrl.apply(len).value_counts()
# let's crawl for images
results_images = list(FSM("-fsmTeiUrl:[* TO *]", fl=",".join(fsmfields.keys())))
len(results_images)
df_images=DataFrame(results_images)
df_images[df_images.fsmImageUrl.isnull()]
# would be interesting to see sizes of images and whether we can get at thumbnails
df_images.fsmImageUrl
urlparse.urlparse("http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg").netloc
df_images.fsmImageUrl
# calculate hostnames for all image urls
# might be possible to do this all with pandas
netlocs = list(df_images.fsmImageUrl.dropna().apply(lambda urls: set([urlparse.urlparse(url).netloc for url in urls])))
reduce(lambda x,y: x | y, netlocs, set())
def len2(x):
try:
return len(x)
except:
return np.nan
df_images.fsmImageUrl.apply(len2) == 3
df_images[df_images.fsmImageUrl.apply(len2) == 3].head()
df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl
IMAGES_TEMPLATE = """
{% for item in items %}
{% endfor %}
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl ))
len(df[~df.fsmDateCreated.isnull()])
s = df[~df.fsmDateCreated.isnull()].fsmDateCreated.apply(len)==2 #.astype('datetime64[ns]')
def first(x):
try:
return x[0]
except:
return np.nan
df['calc_date'] = pd.to_datetime(df.fsmDateCreated.apply(first), coerce=True)
df[~df.calc_date.isnull()].sort_index(by='calc_date').calc_date
pd.to_datetime(df.fsmDateCreated.dropna().apply(lambda s:s[0]).astype('str'), coerce=True).dropna()
# http://stackoverflow.com/questions/17690738/in-pandas-how-do-i-convert-a-string-of-date-strings-to-datetime-objects-and-put
date_stngs = ('2008-12-20','2008-12-21','2008-12-22','2008-12-23','Nov. 9, 1964', 'junk')
pd.to_datetime(pd.Series(date_stngs),coerce=True)
def f(x):
try:
return set(x)
except:
return set()
reduce(lambda x,y: x | y, df.fsmTypeOfResource.apply(f), set())
#related id
len(df.fsmRelatedIdentifier.dropna())
df.fsmTeiUrl.dropna()