from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL) from itertools import islice import logging import requests import json import urllib import urlparse from pandas import DataFrame, Series import pandas as pd import numpy as np logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING) logger=logging.getLogger() def query(q, fl="id"): url = "{base_url}?".format(base_url=HACKFSM_BASEURL) + \ urllib.urlencode({'q':q, 'fl':fl, 'wt':'json', 'app_id':HACKFSM_ID, 'app_key':HACKFSM_KEY}) r = requests.get(url) return r.json() result = query(q="fsmTitle:Savio")['response'] result # try again # http://stackoverflow.com/a/5724453/7782 # http://excess.org/article/2013/02/itergen1/ class my_g(object): def __init__(self,max_count): self._remaining = range(max_count) self._len = max_count def __iter__(self): return self def __len__(self): return self._len def next(self): if not self._remaining: raise StopIteration return self._remaining.pop(0) g=my_g(10) print len(g) list(g) class FSM(object): def __init__(self, q, fl="id", start=0, rows=30, base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY): self.q = q self.fl = fl self.start = start self.rows = rows self.base_url = base_url self.app_id = app_id self.app_key = app_key # get first page and numfound self.cursor = start # get the first page result = self._get_page(q, fl, self.cursor, self.rows) self.numfound = result['response']['numFound'] def _check_status(self,result): """throw exception if non-zero status""" if result['responseHeader']['status'] != 0: raise FSMException("status: " + str(result['responseHeader']['status'])) def _get_page(self, q, fl, start, rows): result = self._call_api(q, fl, start, rows) # update current page self.page = result['response']['docs'] self.page_len = len(self.page) return result def _call_api(self, q, fl, start, rows): url = "{base_url}?".format(base_url=self.base_url) + \ urllib.urlencode({'q':q, 'fl':fl, 'wt':'json', 'start':start, 'row':rows, 'app_id':self.app_id, 'app_key':self.app_key}) result = requests.get(url).json() self._check_status(result) # check whether we're getting fewer records than expected if len(result['response']['docs']) < rows: # are we at the end of the results if start + len(result['response']['docs']) != self.numfound: logger.warning("url:{url}, numfound:{numfound}, start+len{start_plus_len}".format(url=url, numfound=self.numfound, start_plus_len=start + len(result['response']['docs']))) return result def __iter__(self): return self def __len__(self): return self.numfound def next(self): if not self.page: # retrieve next page and check whether there's anything left self.cursor += self.page_len result = self._get_page(self.q, self.fl, self.cursor, self.rows) if self.page_len == 0: raise StopIteration return self.page.pop(0) fsm = FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmTitle,fsmImageUrl,fsmDateCreated") len(fsm) results = list(islice(fsm,None)) results[:10] df = DataFrame(results) len(df) df.fsmImageUrl from IPython.display import HTML from jinja2 import Template CSS = """ """ IMAGES_TEMPLATE = CSS + """
{% for item in items %}{% endfor %}
""" template = Template(IMAGES_TEMPLATE) HTML(template.render(items=results[:10])) # TEI-encoded docs len(FSM("-fsmImageUrl:[* TO *]")) # images len(FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmImageUrl")) from lxml.html import parse, fromstring from collections import OrderedDict api_docs_url = "http://digitalhumanities.berkeley.edu/hackfsm/api/detail" r = requests.get(api_docs_url).content doc = fromstring(r) rows = doc.xpath('//div[@id="content"]/article/div/div/div/table[1]//tr') headers = [col.text_content().strip() for col in rows[0].findall('td')] headers fields = [] for row in rows[1:]: field = [col.text_content().strip() for col in row.findall('td')] fields.append(field) fsmfields = OrderedDict(fields) fsmfields.keys() fsm = FSM(q="*",fl=",".join(fsmfields.keys())) len(fsm) df = DataFrame(list(fsm)) len(df) df.head() # TEI URIs len(list(df[~df.fsmTeiUrl.isnull()].fsmTeiUrl.apply(lambda a: a[0]))) # null dates len(df[df.fsmDateCreated.isnull()]) # non-null image URLs len(df[~df.fsmImageUrl.isnull()]) df[~df.fsmImageUrl.isnull()].id # distribution of number of image URLs df[~df.fsmImageUrl.isnull()].fsmImageUrl.apply(len).value_counts() # let's crawl for images results_images = list(FSM("-fsmTeiUrl:[* TO *]", fl=",".join(fsmfields.keys()))) len(results_images) df_images=DataFrame(results_images) df_images[df_images.fsmImageUrl.isnull()] # would be interesting to see sizes of images and whether we can get at thumbnails df_images.fsmImageUrl urlparse.urlparse("http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg").netloc df_images.fsmImageUrl # calculate hostnames for all image urls # might be possible to do this all with pandas netlocs = list(df_images.fsmImageUrl.dropna().apply(lambda urls: set([urlparse.urlparse(url).netloc for url in urls]))) reduce(lambda x,y: x | y, netlocs, set()) def len2(x): try: return len(x) except: return np.nan df_images.fsmImageUrl.apply(len2) == 3 df_images[df_images.fsmImageUrl.apply(len2) == 3].head() df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl IMAGES_TEMPLATE = """
{% for item in items %}{% endfor %}
""" template = Template(IMAGES_TEMPLATE) HTML(template.render(items=df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl )) len(df[~df.fsmDateCreated.isnull()]) s = df[~df.fsmDateCreated.isnull()].fsmDateCreated.apply(len)==2 #.astype('datetime64[ns]') def first(x): try: return x[0] except: return np.nan df['calc_date'] = pd.to_datetime(df.fsmDateCreated.apply(first), coerce=True) df[~df.calc_date.isnull()].sort_index(by='calc_date').calc_date pd.to_datetime(df.fsmDateCreated.dropna().apply(lambda s:s[0]).astype('str'), coerce=True).dropna() # http://stackoverflow.com/questions/17690738/in-pandas-how-do-i-convert-a-string-of-date-strings-to-datetime-objects-and-put date_stngs = ('2008-12-20','2008-12-21','2008-12-22','2008-12-23','Nov. 9, 1964', 'junk') pd.to_datetime(pd.Series(date_stngs),coerce=True) def f(x): try: return set(x) except: return set() reduce(lambda x,y: x | y, df.fsmTypeOfResource.apply(f), set()) #related id len(df.fsmRelatedIdentifier.dropna()) df.fsmTeiUrl.dropna()