HackFSM
relationship to other public APIs based on Solr?
Documentation:
from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL)
from itertools import islice
import logging
import requests
import json
import urllib
import urlparse
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING)
logger=logging.getLogger()
def query(q, fl="id"):
url = "{base_url}?".format(base_url=HACKFSM_BASEURL) + \
urllib.urlencode({'q':q,
'fl':fl,
'wt':'json',
'app_id':HACKFSM_ID,
'app_key':HACKFSM_KEY})
r = requests.get(url)
return r.json()
result = query(q="fsmTitle:Savio")['response']
result
{u'docs': [{u'id': u'ark:/13030/ft2f59n853'}, {u'id': u'access143'}, {u'id': u'ark:/13030/tf2q2n99d3'}, {u'id': u'ark:/13030/tf3p3003k7'}, {u'id': u'ark:/13030/tf5m3nb15b'}, {u'id': u'ark:/13030/tf267n996q'}, {u'id': u'access326'}, {u'id': u'access327'}, {u'id': u'access328'}, {u'id': u'access329'}, {u'id': u'access330'}, {u'id': u'access331'}, {u'id': u'access332'}, {u'id': u'access333'}, {u'id': u'access334'}, {u'id': u'access335'}, {u'id': u'access339'}, {u'id': u'access340'}, {u'id': u'access341'}, {u'id': u'access343'}, {u'id': u'access344'}, {u'id': u'access345'}, {u'id': u'access346'}, {u'id': u'access347'}, {u'id': u'access348'}, {u'id': u'access365'}, {u'id': u'access366'}, {u'id': u'access367'}, {u'id': u'access369'}, {u'id': u'access370'}], u'numFound': 124, u'start': 0}
# try again
# http://stackoverflow.com/a/5724453/7782
# http://excess.org/article/2013/02/itergen1/
class my_g(object):
def __init__(self,max_count):
self._remaining = range(max_count)
self._len = max_count
def __iter__(self):
return self
def __len__(self):
return self._len
def next(self):
if not self._remaining:
raise StopIteration
return self._remaining.pop(0)
g=my_g(10)
print len(g)
list(g)
10
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
class FSM(object):
def __init__(self, q, fl="id", start=0, rows=30,
base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY):
self.q = q
self.fl = fl
self.start = start
self.rows = rows
self.base_url = base_url
self.app_id = app_id
self.app_key = app_key
# get first page and numfound
self.cursor = start
# get the first page
result = self._get_page(q, fl, self.cursor, self.rows)
self.numfound = result['response']['numFound']
def _check_status(self,result):
"""throw exception if non-zero status"""
if result['responseHeader']['status'] != 0:
raise FSMException("status: " + str(result['responseHeader']['status']))
def _get_page(self, q, fl, start, rows):
result = self._call_api(q, fl, start, rows)
# update current page
self.page = result['response']['docs']
self.page_len = len(self.page)
return result
def _call_api(self, q, fl, start, rows):
url = "{base_url}?".format(base_url=self.base_url) + \
urllib.urlencode({'q':q,
'fl':fl,
'wt':'json',
'start':start,
'row':rows,
'app_id':self.app_id,
'app_key':self.app_key})
result = requests.get(url).json()
self._check_status(result)
# check whether we're getting fewer records than expected
if len(result['response']['docs']) < rows:
# are we at the end of the results
if start + len(result['response']['docs']) != self.numfound:
logger.warning("url:{url}, numfound:{numfound}, start+len{start_plus_len}".format(url=url,
numfound=self.numfound,
start_plus_len=start + len(result['response']['docs'])))
return result
def __iter__(self):
return self
def __len__(self):
return self.numfound
def next(self):
if not self.page:
# retrieve next page and check whether there's anything left
self.cursor += self.page_len
result = self._get_page(self.q, self.fl, self.cursor, self.rows)
if self.page_len == 0:
raise StopIteration
return self.page.pop(0)
fsm = FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmTitle,fsmImageUrl,fsmDateCreated")
len(fsm)
685
results = list(islice(fsm,None))
results[:10]
[{u'fsmDateCreated': [u'Nov. 9, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9r90', u'http://nma.berkeley.edu/ark:/28722/bk0005j9s0j'], u'fsmTitle': [u'Professor John Searle speaking to crowd.'], u'id': u'ark:/13030/ft6k40080h'}, {u'fsmDateCreated': [u'Dec. 2, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2842', u'http://nma.berkeley.edu/ark:/28722/bk0005k285m'], u'fsmTitle': [u'Mario Savio speaking with reporters.'], u'id': u'ark:/13030/tf009n97vn'}, {u'fsmDateCreated': [u'Dec. 2, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2c2h', u'http://nma.berkeley.edu/ark:/28722/bk0005k2c32'], u'fsmTitle': [u'Joan Baez singing in front of Sproul Hall.'], u'id': u'ark:/13030/tf5j49n838'}, {u'fsmDateCreated': [u'Dec. 3, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z5w', u'http://nma.berkeley.edu/ark:/28722/bk0005j9z6f'], u'fsmTitle': [u'Girl student being booked on campus before being taken to jail.'], u'id': u'ark:/13030/ft700007tc'}, {u'fsmDateCreated': [u'Oct. 5, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9n7b', u'http://nma.berkeley.edu/ark:/28722/bk0005j9n8w'], u'fsmTitle': [u'Bryan Turner speaking.'], u'id': u'ark:/13030/ft7n39p1mr'}, {u'fsmDateCreated': [u'Nov. 9, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k1b6q', u'http://nma.berkeley.edu/ark:/28722/bk0005k1b78'], u'fsmTitle': [u'Steve Weissman speaking to crowd.'], u'id': u'ark:/13030/tf8w1006vp'}, {u'fsmDateCreated': [u'Nov. 24, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9v37', u'http://nma.berkeley.edu/ark:/28722/bk0005j9v4s'], u'fsmTitle': [u'Professor Morris Hirsch speaking from Sproul steps.'], u'id': u'ark:/13030/ft9f59p3bw'}, {u'fsmDateCreated': [u'Oct. 1, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k0v2s', u'http://nma.berkeley.edu/ark:/28722/bk0005k0v3b'], u'fsmTitle': [u'Crowd in Sproul Plaza.'], u'id': u'ark:/13030/tf0870010x'}, {u'fsmDateCreated': [u'Dec. 3, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z1p', u'http://nma.berkeley.edu/ark:/28722/bk0005j9z27'], u'fsmTitle': [u'Crowds in Sproul Plaza'], u'id': u'ark:/13030/ft8199p26d'}, {u'fsmDateCreated': [u'Dec. 2, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9x7g', u'http://nma.berkeley.edu/ark:/28722/bk0005j9x81'], u'fsmTitle': [u'Professor David Hackett talking to his class.'], u'id': u'ark:/13030/ft9000102p'}]
df = DataFrame(results)
len(df)
685
df.fsmImageUrl
0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... 1 [http://nma.berkeley.edu/ark:/28722/bk0005k284... 2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... 3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... 4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... 5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6... 6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3... 7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2... 8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1... 9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7... 10 [http://nma.berkeley.edu/ark:/28722/bk0005k232... 11 [http://nma.berkeley.edu/ark:/28722/bk0005k047... 12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8... 13 [http://nma.berkeley.edu/ark:/28722/bk0005k110... 14 [http://nma.berkeley.edu/ark:/28722/bk0005k276... ... 670 NaN 671 NaN 672 NaN 673 NaN 674 NaN 675 NaN 676 NaN 677 NaN 678 NaN 679 NaN 680 NaN 681 NaN 682 NaN 683 NaN 684 NaN Name: fsmImageUrl, Length: 685, dtype: object
from IPython.display import HTML
from jinja2 import Template
CSS = """
<style>
.wrap img {
margin-left: 0px;
margin-right: 0px;
display: inline-block;
width: 150px;
}
.wrap {
/* Prevent vertical gaps */
line-height: 0;
-webkit-column-count: 5;
-webkit-column-gap: 0px;
-moz-column-count: 5;
-moz-column-gap: 0px;
column-count: 5;
column-gap: 0px;
}
.wrap img {
/* Just in case there are inline attributes */
width: 100% !important;
height: auto !important;
}
</style>
"""
IMAGES_TEMPLATE = CSS + """
<div class="wrap">
{% for item in items %}<img title="{{item.fsmTitle.0}}" src="{{item.fsmImageUrl.0}}"/>{% endfor %}
</div>
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=results[:10]))
To programmatically differentiate records that describe images from records that describe TEI-encoded XML documents, the API permits queries that exclude records with NULL values in the "unwanted" Url field.
That is, to retrieve TEI documents only, one would query for null values in the fsmImageUrl
field. To retrieve images only, one would query for null values in the fsmTeiUrl
field.
NOTE: Please observe the hyphen prepended to the field names in the examples below. The hyphen (minus sign) functions here as a NOT operator.
Example that selects for TEI encoded XML documents by excluding null values of fsmImageUrl
:
https://<BASE URL>/solr/fsm/select?q=-fsmImageUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012
Example that selects for images by excluding null values of fsmTeiUrl:
https://<BASE URL>/solr/fsm/select?q=-fsmTeiUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012
# TEI-encoded docs
len(FSM("-fsmImageUrl:[* TO *]"))
577
# images
len(FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmImageUrl"))
685
from lxml.html import parse, fromstring
from collections import OrderedDict
api_docs_url = "http://digitalhumanities.berkeley.edu/hackfsm/api/detail"
r = requests.get(api_docs_url).content
doc = fromstring(r)
rows = doc.xpath('//div[@id="content"]/article/div/div/div/table[1]//tr')
headers = [col.text_content().strip() for col in rows[0].findall('td')]
headers
['Field Name', 'Definitions']
fields = []
for row in rows[1:]:
field = [col.text_content().strip() for col in row.findall('td')]
fields.append(field)
fsmfields = OrderedDict(fields)
fsmfields.keys()
['id', 'fsmTitle', 'fsmCreator', 'fsmTypeOfResource', 'fsmDateCreated', 'fsmNote', 'fsmRelatedTitle', 'fsmIdentifier', 'fsmRelatedIdentifier', 'fsmPhysicalLocation', 'fsmImageUrl', 'fsmTeiUrl']
fsm = FSM(q="*",fl=",".join(fsmfields.keys()))
len(fsm)
879
df = DataFrame(list(fsm))
len(df)
879
df.head()
fsmCreator | fsmDateCreated | fsmIdentifier | fsmImageUrl | fsmNote | fsmPhysicalLocation | fsmRelatedIdentifier | fsmRelatedTitle | fsmTeiUrl | fsmTitle | fsmTypeOfResource | id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [Warren (Photographer)] | [Nov. 9, 1964] | [BANC PIC 1959.010 -- NEG pt.3 11-09-64.4] | [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | [The Free Speech Movement Digital Archive, San... | NaN | [Professor John Searle speaking to crowd.] | [still image] | ark:/13030/ft6k40080h |
1 | [Steven Marcus] | [Dec. 2, 1964] | [BANC PIC 2000.002--NEG Strip 117:36] | [http://nma.berkeley.edu/ark:/28722/bk0005k284... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... | [The Free Speech Movement Digital Archive, Ste... | NaN | [Mario Savio speaking with reporters.] | [still image] | ark:/13030/tf009n97vn |
2 | [Steven Marcus] | [Dec. 2, 1964] | [BANC PIC 2000.002--NEG Strip 122:42] | [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... | [The Free Speech Movement Digital Archive, Ste... | NaN | [Joan Baez singing in front of Sproul Hall.] | [still image] | ark:/13030/tf5j49n838 |
3 | [Jones (Photographer)] | [Dec. 3, 1964] | [BANC PIC 1959.010 -- NEG pt.3 12-03-64.2] | [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | [The Free Speech Movement Digital Archive, San... | NaN | [Girl student being booked on campus before be... | [still image] | ark:/13030/ft700007tc |
4 | [Ingman (Photographer)] | [Oct. 5, 1964] | [BANC PIC 1959.010 -- NEG pt.3 10-05-64.4] | [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | [The Free Speech Movement Digital Archive, San... | NaN | [Bryan Turner speaking.] | [still image] | ark:/13030/ft7n39p1mr |
5 rows × 12 columns
# TEI URIs
len(list(df[~df.fsmTeiUrl.isnull()].fsmTeiUrl.apply(lambda a: a[0])))
194
# null dates
len(df[df.fsmDateCreated.isnull()])
393
# non-null image URLs
len(df[~df.fsmImageUrl.isnull()])
302
df[~df.fsmImageUrl.isnull()].id
0 ark:/13030/ft6k40080h 1 ark:/13030/tf009n97vn 2 ark:/13030/tf5j49n838 3 ark:/13030/ft700007tc 4 ark:/13030/ft7n39p1mr 5 ark:/13030/tf8w1006vp 6 ark:/13030/ft9f59p3bw 7 ark:/13030/tf0870010x 8 ark:/13030/ft8199p26d 9 ark:/13030/ft9000102p 10 ark:/13030/tf7n39n9qb 11 ark:/13030/ft3c6004k4 12 ark:/13030/tf8n39p05g 13 ark:/13030/tf20000235 14 ark:/13030/tf0d5n97ws ... 287 UARC PIC 24B:1:29a 288 UARC PIC 24B:1:23 289 UARC PIC 24B:1:29c 290 UARC PIC 24B:1:18 291 UARC PIC 24B:2:28 292 UARC PIC 24B:2:13 293 UARC PIC 24B:1:6 294 UARC PIC 24B:1:3 295 UARC PIC 24B:2:11 296 UARC PIC 24B:2:10 297 UARC PIC 24B:2:23 298 UARC PIC 24B:2:16 299 UARC PIC 24B:1:29b 300 UARC PIC 24B:1:8 301 UARC PIC 24B:1:16 Name: id, Length: 302, dtype: object
# distribution of number of image URLs
df[~df.fsmImageUrl.isnull()].fsmImageUrl.apply(len).value_counts()
2 245 3 56 4 1 dtype: int64
# let's crawl for images
results_images = list(FSM("-fsmTeiUrl:[* TO *]", fl=",".join(fsmfields.keys())))
len(results_images)
685
df_images=DataFrame(results_images)
df_images[df_images.fsmImageUrl.isnull()]
fsmCreator | fsmDateCreated | fsmIdentifier | fsmImageUrl | fsmNote | fsmPhysicalLocation | fsmRelatedIdentifier | fsmRelatedTitle | fsmTitle | fsmTypeOfResource | id | |
---|---|---|---|---|---|---|---|---|---|---|---|
302 | [Towle, Katherina Amelia, 1898-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Towle, Katherine Amelia, 1898-, Letter to Mar... | [folder, Box 1:46] | access1 |
303 | [Anastasi, Ron] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Anastasi, Ron, Letter to Ronald J. Anastasi] | [folder, Box 2:8] | access2 |
304 | [Towle, Katherina Amelia, 1898-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Towle, Katherine Amelia, 1898-, Letter to Mar... | [folder, Box 1:46] | access3 |
305 | [Towle, Katherina Amelia, 1898-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Towle, Katherine Amelia, 1898-, Letter to Mar... | [folder, Box 1:46] | access4 |
306 | [Anastasi, Ron] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Anastasi, Ron, Note to Ron Anastasi] | [folder, Box 2:8] | access5 |
307 | [Congress of Racial Equality, Berkeley Campus ... | NaN | [BANC MSS 86/157 c] | NaN | NaN | NaN | NaN | [Social Protest Collection, 1960-1982] | [Campus Core-Lator, Vol. I, no.2, Campus Core-... | [fdr, Carton 3:37] | access6 |
308 | [Williams, Arleigh Taber, 1912-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memos, Re: Sandor Carl Fuchs] | [folder, Carton 3:58] | access7 |
309 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Marvin (Garson) from Barbara Garson... | [item, Box 2:11:4] | access8 |
310 | [Williams, Arleigh Taber, 1912-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memos, Re: Mark Bravo] | [folder, Carton 3:58] | access9 |
311 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Marvin (Garson) from Barbara Garson... | [item, Box 2:11:4] | access10 |
312 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Garson, Barbara, Letter to Marvin (Garson) fr... | [folder, Box 2:11] | access11 |
313 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Marvin (Garson) from Barbara Garson... | [item, Box 2:11:4] | access12 |
314 | [Williams, Arleigh Taber, 1912-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memos, Re: Donald G. Hatch] | [folder, Carton 3:58] | access13 |
315 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Garson, Barbara, Unaddressed letter from Barb... | [folder, Box 2:11] | access14 |
316 | [Williams, Arleigh Taber, 1912-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memos, Re: David L. Goines] | [folder, Carton 3:58] | access15 |
317 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Unaddressed letter from Barbara Garson, Unadd... | [item, Box 2:11:3] | access16 |
318 | [Williams, Arleigh Taber, 1912-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memos, Re: Arthur Lee Goldberg] | [folder, Carton 3:58] | access17 |
319 | [Williams, Arleigh Taber, 1912-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memos, Re: Elizabeth C. Gardner] | [folder, Carton 3:58] | access18 |
320 | [Williams, Arleigh Taber, 1912-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memos, Re: Mario Robert Savio] | [folder, Carton 3:58] | access19 |
321 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Rabbit (Marvin Garson) from Barbara... | [item, Box 2:11:6] | access20 |
322 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Rabbit (Marvin Garson) from Barbara... | [item, Box 2:11:6] | access21 |
323 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Garson, Barbara, Letter to Rabbit (Marvin Gar... | [folder, Box 2:11] | access22 |
324 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Garson, Barbara, Letter to Rabbit (Marvin Gar... | [folder, Box 2:11] | access23 |
325 | [Ray, Bill] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Note to Ron Anastasi, Note to Ron Anastasi fr... | [item, Box 2:8:3] | access24 |
326 | [Anastasi, Ron] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Anastasi, Ron, Letter to Ronald J. Anastasi] | [folder, Box 2:8] | access25 |
327 | [Pitelka, Frank A., Museum of Vertebrate Zoology] | NaN | [CU-149] | NaN | NaN | NaN | NaN | [Records of the Office of the Chancellor, Univ... | [Letter to Chancellor Roger W. Heyns, Letter] | [item, Box 72:14:11] | access26 |
328 | [Brace, Richard] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Ronald J. Anastasi, Letter to Ronal... | [item, Box 2:8:4] | access27 |
329 | [Brace, Richard] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Ronald J. Anastasi, Letter to Ronal... | [item, Box 2:8:4] | access28 |
330 | [Anastasi, Ron] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Anastasi, Ron, Note to Ron Anastasi] | [folder, Box 2:8] | access29 |
331 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Rabbit (Marvin Garson), Letter to R... | [item, Box 2:11:2] | access30 |
332 | [University of California, Berkeley, Admission... | NaN | [CU-149] | NaN | NaN | NaN | NaN | [Records of the Office of the Chancellor, Univ... | [Admissions of Mario Savio (305-20), Letter to... | [folder, Box 72:14] | access31 |
333 | [University of California, Berkeley, Admission... | NaN | [CU-149] | NaN | NaN | NaN | NaN | [Records of the Office of the Chancellor, Univ... | [Admissions of Mario Savio (305-20), Letter to... | [folder, Box 72:14] | access32 |
334 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Garson, Barbara, Letter to Rabbit (Marvin Gar... | [folder, Box 2:11] | access33 |
335 | [Burnstein, Malcolm] | NaN | [CU-149] | NaN | NaN | NaN | NaN | [Records of the Office of the Chancellor, Univ... | [Letter to Dr. William B. Boyd, Vice Chancello... | [item, Box 72:14:19] | access34 |
336 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Rabbit (Marvin Garson), Letter to R... | [item, Box 2:11:1] | access35 |
337 | [Garson, Barbara] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Garson, Barbara, Letter to Rabbit (Marvin Gar... | [folder, Box 2:11] | access36 |
338 | [Levy, Leonard L., Vice-President, Amalgamated... | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Governor Edmund G. Brown, Letter to... | [item, Box 2:49:1] | access37 |
339 | [Levy, Leonard L., Vice-President, Amalgamated... | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Governor Edmund G. Brown, Letter to... | [item, Box 2:49:1] | access38 |
340 | [Kerr, Clark Kerr, President of the University] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Mrs. William Mann, Letter to Mrs. W... | [item, Box 2:47:1] | access39 |
341 | [Kerr, Clark Kerr] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Outgoing Letters, Letter to Mrs. William Mann] | [folder, Box 2:47] | access40 |
342 | [Miller, Dustin Mark (Dusty)] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Miller, Dustin (Dusty), Letter to Professor J... | [folder, Box 2:22] | access41 |
343 | [Levy, Leonard L., Vice-President, Amalgamated... | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Governor Edmund G. Brown, Letter to... | [item, Box 2:49:1] | access42 |
344 | [Levy, Leonard L.] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Levy, Leonard, Letter to Governor Edmund G. B... | [folder, Box 2:49] | access43 |
345 | [Kerr, Clark Kerr] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Outgoing Letters, Letter to Mr. and Mrs. Max ... | [folder, Box 2:47] | access44 |
346 | [Miller, Dustin Mark (Dusty)] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Gretchen Kittridge, Letter to Gretc... | [item, Box 2:22:1] | access45 |
347 | [Miller, Dustin Mark (Dusty)] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Miller, Dustin (Dusty), Letter to Gretchen Ki... | [folder, Box 2:22] | access46 |
348 | [Heins, Sulamith Hannah] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Heins, Sulamith Hannah, Letter to Sulamith Ha... | [folder, Box 2:18] | access47 |
349 | [Towle, Katherina Amelia, 1898-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Sulamith Hannah Heins from Katherin... | [item, Box 2:18:1] | access48 |
350 | [Towle, Katherina Amelia, 1898-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Letter to Sulamith Hannah Heins from Katherin... | [item, Box 2:18:1] | access49 |
351 | [Kerr, Clark, 1911-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memorandums to Faculty, Memo from Clark Kerr ... | [item, Box 2:47:3] | access50 |
352 | [Kerr, Clark, 1911-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Memorandums to Faculty, Enclosed memo from Re... | [item, Box 2:47:3] | access51 |
353 | [Williams, Arleigh Taber, 1912-] | NaN | [CU-309] | NaN | NaN | NaN | NaN | [Free Speech Movement Records, 1936-1969] | [Re: Arthur Lee Goldberg, Page 2] | [item, Carton 3:58:4] | access52 |
354 | [Towle, Katherina Amelia, 1898-] | NaN | [CU-149] | NaN | NaN | NaN | NaN | [Records of the Office of the Chancellor, Univ... | [Letter to Acting Chancellor Martin Meyerson f... | [item, Box 72:23:1] | access53 |
355 | [McLaughlin, Donald H. (Donald Hamilton), 1891... | NaN | [Banc Mss 86/60 c] | NaN | NaN | NaN | NaN | [Donald H. McLaughlin Papers, 1930-1984] | [Meyer Committee (Regent's Special Committee t... | [folder, Carton 21:16] | access54 |
356 | [Towle, Katherina Amelia, 1898-] | NaN | [CU-149] | NaN | NaN | NaN | NaN | [Records of the Office of the Chancellor, Univ... | [Letter to Acting Chancellor Martin Meyerson f... | [item, Box 72:23:1] | access55 |
357 | [University of California, Berkeley, Office of... | NaN | [CU-149] | NaN | NaN | NaN | NaN | [Records of the Office of the Chancellor, Univ... | [Dean of Students Office (355), Letter to Acti... | [folder, Box 72:23] | access56 |
358 | [McLaughlin, Donald H. (Donald Hamilton), 1891... | NaN | [Banc Mss 86/60 c] | NaN | NaN | NaN | NaN | [Donald H. McLaughlin Papers, 1930-1984] | [Meyer Committee (Regent's Special Committee t... | [folder, Carton 21:16] | access57 |
359 | [Jones, Hardin B.] | NaN | [Banc Mss 86/60 c] | NaN | NaN | NaN | NaN | [Donald H. McLaughlin Papers, 1930-1984] | [Letter to Arthur Ross, Page 3] | [item, Carton 21:14:7] | access58 |
360 | [Kerr, Clark, 1911-] | NaN | [Banc Mss 86/60 c] | NaN | NaN | NaN | NaN | [Donald H. McLaughlin Papers, 1930-1984] | [Telegram from Clark Kerr to Donald McLaughlin... | [item, Carton 21:2:1] | access59 |
361 | [Jones, Hardin B.] | NaN | [Banc Mss 86/60 c] | NaN | NaN | NaN | NaN | [Donald H. McLaughlin Papers, 1930-1984] | [Letter to Arthur Ross, Page 1] | [item, Carton 21:14:7] | access60 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
383 rows × 11 columns
# would be interesting to see sizes of images and whether we can get at thumbnails
df_images.fsmImageUrl
0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... 1 [http://nma.berkeley.edu/ark:/28722/bk0005k284... 2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... 3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... 4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... 5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6... 6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3... 7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2... 8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1... 9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7... 10 [http://nma.berkeley.edu/ark:/28722/bk0005k232... 11 [http://nma.berkeley.edu/ark:/28722/bk0005k047... 12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8... 13 [http://nma.berkeley.edu/ark:/28722/bk0005k110... 14 [http://nma.berkeley.edu/ark:/28722/bk0005k276... ... 670 NaN 671 NaN 672 NaN 673 NaN 674 NaN 675 NaN 676 NaN 677 NaN 678 NaN 679 NaN 680 NaN 681 NaN 682 NaN 683 NaN 684 NaN Name: fsmImageUrl, Length: 685, dtype: object
urlparse.urlparse("http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg").netloc
'digitalassets.lib.berkeley.edu'
df_images.fsmImageUrl
0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... 1 [http://nma.berkeley.edu/ark:/28722/bk0005k284... 2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... 3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... 4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... 5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6... 6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3... 7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2... 8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1... 9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7... 10 [http://nma.berkeley.edu/ark:/28722/bk0005k232... 11 [http://nma.berkeley.edu/ark:/28722/bk0005k047... 12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8... 13 [http://nma.berkeley.edu/ark:/28722/bk0005k110... 14 [http://nma.berkeley.edu/ark:/28722/bk0005k276... ... 670 NaN 671 NaN 672 NaN 673 NaN 674 NaN 675 NaN 676 NaN 677 NaN 678 NaN 679 NaN 680 NaN 681 NaN 682 NaN 683 NaN 684 NaN Name: fsmImageUrl, Length: 685, dtype: object
# calculate hostnames for all image urls
# might be possible to do this all with pandas
netlocs = list(df_images.fsmImageUrl.dropna().apply(lambda urls: set([urlparse.urlparse(url).netloc for url in urls])))
reduce(lambda x,y: x | y, netlocs, set())
{u'nma.berkeley.edu', u'sunsite.berkeley.edu'}
def len2(x):
try:
return len(x)
except:
return np.nan
df_images.fsmImageUrl.apply(len2) == 3
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False 10 False 11 False 12 False 13 False 14 False ... 670 False 671 False 672 False 673 False 674 False 675 False 676 False 677 False 678 False 679 False 680 False 681 False 682 False 683 False 684 False Name: fsmImageUrl, Length: 685, dtype: bool
df_images[df_images.fsmImageUrl.apply(len2) == 3].head()
fsmCreator | fsmDateCreated | fsmIdentifier | fsmImageUrl | fsmNote | fsmPhysicalLocation | fsmRelatedIdentifier | fsmRelatedTitle | fsmTitle | fsmTypeOfResource | id | |
---|---|---|---|---|---|---|---|---|---|---|---|
246 | [Hecker, Ron] | [Dec. 7, 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [Crowd in Sproul Plaza from Student Union balc... | NaN | UARC PIC 24B:2:22 |
247 | [Hecker, Ron] | [Dec. 7, 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [Crowd at Greek Theater] | NaN | UARC PIC 24B:2:17 |
248 | NaN | [Dec. 7, 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [View from inside Sproul Hall lobby looking th... | NaN | UARC PIC 24B:1:26 |
249 | [Hecker, Ron] | [Dec. 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [Student Strike] | NaN | UARC PIC 24B:2:6 |
250 | [Hecker, Ron] | [Dec. 7, 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [Crowd at Greek Theater] | NaN | UARC PIC 24B:2:21 |
5 rows × 11 columns
df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl
[u'http://nma.berkeley.edu/ark:/28722/bk001532c4q', u'http://nma.berkeley.edu/ark:/28722/bk001532c7c', u'http://nma.berkeley.edu/ark:/28722/bk001532c58', u'http://nma.berkeley.edu/ark:/28722/bk001532c8x']
IMAGES_TEMPLATE = """
<div class="nowrap">
{% for item in items %}<img title="{{item}}" src="{{item}}"/>{% endfor %}
</div>
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl ))
len(df[~df.fsmDateCreated.isnull()])
486
s = df[~df.fsmDateCreated.isnull()].fsmDateCreated.apply(len)==2 #.astype('datetime64[ns]')
def first(x):
try:
return x[0]
except:
return np.nan
df['calc_date'] = pd.to_datetime(df.fsmDateCreated.apply(first), coerce=True)
df[~df.calc_date.isnull()].sort_index(by='calc_date').calc_date
156 1964-01-01 167 1964-01-01 14 1964-01-01 92 1964-01-01 220 1964-01-01 871 1964-01-01 90 1964-01-01 146 1964-01-01 731 1964-01-01 74 1964-01-01 300 1964-01-01 203 1964-01-05 245 1964-10-01 170 1964-10-01 164 1964-10-01 ... 109 1970-05-27 117 1970-05-27 197 1970-05-27 210 1970-05-27 58 1970-05-27 869 1973-01-01 159 1984-10-02 180 1984-10-02 129 1984-10-02 299 1984-10-02 287 1984-10-02 289 1984-10-02 868 1986-01-01 801 1990-01-01 867 1993-06-27 Name: calc_date, Length: 434, dtype: datetime64[ns]
pd.to_datetime(df.fsmDateCreated.dropna().apply(lambda s:s[0]).astype('str'), coerce=True).dropna()
0 1964-11-09 1 1964-12-02 2 1964-12-02 3 1964-12-03 4 1964-10-05 5 1964-11-09 6 1964-11-24 7 1964-10-01 8 1964-12-03 9 1964-12-02 10 1964-12-03 11 1964-12-07 12 1964-11-09 13 1964-10-01 14 1964-01-01 ... 863 1965-07-26 864 1965-10-13 865 1965-03-05 867 1993-06-27 868 1986-01-01 869 1973-01-01 870 1965-01-27 871 1964-01-01 872 1964-11-30 873 1964-12-04 874 1964-12-22 875 1965-01-07 876 1964-12-21 877 1965-01-09 878 1965-01-02 Name: fsmDateCreated, Length: 434, dtype: datetime64[ns]
# http://stackoverflow.com/questions/17690738/in-pandas-how-do-i-convert-a-string-of-date-strings-to-datetime-objects-and-put
date_stngs = ('2008-12-20','2008-12-21','2008-12-22','2008-12-23','Nov. 9, 1964', 'junk')
pd.to_datetime(pd.Series(date_stngs),coerce=True)
0 2008-12-20 1 2008-12-21 2 2008-12-22 3 2008-12-23 4 1964-11-09 5 NaT dtype: datetime64[ns]
def f(x):
try:
return set(x)
except:
return set()
reduce(lambda x,y: x | y, df.fsmTypeOfResource.apply(f), set())
{u'Box 1:1', u'Box 1:11', u'Box 1:11:4', u'Box 1:13', u'Box 1:13:1', u'Box 1:13:4', u'Box 1:14', u'Box 1:15', u'Box 1:16', u'Box 1:17', u'Box 1:2', u'Box 1:25', u'Box 1:25:1', u'Box 1:25:4', u'Box 1:28', u'Box 1:29', u'Box 1:2:3', u'Box 1:30', u'Box 1:30:2', u'Box 1:32', u'Box 1:34', u'Box 1:34:1', u'Box 1:38', u'Box 1:39', u'Box 1:4', u'Box 1:41', u'Box 1:43', u'Box 1:44', u'Box 1:45', u'Box 1:46', u'Box 1:5', u'Box 1:6', u'Box 1:7', u'Box 1:8', u'Box 2:11', u'Box 2:11:1', u'Box 2:11:2', u'Box 2:11:3', u'Box 2:11:4', u'Box 2:11:6', u'Box 2:18', u'Box 2:18:1', u'Box 2:22', u'Box 2:22:1', u'Box 2:47', u'Box 2:47:1', u'Box 2:47:3', u'Box 2:49', u'Box 2:49:1', u'Box 2:49:2', u'Box 2:55', u'Box 2:59', u'Box 2:8', u'Box 2:8:3', u'Box 2:8:4', u'Box 3:1', u'Box 3:11', u'Box 3:14', u'Box 3:14:1', u'Box 3:15', u'Box 3:17', u'Box 3:17:2', u'Box 3:2', u'Box 3:21', u'Box 3:22', u'Box 3:23', u'Box 3:26', u'Box 3:29', u'Box 3:29:1', u'Box 3:29:2', u'Box 3:3', u'Box 3:31', u'Box 3:33', u'Box 3:34', u'Box 3:34:1', u'Box 3:36', u'Box 3:38', u'Box 3:39', u'Box 3:39:4', u'Box 3:39:5', u'Box 3:39:8', u'Box 3:40', u'Box 3:41', u'Box 3:5', u'Box 4:10', u'Box 4:5', u'Box 4:5:13', u'Box 4:5:2', u'Box 4:5:6', u'Box 4:8', u'Box 4:8:5', u'Box 4:9', u'Box 4:9:3', u'Box 70:33', u'Box 70:33:2', u'Box 70:33:4', u'Box 70:34', u'Box 70:34:1', u'Box 70:34:3', u'Box 70:34:7c', u'Box 70:34:8', u'Box 72:14', u'Box 72:14:1', u'Box 72:14:11', u'Box 72:14:19', u'Box 72:23', u'Box 72:23:1', u'Carton 1:12', u'Carton 1:12:2', u'Carton 1:12:3', u'Carton 1:12:4', u'Carton 1:12:5', u'Carton 1:12:6', u'Carton 1:12:7', u'Carton 1:12:8', u'Carton 1:14', u'Carton 1:15', u'Carton 1:9', u'Carton 21:14:1', u'Carton 21:14:7', u'Carton 21:16', u'Carton 21:2:1', u'Carton 2:20', u'Carton 2:32', u'Carton 3:16', u'Carton 3:37', u'Carton 3:58', u'Carton 3:58:4', u'Carton 3:58:7', u'Carton 4:32', u'Carton 4:78', u'Carton 4:80', u'agendas', u'articles', u'briefs (legal documents)', u'detail', u'fdr', u'fliers (printed matter)', u'folder', u'form letters', u'group statements', u'item', u'leaflets', u'letters (correspondence)', u'magazines (periodicals)', u'memorandums', u'minutes', u'miscellaneous', u'news bulletins', u'newsletters', u'newspapers', u'oral histories', u'pamphlets', u'papers (document genres)', u'personal statement', u'personal statements', u'progress reports', u'reports', u'still image', u'tables of content', u'text', u'title pages', u'transcripts'}
#related id
len(df.fsmRelatedIdentifier.dropna())
236
df.fsmTeiUrl.dropna()
685 [http://content.cdlib.org/xml/ark:/13030/kt5m3... 686 [http://content.cdlib.org/xml/ark:/13030/kt5s2... 687 [http://content.cdlib.org/xml/ark:/13030/kt6k4... 688 [http://content.cdlib.org/xml/ark:/13030/kt4s2... 689 [http://content.cdlib.org/xml/ark:/13030/kt1h4... 690 [http://content.cdlib.org/xml/ark:/13030/kt2w1... 691 [http://content.cdlib.org/xml/ark:/13030/kt609... 692 [http://content.cdlib.org/xml/ark:/13030/kt638... 693 [http://content.cdlib.org/xml/ark:/13030/kt777... 694 [http://content.cdlib.org/xml/ark:/13030/kt0k4... 695 [http://content.cdlib.org/xml/ark:/13030/kt6m3... 696 [http://content.cdlib.org/xml/ark:/13030/kt287... 697 [http://content.cdlib.org/xml/ark:/13030/kt3p3... 698 [http://content.cdlib.org/xml/ark:/13030/kt177... 699 [http://content.cdlib.org/xml/ark:/13030/kt1g5... ... 864 [http://content.cdlib.org/xml/ark:/13030/kt3z0... 865 [http://content.cdlib.org/xml/ark:/13030/kt5h4... 866 [http://content.cdlib.org/xml/ark:/13030/kt1v1... 867 [http://content.cdlib.org/xml/ark:/13030/kt7d5... 868 [http://content.cdlib.org/xml/ark:/13030/kt7h4... 869 [http://content.cdlib.org/xml/ark:/13030/kt919... 870 [http://content.cdlib.org/xml/ark:/13030/kt409... 871 [http://content.cdlib.org/xml/ark:/13030/kt4c6... 872 [http://content.cdlib.org/xml/ark:/13030/kt387... 873 [http://content.cdlib.org/xml/ark:/13030/kt3q2... 874 [http://content.cdlib.org/xml/ark:/13030/kt7v1... 875 [http://content.cdlib.org/xml/ark:/13030/kt038... 876 [http://content.cdlib.org/xml/ark:/13030/kt7z0... 877 [http://content.cdlib.org/xml/ark:/13030/kt500... 878 [http://content.cdlib.org/xml/ark:/13030/kt9b6... Name: fsmTeiUrl, Length: 194, dtype: object