HackFSM
Relationship to other public APIs based on Solr?
Documentation:
from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL)
from itertools import islice
import logging
import requests
import json
import urllib
import urlparse
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING)
logger=logging.getLogger()
def query(q, fl="id"):
url = "{base_url}?".format(base_url=HACKFSM_BASEURL) + \
urllib.urlencode({'q':q,
'fl':fl,
'wt':'json',
'app_id':HACKFSM_ID,
'app_key':HACKFSM_KEY})
r = requests.get(url)
return r.json()
result = query(q="fsmTitle:Savio")['response']
result
{u'docs': [{u'id': u'ark:/13030/ft2f59n853'}, {u'id': u'access143'}, {u'id': u'ark:/13030/tf2q2n99d3'}, {u'id': u'ark:/13030/tf3p3003k7'}, {u'id': u'ark:/13030/tf5m3nb15b'}, {u'id': u'ark:/13030/tf267n996q'}, {u'id': u'access326'}, {u'id': u'access327'}, {u'id': u'access328'}, {u'id': u'access329'}, {u'id': u'access330'}, {u'id': u'access331'}, {u'id': u'access332'}, {u'id': u'access333'}, {u'id': u'access334'}, {u'id': u'access335'}, {u'id': u'access339'}, {u'id': u'access340'}, {u'id': u'access341'}, {u'id': u'access343'}, {u'id': u'access344'}, {u'id': u'access345'}, {u'id': u'access346'}, {u'id': u'access347'}, {u'id': u'access348'}, {u'id': u'access365'}, {u'id': u'access366'}, {u'id': u'access367'}, {u'id': u'access369'}, {u'id': u'access370'}], u'numFound': 124, u'start': 0}
# try again
# http://stackoverflow.com/a/5724453/7782
# http://excess.org/article/2013/02/itergen1/
class my_g(object):
def __init__(self,max_count):
self._remaining = range(max_count)
self._len = max_count
def __iter__(self):
return self
def __len__(self):
return self._len
def next(self):
if not self._remaining:
raise StopIteration
return self._remaining.pop(0)
g=my_g(10)
print len(g)
list(g)
10
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
class FSM(object):
def __init__(self, q, fl="id", start=0, rows=30,
base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY):
self.q = q
self.fl = fl
self.start = start
self.rows = rows
self.base_url = base_url
self.app_id = app_id
self.app_key = app_key
# get first page and numfound
self.cursor = start
# get the first page
result = self._get_page(q, fl, self.cursor, self.rows)
self.numfound = result['response']['numFound']
def _check_status(self,result):
"""throw exception if non-zero status"""
if result['responseHeader']['status'] != 0:
raise FSMException("status: " + str(result['responseHeader']['status']))
def _get_page(self, q, fl, start, rows):
result = self._call_api(q, fl, start, rows)
# update current page
self.page = result['response']['docs']
self.page_len = len(self.page)
return result
def _call_api(self, q, fl, start, rows):
url = "{base_url}?".format(base_url=self.base_url) + \
urllib.urlencode({'q':q,
'fl':fl,
'wt':'json',
'start':start,
'row':rows,
'app_id':self.app_id,
'app_key':self.app_key})
result = requests.get(url).json()
self._check_status(result)
# check whether we're getting fewer records than expected
if len(result['response']['docs']) < rows:
# are we at the end of the results
if start + len(result['response']['docs']) != self.numfound:
logger.warning("url:{url}, numfound:{numfound}, start+len{start_plus_len}".format(url=url,
numfound=self.numfound,
start_plus_len=start + len(result['response']['docs'])))
return result
def __iter__(self):
return self
def __len__(self):
return self.numfound
def next(self):
if not self.page:
# retrieve next page and check whether there's anything left
self.cursor += self.page_len
result = self._get_page(self.q, self.fl, self.cursor, self.rows)
if self.page_len == 0:
raise StopIteration
return self.page.pop(0)
fsm = FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmTitle,fsmImageUrl,fsmDateCreated")
len(fsm)
685
results = list(islice(fsm,None))
results[:10]
[{u'fsmDateCreated': [u'Nov. 9, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9r90', u'http://nma.berkeley.edu/ark:/28722/bk0005j9s0j'], u'fsmTitle': [u'Professor John Searle speaking to crowd.'], u'id': u'ark:/13030/ft6k40080h'}, {u'fsmDateCreated': [u'Dec. 2, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2842', u'http://nma.berkeley.edu/ark:/28722/bk0005k285m'], u'fsmTitle': [u'Mario Savio speaking with reporters.'], u'id': u'ark:/13030/tf009n97vn'}, {u'fsmDateCreated': [u'Dec. 2, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2c2h', u'http://nma.berkeley.edu/ark:/28722/bk0005k2c32'], u'fsmTitle': [u'Joan Baez singing in front of Sproul Hall.'], u'id': u'ark:/13030/tf5j49n838'}, {u'fsmDateCreated': [u'Dec. 3, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z5w', u'http://nma.berkeley.edu/ark:/28722/bk0005j9z6f'], u'fsmTitle': [u'Girl student being booked on campus before being taken to jail.'], u'id': u'ark:/13030/ft700007tc'}, {u'fsmDateCreated': [u'Oct. 5, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9n7b', u'http://nma.berkeley.edu/ark:/28722/bk0005j9n8w'], u'fsmTitle': [u'Bryan Turner speaking.'], u'id': u'ark:/13030/ft7n39p1mr'}, {u'fsmDateCreated': [u'Nov. 9, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k1b6q', u'http://nma.berkeley.edu/ark:/28722/bk0005k1b78'], u'fsmTitle': [u'Steve Weissman speaking to crowd.'], u'id': u'ark:/13030/tf8w1006vp'}, {u'fsmDateCreated': [u'Nov. 24, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9v37', u'http://nma.berkeley.edu/ark:/28722/bk0005j9v4s'], u'fsmTitle': [u'Professor Morris Hirsch speaking from Sproul steps.'], u'id': u'ark:/13030/ft9f59p3bw'}, {u'fsmDateCreated': [u'Oct. 1, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k0v2s', u'http://nma.berkeley.edu/ark:/28722/bk0005k0v3b'], u'fsmTitle': [u'Crowd in Sproul Plaza.'], u'id': u'ark:/13030/tf0870010x'}, {u'fsmDateCreated': [u'Dec. 3, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z1p', u'http://nma.berkeley.edu/ark:/28722/bk0005j9z27'], u'fsmTitle': [u'Crowds in Sproul Plaza'], u'id': u'ark:/13030/ft8199p26d'}, {u'fsmDateCreated': [u'Dec. 2, 1964'], u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9x7g', u'http://nma.berkeley.edu/ark:/28722/bk0005j9x81'], u'fsmTitle': [u'Professor David Hackett talking to his class.'], u'id': u'ark:/13030/ft9000102p'}]
df = DataFrame(results)
len(df)
685
df.fsmImageUrl
0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... 1 [http://nma.berkeley.edu/ark:/28722/bk0005k284... 2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... 3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... 4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... 5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6... 6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3... 7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2... 8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1... 9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7... 10 [http://nma.berkeley.edu/ark:/28722/bk0005k232... 11 [http://nma.berkeley.edu/ark:/28722/bk0005k047... 12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8... 13 [http://nma.berkeley.edu/ark:/28722/bk0005k110... 14 [http://nma.berkeley.edu/ark:/28722/bk0005k276... ... 670 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 671 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 672 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 673 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 674 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 675 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 676 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 677 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 678 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 679 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 680 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 681 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 682 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 683 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 684 [http://digitalassets.lib.berkeley.edu/fsm/ucb... Name: fsmImageUrl, Length: 685, dtype: object
from IPython.display import HTML
from jinja2 import Template
CSS = """
<style>
.wrap img {
margin-left: 0px;
margin-right: 0px;
display: inline-block;
width: 150px;
}
.wrap {
/* Prevent vertical gaps */
line-height: 0;
-webkit-column-count: 5;
-webkit-column-gap: 0px;
-moz-column-count: 5;
-moz-column-gap: 0px;
column-count: 5;
column-gap: 0px;
}
.wrap img {
/* Just in case there are inline attributes */
width: 100% !important;
height: auto !important;
}
</style>
"""
IMAGES_TEMPLATE = CSS + """
<div class="wrap">
{% for item in items %}<img title="{{item.fsmTitle.0}}" src="{{item.fsmImageUrl.0}}"/>{% endfor %}
</div>
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=results[:10]))
To programmatically differentiate records that describe images from records that describe TEI-encoded XML documents, the API permits queries that exclude records with NULL values in the "unwanted" Url field.
That is, to retrieve TEI documents only, one would query for null values in the fsmImageUrl
field. To retrieve images only, one would query for null values in the fsmTeiUrl
field.
NOTE: Please observe the hyphen prepended to the field names in the examples below. The hyphen (minus sign) functions here as a NOT operator.
Example that selects for TEI encoded XML documents by excluding null values of fsmImageUrl
:
https://<BASE URL>/solr/fsm/select?q=-fsmImageUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012
Example that selects for images by excluding null values of fsmTeiUrl:
https://<BASE URL>/solr/fsm/select?q=-fsmTeiUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012
# TEI-encoded docs
len(FSM("-fsmImageUrl:[* TO *]"))
194
# images
len(FSM("-fsmTeiUrl:[* TO *]", fl="id,fsmImageUrl"))
685
from lxml.html import parse, fromstring
from collections import OrderedDict
api_docs_url = "http://digitalhumanities.berkeley.edu/hackfsm/api/detail"
r = requests.get(api_docs_url).content
doc = fromstring(r)
rows = doc.xpath('//div[@id="content"]/article/div/div/div/table[1]//tr')
headers = [col.text_content().strip() for col in rows[0].findall('td')]
headers
['Field Name', 'Definitions']
fields = []
for row in rows[1:]:
field = [col.text_content().strip() for col in row.findall('td')]
fields.append(field)
fsmfields = OrderedDict(fields)
fsmfields.keys()
['id', 'fsmTitle', 'fsmCreator', 'fsmTypeOfResource', 'fsmDateCreated', 'fsmNote', 'fsmRelatedTitle', 'fsmIdentifier', 'fsmRelatedIdentifier', 'fsmPhysicalLocation', 'fsmImageUrl', 'fsmTeiUrl']
fsm = FSM(q="*",fl=",".join(fsmfields.keys()))
len(fsm)
879
df = DataFrame(list(fsm))
len(df)
879
df.head()
fsmCreator | fsmDateCreated | fsmIdentifier | fsmImageUrl | fsmNote | fsmPhysicalLocation | fsmRelatedIdentifier | fsmRelatedTitle | fsmTeiUrl | fsmTitle | fsmTypeOfResource | id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [Warren (Photographer)] | [Nov. 9, 1964] | [BANC PIC 1959.010 -- NEG pt.3 11-09-64.4] | [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | [The Free Speech Movement Digital Archive, San... | NaN | [Professor John Searle speaking to crowd.] | [still image] | ark:/13030/ft6k40080h |
1 | [Steven Marcus] | [Dec. 2, 1964] | [BANC PIC 2000.002--NEG Strip 117:36] | [http://nma.berkeley.edu/ark:/28722/bk0005k284... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... | [The Free Speech Movement Digital Archive, Ste... | NaN | [Mario Savio speaking with reporters.] | [still image] | ark:/13030/tf009n97vn |
2 | [Steven Marcus] | [Dec. 2, 1964] | [BANC PIC 2000.002--NEG Strip 122:42] | [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... | [The Free Speech Movement Digital Archive, Ste... | NaN | [Joan Baez singing in front of Sproul Hall.] | [still image] | ark:/13030/tf5j49n838 |
3 | [Jones (Photographer)] | [Dec. 3, 1964] | [BANC PIC 1959.010 -- NEG pt.3 12-03-64.2] | [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | [The Free Speech Movement Digital Archive, San... | NaN | [Girl student being booked on campus before be... | [still image] | ark:/13030/ft700007tc |
4 | [Ingman (Photographer)] | [Oct. 5, 1964] | [BANC PIC 1959.010 -- NEG pt.3 10-05-64.4] | [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... | [Photographer] | [The Bancroft Library;;, University of Califor... | [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | [The Free Speech Movement Digital Archive, San... | NaN | [Bryan Turner speaking.] | [still image] | ark:/13030/ft7n39p1mr |
5 rows × 12 columns
# TEI URIs
len(list(df[~df.fsmTeiUrl.isnull()].fsmTeiUrl.apply(lambda a: a[0])))
194
# null dates
len(df[df.fsmDateCreated.isnull()])
393
# non-null image URLs
len(df[~df.fsmImageUrl.isnull()])
685
df[~df.fsmImageUrl.isnull()].id
0 ark:/13030/ft6k40080h 1 ark:/13030/tf009n97vn 2 ark:/13030/tf5j49n838 3 ark:/13030/ft700007tc 4 ark:/13030/ft7n39p1mr 5 ark:/13030/tf8w1006vp 6 ark:/13030/ft9f59p3bw 7 ark:/13030/tf0870010x 8 ark:/13030/ft8199p26d 9 ark:/13030/ft9000102p 10 ark:/13030/tf7n39n9qb 11 ark:/13030/ft3c6004k4 12 ark:/13030/tf8n39p05g 13 ark:/13030/tf20000235 14 ark:/13030/tf0d5n97ws ... 670 access369 671 access370 672 access371 673 access372 674 access373 675 access374 676 access375 677 access376 678 access377 679 access378 680 access379 681 access380 682 access381 683 access382 684 access383 Name: id, Length: 685, dtype: object
# distribution of number of image URLs
df[~df.fsmImageUrl.isnull()].fsmImageUrl.apply(len).value_counts()
2 628 3 56 4 1 dtype: int64
# let's crawl for images
results_images = list(FSM("-fsmTeiUrl:[* TO *]", fl=",".join(fsmfields.keys())))
len(results_images)
685
df_images=DataFrame(results_images)
df_images[df_images.fsmImageUrl.isnull()]
Int64Index([], dtype='int64') | Empty DataFrame |
0 rows × 11 columns
# would be interesting to see sizes of images and whether we can get at thumbnails
df_images.fsmImageUrl
0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... 1 [http://nma.berkeley.edu/ark:/28722/bk0005k284... 2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... 3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... 4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... 5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6... 6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3... 7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2... 8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1... 9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7... 10 [http://nma.berkeley.edu/ark:/28722/bk0005k232... 11 [http://nma.berkeley.edu/ark:/28722/bk0005k047... 12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8... 13 [http://nma.berkeley.edu/ark:/28722/bk0005k110... 14 [http://nma.berkeley.edu/ark:/28722/bk0005k276... ... 670 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 671 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 672 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 673 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 674 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 675 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 676 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 677 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 678 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 679 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 680 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 681 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 682 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 683 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 684 [http://digitalassets.lib.berkeley.edu/fsm/ucb... Name: fsmImageUrl, Length: 685, dtype: object
urlparse.urlparse("http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg").netloc
'digitalassets.lib.berkeley.edu'
df_images.fsmImageUrl
0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... 1 [http://nma.berkeley.edu/ark:/28722/bk0005k284... 2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... 3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... 4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... 5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6... 6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3... 7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2... 8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1... 9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7... 10 [http://nma.berkeley.edu/ark:/28722/bk0005k232... 11 [http://nma.berkeley.edu/ark:/28722/bk0005k047... 12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8... 13 [http://nma.berkeley.edu/ark:/28722/bk0005k110... 14 [http://nma.berkeley.edu/ark:/28722/bk0005k276... ... 670 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 671 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 672 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 673 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 674 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 675 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 676 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 677 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 678 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 679 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 680 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 681 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 682 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 683 [http://digitalassets.lib.berkeley.edu/fsm/ucb... 684 [http://digitalassets.lib.berkeley.edu/fsm/ucb... Name: fsmImageUrl, Length: 685, dtype: object
# calculate hostnames for all image urls
# might be possible to do this all with pandas
netlocs = list(df_images.fsmImageUrl.dropna().apply(lambda urls: set([urlparse.urlparse(url).netloc for url in urls])))
reduce(lambda x,y: x | y, netlocs, set())
{u'digitalassets.lib.berkeley.edu', u'nma.berkeley.edu', u'sunsite.berkeley.edu'}
def len2(x):
try:
return len(x)
except:
return np.nan
df_images.fsmImageUrl.apply(len2) == 3
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False 10 False 11 False 12 False 13 False 14 False ... 670 False 671 False 672 False 673 False 674 False 675 False 676 False 677 False 678 False 679 False 680 False 681 False 682 False 683 False 684 False Name: fsmImageUrl, Length: 685, dtype: bool
df_images[df_images.fsmImageUrl.apply(len2) == 3].head()
fsmCreator | fsmDateCreated | fsmIdentifier | fsmImageUrl | fsmNote | fsmPhysicalLocation | fsmRelatedIdentifier | fsmRelatedTitle | fsmTitle | fsmTypeOfResource | id | |
---|---|---|---|---|---|---|---|---|---|---|---|
246 | [Hecker, Ron] | [Dec. 7, 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [Crowd in Sproul Plaza from Student Union balc... | NaN | UARC PIC 24B:2:22 |
247 | [Hecker, Ron] | [Dec. 7, 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [Crowd at Greek Theater] | NaN | UARC PIC 24B:2:17 |
248 | NaN | [Dec. 7, 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [View from inside Sproul Hall lobby looking th... | NaN | UARC PIC 24B:1:26 |
249 | [Hecker, Ron] | [Dec. 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [Student Strike] | NaN | UARC PIC 24B:2:6 |
250 | [Hecker, Ron] | [Dec. 7, 1964] | NaN | [http://sunsite.berkeley.edu/FindingAids/dynaw... | NaN | NaN | NaN | [Free Speech Movement Photographs Collection, ] | [Crowd at Greek Theater] | NaN | UARC PIC 24B:2:21 |
5 rows × 11 columns
df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl
[u'http://nma.berkeley.edu/ark:/28722/bk001532c4q', u'http://nma.berkeley.edu/ark:/28722/bk001532c7c', u'http://nma.berkeley.edu/ark:/28722/bk001532c58', u'http://nma.berkeley.edu/ark:/28722/bk001532c8x']
IMAGES_TEMPLATE = """
<div class="nowrap">
{% for item in items %}<img title="{{item}}" src="{{item}}"/>{% endfor %}
</div>
"""
template = Template(IMAGES_TEMPLATE)
HTML(template.render(items=df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl ))
len(df[~df.fsmDateCreated.isnull()])
486
s = df[~df.fsmDateCreated.isnull()].fsmDateCreated.apply(len)==2 #.astype('datetime64[ns]')
def first(x):
try:
return x[0]
except:
return np.nan
df['calc_date'] = pd.to_datetime(df.fsmDateCreated.apply(first), coerce=True)
df[~df.calc_date.isnull()].sort_index(by='calc_date').calc_date
156 1964-01-01 90 1964-01-01 74 1964-01-01 14 1964-01-01 146 1964-01-01 731 1964-01-01 92 1964-01-01 167 1964-01-01 300 1964-01-01 220 1964-01-01 871 1964-01-01 203 1964-01-05 261 1964-10-01 245 1964-10-01 243 1964-10-01 ... 197 1970-05-03 210 1970-05-03 23 1970-05-05 50 1970-05-05 179 1970-05-05 869 1973-01-01 129 1984-10-02 180 1984-10-02 159 1984-10-02 287 1984-10-02 289 1984-10-02 299 1984-10-02 868 1986-01-01 801 1990-01-01 867 1993-06-03 Name: calc_date, Length: 434, dtype: datetime64[ns]
pd.to_datetime(df.fsmDateCreated.dropna().apply(lambda s:s[0]).astype('str'), coerce=True).dropna()
0 1964-11-09 1 1964-12-02 2 1964-12-02 3 1964-12-03 4 1964-10-05 5 1964-11-09 6 1964-11-24 7 1964-10-01 8 1964-12-03 9 1964-12-02 10 1964-12-03 11 1964-12-07 12 1964-11-09 13 1964-10-01 14 1964-01-01 ... 863 1965-07-26 864 1965-10-13 865 1965-03-05 867 1993-06-03 868 1986-01-01 869 1973-01-01 870 1965-01-03 871 1964-01-01 872 1964-11-30 873 1964-12-04 874 1964-12-22 875 1965-01-07 876 1964-12-21 877 1965-01-09 878 1965-01-02 Name: fsmDateCreated, Length: 434, dtype: datetime64[ns]
# http://stackoverflow.com/questions/17690738/in-pandas-how-do-i-convert-a-string-of-date-strings-to-datetime-objects-and-put
date_stngs = ('2008-12-20','2008-12-21','2008-12-22','2008-12-23','Nov. 9, 1964', 'junk')
pd.to_datetime(pd.Series(date_stngs),coerce=True)
0 2008-12-20 1 2008-12-21 2 2008-12-22 3 2008-12-23 4 1964-11-09 5 NaT dtype: datetime64[ns]
def f(x):
try:
return set(x)
except:
return set()
reduce(lambda x,y: x | y, df.fsmTypeOfResource.apply(f), set())
{u'Box 1:1', u'Box 1:11', u'Box 1:11:4', u'Box 1:13', u'Box 1:13:1', u'Box 1:13:4', u'Box 1:14', u'Box 1:15', u'Box 1:16', u'Box 1:17', u'Box 1:2', u'Box 1:25', u'Box 1:25:1', u'Box 1:25:4', u'Box 1:28', u'Box 1:29', u'Box 1:2:3', u'Box 1:30', u'Box 1:30:2', u'Box 1:32', u'Box 1:34', u'Box 1:34:1', u'Box 1:38', u'Box 1:39', u'Box 1:4', u'Box 1:41', u'Box 1:43', u'Box 1:44', u'Box 1:45', u'Box 1:46', u'Box 1:5', u'Box 1:6', u'Box 1:7', u'Box 1:8', u'Box 2:11', u'Box 2:11:1', u'Box 2:11:2', u'Box 2:11:3', u'Box 2:11:4', u'Box 2:11:6', u'Box 2:18', u'Box 2:18:1', u'Box 2:22', u'Box 2:22:1', u'Box 2:47', u'Box 2:47:1', u'Box 2:47:3', u'Box 2:49', u'Box 2:49:1', u'Box 2:49:2', u'Box 2:55', u'Box 2:59', u'Box 2:8', u'Box 2:8:3', u'Box 2:8:4', u'Box 3:1', u'Box 3:11', u'Box 3:14', u'Box 3:14:1', u'Box 3:15', u'Box 3:17', u'Box 3:17:2', u'Box 3:2', u'Box 3:21', u'Box 3:22', u'Box 3:23', u'Box 3:26', u'Box 3:29', u'Box 3:29:1', u'Box 3:29:2', u'Box 3:3', u'Box 3:31', u'Box 3:33', u'Box 3:34', u'Box 3:34:1', u'Box 3:36', u'Box 3:38', u'Box 3:39', u'Box 3:39:4', u'Box 3:39:5', u'Box 3:39:8', u'Box 3:40', u'Box 3:41', u'Box 3:5', u'Box 4:10', u'Box 4:5', u'Box 4:5:13', u'Box 4:5:2', u'Box 4:5:6', u'Box 4:8', u'Box 4:8:5', u'Box 4:9', u'Box 4:9:3', u'Box 70:33', u'Box 70:33:2', u'Box 70:33:4', u'Box 70:34', u'Box 70:34:1', u'Box 70:34:3', u'Box 70:34:7c', u'Box 70:34:8', u'Box 72:14', u'Box 72:14:1', u'Box 72:14:11', u'Box 72:14:19', u'Box 72:23', u'Box 72:23:1', u'Carton 1:12', u'Carton 1:12:2', u'Carton 1:12:3', u'Carton 1:12:4', u'Carton 1:12:5', u'Carton 1:12:6', u'Carton 1:12:7', u'Carton 1:12:8', u'Carton 1:14', u'Carton 1:15', u'Carton 1:9', u'Carton 21:14:1', u'Carton 21:14:7', u'Carton 21:16', u'Carton 21:2:1', u'Carton 2:20', u'Carton 2:32', u'Carton 3:16', u'Carton 3:37', u'Carton 3:58', u'Carton 3:58:4', u'Carton 3:58:7', u'Carton 4:32', u'Carton 4:78', u'Carton 4:80', u'agendas', u'articles', u'briefs (legal documents)', u'detail', u'fdr', u'fliers (printed matter)', u'folder', u'form letters', u'group statements', u'item', u'leaflets', u'letters (correspondence)', u'magazines (periodicals)', u'memorandums', u'minutes', u'miscellaneous', u'news bulletins', u'newsletters', u'newspapers', u'oral histories', u'pamphlets', u'papers (document genres)', u'personal statement', u'personal statements', u'progress reports', u'reports', u'still image', u'tables of content', u'text', u'title pages', u'transcripts'}
#related id
len(df.fsmRelatedIdentifier.dropna())
236
df.fsmTeiUrl.dropna()
685 [http://content.cdlib.org/xml/ark:/13030/kt5m3... 686 [http://content.cdlib.org/xml/ark:/13030/kt5s2... 687 [http://content.cdlib.org/xml/ark:/13030/kt6k4... 688 [http://content.cdlib.org/xml/ark:/13030/kt4s2... 689 [http://content.cdlib.org/xml/ark:/13030/kt1h4... 690 [http://content.cdlib.org/xml/ark:/13030/kt2w1... 691 [http://content.cdlib.org/xml/ark:/13030/kt609... 692 [http://content.cdlib.org/xml/ark:/13030/kt638... 693 [http://content.cdlib.org/xml/ark:/13030/kt777... 694 [http://content.cdlib.org/xml/ark:/13030/kt0k4... 695 [http://content.cdlib.org/xml/ark:/13030/kt6m3... 696 [http://content.cdlib.org/xml/ark:/13030/kt287... 697 [http://content.cdlib.org/xml/ark:/13030/kt3p3... 698 [http://content.cdlib.org/xml/ark:/13030/kt177... 699 [http://content.cdlib.org/xml/ark:/13030/kt1g5... ... 864 [http://content.cdlib.org/xml/ark:/13030/kt3z0... 865 [http://content.cdlib.org/xml/ark:/13030/kt5h4... 866 [http://content.cdlib.org/xml/ark:/13030/kt1v1... 867 [http://content.cdlib.org/xml/ark:/13030/kt7d5... 868 [http://content.cdlib.org/xml/ark:/13030/kt7h4... 869 [http://content.cdlib.org/xml/ark:/13030/kt919... 870 [http://content.cdlib.org/xml/ark:/13030/kt409... 871 [http://content.cdlib.org/xml/ark:/13030/kt4c6... 872 [http://content.cdlib.org/xml/ark:/13030/kt387... 873 [http://content.cdlib.org/xml/ark:/13030/kt3q2... 874 [http://content.cdlib.org/xml/ark:/13030/kt7v1... 875 [http://content.cdlib.org/xml/ark:/13030/kt038... 876 [http://content.cdlib.org/xml/ark:/13030/kt7z0... 877 [http://content.cdlib.org/xml/ark:/13030/kt500... 878 [http://content.cdlib.org/xml/ark:/13030/kt9b6... Name: fsmTeiUrl, Length: 194, dtype: object