#!/usr/bin/env python # coding: utf-8 # # `BibTeX` Record Generator for archive.org # # Simple script to generate a BibTex record from an `archive.org` identifier. # ## Proof of Concept # # Given an `archive.org` identifier, we can look up its metadata as: # # `https://archive.org/metadata/IDENTIFIER/metadata` # In[1]: url_ = 'https://archive.org/metadata/{uid}/metadata' # The `archive.org` metadata schema is given [here](https://archive.org/services/docs/api/metadata-schema/index.html). # # Relevant fields include: # # - `title` # - `creator` # - `publisher` # - `date` (replaces the deprecated `year`) # - `volume` # - `description` # - `issn` / `isbn` # - `subject` (subject / topic tags) # - `rights` / `possible-copyright-status` # # We can then attempt to map fields onto appropriate fields in a [`BibTeX` `book` entry record](https://www.bibtex.com/e/book-entry/). Appropriate fields might include: # # - `author`; # - `editor`; # - `title`; # - `publisher` # - `address` # - `year` # - `volume` / `number` # - `note` # - `issn` / `isbn` # # We can map many of the items directly. # # *The volume we may want to try to parse into a volume and part (which is to say, `volumne` and `number`). For now, just map the `volume`literally.* # In[2]: example_id = 'dli.granth.84831' # Get the `archive.org` metadata: # In[3]: import requests r = requests.get(url_.format(uid=example_id)) result = r.json()['result'] result # In[4]: bib_data = {} bib_map = {"date": "year", "description": "note", "creator": "author"} for k in ['title', 'publisher', 'description', 'volume', 'issn', 'isbn', "date", "creator"]: if k in result: k_ = bib_map[k] if k in bib_map else k bib_data[k_] = result[k] bib_data # For now, we're naively mapping the creator on to the author, although we might later want to try to improve *author* vs. *editor* resultion. # # Note that the `creator` metadata may be presented as a list of creators, often with birth/death dates, so we need to potentially tidy that up. # In[6]: import re _example = ['Gregory, Lady, 1852-1932', 'Finn, MacCumaill, 3rd cent', 'Yeats, W. B. (William Butler), 1865-1939'] for c in _example: print(re.sub(' \(?[0-9]+-[0-9]+\)?', '', c)) # Create an identifier for the book (we may need to leaborate the to make sure it generates a unique identifier): # In[10]: bib_id = f"{re.sub('[^09a-zA-Z]', '', result['creator']).lower()[:7]}{result['date']}" bib_data["bib_id"] = bib_id bib_id # Use a heuristic to generate the publisher `address`... # In[11]: import parse addr = parse.parse('{publisher} ({address})', bib_data['publisher']) if addr: bib_data['publisher'] = addr['publisher'] bib_data['address'] = addr['address'] bib_data # We now need to render the data via an appropriate BibTeX template: # In[12]: from jinja2 import Template tm = Template("""@book{ {{bib_id}}, title = "{{title}}", author = "{{author}}", year = "{{year}}", {% if volume %}volume = "{{volume}}",{% endif %} {% if publisher %}publisher = "{{publisher}}",{% endif %} {% if address %}address = "{{address}}",{% endif %} {% if isbn %}isbn = "{{isbn}}",{% endif %} {% if issn %}issn = "{{issn}}",{% endif %} } """) print(tm.render(**bib_data)) # Cjeck that the record parses correctly, and then export it in a well-formatted way: # In[13]: #%pip install bibtexparser import bibtexparser tex_ = bibtexparser.loads(tm.render(**bib_data)) print(bibtexparser.dumps(tex_)) # We can extract `archive.org` identifers from a file with the following simple pattern matcher: # In[26]: with open("irish-legends-finn-oisin.md") as f: urls = re.findall('https?://archive.org/details/([^\s\n]*)[\s\n]+', f.read()) # Find the unique archive.org identifiers ids_ = list({u.split("/")[0] for u in urls}) ids_[:3] # ## Generate a BibTeX Record Collection # # Let's now put the pieces together to extract a list of `archive.org` identifiers from a text file, look up the metadata associated with each one, and then generate a full list of BibTeX records for them. # # The following function is derived from the skecthes shown above, repackaged as a function: # In[33]: # Cache requests import requests_cache requests_cache.install_cache('.archive_org_metadata') def get_metadata(uid): """Get metadata given an archive.org identifier.""" r = requests.get(url_.format(uid=uid)) result = r.json()['result'] return result def generate_bib_record(uid): """Generate a bibliographic data record from archive.org metadata.""" metadata = get_metadata(uid) bib_data = {} bib_map = {"date": "year", "creator": "author"} # Handle a list of creators if 'creator' in metadata: _creators = metadata['creator'] if isinstance(metadata['creator'], list) \ else [metadata['creator']] _clean_creators = [] for _c in _creators: _clean_creators.append(re.sub(' \(?[0-9]+-[0-9]+\)?', '', _c)) metadata['creator'] = ", ".join(_clean_creators) if 'creator' in metadata and 'date' in metadata: # Create id record_id = re.sub('[^09a-zA-Z]', '', metadata['creator']).lower()[:7] bib_id = f"{record_id}{metadata['date']}" else: bib_id = uid for k in ['creator', 'title', 'publisher', 'volume', 'issn', 'isbn', "date"]: if k in metadata: k_ = bib_map[k] if k in bib_map else k bib_data[k_] = metadata[k] bib_data["bib_id"] = bib_id # Try to find publisher address using simple heuristics if 'publisher' in bib_data: addr = parse.parse('{publisher} ({address})', bib_data['publisher']) if not addr: addr = parse.parse('{address} : {publisher}', bib_data['publisher']) if addr: bib_data['publisher'] = addr['publisher'] bib_data['address'] = addr['address'] _bibtex = bibtexparser.loads(tm.render(**bib_data)) bibtex_ = bibtexparser.dumps(_bibtex) return bibtex_ # We can now iterate through the identifiers and generate or list of BibTeX records. # # We can also add a progress bar to help keep track of how far along we are (making the `archive.org` reuests might take some time...). # In[34]: from tqdm.notebook import tqdm records = [] # For tqdm, ensure to update jupyterlab_widgets for uid in tqdm(ids_): records.append(generate_bib_record(uid)) records[:10] # In[ ]: # In[ ]: # In[ ]: