#!/usr/bin/env python
# coding: utf-8

# # `BibTeX` Record Generator for archive.org
# 
# Simple script to generate a BibTex record from an `archive.org` identifier.

# ## Proof of Concept
# 
# Given an `archive.org` identifier, we can look up its metadata as:
# 
# `https://archive.org/metadata/IDENTIFIER/metadata`

# In[1]:


url_ = 'https://archive.org/metadata/{uid}/metadata'


# The `archive.org` metadata schema is given [here](https://archive.org/services/docs/api/metadata-schema/index.html).
# 
# Relevant fields include:
# 
# - `title`
# - `creator`
# - `publisher`
# - `date` (replaces the deprecated `year`)
# - `volume`
# - `description`
# - `issn` / `isbn`
# - `subject` (subject / topic tags)
# - `rights` / `possible-copyright-status`
# 
# We can then attempt to map fields onto appropriate fields in a [`BibTeX` `book` entry record](https://www.bibtex.com/e/book-entry/). Appropriate fields might include:
# 
# - `author`;
# - `editor`;
# - `title`;
# - `publisher`
# - `address`
# - `year`
# - `volume` / `number`
# - `note`
# - `issn` / `isbn`
# 
# We can map many of the items directly.
# 
# *The volume we may want to try to parse into a volume and part (which is to say, `volumne` and `number`). For now, just map the `volume`literally.*

# In[2]:


example_id = 'dli.granth.84831'


# Get the `archive.org` metadata:

# In[3]:


import requests

r = requests.get(url_.format(uid=example_id))
result = r.json()['result']
result


# In[4]:


bib_data = {}

bib_map = {"date": "year",
           "description": "note",
           "creator": "author"}

for k in ['title', 'publisher', 'description',
          'volume', 'issn', 'isbn', "date", "creator"]:
    if k in result:
        k_ = bib_map[k] if k in bib_map else k 
        bib_data[k_] = result[k]
    
bib_data


# For now, we're naively mapping the creator on to the author, although we might later want to try to improve *author* vs. *editor* resultion.
# 
# Note that the `creator` metadata may be presented as a list of creators, often with birth/death dates, so we need to potentially tidy that up.

# In[6]:


import re

_example = ['Gregory, Lady, 1852-1932',
             'Finn, MacCumaill, 3rd cent',
             'Yeats, W. B. (William Butler), 1865-1939']

for c in _example:
    print(re.sub(' \(?[0-9]+-[0-9]+\)?', '', c))


# Create an identifier for the book (we may need to leaborate the to make sure it generates a unique identifier):

# In[10]:


bib_id = f"{re.sub('[^09a-zA-Z]', '', result['creator']).lower()[:7]}{result['date']}"
bib_data["bib_id"] = bib_id

bib_id


# Use a heuristic to generate the publisher `address`...

# In[11]:


import parse

addr = parse.parse('{publisher} ({address})', bib_data['publisher'])
if addr:
    bib_data['publisher'] = addr['publisher']
    bib_data['address'] = addr['address']

bib_data


# We now need to render the data via an appropriate BibTeX template:

# In[12]:


from jinja2 import Template

tm = Template("""@book{ {{bib_id}},
  title     = "{{title}}",
  author    = "{{author}}",
  year      = "{{year}}",
  {% if volume %}volume = "{{volume}}",{% endif %}
  {% if publisher %}publisher = "{{publisher}}",{% endif %}
  {% if address %}address = "{{address}}",{% endif %}
  {% if isbn %}isbn = "{{isbn}}",{% endif %}
  {% if issn %}issn = "{{issn}}",{% endif %}
}
""")

print(tm.render(**bib_data))


# Cjeck that the record parses correctly, and then export it in   a well-formatted way:

# In[13]:


#%pip install bibtexparser
import bibtexparser

tex_ = bibtexparser.loads(tm.render(**bib_data))
print(bibtexparser.dumps(tex_))


# We can extract `archive.org` identifers from a file with the following simple pattern matcher:

# In[26]:


with open("irish-legends-finn-oisin.md") as f:
    urls = re.findall('https?://archive.org/details/([^\s\n]*)[\s\n]+', f.read())

# Find the unique archive.org identifiers
ids_ = list({u.split("/")[0] for u in urls})
ids_[:3]


# ## Generate a BibTeX Record Collection
# 
# Let's now put the pieces together to extract a list of `archive.org` identifiers from a text file, look up the metadata associated with each one, and then generate a full list of BibTeX records for them.
# 
# The following function is derived from the skecthes shown above, repackaged as a function:

# In[33]:


# Cache requests
import requests_cache
requests_cache.install_cache('.archive_org_metadata')

def get_metadata(uid):
    """Get metadata given an archive.org identifier."""
    r = requests.get(url_.format(uid=uid))
    result = r.json()['result']
    return result

def generate_bib_record(uid):
    """Generate a bibliographic data record
       from archive.org metadata."""

    metadata = get_metadata(uid)
    bib_data = {}

    bib_map = {"date": "year",
               "creator": "author"}

    # Handle a list of creators
    if 'creator' in metadata:
        _creators = metadata['creator'] if isinstance(metadata['creator'], list) \
                    else [metadata['creator']]
        _clean_creators = []
        for _c in _creators:
            _clean_creators.append(re.sub(' \(?[0-9]+-[0-9]+\)?', '', _c))
        metadata['creator'] = ", ".join(_clean_creators)
    if 'creator' in metadata and 'date' in metadata:
        # Create id
        record_id = re.sub('[^09a-zA-Z]', '',
                    metadata['creator']).lower()[:7]
        bib_id = f"{record_id}{metadata['date']}"
    else:
        bib_id = uid
        
    for k in ['creator', 'title', 'publisher',
              'volume', 'issn', 'isbn', "date"]:
        if k in metadata:
            k_ = bib_map[k] if k in bib_map else k 
            bib_data[k_] = metadata[k]

    bib_data["bib_id"] = bib_id

    # Try to find publisher address using simple heuristics
    if 'publisher' in bib_data:
        addr = parse.parse('{publisher} ({address})',
                           bib_data['publisher'])
        if not addr:
            addr = parse.parse('{address} : {publisher}',
                           bib_data['publisher'])
        if addr:
            bib_data['publisher'] = addr['publisher']
            bib_data['address'] = addr['address']
        
    _bibtex = bibtexparser.loads(tm.render(**bib_data))
    bibtex_ = bibtexparser.dumps(_bibtex)
    return bibtex_


# We can now iterate through the identifiers and generate or list of BibTeX records.
# 
# We can also add a progress bar to help keep track of how far along we are (making the `archive.org` reuests might take some time...).

# In[34]:


from tqdm.notebook import tqdm

records = []

# For tqdm, ensure to update jupyterlab_widgets
for uid in tqdm(ids_):
    records.append(generate_bib_record(uid))

records[:10]


# In[ ]:


# In[ ]:


# In[ ]: