DATA_FILES = {"nondigitized":"Project/Data/non_google_pd_pdus.xml",
"fullhathifiles":"Project/Data/hathi_full_20130301.txt"}
import os
import sys
import pandas as pd
import numpy as np
def file_path(key):
return os.path.join(os.pardir, DATA_FILES[key])
file_path("nondigitized")
'../Project/Data/non_google_pd_pdus.xml'
#In the original file, replaced xmlns:dc= url with xmlns:dc="#" this works , loaing xml file data into mysql database, this needs to be worked on
from lxml import etree
from collections import defaultdict
import pandas as pd
import sqlite3
from pandas.io import sql
tree = etree.parse(file_path("nondigitized"))
root=tree.getroot()
finaldictlist=list()
for record in root.getchildren():
tmpdict=defaultdict(list)
tmpdict.update(record.attrib)
for ch in record.getchildren():
if ((ch.tag.replace("{#}","") == 'identifier') or (ch.tag.replace("{#}","") == 'description')):
pass
else:
#tmpdict[ch.tag.replace("{#}","")].append(ch.text)
tmpdict.update({ch.tag.replace("{#}",""):ch.text})
#tmpdict[ch.tag.replace("{#}","")] = [''.join(tmpdict[ch.tag.replace("{#}","")])]
finaldictlist.append(tmpdict)
finaldictlist
dfnondigitized = pd.DataFrame(finaldictlist)
#Finding out authors with maximum publications
creatercount = dfnondigitized['creator'].value_counts()
maxpublications = creatercount[creatercount>50]
maxpublications
Oliphant, Mrs. (Margaret), 1828-1897. 197 Abdülhamid II, Sultan of the Turks, 1842-1918, former owner. 192 Valpy, Abraham John, 1787-1854. 170 Cairns Collection of American Women Writers. 156 United States. Office of the Federal Register. 146 Freeman, A. C. (Abraham Clark), 1843-1911. 146 Lytton, Edward Bulwer Lytton, Baron, 1803-1873. 145 Scott, Walter, Sir, 1771-1832. 136 Dickens, Charles, 1812-1870. 116 Saintsbury, George, 1845-1933. 114 Lawrence J. Gutter Collection of Chicagoana (University of Illinois at Chicago) ICIU 111 Defoe, Daniel, 1661?-1731. 110 James, G. P. R. (George Payne Rainsford), 1801?-1860. 108 Balzac, Honoré de, 1799-1850. 108 Canada. Parliament. 107 Lever, Charles James, 1806-1872. 103 Black, William, 1841-1898. 101 Chalmers, Alexander, 1759-1834. 101 Whyte-Melville, G. J. (George John), 1821-1878. 100 Eliot, George, 1819-1880. 98 Thackeray, William Makepeace, 1811-1863. 96 Wood, Henry, Mrs., 1814-1887. 95 Leigh, Oliver Herbrand Gordon. 89 Baring-Gould, S. (Sabine), 1834-1924. 88 Swift, Jonathan, 1667-1745. 84 Russell, William Clark, 1844-1911. 81 Carlyle, Thomas, 1795-1881. 79 Kipling, Rudyard, 1865-1936. 77 Braddon, M. E. (Mary Elizabeth), 1837-1915. 77 Disraeli, Benjamin, Earl of Beaconsfield, 1804-1881. 76 Lang, Andrew, 1844-1912. 75 Ouida, 1839-1908. 75 Longfellow, Henry Wadsworth, 1807-1882. 75 Besant, Walter, Sir, 1836-1901. 74 Gore, Mrs. (Catherine Grace Frances), 1799-1861. 74 Cooper, James Fenimore, 1789-1851. 73 Kingsley, Charles, 1819-1875. 72 Dumas, Alexandre, 1802-1870. 71 Reid, Mayne, 1818-1883. 71 Swedenborg, Emanuel, 1688-1772. 71 Stevenson, Robert Louis, 1850-1894. 71 Johnson, Samuel, 1709-1784. 69 Ruskin, John, 1819-1900. 69 Yonge, Charlotte Mary, 1823-1901. 69 Harte, Bret, 1836-1902. 66 New York (State). Court of Appeals 66 Hewlett, Maurice, 1861-1923. 66 Smart, Hawley, 1833-1893. 65 Moore, Thomas, 1779-1852. 64 MacDonald, George, 1824-1905. 64 Karl Baedeker (Firm) 63 Braddon, M. E. (Mary Elizabeth), 1835-1915. 63 Phillpotts, Eden, 1862-1960. 63 Meyerhof, Max, 1874-1945, former owner. 63 Irving, Washington, 1783-1859. 61 Burke, Edmund, 1729-1797. 61 Ridpath, John Clark, 1840-1900. 61 Great Britain. Laws, statutes, etc. 61 Hunt, Leigh, 1784-1859. 60 Collins, Wilkie, 1824-1889. 60 Payn, James, 1830-1898. 60 Burton, Richard Francis, Sir, 1821-1890. 59 Ohio. 59 Schiller, Friedrich, 1759-1805. 59 Riley, James Whitcomb, 1849-1916. 59 Meredith, George, 1828-1909. 58 Mühlbach, L. (Luise), 1814-1873. 58 Fitzgerald, Percy Hetherington, 1834-1925. 58 Shakespeare, William, 1564-1616. 58 Grant, James, 1822-1887. 57 Arthur, T. S. (Timothy Shay), 1809-1885. 56 Henty, G. A. (George Alfred), 1832-1902. 56 Traill, H. D. (Henry Duff), 1842-1900. 55 Abbott, Jacob, 1803-1879. 54 Bennett, Arnold, 1867-1931. 54 Tabor, Eliza. 54 Dobson, Austin, 1840-1921. 54 Byron, George Gordon Byron, Baron, 1788-1824. 53 Königlich Bayerische Akademie der Wissenschaften. Historische Kommission. 53 Lecky, William Edward Hartpole, 1838-1903. 53 Walls, Thomas H., tr. 52 Rouillu, Charles August, 1883- ed. 52 Crockett, S. R. (Samuel Rutherford), 1860-1914. 52 Gladstone, W. E. (William Ewart), 1809-1898. 52 Length: 84
#Plot the graph for top 10 authors along with the number of publications
import matplotlib
maxpublications[:11].plot(kind='barh',rot=0)
<matplotlib.axes.AxesSubplot at 0x10f1f1c50>
#Fetch dataframe row corresponding to the most popular author
tmp = maxpublications.index
topauthor = dfnondigitized[dfnondigitized['creator']==tmp[0]]
#For the top author plot a graph of when sevral books were ublished over years.
import matplotlib.pyplot as plt
fig = plt.figure()
fig.set_size_inches(5,5)
topauthorpub = pd.Series(map(lambda x : re.findall(("(\d+)") ,x)[0] , topauthor['date'])).value_counts()
topauthorpub.drop('19').sort_index().plot()
#topauthorpub.sort_index().plot()
<matplotlib.axes.AxesSubplot at 0x11cd39050>
#Plot the books by access rights
rights = dfnondigitized['rights'].value_counts()
figure(1, figsize=(6,6))
labels = 'Public Domain', 'Public Domain US'
explode=(0, 0.05)
pie(rights,labels=labels, explode=explode,autopct='%1.1f%%')
([<matplotlib.patches.Wedge at 0x112ec8650>, <matplotlib.patches.Wedge at 0x112ec8c10>], [<matplotlib.text.Text at 0x112ec8a10>, <matplotlib.text.Text at 0x112ecc090>], [<matplotlib.text.Text at 0x112ec8ad0>, <matplotlib.text.Text at 0x112ec8b90>])
#top five languages in which the books are available
lang = dfnondigitized['language'].value_counts()
dflang = pd.DataFrame(lang)
lang[:5]
eng 107510 fre 2947 ger 2703 spa 652 ita 584
#Statistics various types of books available
typedata = dfnondigitized['type'].value_counts()
typedata
Text 113133 Collection 3024 Authors' presentation copies (Provenance) rbprov IU-R 141 Three deckers. rbgenr 60 Publishers' cloth bindings (Binding) New York (State) New York 20th century. rbbin 49 Publishers' advertisements. rbgenr 33 Historical fiction. gsafd 22 Mystery and detective fiction Great Britain. rbgenr 19 Periodicals. rbgenr 18 Publishers' cloth bindings (Binding) Illinois Chicago 19th century. rbbin 17 Publishers' cloth bindings (Binding) New York (State) New York 19th century. rbbin 17 Juvenile literature. rbgenr 15 Bookplates (Provenance). rbprov 15 Author's presentation copies (Provenance). rbprov IU-R 13 California, Southern Imprints Specimens. local 13 ... Gilt edges. rbbin 1 Juvenile literature Vermont Woodstock 1823. rbgenr 1 Short stories. gsafd 1 Unopened books (Binding). rbbin 1 Presentation inscriptions. rbprov 1 Annotations (Provenance) 20th century. rbprov CLUW 1 Guidebooks Washington (State) Seattle 20th century. rbgenr 1 Travel literature Canada 19th century. rbgenr 1 Headpieces (layout features) 16th century. aat. 1 Publishers' copies (Provenance) rbprov. 1 Publishers' paper bindings (Binding) Indiana Indianapolis 20th century. rbbin 1 Ink stamps (Provenance). rbprov 1 Political fiction. lcsh 1 Devotional literature. rbgenr 1 Religious fiction. lcsh 1 Length: 218