#Get the PDF and parse it into an xml file #!wget http://www.ecb.europa.eu/pub/pdf/other/ssm-listofsupervisedentities1409en.pdf?59d76de0c5663687f594250ebf228c6b -P data #!pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes data/ssm-listofsupervisedentities1409en.pdf\?59d76de0c5663687f594250ebf228c6b pdftest2.xml #Open the XML file f=open('pdftest2.xml','rb').read() #Set up the parse tree from lxml import etree root = etree.fromstring(f) pages = list(root) #Import utility libraries import pandas as pd #Utility function to flatten cells (eg remove bold font tags etc) def flatten(el): if el != None: result = [ (el.text or "") ] for sel in el: result.append(flatten(sel)) result.append(sel.tail or "") return "".join(result).strip() return "" #The .attrib dict contains positional and font informaation about a piece of text #{'top': '555', 'width': '72', 'font': '5', 'left': '102', 'height': '15'} #Look to see where column boundaries are in the first scraper def dfColBoundaries(pages): df=pd.DataFrame() for page in pages: for el in page: if el.tag == "text": tmp={} for i in ['top', 'width', 'font', 'left', 'height']: tmp[i]=el.attrib[i] df=pd.concat([df,pd.DataFrame([tmp])]) return df df=dfColBoundaries(pages[3:33]) df['left']=df['left'].astype(int) df['left'].hist() df['left'].value_counts() #A helper function to show what text appears at a particular column boundary def probeColBoundary(pages,bval,boundary='left'): tmp=[] for page in pages: for el in page: if el.tag == "text" and el.attrib[boundary]in bval: tmp.append(flatten(el)) return tmp probeColBoundary(pages[3:33],'102')[:10] #The logic of this scraper evoleved through several iterations! #This is the dataframe we'll build up the table of results in df2=pd.DataFrame() skip=False dfrBase={} #The columns in the output table for i in ['group','member','country','estcountry','reason']:dfrBase[i]='' dfr=dfrBase.copy() #Some cells have mutliple rows of text in them - this configures cell actions #'CELL_TYPE':(colNum,colDesignator,[resetCols]) multiConfig={'3':(2,'reason',[]),'1a':(0,'group',['member']),'1b':(0,'member',[])} multiblock={} for i in multiConfig: multiblock[i]=[] currCol=-1 prevCol=-1 colSets=[['102'],['508','509'],['655']] for page in pages[3:33]: for el in page: if el.tag == "text": if skip: skip = False else: #Identify the current col from the column boundary sets #Would probably be better to do this as upper/lower bounds? prevCol=currCol for col in range(0,len(colSets)): if el.attrib['left'] in colSets[col]: currCol=col #if multiblock['3']!=[] and currCol!=2: # print(' '.join(multiblock['3'])) # dfr['reason']=' '.join(multiblock['3']) # multiblock['3']=[] #if multiblock['1b']!=[] and currCol!=0: # print(' '.join(multiblock['1b'])) # dfr['member']=' '.join(multiblock['1b']) # multiblock['1b']=[] #Update the status of each cell in case it's a multirow cell for mblock in multiConfig: mcol,mvar,mig = multiConfig[mblock] if multiblock[mblock]!=[] and currCol!=mcol: #print(' '.join(multiblock[mblock])) dfr[mvar]=' '.join(multiblock[mblock]) multiblock[mblock]=[] #The logic that decides when we have a new row - which is time to write the old one if currCol 21): dft['corp']=' '.join(mblock['corp']) mblock['corp']=[] if dft['corp']!='': dfc=pd.concat([dfc,pd.DataFrame([dft])]) if el.attrib['font']=='4': dft['country']=flatten(el) #preview the first and last few rows pd.concat([dfc[:5],dfc[-5:]]) #Write the data to a CSV file dfc.to_csv('data/euMinorGroup.csv',index=False,encoding='UTF-8') dfc.groupby('country').size().order(ascending=False)