#Get the PDF and parse it into an xml file
#!wget http://www.ecb.europa.eu/pub/pdf/other/ssm-listofsupervisedentities1409en.pdf?59d76de0c5663687f594250ebf228c6b -P data
#!pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes data/ssm-listofsupervisedentities1409en.pdf\?59d76de0c5663687f594250ebf228c6b pdftest2.xml

#Open the XML file
f=open('pdftest2.xml','rb').read()

#Set up the parse tree
from lxml import etree
root = etree.fromstring(f)
pages = list(root)

#Import utility libraries
import pandas as pd

#Utility function to flatten cells (eg remove bold font tags etc)
def flatten(el):
    if el != None:
        result = [ (el.text or "") ]
        for sel in el:
            result.append(flatten(sel))
            result.append(sel.tail or "")
        return "".join(result).strip()
    return ""

#The .attrib dict contains positional and font informaation about a piece of text
#{'top': '555', 'width': '72', 'font': '5', 'left': '102', 'height': '15'}

#Look to see where column boundaries are in the first scraper
def dfColBoundaries(pages):
    df=pd.DataFrame()
    for page in pages:
        for el in page:
            if el.tag == "text":
                tmp={}
                for i in ['top', 'width', 'font', 'left', 'height']:
                    tmp[i]=el.attrib[i]
                df=pd.concat([df,pd.DataFrame([tmp])])
    return df
   

df=dfColBoundaries(pages[3:33])
df['left']=df['left'].astype(int)
df['left'].hist()

df['left'].value_counts()

#A helper function to show what text appears at a particular column boundary
def probeColBoundary(pages,bval,boundary='left'):
    tmp=[]
    for page in pages:
        for el in page:
            if el.tag == "text" and el.attrib[boundary]in bval:
                tmp.append(flatten(el))

    return tmp

probeColBoundary(pages[3:33],'102')[:10]

#The logic of this scraper evoleved through several iterations!

#This is the dataframe we'll build up the table of results in
df2=pd.DataFrame()

skip=False

dfrBase={}
#The columns in the output table
for i in ['group','member','country','estcountry','reason']:dfrBase[i]=''
dfr=dfrBase.copy()

#Some cells have mutliple rows of text in them - this configures cell actions
#'CELL_TYPE':(colNum,colDesignator,[resetCols])
multiConfig={'3':(2,'reason',[]),'1a':(0,'group',['member']),'1b':(0,'member',[])}

multiblock={}
for i in multiConfig:  multiblock[i]=[]
    
currCol=-1
prevCol=-1

colSets=[['102'],['508','509'],['655']]
for page in pages[3:33]:
    for el in page:
        if el.tag == "text":
            if skip:
                skip = False
            else:
                #Identify the current col from the column boundary sets
                #Would probably be better to do this as upper/lower bounds?
                prevCol=currCol
                for col in range(0,len(colSets)):
                    if el.attrib['left'] in colSets[col]:
                        currCol=col
                    
                #if multiblock['3']!=[] and currCol!=2:
                #    print(' '.join(multiblock['3']))
                #    dfr['reason']=' '.join(multiblock['3'])
                #    multiblock['3']=[]
                #if multiblock['1b']!=[] and currCol!=0:
                #    print(' '.join(multiblock['1b']))
                #    dfr['member']=' '.join(multiblock['1b'])
                #    multiblock['1b']=[]
                #Update the status of each cell in case it's a multirow cell
                for mblock in multiConfig:
                    mcol,mvar,mig = multiConfig[mblock]
                    if multiblock[mblock]!=[] and currCol!=mcol:
                        #print(' '.join(multiblock[mblock]))
                        dfr[mvar]=' '.join(multiblock[mblock])
                        multiblock[mblock]=[]
                
                #The logic that decides when we have a new row - which is time to write the old one
                if currCol<prevCol and dfr['group']!='':
                    #Ignore blank rows
                    if (dfr['estcountry']!='' ) or (dfr['member']!='' ) or (dfr['reason']!='' ):
                        df2=pd.concat([df2,pd.DataFrame([dfr])])
                #print(flatten(el),el.attrib)

                #Logic to work out what part of the doc we are in  - and take appropriate action
                #The font 4 items identify country names preceding the country table
                if el.attrib['font']=='4':
                    skip=True
                    dfr=dfrBase.copy()
                    dfr['country']=flatten(el)
                    #print(dfr['country'])
                
                #Now we parse the table columns
                if currCol==0:
                    if el.attrib['font']=='5':
                        #dfr['group']=flatten(el)
                        multiblock['1a'].append(flatten(el))
                        #We should be able to reset this from the multiConfig?
                        #dfr['member']=''
                        for clear in multiConfig['1a'][2]: dfr[clear]=''
                        #print('New group:',dfr['group'],end=', ')
                    elif el.attrib['font']=='3':
                        multiblock['1b'].append(flatten(el))
                        #print('Group',dfr['group'],'member:',dfr['member'],end=', ')                 
                elif currCol==1:
                    dfr['estcountry']=flatten(el)
                    #print(dfr['estcountry'])
                elif currCol==2:
                    multiblock['3'].append(flatten(el))
                                
    #Don't forget the last line!
    df2=pd.concat([df2,pd.DataFrame([dfr])])

#Preview the first and last few rows
pd.concat([df2[:6],df2[-6:]])

#Split out the data into a couple of tables - one for the main group, the other for the subsidiaries
df_group=df2[df2['estcountry']=='']
df_member=df2[df2['estcountry']!='']

#Write the data to a CSV file
df_group.to_csv('data/euMajorGroup.csv',index=False,encoding='UTF-8')
df_member.to_csv('data/euMajorGroupMembers.csv',index=False,encoding='UTF-8')

df_group[:5]

df_member[:5]

df_member[['country','estcountry']].groupby(['country','estcountry']).size().order(ascending=False)[:20]

df_group.groupby(['country']).size().order(ascending=False)

df_group[df_group['country'].str.startswith('France')]

df_group[df_group['group'].str.startswith('BNP')]

df_group[df_group['group'].str.contains('BNP')]

df_member[df_member['member'].str.contains('BNP')]

df_member[df_member['group'].str.contains('BNP')]

df_member[df_member['group']=='BNP Paribas'].groupby('estcountry').size().order(ascending=False)

#Check the column boundaries
df=dfColBoundaries(pages[33:])
df['left']=df['left'].astype(int)
df['left'].value_counts()

#Here's the scraper...

skip=False
currCol=-1
prevCol=-1
prevLeft=-1
currLeft=-1
currTop=-1
prevTop=-1
topDelta=-1

mblock={'corp':[]}
dfc=pd.DataFrame()

dft={'country':'','corp':''}
for page in pages[33:]: #33
    for el in page:
        if el.tag == "text":
            if skip:
                skip = False
            else:
                #print('..',flatten(el),'...',el.attrib)

                prevLeft=currLeft
                currLeft=el.attrib['left']
                prevTop=currTop
                currTop=int(el.attrib['top'])
                topDelta=currTop-prevTop
                if currLeft in ['111','113']:
                    mblock['corp'].append(flatten(el))
                #Need some logic to cope with cells that have names split over multiple rows
                #The PDF is variable in this respect - some pages have a whitespace element between separate table rows,
                # but other don't. In the latter case, multiline cells have text closer than text in separate rows.
                if prevLeft in ['111','113'] and (currLeft not in ['111','113'] or topDelta > 21):
                    dft['corp']=' '.join(mblock['corp'])
                    mblock['corp']=[]
                    if dft['corp']!='':
                        dfc=pd.concat([dfc,pd.DataFrame([dft])])
                if el.attrib['font']=='4':
                    dft['country']=flatten(el)
                    

#preview the first and last few rows
pd.concat([dfc[:5],dfc[-5:]])

#Write the data to a CSV file
dfc.to_csv('data/euMinorGroup.csv',index=False,encoding='UTF-8')

dfc.groupby('country').size().order(ascending=False)