Scrape and quick view of the European Central Bank "list of significant supervised entities and the list of less significant institutions".
The report is split into three parts:
#Get the PDF and parse it into an xml file
#!wget http://www.ecb.europa.eu/pub/pdf/other/ssm-listofsupervisedentities1409en.pdf?59d76de0c5663687f594250ebf228c6b -P data
#!pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes data/ssm-listofsupervisedentities1409en.pdf\?59d76de0c5663687f594250ebf228c6b pdftest2.xml
#Open the XML file
f=open('pdftest2.xml','rb').read()
#Set up the parse tree
from lxml import etree
root = etree.fromstring(f)
pages = list(root)
#Import utility libraries
import pandas as pd
#Utility function to flatten cells (eg remove bold font tags etc)
def flatten(el):
if el != None:
result = [ (el.text or "") ]
for sel in el:
result.append(flatten(sel))
result.append(sel.tail or "")
return "".join(result).strip()
return ""
I'm going to hand craft a scraper, because I fancy a recreational data scrpaing activity!
#The .attrib dict contains positional and font informaation about a piece of text
#{'top': '555', 'width': '72', 'font': '5', 'left': '102', 'height': '15'}
#Look to see where column boundaries are in the first scraper
def dfColBoundaries(pages):
df=pd.DataFrame()
for page in pages:
for el in page:
if el.tag == "text":
tmp={}
for i in ['top', 'width', 'font', 'left', 'height']:
tmp[i]=el.attrib[i]
df=pd.concat([df,pd.DataFrame([tmp])])
return df
Let's start with the more complex, multi-column table...
df=dfColBoundaries(pages[3:33])
df['left']=df['left'].astype(int)
df['left'].hist()
<matplotlib.axes.AxesSubplot at 0x7f674812c710>
df['left'].value_counts()
655 1408 102 1348 509 1269 106 60 411 3 446 2 410 2 404 2 413 2 417 2 419 1 425 1 629 1 302 1 405 1 401 1 253 1 508 1 406 1 384 1 364 1 dtype: int64
#A helper function to show what text appears at a particular column boundary
def probeColBoundary(pages,bval,boundary='left'):
tmp=[]
for page in pages:
for el in page:
if el.tag == "text" and el.attrib[boundary]in bval:
tmp.append(flatten(el))
return tmp
probeColBoundary(pages[3:33],'102')[:10]
['', 'Investar', 'Argenta Bank- en Verzekeringsgroep', 'Argenta Spaarbank NV', 'Argentabank Luxembourg S.A.', 'Axa Bank Europe SA', 'AxaBank Europe SCF', 'Banque Degroof S.A.', 'Banque Degroof France S.A.', 'Banque Degroof Luxembourg S.A.']
#The logic of this scraper evoleved through several iterations!
#This is the dataframe we'll build up the table of results in
df2=pd.DataFrame()
skip=False
dfrBase={}
#The columns in the output table
for i in ['group','member','country','estcountry','reason']:dfrBase[i]=''
dfr=dfrBase.copy()
#Some cells have mutliple rows of text in them - this configures cell actions
#'CELL_TYPE':(colNum,colDesignator,[resetCols])
multiConfig={'3':(2,'reason',[]),'1a':(0,'group',['member']),'1b':(0,'member',[])}
multiblock={}
for i in multiConfig: multiblock[i]=[]
currCol=-1
prevCol=-1
colSets=[['102'],['508','509'],['655']]
for page in pages[3:33]:
for el in page:
if el.tag == "text":
if skip:
skip = False
else:
#Identify the current col from the column boundary sets
#Would probably be better to do this as upper/lower bounds?
prevCol=currCol
for col in range(0,len(colSets)):
if el.attrib['left'] in colSets[col]:
currCol=col
#if multiblock['3']!=[] and currCol!=2:
# print(' '.join(multiblock['3']))
# dfr['reason']=' '.join(multiblock['3'])
# multiblock['3']=[]
#if multiblock['1b']!=[] and currCol!=0:
# print(' '.join(multiblock['1b']))
# dfr['member']=' '.join(multiblock['1b'])
# multiblock['1b']=[]
#Update the status of each cell in case it's a multirow cell
for mblock in multiConfig:
mcol,mvar,mig = multiConfig[mblock]
if multiblock[mblock]!=[] and currCol!=mcol:
#print(' '.join(multiblock[mblock]))
dfr[mvar]=' '.join(multiblock[mblock])
multiblock[mblock]=[]
#The logic that decides when we have a new row - which is time to write the old one
if currCol<prevCol and dfr['group']!='':
#Ignore blank rows
if (dfr['estcountry']!='' ) or (dfr['member']!='' ) or (dfr['reason']!='' ):
df2=pd.concat([df2,pd.DataFrame([dfr])])
#print(flatten(el),el.attrib)
#Logic to work out what part of the doc we are in - and take appropriate action
#The font 4 items identify country names preceding the country table
if el.attrib['font']=='4':
skip=True
dfr=dfrBase.copy()
dfr['country']=flatten(el)
#print(dfr['country'])
#Now we parse the table columns
if currCol==0:
if el.attrib['font']=='5':
#dfr['group']=flatten(el)
multiblock['1a'].append(flatten(el))
#We should be able to reset this from the multiConfig?
#dfr['member']=''
for clear in multiConfig['1a'][2]: dfr[clear]=''
#print('New group:',dfr['group'],end=', ')
elif el.attrib['font']=='3':
multiblock['1b'].append(flatten(el))
#print('Group',dfr['group'],'member:',dfr['member'],end=', ')
elif currCol==1:
dfr['estcountry']=flatten(el)
#print(dfr['estcountry'])
elif currCol==2:
multiblock['3'].append(flatten(el))
#Don't forget the last line!
df2=pd.concat([df2,pd.DataFrame([dfr])])
#Preview the first and last few rows
pd.concat([df2[:6],df2[-6:]])
country | estcountry | group | member | reason | |
---|---|---|---|---|---|
0 | Belgium | Investar | Size (total assets EUR 30 - 50 bn) | ||
0 | Belgium | Belgium | Investar | Argenta Bank- en Verzekeringsgroep | |
0 | Belgium | Belgium | Investar | Argenta Spaarbank NV | |
0 | Belgium | Luxembourg | Investar | Argentabank Luxembourg S.A. | |
0 | Belgium | Axa Bank Europe SA | Size (total assets EUR 30-50 bn) | ||
0 | Belgium | France | Axa Bank Europe SA | AxaBank Europe SCF | |
0 | Finland | Finland | OP-Pohjola osk | Ylitornion Osuuspankki (cooperative member bank) | |
0 | Finland | Finland | OP-Pohjola osk | Ylivieskan Osuuspankki (cooperative member bank) | |
0 | Finland | Finland | OP-Pohjola osk | Yläneen Osuuspankki (cooperative member bank) | |
0 | Finland | Finland | OP-Pohjola osk | Ypäjän Osuuspankki (cooperative member bank) | |
0 | Finland | Finland | OP-Pohjola osk | Östnylands Andelsbank (cooperative member bank) | |
0 | Finland | Finland | OP-Pohjola osk | Östra Korsholms Andelsbank (cooperative member... |
#Split out the data into a couple of tables - one for the main group, the other for the subsidiaries
df_group=df2[df2['estcountry']=='']
df_member=df2[df2['estcountry']!='']
#Write the data to a CSV file
df_group.to_csv('data/euMajorGroup.csv',index=False,encoding='UTF-8')
df_member.to_csv('data/euMajorGroupMembers.csv',index=False,encoding='UTF-8')
df_group[:5]
country | estcountry | group | member | reason | |
---|---|---|---|---|---|
0 | Belgium | Investar | Size (total assets EUR 30 - 50 bn) | ||
0 | Belgium | Axa Bank Europe SA | Size (total assets EUR 30-50 bn) | ||
0 | Belgium | Banque Degroof S.A. | Significant cross-border assets | ||
0 | Belgium | Belfius Banque S.A. | Size (total assets EUR 150-300 bn) | ||
0 | Belgium | Dexia NV | Size (total assets EUR 150-300 bn) |
df_member[:5]
country | estcountry | group | member | reason | |
---|---|---|---|---|---|
0 | Belgium | Belgium | Investar | Argenta Bank- en Verzekeringsgroep | |
0 | Belgium | Belgium | Investar | Argenta Spaarbank NV | |
0 | Belgium | Luxembourg | Investar | Argentabank Luxembourg S.A. | |
0 | Belgium | France | Axa Bank Europe SA | AxaBank Europe SCF | |
0 | Belgium | France | Banque Degroof S.A. | Banque Degroof France S.A. |
I'm not sure what's useful to look at?
How about the number of member institutions by parent country and member country?
df_member[['country','estcountry']].groupby(['country','estcountry']).size().order(ascending=False)[:20]
country estcountry France France 289 Finland Finland 191 The Netherlands The Netherlands 146 Austria Austria 131 Italy Italy 61 Spain Spain 56 Germany Germany 32 Cyprus Cyprus 18 Germany Luxembourg 14 France Luxembourg 12 Italy 9 Germany 9 Italy Luxembourg 8 France Belgium 8 Italy Austria 8 Belgium France 7 Portugal Portugal 7 Ireland Ireland 6 Greece Cyprus 5 Belgium Belgium 5 dtype: int64
So for example, there are 289 institutions in France with a French parent, and 9 Italian and German institutions with a French parent.
How about the number of parent companies by country?
df_group.groupby(['country']).size().order(ascending=False)
country Germany 23 Italy 15 Spain 15 France 10 Austria 9 The Netherlands 8 Belgium 7 Luxembourg 4 Portugal 4 Greece 4 Latvia 3 Slovakia 3 Finland 3 Cyprus 3 Ireland 3 Malta 2 Slovenia 2 Estonia 1 dtype: int64
We can search on major groups by country. For example:
df_group[df_group['country'].str.startswith('France')]
country | estcountry | group | member | reason | |
---|---|---|---|---|---|
0 | France | BNP Paribas | Size (total assets above EUR 1,000 bn) | ||
0 | France | BPCE | Size (total assets above EUR 1,000 bn) | ||
0 | France | Bpifrance (Banque Publique d’Investissement) | Size (total assets EUR 50-75 bn) | ||
0 | France | Confédération Nationale du Crédit Mutuel | Size (total assets EUR 500-1,000 bn) | ||
0 | France | C.R.H. – Caisse de Refinancement de l’Habitat | Size (total assets EUR 50-75 bn) | ||
0 | France | Crédit Agricole S.A. | Size (total assets above EUR 1,000 bn) | ||
0 | France | HSBC France | Size (total assets EUR 150-300 bn) | ||
0 | France | La Banque Postale | Size (total assets EUR 150-300 bn) | ||
0 | France | Société de Financement Local | Size (total assets EUR 75-100 bn) | ||
0 | France | Société Générale S.A. | Size (total assets above EUR 1,000 bn) |
Or search major groups by (partial) name:
df_group[df_group['group'].str.startswith('BNP')]
country | estcountry | group | member | reason | |
---|---|---|---|---|---|
0 | France | BNP Paribas | Size (total assets above EUR 1,000 bn) |
df_group[df_group['group'].str.contains('BNP')]
country | estcountry | group | member | reason | |
---|---|---|---|---|---|
0 | France | BNP Paribas | Size (total assets above EUR 1,000 bn) |
How about members that contain BNP?
df_member[df_member['member'].str.contains('BNP')]
country | estcountry | group | member | reason | |
---|---|---|---|---|---|
0 | France | France | BNP Paribas | BNP-Paribas Guyane | |
0 | France | France | BNP Paribas | BNP-Paribas Guadeloupe | |
0 | France | France | BNP Paribas | BNP-Paribas Martinique | |
0 | France | France | BNP Paribas | BNP-Paribas home loan SFH | |
0 | France | France | BNP Paribas | BNP-Paribas Public Sector SCF | |
0 | France | France | BNP Paribas | BNP-Paribas factor | |
0 | France | France | BNP Paribas | BNP-Paribas Personal Finance | |
0 | France | France | BNP Paribas | BNP-Paribas Lease Group – Leasegroup | |
0 | France | France | BNP Paribas | BNP-Paribas wealth management | |
0 | France | France | BNP Paribas | BNP-Paribas Securities Services | |
0 | France | France | BNP Paribas | BNP-Paribas Réunion | |
0 | France | France | BNP Paribas | BNP-Paribas Dealing Services | |
0 | France | France | BNP Paribas | BNP Nouvelle-Calédonie | |
0 | France | Belgium | BNP Paribas | BNP-Paribas Fortis SA | |
0 | France | Luxembourg | BNP Paribas | BGL BNP-Paribas | |
0 | France | Portugal | BNP Paribas | Banco BNP-Paribas Personal Finance, S.A. | |
0 | France | Spain | BNP Paribas | BNP-Paribas España, S.A. |
Or subsidiaries of groups containing BNP in the name?
df_member[df_member['group'].str.contains('BNP')]
country | estcountry | group | member | reason | |
---|---|---|---|---|---|
0 | France | France | BNP Paribas | Laser cofinoga | |
0 | France | France | BNP Paribas | BNP-Paribas Guyane | |
0 | France | France | BNP Paribas | JCB Finance | |
0 | France | France | BNP Paribas | BNP-Paribas Guadeloupe | |
0 | France | France | BNP Paribas | BNP-Paribas Martinique | |
0 | France | France | BNP Paribas | Banque nationale de Paris Intercontinentale | |
0 | France | France | BNP Paribas | BNP-Paribas home loan SFH | |
0 | France | France | BNP Paribas | BNP-Paribas Public Sector SCF | |
0 | France | France | BNP Paribas | Natiocrédibail | |
0 | France | France | BNP Paribas | BNP-Paribas factor | |
0 | France | France | BNP Paribas | BNP-Paribas Personal Finance | |
0 | France | France | BNP Paribas | Cortal Consors | |
0 | France | France | BNP Paribas | Fortis lease | |
0 | France | France | BNP Paribas | Parilease | |
0 | France | France | BNP Paribas | BNP-Paribas Lease Group – Leasegroup | |
0 | France | France | BNP Paribas | BNP-Paribas wealth management | |
0 | France | France | BNP Paribas | BNP-Paribas Securities Services | |
0 | France | France | BNP Paribas | BNP-Paribas Réunion | |
0 | France | France | BNP Paribas | Société alsacienne de développement et d’expan... | |
0 | France | France | BNP Paribas | Cmv mediforce | |
0 | France | France | BNP Paribas | Fidem | |
0 | France | France | BNP Paribas | Banque Solfea | |
0 | France | France | BNP Paribas | CNH Industrial Capital Europe | |
0 | France | France | BNP Paribas | Loisirs finance | |
0 | France | France | BNP Paribas | Same Deutz-Fahr Finance | |
0 | France | France | BNP Paribas | Claas financial services | |
0 | France | France | BNP Paribas | Facet | |
0 | France | France | BNP Paribas | Domofinance | |
0 | France | France | BNP Paribas | MFF | |
0 | France | France | BNP Paribas | BNP-Paribas Dealing Services | |
0 | France | France | BNP Paribas | Natiocredimurs, Société en Nom Collectif | |
0 | France | France | BNP Paribas | Compagnie de gestion et de prêts | |
0 | France | France | BNP Paribas | Sygma banque | |
0 | France | France | BNP Paribas | BNP Nouvelle-Calédonie | |
0 | France | Belgium | BNP Paribas | BNP-Paribas Fortis SA | |
0 | France | Belgium | BNP Paribas | bpost banque SA | |
0 | France | Germany | BNP Paribas | Commerz Finanz GmbH | |
0 | France | Germany | BNP Paribas | Von Essen GmbH & Co. KG Bankgesellschaft | |
0 | France | Italy | BNP Paribas | Banca Nazionale del Lavoro S.p.A. | |
0 | France | Italy | BNP Paribas | Findomestic Banca S.p.A. | |
0 | France | Italy | BNP Paribas | Artigiancassa S.p.A. | |
0 | France | Italy | BNP Paribas | Artigiancassa S.p.A. | |
0 | France | Luxembourg | BNP Paribas | BGL BNP-Paribas | |
0 | France | The Netherlands | BNP Paribas | Bank Insinger de Beaufort N.V. | |
0 | France | The Netherlands | BNP Paribas | The Economy Bank N.V. | |
0 | France | Portugal | BNP Paribas | Banco BNP-Paribas Personal Finance, S.A. | |
0 | France | Spain | BNP Paribas | BNP-Paribas España, S.A. | |
0 | France | Spain | BNP Paribas | Banco Cetelem, S.A. |
How about the representation of particular group by country?
df_member[df_member['group']=='BNP Paribas'].groupby('estcountry').size().order(ascending=False)
estcountry France 34 Italy 4 The Netherlands 2 Spain 2 Germany 2 Belgium 2 Portugal 1 Luxembourg 1 dtype: int64
Now let's look at the simpler, single column table of less significant entities.
#Check the column boundaries
df=dfColBoundaries(pages[33:])
df['left']=df['left'].astype(int)
df['left'].value_counts()
113 3578 766 2541 763 705 106 160 760 32 111 30 168 2 396 2 398 2 386 2 388 2 353 1 269 1 265 1 387 1 404 1 400 1 380 1 389 1 376 1 368 1 348 1 344 1 276 1 256 1 369 1 287 1 393 1 366 1 231 1 375 1 498 1 466 1 410 1 394 1 374 1 362 1 275 1 350 1 314 1 274 1 190 1 178 1 395 1 477 1 401 1 dtype: int64
#Here's the scraper...
skip=False
currCol=-1
prevCol=-1
prevLeft=-1
currLeft=-1
currTop=-1
prevTop=-1
topDelta=-1
mblock={'corp':[]}
dfc=pd.DataFrame()
dft={'country':'','corp':''}
for page in pages[33:]: #33
for el in page:
if el.tag == "text":
if skip:
skip = False
else:
#print('..',flatten(el),'...',el.attrib)
prevLeft=currLeft
currLeft=el.attrib['left']
prevTop=currTop
currTop=int(el.attrib['top'])
topDelta=currTop-prevTop
if currLeft in ['111','113']:
mblock['corp'].append(flatten(el))
#Need some logic to cope with cells that have names split over multiple rows
#The PDF is variable in this respect - some pages have a whitespace element between separate table rows,
# but other don't. In the latter case, multiline cells have text closer than text in separate rows.
if prevLeft in ['111','113'] and (currLeft not in ['111','113'] or topDelta > 21):
dft['corp']=' '.join(mblock['corp'])
mblock['corp']=[]
if dft['corp']!='':
dfc=pd.concat([dfc,pd.DataFrame([dft])])
if el.attrib['font']=='4':
dft['country']=flatten(el)
#preview the first and last few rows
pd.concat([dfc[:5],dfc[-5:]])
corp | country | |
---|---|---|
0 | ABK Bank CVBA | Belgium |
0 | Bank J. Van Breda en C° NV | Belgium |
0 | Banque CPH | Belgium |
0 | Banque Delen & de Schaetzen | Belgium |
0 | Banque Eni SA | Belgium |
0 | The Royal Bank of Scotland public limited comp... | Finland |
0 | Tiistenjoen Osuuspankki | Finland |
0 | Tuusniemen Osuuspankki | Finland |
0 | Ylihärmän Säästöpankki | Finland |
0 | Yttermark Sparbank | Finland |
#Write the data to a CSV file
dfc.to_csv('data/euMinorGroup.csv',index=False,encoding='UTF-8')
Count the less significant institutions by country.
dfc.groupby('country').size().order(ascending=False)
country Germany 1699 Austria 562 Italy 531 France 156 Portugal 118 Finland 93 Spain 81 Luxembourg 69 The Netherlands 50 Belgium 30 Ireland 30 Greece 19 Malta 16 Latvia 13 Estonia 11 Slovakia 9 Slovenia 9 Cyprus 6 dtype: int64