import requests, pandas as pd, numpy as np
from requests import session
from bs4 import BeautifulSoup
dfsi=pd.read_csv('dfsi.csv',sep=';')
dfsi2=pd.read_csv('dfsi2.csv',sep=';')
Normalize
dfsi.columns=['Unnamed: 0', '0', 'tavaly', 'Cégnév',
'Alkalmazottak száma 2018', 'Alkalmazottak száma 2017',
'Alkalmazottak száma 2016','Alkalmazottak száma 2015', 'region', 'nr', 'coords', 'kws', 'cms']
dfsi2.columns=['Unnamed: 0', '0', 'tavaly', 'Cégnév',
'Árbevétel 2018 (RON)', 'Árbevétel 2017 (RON)', 'Árbevétel 2016 (RON)','Árbevétel 2015 (RON)',
'region', 'nr', 'coords', 'kws', 'cms']
dfsi['nr_alkalmazottak']=dfsi['nr']
dfsi2['nr_arbevetel']=dfsi2['nr']
dfsi.head(2)
dfsi2.head(2)
data=dfsi.set_index('Cégnév').join(dfsi2.set_index('Cégnév'),how='outer',lsuffix='_left', rsuffix='_right')
data['Cím']=data[['cms_right','cms_left']].T.ffill().bfill().T['cms_right']
data['Koord']=data[['coords_right','coords_left']].T.ffill().bfill().T['coords_right']
data['Kw']=data[['kws_right','kws_left']].T.ffill().bfill().T['kws_right']
data['Régió']=data[['region_right','region_left']].T.ffill().bfill().T['region_right']
data=data.drop(['0_left', 'region_left','nr_left', 'coords_left', 'kws_left', 'cms_left',
'0_right', 'region_right', 'nr_right', 'coords_right', 'kws_right', 'cms_right',
'Unnamed: 0_left','Unnamed: 0_right'],axis=1).reset_index()
data.head()
lat=[]
lon=[]
for i in range(len(data.index)):
k=data.loc[data.index[i]]['Koord'].replace("'",'').replace('[','')\
.replace(']','').replace(' ','').split(',')
lon.append(k[0])
lat.append(k[1])
data['Latitude']=lat
data['Longitude']=lon
data=data.drop('Koord',axis=1)
data
manual=data[data['Longitude']=='']
data=data[data['Longitude']!='']
data.to_excel('data.xlsx')
manual.to_excel('manual.xlsx')
manual.to_excel('manual_manual.xlsx')
Fix sector, address data and coordinates manually
manual=pd.read_excel('manual_manual.xlsx')
data=pd.concat([data,manual]).set_index('Cégnév')
sectors=pd.read_excel('sectors.xlsx')
sectormap={}
repl={'á':'a','é':'e','í':'i','ó':'o','ú':'u','ü':'u','ű':'u','ő':'o','ö':'o'}
for s in sectors.columns:
for k in sectors[s].values:
for j in str(k).replace(' ','').split(','):
sectormap[j]=s
for c in repl:
j=j.replace(c,repl[c])
sectormap[j]=s
valid=[]
kws=[]
ki=-1
for i in range(len(data.index)):
ks=data.loc[data.index[i]]['Kw'].replace("'",'').replace('[','')\
.replace(']','').replace(' ','').lower().split(',')[:-2]
for k in ks:
if k in sectormap:
kws.append(sectormap[k])
break
k=k.replace('-','')
if k in sectormap:
kws.append(sectormap[k])
break
if len(kws)<i-1:
print('ERROR',ks,i)
break
else:
#print('SUCCES',kws[-1],ks)
valid.append([data.index[i],kws[-1],ks])
pd.DataFrame(valid).to_excel('valid.xlsx')
pd.DataFrame(valid).to_excel('valid_manual.xlsx')
Manual overwrite