#!/usr/bin/env python # coding: utf-8 # In[73]: import requests, pandas as pd, numpy as np from requests import session from bs4 import BeautifulSoup # In[229]: dfsi=pd.read_csv('dfsi.csv',sep=';') # In[230]: dfsi2=pd.read_csv('dfsi2.csv',sep=';') # Normalize # In[231]: dfsi.columns=['Unnamed: 0', '0', 'tavaly', 'Cégnév', 'Alkalmazottak száma 2018', 'Alkalmazottak száma 2017', 'Alkalmazottak száma 2016','Alkalmazottak száma 2015', 'region', 'nr', 'coords', 'kws', 'cms'] dfsi2.columns=['Unnamed: 0', '0', 'tavaly', 'Cégnév', 'Árbevétel 2018 (RON)', 'Árbevétel 2017 (RON)', 'Árbevétel 2016 (RON)','Árbevétel 2015 (RON)', 'region', 'nr', 'coords', 'kws', 'cms'] # In[232]: dfsi['nr_alkalmazottak']=dfsi['nr'] dfsi2['nr_arbevetel']=dfsi2['nr'] # In[233]: dfsi.head(2) # In[234]: dfsi2.head(2) # In[235]: data=dfsi.set_index('Cégnév').join(dfsi2.set_index('Cégnév'),how='outer',lsuffix='_left', rsuffix='_right') # In[236]: data['Cím']=data[['cms_right','cms_left']].T.ffill().bfill().T['cms_right'] data['Koord']=data[['coords_right','coords_left']].T.ffill().bfill().T['coords_right'] data['Kw']=data[['kws_right','kws_left']].T.ffill().bfill().T['kws_right'] data['Régió']=data[['region_right','region_left']].T.ffill().bfill().T['region_right'] # In[237]: data=data.drop(['0_left', 'region_left','nr_left', 'coords_left', 'kws_left', 'cms_left', '0_right', 'region_right', 'nr_right', 'coords_right', 'kws_right', 'cms_right', 'Unnamed: 0_left','Unnamed: 0_right'],axis=1).reset_index() # In[238]: data.head() # In[239]: lat=[] lon=[] for i in range(len(data.index)): k=data.loc[data.index[i]]['Koord'].replace("'",'').replace('[','')\ .replace(']','').replace(' ','').split(',') lon.append(k[0]) lat.append(k[1]) # In[240]: data['Latitude']=lat data['Longitude']=lon # In[241]: data=data.drop('Koord',axis=1) # In[242]: data # In[243]: manual=data[data['Longitude']==''] data=data[data['Longitude']!=''] # In[222]: data.to_excel('data.xlsx') manual.to_excel('manual.xlsx') manual.to_excel('manual_manual.xlsx') # Fix sector, address data and coordinates manually # In[244]: manual=pd.read_excel('manual_manual.xlsx') # In[245]: data=pd.concat([data,manual]).set_index('Cégnév') # In[246]: sectors=pd.read_excel('sectors.xlsx') sectormap={} repl={'á':'a','é':'e','í':'i','ó':'o','ú':'u','ü':'u','ű':'u','ő':'o','ö':'o'} for s in sectors.columns: for k in sectors[s].values: for j in str(k).replace(' ','').split(','): sectormap[j]=s for c in repl: j=j.replace(c,repl[c]) sectormap[j]=s # In[247]: valid=[] kws=[] ki=-1 for i in range(len(data.index)): ks=data.loc[data.index[i]]['Kw'].replace("'",'').replace('[','')\ .replace(']','').replace(' ','').lower().split(',')[:-2] for k in ks: if k in sectormap: kws.append(sectormap[k]) break k=k.replace('-','') if k in sectormap: kws.append(sectormap[k]) break if len(kws)