import requests, pandas as pd, numpy as np
from requests import session
from bs4 import BeautifulSoup
dfsi=pd.read_csv('dfsi.csv',sep=';')
dfsi2=pd.read_csv('dfsi2.csv',sep=';')
Normalize
dfsi.columns=['Unnamed: 0', '0', 'tavaly', 'Cégnév',
'Alkalmazottak száma 2018', 'Alkalmazottak száma 2017',
'Alkalmazottak száma 2016','Alkalmazottak száma 2015', 'region', 'nr', 'coords', 'kws', 'cms']
dfsi2.columns=['Unnamed: 0', '0', 'tavaly', 'Cégnév',
'Árbevétel 2018 (RON)', 'Árbevétel 2017 (RON)', 'Árbevétel 2016 (RON)','Árbevétel 2015 (RON)',
'region', 'nr', 'coords', 'kws', 'cms']
dfsi['nr_alkalmazottak']=dfsi['nr']
dfsi2['nr_arbevetel']=dfsi2['nr']
dfsi.head(2)
Unnamed: 0 | 0 | tavaly | Cégnév | Alkalmazottak száma 2018 | Alkalmazottak száma 2017 | Alkalmazottak száma 2016 | Alkalmazottak száma 2015 | region | nr | coords | kws | cms | nr_alkalmazottak | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 2 | READY GARMENT TECHNOLOGY ROMANIA SRL | 990 | 730 | 1 005 | 1 199 | also-haromszek | 1 | ['45.8772830', ' 25.7994510'] | ['mosas-tisztitas', '', '"'] | 520036, Sepsiszentgy\xf6rgy, Cs\xedki u., 149/A\t | 1 |
1 | 1 | 2 | 1 | VALKES SRL | 621 | 788 | 886 | 1 007 | also-haromszek | 2 | ['45.855129 ', ' 25.806651 '] | ['kabelek', '', '"'] | 520077, Sepsiszentgy\xf6rgy, Păiş Da... | 2 |
dfsi2.head(2)
Unnamed: 0 | 0 | tavaly | Cégnév | Árbevétel 2018 (RON) | Árbevétel 2017 (RON) | Árbevétel 2016 (RON) | Árbevétel 2015 (RON) | region | nr | coords | kws | cms | nr_arbevetel | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 1 | FABRICA DE LAPTE BRASOV SA | 468 016 207 | 431 950 759 | 370 418 528 | 338 283 110 | also-haromszek | 1 | ['46.0750630', ' 25.6087250'] | ['tejfeldolgozas', '', '"'] | 525100, Bar\xf3t, V\xedz u., 109\t | 1 |
1 | 1 | 2 | 2 | COVALACT SA | 252 640 519 | 235 415 329 | 230 073 646 | 199 493 229 | also-haromszek | 2 | ['45.869573 ', ' 25.800705 '] | ['tejfeldolgozas', '', '"'] | 520036, Sepsiszentgy\xf6rgy, Oltmez\xf5 u., 1\t | 2 |
data=dfsi.set_index('Cégnév').join(dfsi2.set_index('Cégnév'),how='outer',lsuffix='_left', rsuffix='_right')
data['Cím']=data[['cms_right','cms_left']].T.ffill().bfill().T['cms_right']
data['Koord']=data[['coords_right','coords_left']].T.ffill().bfill().T['coords_right']
data['Kw']=data[['kws_right','kws_left']].T.ffill().bfill().T['kws_right']
data['Régió']=data[['region_right','region_left']].T.ffill().bfill().T['region_right']
data=data.drop(['0_left', 'region_left','nr_left', 'coords_left', 'kws_left', 'cms_left',
'0_right', 'region_right', 'nr_right', 'coords_right', 'kws_right', 'cms_right',
'Unnamed: 0_left','Unnamed: 0_right'],axis=1).reset_index()
data.head()
Cégnév | tavaly_left | Alkalmazottak száma 2018 | Alkalmazottak száma 2017 | Alkalmazottak száma 2016 | Alkalmazottak száma 2015 | nr_alkalmazottak | tavaly_right | Árbevétel 2018 (RON) | Árbevétel 2017 (RON) | Árbevétel 2016 (RON) | Árbevétel 2015 (RON) | nr_arbevetel | Cím | Koord | Kw | Régió | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A M C SRL | NaN | NaN | NaN | NaN | NaN | NaN | 20 | 28 716 061 | 27 881 270 | 24 390 005 | 22 173 039 | 22.0 | 527100, H\xeddv\xe9g, Rom\xe1n u., 203\t | ['45.8391350', ' 25.5892160'] | ['nagykereskedelem', '', '"'] | also-haromszek |
1 | ABC IMPEX SRL | 20 | 139 | 145 | 132 | 132 | 21.0 | 20 | 37 875 372 | 36 539 530 | 33 071 733 | 31 956 498 | 24.0 | 535600, Sz\xe9kelyudvarhely, R\xe1k\xf3czi Fer... | ['46.289768 ', ' 25.290034 '] | ['csomagoloanyagok', 'nyomdak', '', '"'] | udvarhelyszek |
2 | ABRAZIV SRL | NaN | NaN | NaN | NaN | NaN | NaN | 49 | 16 316 705 | 5 483 275 | 0 | 0 | 17.0 | 535500, Gyergy\xf3szentmikl\xf3s, \xc1llom\xe1... | ['46.7174250', ' 25.5751650'] | ['szerszamgepek', '', '"'] | gyergyoszek |
3 | ADILEX FUNGO SRL | NaN | NaN | NaN | NaN | NaN | NaN | >50 | 6 834 996 | 925 866 | - | - | 50.0 | 537355, Vasl\xe1b, , 37\t | ['46.64607', ' 25.62371'] | ['zoldseg-gyumolcs-csomagolas', '', '"'] | gyergyoszek |
4 | ADIMAG COM IMPEX SRL | NaN | NaN | NaN | NaN | NaN | NaN | 36 | 65 548 663 | 57 106 224 | 48 827 471 | 47 437 389 | 40.0 | 540190, Marosv\xe1s\xe1rhely, Szabads\xe1g u.,... | ['46.537905', ' 24.548819'] | ['belsoepiteszeti-anyagok', 'epitoanyagok', 'f... | marosszek |
lat=[]
lon=[]
for i in range(len(data.index)):
k=data.loc[data.index[i]]['Koord'].replace("'",'').replace('[','')\
.replace(']','').replace(' ','').split(',')
lon.append(k[0])
lat.append(k[1])
data['Latitude']=lat
data['Longitude']=lon
data=data.drop('Koord',axis=1)
data
Cégnév | tavaly_left | Alkalmazottak száma 2018 | Alkalmazottak száma 2017 | Alkalmazottak száma 2016 | Alkalmazottak száma 2015 | nr_alkalmazottak | tavaly_right | Árbevétel 2018 (RON) | Árbevétel 2017 (RON) | Árbevétel 2016 (RON) | Árbevétel 2015 (RON) | nr_arbevetel | Cím | Kw | Régió | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A M C SRL | NaN | NaN | NaN | NaN | NaN | NaN | 20 | 28 716 061 | 27 881 270 | 24 390 005 | 22 173 039 | 22.0 | 527100, H\xeddv\xe9g, Rom\xe1n u., 203\t | ['nagykereskedelem', '', '"'] | also-haromszek | 25.5892160 | 45.8391350 |
1 | ABC IMPEX SRL | 20 | 139 | 145 | 132 | 132 | 21.0 | 20 | 37 875 372 | 36 539 530 | 33 071 733 | 31 956 498 | 24.0 | 535600, Sz\xe9kelyudvarhely, R\xe1k\xf3czi Fer... | ['csomagoloanyagok', 'nyomdak', '', '"'] | udvarhelyszek | 25.290034 | 46.289768 |
2 | ABRAZIV SRL | NaN | NaN | NaN | NaN | NaN | NaN | 49 | 16 316 705 | 5 483 275 | 0 | 0 | 17.0 | 535500, Gyergy\xf3szentmikl\xf3s, \xc1llom\xe1... | ['szerszamgepek', '', '"'] | gyergyoszek | 25.5751650 | 46.7174250 |
3 | ADILEX FUNGO SRL | NaN | NaN | NaN | NaN | NaN | NaN | >50 | 6 834 996 | 925 866 | - | - | 50.0 | 537355, Vasl\xe1b, , 37\t | ['zoldseg-gyumolcs-csomagolas', '', '"'] | gyergyoszek | 25.62371 | 46.64607 |
4 | ADIMAG COM IMPEX SRL | NaN | NaN | NaN | NaN | NaN | NaN | 36 | 65 548 663 | 57 106 224 | 48 827 471 | 47 437 389 | 40.0 | 540190, Marosv\xe1s\xe1rhely, Szabads\xe1g u.,... | ['belsoepiteszeti-anyagok', 'epitoanyagok', 'f... | marosszek | 24.548819 | 46.537905 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
419 | WONDERLAND SRL | 44 | 28 | 29 | 28 | 29 | 46.0 | NaN | NaN | NaN | NaN | NaN | NaN | 527160, Torja, F\xf5 u., 225\t | ['epitkezes', '', '"'] | felso-haromszek | 26.063746 | 46.040239 |
420 | ZABOLA ESTATE SRL | 39 | 45 | 31 | 25 | 23 | 28.0 | NaN | NaN | NaN | NaN | NaN | NaN | 527190, Zabola, , 437\t | ['hotelek', 'kastelyszallo', 'vendeglatas', ''... | felso-haromszek | 26.1980287 | 45.8915563 |
421 | ZAMBELLI METAL SRL | 15 | 205 | 179 | 162 | 163 | 14.0 | 15 | 52 694 046 | 42 016 920 | 34 591 562 | 27 891 895 | 15.0 | 520077, Sepsiszentgy\xf6rgy, \xc9p\xedt\xf5k u... | ['badogosmunkak', '', '"'] | also-haromszek | 25.818515 | 45.861235 |
422 | ZARAH MODEN SRL | 2 | 764 | 785 | 778 | 839 | 2.0 | 2 | 154 604 899 | 148 531 005 | 141 121 629 | 139 752 712 | 2.0 | 525400, K\xe9zdiv\xe1s\xe1rhely, B\xe9ke u., 27\t | ['nadraggyartas', 'textilipar', '', 'INDUSTRIA... | felso-haromszek | 26.1359670 | 45.9969390 |
423 | ZENCO TRANS SRL | 40 | 36 | 37 | 33 | 33 | 41.0 | NaN | NaN | NaN | NaN | NaN | NaN | 535700, Marosh\xe9v\xedz, , 2\t | ['aruszallitas', '', '"'] | gyergyoszek | 25.3534040 | 46.9260300 |
424 rows × 18 columns
manual=data[data['Longitude']=='']
data=data[data['Longitude']!='']
data.to_excel('data.xlsx')
manual.to_excel('manual.xlsx')
manual.to_excel('manual_manual.xlsx')
Fix sector, address data and coordinates manually
manual=pd.read_excel('manual_manual.xlsx')
data=pd.concat([data,manual]).set_index('Cégnév')
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
sectors=pd.read_excel('sectors.xlsx')
sectormap={}
repl={'á':'a','é':'e','í':'i','ó':'o','ú':'u','ü':'u','ű':'u','ő':'o','ö':'o'}
for s in sectors.columns:
for k in sectors[s].values:
for j in str(k).replace(' ','').split(','):
sectormap[j]=s
for c in repl:
j=j.replace(c,repl[c])
sectormap[j]=s
valid=[]
kws=[]
ki=-1
for i in range(len(data.index)):
ks=data.loc[data.index[i]]['Kw'].replace("'",'').replace('[','')\
.replace(']','').replace(' ','').lower().split(',')[:-2]
for k in ks:
if k in sectormap:
kws.append(sectormap[k])
break
k=k.replace('-','')
if k in sectormap:
kws.append(sectormap[k])
break
if len(kws)<i-1:
print('ERROR',ks,i)
break
else:
#print('SUCCES',kws[-1],ks)
valid.append([data.index[i],kws[-1],ks])
pd.DataFrame(valid).to_excel('valid.xlsx')
pd.DataFrame(valid).to_excel('valid_manual.xlsx')
Manual overwrite