#!/usr/bin/env python
# coding: utf-8
# In[36]:
import requests, pandas as pd, numpy as np
from requests import session
from bs4 import BeautifulSoup
# In[2]:
url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-cegek/arbevetel/also-haromszek-2014-2017'
r = requests.get(url)
for c in r.cookies:
print(c.name, c.value)
# In[ ]:
# In[129]:
dfs=[]
regions=['also-haromszek','felso-haromszek','csikszek','udvarhelyszek','marosszek','gyergyoszek']
for region in regions:
url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-cegek/alkalmazott/'+\
region+'-2014-2017/'
with session() as c:
response = c.get(url)
#print(response.headers)
#print(response.text)
df=pd.read_html(response.text)[1]
df.columns=[0]+list(df.loc[0])[:-1]
df=df.loc[2:].set_index(0)
df=df[df.columns[1:-1]]
df=df.loc[list(df.index)[:-1]]
df['region']=region
df['nr']=df.index
soup = BeautifulSoup(response.content)
links=soup.findAll('table')[3].findAll('a')
coords=[]
kws=[]
cms=[]
for i in range(len(links)):
print(i,)
r=requests.get(links[i]['href'])
g=repr(r.content)
coord_start=g.find('GLatLng')
coord_end=coord_start+g[coord_start:].find(')')
coord=g[coord_start+len('GLatLng')+1:coord_end].split(',')
kw_start=g.find('')
kw=g[kw_start+len('