#!/usr/bin/env python # coding: utf-8 # # Pandas # # [Pandas](http://pandas.pydata.org/) je biblioteka za analizu podataka. Ilustrirat ćemo njeno korištenje na jednostavnom primjeru analize podataka iz IMDB baze. # # Koristit ću i biblioteke [requests](https://github.com/kennethreitz/requests) (za učitavanje web stranica) i [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) (za analizu HTML-a). # In[4]: import requests from bs4 import BeautifulSoup as bs # Dohvaćamo podatke s IMDB-a o filmovima sa zemljom porijekla Hrvatska, smimljenim između 1945. i 2017. # In[117]: import time url = 'http://www.imdb.com/search/title' params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1945,2017',countries='hr', languages='hr') r=[] for i in range(1,240,50): # takvih filmova ima trenutno 201 params['start']=i r.append(requests.get(url, params=params)) time.sleep(10) # Parsiranje podataka koje spremamo u datoteku `filmovi2.txt`. # In[167]: import re with open('filmovi2.txt','w') as f: for i in range(len(r)): soup = bs(r[i].text,'html.parser') for film in soup.find_all('div', class_="lister-item"): for a in film.find_all('a', href=True): if '/title/tt' and 'adv_li_tt' in a['href']: title = a.contents[0] rt = film.find_all('span', class_="runtime") if rt: runtime = rt[0].contents[0] else: runtime = '0 mins' y = film.find_all('span', class_="lister-item-year") if y: year = y[0].contents[0] year = year.replace('(I)','').replace('(III)','').strip() else: year = '???' rat = film.find_all('span', class_='global-sprite rating-star imdb-rating') if rat: rating = str(list(rat[0].next_siblings)[1]).replace('','').replace('','') else: rat = '0' g = film.find_all('span',class_="genre") if g: genres = ' '.join(g[0].contents).replace('\n','').strip() d=film.find_all(string =re.compile('Director'))[0] director = d.next_element.contents[0] f.write('\t'.join((title, year, runtime, rating,director, genres))+'\n') # In[119]: get_ipython().system('head filmovi2.txt') # ### Analiza podataka pomoću biblioteke Pandas # In[174]: import pandas as pd names = ['title', 'year','runtime', 'rating', 'director', 'genres'] data = pd.read_csv('filmovi2.txt', delimiter='\t', names=names) print ("Number of rows: {:d}".format(data.shape[0])) data.head() # In[175]: # data['runtime'].fillna('0 mins.', inplace=True); clean_runtime = [int(v.split(' ')[0]) for v in data.runtime] data['runtime'] = clean_runtime data['year'] = [int(y[1:-1]) for y in data.year] # data.rating[data.rating=='-'] = '0'; clean_rating = [float(v) for v in data.rating] data['rating'] = clean_rating #clean_genres = [g.replace(' ','|') for g in data.genres] #data['genres'] = clean_genres data.head() # In[176]: data.ix[118] # In[177]: data[['year','runtime', 'rating']].describe() # In[217]: import numpy as np data.replace(0,np.nan, inplace=True); # In[218]: data[['runtime', 'rating']].describe() # In[219]: import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.hist(data.year, bins=np.arange(1945, 2017)) plt.xlabel("Godina produkcije"); # In[220]: plt.hist(data.rating.dropna(), bins=20) plt.xlabel("IMDB ocjena"); # In[222]: plt.scatter(data.year, data.rating, lw=0, color='k') plt.xlabel("Godina") plt.ylabel("IMDB ocjena"); # In[223]: data[data.rating == data.rating.min()][['title', 'year', 'rating','director', 'genres']] # In[224]: data[data.rating == data.rating.max()][['title', 'year', 'rating','director', 'genres']] # In[225]: genres = set() for m in data.genres: genres.update(g for g in m.split(',')) genres = sorted(genres) for genre in genres: data[genre] = [genre in movie.split(',') for movie in data.genres] data.head() # In[226]: genre_count = data[genres].sum() pd.DataFrame({'Genre Count': genre_count}) # In[227]: petoljetka = (data.year // 5) * 5 tyd = data.loc[:, ('title', 'year')] tyd['petoljetka'] = petoljetka; tyd.head() # In[228]: pet_mean = data.groupby(petoljetka).rating.mean() pet_mean.name = 'Petoljetka mean' print (pet_mean) plt.plot(pet_mean.index, pet_mean.values, 'o-', color='r', lw=3, label='Petoljetka prosjek') plt.scatter(data.year, data.rating, alpha=.04, lw=0, color='k') plt.xlabel("Godina") plt.ylabel("Ocjena") plt.legend(frameon=False); # In[229]: for year, subset in data.groupby('year'): print (year, subset[subset.rating == subset.rating.max()].title.values) # In[230]: from verzije import * from IPython.display import HTML HTML(print_sysinfo()+info_packages('pandas, numpy,requests, beautifulsoup4'))