%pylab inline
import pandas as pd
import geojson as gj
from collections import defaultdict, Counter
from scipy import stats
Populating the interactive namespace from numpy and matplotlib
poi = pd.read_csv('data/poi/pois_milano_tripadvisor.csv')
poi.sort('reviews', ascending=False).head()
name | reviews | lat | lon | |
---|---|---|---|---|
1 | Duomo di Milano | 9561 | 45.46467 | 9.190500 |
13 | Galleria Vittorio Emanuele II | 3488 | 45.46560 | 9.190000 |
3 | L'Ultima Cena (Cenacolo Vinciano) | 3099 | 45.46596 | 9.170649 |
18 | Piazza del Duomo | 2600 | 45.46468 | 9.190770 |
17 | Castello Sforzesco | 2398 | 45.47045 | 9.180639 |
with open('data/poi/milano-grid.geojson') as gf:
grid = gj.load(gf)
cell_position = pd.DataFrame([([cell["properties"]["cellId"]] + cell["geometry"]["coordinates"][0][0])
for cell in grid['features']],
columns=['cellId', 'lat', 'lon']).set_index('cellId')
cell_position.head()
lat | lon | |
---|---|---|
cellId | ||
1 | 9.011491 | 45.358801 |
2 | 9.014491 | 45.358801 |
3 | 9.017492 | 45.358801 |
4 | 9.020492 | 45.358800 |
5 | 9.023493 | 45.358799 |
def poi_in_cell(poi_coords, coords):
return not ((poi_coords[0]>coords[1][0]) or (poi_coords[1]>coords[1][1]) or
(poi_coords[0]<coords[3][0]) or (poi_coords[1]<coords[3][1]))
poi_to_cell_dict={}
cell_reviews=defaultdict(int)
for index, p in poi.iterrows():
if index%50==0:
print index, p[1]
for cell in grid['features']:
if poi_in_cell([p[3],p[2]], cell['geometry']['coordinates'][0]):
poi_to_cell_dict[p[0]] = cell['id']
cell_reviews[cell['id']]+=p[1]
cell_reviews = pd.Series(cell_reviews)
0 784 50 30 100 92 150 16 200 3 250 14 300 10 350 2
npoi = pd.Series(Counter(poi_to_cell_dict.values()))
cells = npoi.keys()
store = pd.HDFStore('./stores/aggregated_dataset.h5')
fh = store['entropy_1D'].reset_index()
store.close()
fh['day'] = fh['time'].map(lambda x: x.date())
fh.drop(['entropy_n', 'n', 'time'], inplace=True, axis=1)
entropy_avg = fh.groupby('Square_id')['entropy'].mean()
entropy_avg.head()
Square_id 1 2.444591 2 2.430424 3 2.410835 4 2.477077 5 2.502639 Name: entropy, dtype: float64
x = npoi[cells].values
y = entropy_avg[cells].values
plt.figure(figsize=(8,7))
plt.xlabel('Number of POIs', fontsize=22)
plt.ylabel('Average daily entropy', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.axis([0,12,1.5,4.0])
plt.scatter(x, y, s=40, c='r', alpha=0.8)
print "Spearman rank coefficient :", stats.spearmanr(x, y)
Spearman rank coefficient : (0.35557979145601343, 6.3085603657327917e-08)
y = entropy_avg[cells].values
yy = entropy_avg[entropy_avg.index.delete(cells)].values
stats.ks_2samp(yy, y)
(0.56183991047782045, 3.4853019940608143e-60)
plt.figure(figsize=(8,7))
plt.xlabel('Average Daily Entropy', fontsize=22)
plt.ylabel('P(Avg Daily Entropy)', fontsize=22)
plt.hist(yy, 100, cumulative=1, normed=1, label='no-POIs cells')
plt.hist(y, 50, cumulative=1, normed=1, label='POIs cells', color='r')
plt.tick_params(axis='both', which='major', labelsize=15)
plt.axis([0.0, 3.5, 0, 1.0])
plt.legend(loc=2)
<matplotlib.legend.Legend at 0x7f5c3e78dbd0>
store = pd.HDFStore('./stores/aggregated_dataset.h5')
fh = store['intensity_ni_1D'].reset_index()
store.close()
fh['day'] = fh['time'].map(lambda x: x.date())
fh.drop(['time'], inplace=True, axis=1)
activity_avg = fh.groupby(['Square_id', 'day'], as_index=False).sum().groupby('Square_id')['Call'].mean()
activity_avg.head()
Square_id 1 1.132792 2 1.156711 3 1.182170 4 1.063513 5 1.002914 Name: Call, dtype: float32
x2 = npoi[cells].values
y2 = activity_avg[cells].values
plt.figure(figsize=(7,7))
plt.xlabel('Number of POIs', fontsize=22)
plt.ylabel('average daily activity',fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.scatter(x2,y2,s=40, c='r', alpha=0.8)
r=stats.spearmanr(x2, y2)
print "Spearman rank coefficient :",r
Spearman rank coefficient : (0.48611959477097355, 2.1709159849930512e-14)
plt.figure(figsize=(7,7))
xz = entropy_avg[entropy_avg.index.delete(cells)].values
yz = activity_avg[activity_avg.index.delete(cells)].values
plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells')
xz = entropy_avg[cells].values
yz = activity_avg[cells].values
plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells')
plt.xlabel('Average daily entropy', fontsize=22)
plt.ylabel('Average daily activity', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc=4)
plt.yscale('log')
from sklearn import svm, grid_search, metrics
from sklearn import cross_validation
df = pd.DataFrame(entropy_avg)
df['activity'] = activity_avg
df['reviews'] = 0
df.loc[cells, 'reviews'] = cell_reviews[cells]
df['npoi'] = 0
df.loc[cells, 'npoi'] = npoi[cells]
df['poi'] = df.npoi > 0
df['lat'] = cell_position.lat
df['lon'] = cell_position.lon
df[df.poi].head(10)
entropy | activity | reviews | npoi | poi | lat | lon | |
---|---|---|---|---|---|---|---|
Square_id | |||||||
2767 | 2.740439 | 7.469481 | 5 | 1 | True | 9.209731 | 45.415723 |
2773 | 2.785204 | 2.840190 | 212 | 1 | True | 9.227751 | 45.415688 |
3454 | 2.768579 | 34.803547 | 8 | 1 | True | 9.170730 | 45.430595 |
3655 | 2.673355 | 61.745651 | 7 | 2 | True | 9.173748 | 45.434821 |
3670 | 2.566126 | 39.636150 | 2 | 1 | True | 9.218814 | 45.434743 |
3753 | 2.801604 | 31.504326 | 7 | 1 | True | 9.167745 | 45.436945 |
4343 | 2.658501 | 48.257473 | 3 | 1 | True | 9.137730 | 45.449677 |
4347 | 3.044334 | 58.211586 | 88 | 1 | True | 9.149751 | 45.449662 |
4355 | 3.046771 | 90.429901 | 2 | 1 | True | 9.173793 | 45.449628 |
4357 | 3.110772 | 82.910629 | 16 | 1 | True | 9.179804 | 45.449618 |
from sklearn import grid_search
model = svm.SVC(class_weight={0:1, 1:4.5})
X = array([df.entropy.values, np.log(df.activity.values)]).T
cv = cross_validation.KFold(X.shape[0], n_folds=3, shuffle=True, random_state=3)
gs = grid_search.GridSearchCV(model, {'gamma': np.logspace(-3, 0, 4), 'C': np.logspace(-1, 1, 3)},
cv=cv, scoring='precision', n_jobs=4)
gs.fit(X, df.poi.astype(int))
print gs.best_params_
print gs.best_score_
{'C': 1.0, 'gamma': 0.01} 0.444223620224
/usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. 'precision', 'predicted', average, warn_for) /usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. 'precision', 'predicted', average, warn_for) /usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. 'precision', 'predicted', average, warn_for) /usr/local/projects/unveiling-patterns/code_repository/virtualenv/local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. 'precision', 'predicted', average, warn_for)
ypred = gs.predict(X)
print metrics.classification_report(df.poi.astype(int).values, ypred)
precision recall f1-score support 0 0.99 0.99 0.99 9781 1 0.42 0.43 0.43 219 avg / total 0.97 0.97 0.97 10000
y_pred = gs.decision_function(X)
roc_auc = metrics.roc_auc_score(df.poi.astype(int).values, y_pred)
print roc_auc
fpr, tpr, _ = metrics.roc_curve(df.poi.astype(int).values, y_pred)
plt.plot(fpr, tpr, 'b.', label='ROC curve (area = %0.2f)' % roc_auc)
plt.legend();
0.915791215753
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 1))
Z = gs.decision_function(np.c_[xx.ravel(), yy.ravel()])
plt.figure(figsize=(7,7))
Z = Z.reshape(xx.shape)
ax = plt.subplot(111)
norm = plt.cm.colors.Normalize(vmax=abs(Z).max(), vmin=-abs(Z).max())
ax.contourf(xx, yy, Z, 200, cmap=cm.RdBu_r, alpha=.8, norm=norm)
xz = df.entropy[df.entropy.index.delete(cells)].values
yz = log(df.activity[df.activity.index.delete(cells)].values)
plt.scatter(xz, yz, c='b', alpha=0.8, label='no-POIs cells')
xz = df.entropy[cells].values
yz = log(df.activity[cells].values)
plt.scatter(xz, yz, c='r', alpha=0.8, label='POIs cells')
plt.xlabel('Average daily entropy', fontsize=22)
plt.ylabel('Log (average daily activity)', fontsize=22)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.xlim(0, 4)
plt.ylim(y_min, 7.4)
plt.legend(loc=4, scatterpoints=1, markerscale=3, fontsize='x-large')