import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
Populating the interactive namespace from numpy and matplotlib
The data consists of the titles, student name, organisation, and mentor name, downloaded as a CSV file from melange
source = 'https://gist.githubusercontent.com/chengsoonong/dede21b2eefa43b30d14/raw/add9312ea30c0ccc770151cb6188f964d2baa047/gsoc2014_accepted.csv'
data = pd.read_table(source, sep=',')
print(data.columns.values)
orgs = np.unique(data['Organization'])
proj_per_org = []
for org in orgs:
proj_per_org.append(len(np.flatnonzero(data['Organization'] == org)))
proj_per_org = np.array(proj_per_org)
sort_idx = np.argsort(-proj_per_org)
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111)
top_orgs = 20
idx = np.arange(top_orgs)
ax.bar(idx, proj_per_org[sort_idx][:top_orgs])
ax.set_xticks(idx)
dummy = ax.set_xticklabels(orgs[sort_idx], rotation=80, ha='center')
['Key' 'Title' 'Student' 'Organization' 'Mentors']
Use some heuristics for identifying machine learning projects.
ml_orgs = []
for org in orgs:
if 'learning' in org.lower():
ml_orgs.append(org)
ml_titles = []
for org in ml_orgs:
title_from_org = data['Title'][data['Organization']==org]
for title in title_from_org:
ml_titles.append(np.flatnonzero(data['Title']==title)[0])
titles = data['Title']
for idx,title in enumerate(titles):
if ('learning' in title.lower()) or ('scikit-learn' in title.lower()):
ml_titles.append(np.flatnonzero(data['Title']==title)[0])
ml_orgs.append(data['Organization'][idx])
ml_orgs = np.unique(np.array(ml_orgs))
print(ml_orgs)
['Centre for Computational Medicine, SickKids Research Institute' 'Mozilla' 'Open Source Computer Vision Library (OpenCV)' 'Python Software Foundation' 'Shogun Machine Learning Toolbox' 'Sugar Labs' 'Swathanthra Malayalam Computing' 'The OpenCog Foundation' 'Xapian Search Engine Library' 'lmonade: scientific software distribution' 'mlpack: scalable C++ machine learning library']
ml_titles = np.unique(np.array(ml_titles))
print(data.loc[ml_titles][['Title','Organization','Student','Mentors']])
Title \ 150 Development of machine learning methods for mo... 573 Image features for machine learning in VIGRA 599 Implementation of Multi-Class Adaboost algorit... 600 Improvement of automatic benchmarking system 601 Collaborative Filtering Package Improvements 602 Optimization of tree-traversal in mlpack 649 Add learning capability in the Gaia Keyboard p... 704 Learning based trackers 871 Scikit-learn: Improved Linear Models 901 scikit-learn: Locality sensitive Hashing for a... 904 Scikit-learn - Add Sparse Input Support for En... 998 Fundamental Machine Learning Algorithms 999 application to the idea of Variational Learni... 1000 Shogun Missionary & Shogun in Education 1001 Large-Scale Multi-Label Classification 1002 OpenCV Integration and Computer Vision Applica... 1003 large-scale structured prediction with approxi... 1004 Testing and Measuring Variable Interactions Wi... 1005 Essential Deep Learning Modules 1026 Sugar Listens - Speech Recognition within the ... 1039 Improving learning in varnam 1146 AGI Language Learning of Tagalog Morphology\t 1287 Learning to Rank 1292 GSoC 2014 Proposal for Xapian‘s Learning to Ra... Organization Student \ 150 Centre for Computational Medicine, SickKids Re... Randy Wei 573 lmonade: scientific software distribution Esteban 599 mlpack: scalable C++ machine learning library Udit Saxena 600 mlpack: scalable C++ machine learning library Anand Soni 601 mlpack: scalable C++ machine learning library Sumedh Kedar Ghaisas 602 mlpack: scalable C++ machine learning library Andrew 649 Mozilla Sukant Garg 704 Open Source Computer Vision Library (OpenCV) Alex Leontiev 871 Python Software Foundation Manoj Kumar 901 Python Software Foundation maheshakya 904 Python Software Foundation Hamzeh Alsalhi 998 Shogun Machine Learning Toolbox Parijat 999 Shogun Machine Learning Toolbox yorkerlin 1000 Shogun Machine Learning Toolbox Saurabh Mahindre 1001 Shogun Machine Learning Toolbox Abinash Panda 1002 Shogun Machine Learning Toolbox Abhijeet Kislay 1003 Shogun Machine Learning Toolbox Jiaolong 1004 Shogun Machine Learning Toolbox Soumyajit De 1005 Shogun Machine Learning Toolbox Khaled Nasr 1026 Sugar Labs Rodrigo Parra 1039 Swathanthra Malayalam Computing Kevin Martin Jose 1146 The OpenCog Foundation Lareina Milambiling 1287 Xapian Search Engine Library Jiarong Wei 1292 Xapian Search Engine Library Hanxiao Sun Mentors 150 Quaid Morris, Shankar Vembu 573 Ullrich Koethe, Burcin Erocal 599 Ryan Curtin 600 Marcus Edel 601 Ryan Curtin 602 Ryan Curtin 649 Jan Jongboom 704 Gary Bradski 871 Jaidev Deshpande, Alex Gramfort 901 Daniel Vainsencher, Robert Layton 904 Arnaud Joly, Vlad NIculae 998 Fernando Iglesias 999 Emtiyaz Khan, Heiko 1000 Heiko 1001 Thoralf 1002 pickle27 1003 hushell 1004 Dino Sejdinovic, Heiko 1005 Sergey Lisitsyn, Theofanis Karaletsos 1026 tchx84 1039 Navaneeth, Hrishikesh K B 1146 Matt Chapman, Linas Vepstas 1287 Richard Boulton 1292 James Aylett [24 rows x 4 columns]
ml_proj_count = []
for org in ml_orgs:
ml_proj_count.append(len(np.flatnonzero(data.loc[ml_titles]['Organization'] == org)))
ml_proj_count = np.array(ml_proj_count)
ml_orgs = np.array(ml_orgs)
sort_idx = np.argsort(-ml_proj_count)
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111)
idx = np.arange(len(ml_orgs))
ax.bar(idx, ml_proj_count[sort_idx])
ax.set_xticks(idx)
dummy = ax.set_xticklabels(ml_orgs[sort_idx], rotation=80)