Google Summer of Code Projects related to machine learning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
Populating the interactive namespace from numpy and matplotlib

The data consists of the titles, student name, organisation, and mentor name, downloaded as a CSV file from melange

In [2]:
source = 'https://gist.githubusercontent.com/chengsoonong/dede21b2eefa43b30d14/raw/add9312ea30c0ccc770151cb6188f964d2baa047/gsoc2014_accepted.csv'
data = pd.read_table(source, sep=',')
print(data.columns.values)
orgs = np.unique(data['Organization'])
proj_per_org = []
for org in orgs:
    proj_per_org.append(len(np.flatnonzero(data['Organization'] == org)))
proj_per_org = np.array(proj_per_org)
sort_idx = np.argsort(-proj_per_org)

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111)
top_orgs = 20
idx = np.arange(top_orgs)
ax.bar(idx, proj_per_org[sort_idx][:top_orgs])
ax.set_xticks(idx)
dummy = ax.set_xticklabels(orgs[sort_idx], rotation=80, ha='center')
['Key' 'Title' 'Student' 'Organization' 'Mentors']

Use some heuristics for identifying machine learning projects.

In [3]:
ml_orgs = []
for org in orgs:
    if 'learning' in org.lower():
        ml_orgs.append(org)

ml_titles = []
for org in ml_orgs:
    title_from_org = data['Title'][data['Organization']==org]
    for title in title_from_org:
        ml_titles.append(np.flatnonzero(data['Title']==title)[0])

titles = data['Title']
for idx,title in enumerate(titles):
    if ('learning' in title.lower()) or ('scikit-learn' in title.lower()):
        ml_titles.append(np.flatnonzero(data['Title']==title)[0])
        ml_orgs.append(data['Organization'][idx])
In [4]:
ml_orgs = np.unique(np.array(ml_orgs))
print(ml_orgs)
['Centre for Computational Medicine, SickKids Research Institute' 'Mozilla'
 'Open Source Computer Vision Library (OpenCV)'
 'Python Software Foundation' 'Shogun Machine Learning Toolbox'
 'Sugar Labs' 'Swathanthra Malayalam Computing' 'The OpenCog Foundation'
 'Xapian Search Engine Library' 'lmonade: scientific software distribution'
 'mlpack: scalable C++ machine learning library']
In [5]:
ml_titles = np.unique(np.array(ml_titles))
print(data.loc[ml_titles][['Title','Organization','Student','Mentors']])
                                                  Title  \
150   Development of machine learning methods for mo...   
573        Image features for machine learning in VIGRA   
599   Implementation of Multi-Class Adaboost algorit...   
600        Improvement of automatic benchmarking system   
601        Collaborative Filtering Package Improvements   
602            Optimization of tree-traversal in mlpack   
649   Add learning capability in the Gaia Keyboard p...   
704                             Learning based trackers   
871                Scikit-learn: Improved Linear Models   
901   scikit-learn: Locality sensitive Hashing for a...   
904   Scikit-learn - Add Sparse Input Support for En...   
998             Fundamental Machine Learning Algorithms   
999   application to the idea of  Variational Learni...   
1000            Shogun Missionary & Shogun in Education   
1001             Large-Scale Multi-Label Classification   
1002  OpenCV Integration and Computer Vision Applica...   
1003  large-scale structured prediction with approxi...   
1004  Testing and Measuring Variable Interactions Wi...   
1005                    Essential Deep Learning Modules   
1026  Sugar Listens - Speech Recognition within the ...   
1039                       Improving learning in varnam   
1146      AGI Language Learning of Tagalog Morphology\t   
1287                                   Learning to Rank   
1292  GSoC 2014 Proposal for Xapian‘s Learning to Ra...   

                                           Organization               Student  \
150   Centre for Computational Medicine, SickKids Re...             Randy Wei   
573           lmonade: scientific software distribution               Esteban   
599       mlpack: scalable C++ machine learning library           Udit Saxena   
600       mlpack: scalable C++ machine learning library            Anand Soni   
601       mlpack: scalable C++ machine learning library  Sumedh Kedar Ghaisas   
602       mlpack: scalable C++ machine learning library                Andrew   
649                                             Mozilla           Sukant Garg   
704        Open Source Computer Vision Library (OpenCV)         Alex Leontiev   
871                          Python Software Foundation           Manoj Kumar   
901                          Python Software Foundation            maheshakya   
904                          Python Software Foundation        Hamzeh Alsalhi   
998                     Shogun Machine Learning Toolbox               Parijat   
999                     Shogun Machine Learning Toolbox             yorkerlin   
1000                    Shogun Machine Learning Toolbox      Saurabh Mahindre   
1001                    Shogun Machine Learning Toolbox         Abinash Panda   
1002                    Shogun Machine Learning Toolbox       Abhijeet Kislay   
1003                    Shogun Machine Learning Toolbox              Jiaolong   
1004                    Shogun Machine Learning Toolbox          Soumyajit De   
1005                    Shogun Machine Learning Toolbox           Khaled Nasr   
1026                                         Sugar Labs         Rodrigo Parra   
1039                    Swathanthra Malayalam Computing     Kevin Martin Jose   
1146                             The OpenCog Foundation   Lareina Milambiling   
1287                       Xapian Search Engine Library           Jiarong Wei   
1292                       Xapian Search Engine Library           Hanxiao Sun   

                                    Mentors  
150             Quaid Morris, Shankar Vembu  
573           Ullrich Koethe, Burcin Erocal  
599                             Ryan Curtin  
600                             Marcus Edel  
601                             Ryan Curtin  
602                             Ryan Curtin  
649                            Jan Jongboom  
704                            Gary Bradski  
871         Jaidev Deshpande, Alex Gramfort  
901       Daniel Vainsencher, Robert Layton  
904               Arnaud Joly, Vlad NIculae  
998                       Fernando Iglesias  
999                     Emtiyaz Khan, Heiko  
1000                                  Heiko  
1001                                Thoralf  
1002                               pickle27  
1003                                hushell  
1004                 Dino Sejdinovic, Heiko  
1005  Sergey Lisitsyn, Theofanis Karaletsos  
1026                                 tchx84  
1039              Navaneeth, Hrishikesh K B  
1146            Matt Chapman, Linas Vepstas  
1287                        Richard Boulton  
1292                           James Aylett  

[24 rows x 4 columns]
In [6]:
ml_proj_count = []
for org in ml_orgs:
    ml_proj_count.append(len(np.flatnonzero(data.loc[ml_titles]['Organization'] == org)))
ml_proj_count = np.array(ml_proj_count)
ml_orgs = np.array(ml_orgs)
sort_idx = np.argsort(-ml_proj_count)

fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111)
idx = np.arange(len(ml_orgs))
ax.bar(idx, ml_proj_count[sort_idx])
ax.set_xticks(idx)
dummy = ax.set_xticklabels(ml_orgs[sort_idx], rotation=80)
In [6]:
 
In [6]: