#!/usr/bin/env python # -*- coding: latin-1 -*- %matplotlib inline import matplotlib.pyplot as plt import numpy as np import scipy import seaborn as sns from sklearn import ensemble from sklearn import feature_extraction from sklearn import linear_model from sklearn import pipeline from sklearn import cross_validation from sklearn import metrics # Load module that will load the instances import load X, y, label_names = load.get_instances_from_directory('data/text') X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=0) print(X[0]) print(X[2000]) print(X[1000]) print(X[3000]) print(X[4000]) print(X[6000]) print(X[8000]) print(X[-10]) vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 6), analyzer='char',) # use_idf=False) pipe = pipeline.Pipeline([ ('vectorizer', vectorizer), ('clf', linear_model.LogisticRegression()) ]) pipe.fit(X_train, y_train) y_predicted = pipe.predict(X_test) cm = metrics.confusion_matrix(y_test, y_predicted) # Predict the result on some short new sentences: sentences = [ u'Je ne dis pas ce que je faisais', u'Ich habe nich erzahlt was ich gemacht habe', u'Ne yaptığımı söylemedim', u'Yo no dije lo que hice' ] # We could pass not "feature" but raw data, pretty neat! predicted_languages = pipe.predict(sentences) for sentence, lang in zip(sentences, predicted_languages): print(u'{} ----> {}'.format(sentence, label_names[lang])) plt.figure(figsize=(16, 16)) sns.heatmap(cm, annot=True, fmt='', xticklabels=label_names, yticklabels=label_names); print(metrics.classification_report(y_test, y_predicted, target_names=label_names))