import pandas as pd import numpy as np dataset = pd.DataFrame({ 'panda': np.random.normal(0, 1, 100000), 'elephant': np.random.normal(0, 1, 100000) }) x = - 1/3 * (dataset['panda'] + dataset['elephant']) transformed = 1 / (1 + np.exp(-1 * x)) dataset['target'] = np.random.uniform(0,1, 100000) < transformed dataset.target.value_counts() from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics import confusion_matrix cls = LogisticRegression() features = dataset[['elephant', 'panda']] target = dataset['target'] features_train, features_test, target_train, target_test = train_test_split(features, target) cls.fit(features_train, target_train) predictions = cls.predict(features_test) print confusion_matrix(predictions, target_test) pd.set_option('display.mpl_style', 'default') from sklearn.metrics import roc_curve import matplotlib.pyplot as plt scores = cls.predict_proba(features_test).transpose()[1] fpr, tpr, thresholds = roc_curve(target_test, scores) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.plot(fpr, tpr)