scikit-learn
¶pandas
import numpy as np
import pandas as pd
import matplotlib
train_pd = pd.read_csv("data/train.csv")
test_pd = pd.read_csv("data/test.csv")
train_pd.shape
(61878, 95)
test_pd.shape
(144368, 94)
train_pd.head()
id | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | ... | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
3 | 4 | 1 | 0 | 0 | 1 | 6 | 1 | 5 | 0 | 0 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | Class_1 |
5 rows × 95 columns
train_pd.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 61878 entries, 0 to 61877 Data columns (total 95 columns): id 61878 non-null int64 feat_1 61878 non-null int64 feat_2 61878 non-null int64 feat_3 61878 non-null int64 feat_4 61878 non-null int64 feat_5 61878 non-null int64 feat_6 61878 non-null int64 feat_7 61878 non-null int64 feat_8 61878 non-null int64 feat_9 61878 non-null int64 feat_10 61878 non-null int64 feat_11 61878 non-null int64 feat_12 61878 non-null int64 feat_13 61878 non-null int64 feat_14 61878 non-null int64 feat_15 61878 non-null int64 feat_16 61878 non-null int64 feat_17 61878 non-null int64 feat_18 61878 non-null int64 feat_19 61878 non-null int64 feat_20 61878 non-null int64 feat_21 61878 non-null int64 feat_22 61878 non-null int64 feat_23 61878 non-null int64 feat_24 61878 non-null int64 feat_25 61878 non-null int64 feat_26 61878 non-null int64 feat_27 61878 non-null int64 feat_28 61878 non-null int64 feat_29 61878 non-null int64 feat_30 61878 non-null int64 feat_31 61878 non-null int64 feat_32 61878 non-null int64 feat_33 61878 non-null int64 feat_34 61878 non-null int64 feat_35 61878 non-null int64 feat_36 61878 non-null int64 feat_37 61878 non-null int64 feat_38 61878 non-null int64 feat_39 61878 non-null int64 feat_40 61878 non-null int64 feat_41 61878 non-null int64 feat_42 61878 non-null int64 feat_43 61878 non-null int64 feat_44 61878 non-null int64 feat_45 61878 non-null int64 feat_46 61878 non-null int64 feat_47 61878 non-null int64 feat_48 61878 non-null int64 feat_49 61878 non-null int64 feat_50 61878 non-null int64 feat_51 61878 non-null int64 feat_52 61878 non-null int64 feat_53 61878 non-null int64 feat_54 61878 non-null int64 feat_55 61878 non-null int64 feat_56 61878 non-null int64 feat_57 61878 non-null int64 feat_58 61878 non-null int64 feat_59 61878 non-null int64 feat_60 61878 non-null int64 feat_61 61878 non-null int64 feat_62 61878 non-null int64 feat_63 61878 non-null int64 feat_64 61878 non-null int64 feat_65 61878 non-null int64 feat_66 61878 non-null int64 feat_67 61878 non-null int64 feat_68 61878 non-null int64 feat_69 61878 non-null int64 feat_70 61878 non-null int64 feat_71 61878 non-null int64 feat_72 61878 non-null int64 feat_73 61878 non-null int64 feat_74 61878 non-null int64 feat_75 61878 non-null int64 feat_76 61878 non-null int64 feat_77 61878 non-null int64 feat_78 61878 non-null int64 feat_79 61878 non-null int64 feat_80 61878 non-null int64 feat_81 61878 non-null int64 feat_82 61878 non-null int64 feat_83 61878 non-null int64 feat_84 61878 non-null int64 feat_85 61878 non-null int64 feat_86 61878 non-null int64 feat_87 61878 non-null int64 feat_88 61878 non-null int64 feat_89 61878 non-null int64 feat_90 61878 non-null int64 feat_91 61878 non-null int64 feat_92 61878 non-null int64 feat_93 61878 non-null int64 target 61878 non-null object dtypes: int64(94), object(1) memory usage: 45.3+ MB
train_pd.describe()
id | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | ... | feat_84 | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 61878.000000 | 61878.00000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | ... | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 |
mean | 30939.500000 | 0.38668 | 0.263066 | 0.901467 | 0.779081 | 0.071043 | 0.025696 | 0.193704 | 0.662433 | 1.011296 | ... | 0.070752 | 0.532306 | 1.128576 | 0.393549 | 0.874915 | 0.457772 | 0.812421 | 0.264941 | 0.380119 | 0.126135 |
std | 17862.784315 | 1.52533 | 1.252073 | 2.934818 | 2.788005 | 0.438902 | 0.215333 | 1.030102 | 2.255770 | 3.474822 | ... | 1.151460 | 1.900438 | 2.681554 | 1.575455 | 2.115466 | 1.527385 | 4.597804 | 2.045646 | 0.982385 | 1.201720 |
min | 1.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 15470.250000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 30939.500000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 46408.750000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 61878.000000 | 61.00000 | 51.000000 | 64.000000 | 70.000000 | 19.000000 | 10.000000 | 38.000000 | 76.000000 | 43.000000 | ... | 76.000000 | 55.000000 | 65.000000 | 67.000000 | 30.000000 | 61.000000 | 130.000000 | 52.000000 | 19.000000 | 87.000000 |
8 rows × 94 columns
train_pd.target.unique()
array(['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'], dtype=object)
train_pd.target.value_counts()
Class_2 16122 Class_6 14135 Class_8 8464 Class_3 8004 Class_9 4955 Class_7 2839 Class_5 2739 Class_4 2691 Class_1 1929 dtype: int64
from sklearn import preprocessing
labels = train_pd.target.values
enc = preprocessing.LabelBinarizer()
binarized_labels = enc.fit_transform(labels)
binarized_labels[0:10]
array([[1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0]])
train_pd[train_pd.target == "Class_2"]
id | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | ... | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1929 | 1930 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 29 | ... | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1930 | 1931 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1931 | 1932 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Class_2 |
1932 | 1933 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 12 | ... | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1933 | 1934 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 18 | ... | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1934 | 1935 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1935 | 1936 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 3 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | Class_2 |
1936 | 1937 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Class_2 |
1937 | 1938 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13 | ... | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Class_2 |
1938 | 1939 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1939 | 1940 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1940 | 1941 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1941 | 1942 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1942 | 1943 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1943 | 1944 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | Class_2 |
1944 | 1945 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | ... | 4 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1945 | 1946 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1946 | 1947 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 2 | 0 | 1 | 0 | 0 | Class_2 |
1947 | 1948 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Class_2 |
1948 | 1949 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1949 | 1950 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1950 | 1951 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1951 | 1952 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | Class_2 |
1952 | 1953 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | Class_2 |
1953 | 1954 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1954 | 1955 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1955 | 1956 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | Class_2 |
1956 | 1957 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | ... | 0 | 3 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | Class_2 |
1957 | 1958 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 4 | 0 | 0 | 2 | 0 | 0 | 0 | 1 | 0 | Class_2 |
1958 | 1959 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
18021 | 18022 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18022 | 18023 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 2 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18023 | 18024 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Class_2 |
18024 | 18025 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18025 | 18026 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | Class_2 |
18026 | 18027 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Class_2 |
18027 | 18028 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | ... | 1 | 8 | 1 | 9 | 0 | 0 | 0 | 1 | 0 | Class_2 |
18028 | 18029 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | Class_2 |
18029 | 18030 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 4 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18030 | 18031 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18031 | 18032 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 1 | 1 | 0 | 2 | 0 | 0 | 0 | 0 | Class_2 |
18032 | 18033 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18033 | 18034 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | Class_2 |
18034 | 18035 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18035 | 18036 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 7 | 2 | 0 | 0 | 2 | 0 | Class_2 |
18036 | 18037 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | ... | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | Class_2 |
18037 | 18038 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | ... | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18038 | 18039 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 14 | ... | 0 | 1 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18039 | 18040 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18040 | 18041 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18041 | 18042 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18042 | 18043 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18043 | 18044 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | Class_2 |
18044 | 18045 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18045 | 18046 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Class_2 |
18046 | 18047 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18047 | 18048 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18048 | 18049 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18049 | 18050 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_2 |
18050 | 18051 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | Class_2 |
16122 rows × 95 columns
binarized_labels[1920:1940]
array([[1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0]])
target = binarized_labels[:,1]
train_pd = train_pd.drop("id", axis=1)
train_pd = train_pd.drop("target", axis=1)
from sklearn import linear_model
ols = linear_model.LinearRegression(normalize=True, fit_intercept=True)
%timeit -n 1 ols.fit(train_pd, target, n_jobs=-1)
1 loops, best of 3: 926 ms per loop
ols.coef_
array([ -3.14982300e-03, 3.47221514e-03, -1.42229541e-03, -4.14763764e-04, 3.42997381e-03, 6.25192239e-03, 3.43866768e-03, -4.29012175e-03, 1.12415154e-02, -3.26559447e-03, -1.69684512e-02, 5.55673596e-03, -3.15315231e-03, 1.71719689e-02, 1.03620445e-02, 1.53954874e-03, -6.24911626e-03, 7.22623491e-04, -1.77014337e-03, -7.30082475e-03, 6.00340306e-04, -4.60302936e-03, -9.30699965e-03, 1.42825335e-03, 1.99779617e-02, -1.10315018e-02, -9.65080310e-04, -3.27765172e-03, -7.58392354e-03, -7.05775582e-04, 5.84541856e-04, -6.44978225e-03, 2.80796165e-02, -1.24145829e-02, -1.30099716e-03, -1.04845101e-02, -1.18546643e-02, -9.15970913e-04, -5.20614655e-03, 1.07416153e-02, -1.52159677e-02, -1.28705193e-02, -4.76401848e-03, -2.54190978e-04, -8.70311908e-06, -9.93784950e-04, -3.81085921e-03, 1.85208275e-02, 1.42715351e-04, -5.60039913e-03, -2.92715309e-03, -6.53707392e-04, -1.05888156e-02, 1.91483612e-04, -2.93958998e-03, -9.73075960e-03, -5.11414048e-03, 2.18044328e-04, -7.21058124e-03, -1.07665493e-02, -2.31556376e-03, -5.68270603e-03, 6.66723411e-03, 4.30045356e-03, 1.86049799e-04, -3.31222071e-03, -2.28423505e-03, -1.38809907e-02, -4.38277506e-03, -3.50334420e-03, -1.76637776e-02, 1.67441916e-02, -7.52699632e-04, -3.81691733e-04, -9.81741192e-03, -7.72718001e-04, 1.45962327e-02, -8.16732889e-04, -1.13025561e-02, -1.33054605e-03, -1.87685148e-03, -2.13146999e-03, -3.64391781e-03, 1.59689682e-03, -2.83951023e-03, -8.29384211e-04, -1.98064820e-03, -6.76555868e-05, -2.32347000e-03, 3.05865542e-05, -4.90843920e-03, -3.47720141e-03, -1.18598834e-04])
ols_predict = ols.predict(train_pd)
ols_predict
array([ 0.01636644, 0.24374971, 0.27022419, ..., 0.22284388, 0.08496076, -0.01905893])
ols_predict_raw = ols_predict.copy()
ols_predict[ols_predict > 0.5] = 1
ols_predict[ols_predict <= 0.5] = 0
ols_predict
array([ 0., 0., 0., ..., 0., 0., 0.])
ols_model_analysis = pd.concat([pd.Series(target), pd.Series(ols_predict)], axis=1)
ols_model_analysis.columns = ['actual', 'prediction']
ols_model_analysis[ols_model_analysis.actual == 1]
actual | prediction | |
---|---|---|
1929 | 1 | 1 |
1930 | 1 | 0 |
1931 | 1 | 0 |
1932 | 1 | 0 |
1933 | 1 | 1 |
1934 | 1 | 0 |
1935 | 1 | 0 |
1936 | 1 | 0 |
1937 | 1 | 0 |
1938 | 1 | 0 |
1939 | 1 | 1 |
1940 | 1 | 0 |
1941 | 1 | 0 |
1942 | 1 | 1 |
1943 | 1 | 1 |
1944 | 1 | 0 |
1945 | 1 | 1 |
1946 | 1 | 1 |
1947 | 1 | 0 |
1948 | 1 | 0 |
1949 | 1 | 0 |
1950 | 1 | 0 |
1951 | 1 | 0 |
1952 | 1 | 1 |
1953 | 1 | 0 |
1954 | 1 | 0 |
1955 | 1 | 1 |
1956 | 1 | 1 |
1957 | 1 | 1 |
1958 | 1 | 0 |
... | ... | ... |
18021 | 1 | 0 |
18022 | 1 | 1 |
18023 | 1 | 0 |
18024 | 1 | 1 |
18025 | 1 | 0 |
18026 | 1 | 0 |
18027 | 1 | 1 |
18028 | 1 | 0 |
18029 | 1 | 0 |
18030 | 1 | 0 |
18031 | 1 | 1 |
18032 | 1 | 0 |
18033 | 1 | 1 |
18034 | 1 | 0 |
18035 | 1 | 1 |
18036 | 1 | 0 |
18037 | 1 | 1 |
18038 | 1 | 0 |
18039 | 1 | 0 |
18040 | 1 | 1 |
18041 | 1 | 1 |
18042 | 1 | 0 |
18043 | 1 | 1 |
18044 | 1 | 0 |
18045 | 1 | 0 |
18046 | 1 | 1 |
18047 | 1 | 0 |
18048 | 1 | 0 |
18049 | 1 | 0 |
18050 | 1 | 1 |
16122 rows × 2 columns
true_positives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 1)]\
.sum()[0]
true_positives
7366.0
true_negatives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 0)]\
.sum()[0]
true_negatives
0.0
false_positives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 1)]\
.sum()[0]
false_positives
0.0
false_negatives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 0)]\
.sum()[0]
false_negatives
8756.0
precision = true_positives / (true_positives + false_positives)
precision
1.0
recall = true_positives / (true_positives + false_negatives)
recall
0.45689120456519045
from sklearn import metrics
%pylab inline
Populating the interactive namespace from numpy and matplotlib
ols_auc = metrics.roc_auc_score(ols_model_analysis.actual, ols_predict_raw)
fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_predict_raw)
pyplot.plot(fpr, tpr)
pyplot.plot([0,1],[0,1])
[<matplotlib.lines.Line2D at 0x116352350>]
fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_model_analysis.prediction)
pyplot.plot(fpr, tpr)
pyplot.plot([0,1],[0,1])
[<matplotlib.lines.Line2D at 0x11636e250>]
ols_auc
0.89794512670396931
ols_f1 = 2*true_positives / (2*true_positives + false_positives + false_negatives)
ols_f1
0.62721389645776571
from sklearn.cross_validation import train_test_split
sample_train, sample_validate, sample_train_target, sample_validate_target = train_test_split(train_pd, target, test_size = 0.2, random_state = 123)
sample_train.shape
(49502, 93)
sample_validate.shape
(12376, 93)
sample_train_target.shape
(49502,)
sample_validate_target.shape
(12376,)
ols = linear_model.LinearRegression(normalize=True)
%timeit -n 1 ols.fit(sample_train, sample_train_target, n_jobs=-1)
ols_sample_predict = ols.predict(sample_validate)
ols_predict_raw = ols_sample_predict.copy()
ols_sample_predict[ols_sample_predict > 0.5] = 1
ols_sample_predict[ols_sample_predict <= 0.5] = 0
ols_model_analysis = pd.concat([pd.Series(sample_validate_target), pd.Series(ols_sample_predict)], axis=1)
ols_model_analysis.columns = ['actual', 'prediction']
true_positives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 1)]\
.sum()[0]
true_negatives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 0)]\
.sum()[0]
false_positives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 1)]\
.sum()[0]
false_negatives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 0)]\
.sum()[0]
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
ols_auc = metrics.roc_auc_score(ols_model_analysis.actual, ols_model_analysis.prediction)
ols_f1 = 2*true_positives / (2*true_positives + false_positives + false_negatives)
1 loops, best of 3: 705 ms per loop
print precision
print recall
print ols_auc
print ols_f1
1.0 0.45322180917 0.686438189237 0.623747601791
fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_predict_raw)
pyplot.plot(fpr, tpr)
pyplot.plot([0,1],[0,1])
fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_model_analysis.prediction)
pyplot.plot(fpr, tpr)
pyplot.plot([0,1],[0,1])
[<matplotlib.lines.Line2D at 0x1129ec610>]
sklearn
has a cross_validation
module to create the k-fold datasets¶from sklearn.cross_validation import KFold
ols_kf = KFold(n=train_pd.shape[0], n_folds=5, shuffle=True)
ols_kf
sklearn.cross_validation.KFold(n=61878, n_folds=5, shuffle=True, random_state=None)
ols_cv_metrics = []
for train_index, validate_index in ols_kf:
sample_train, sample_validate = train_pd.loc[train_index], train_pd.loc[validate_index]
sample_train_target, sample_validate_target = target[train_index], target[validate_index]
ols = linear_model.LinearRegression(normalize=True)
%timeit -n 1 ols.fit(sample_train, sample_train_target, n_jobs=-1)
ols_sample_predict = ols.predict(sample_validate)
ols_predict_raw = ols_sample_predict.copy()
ols_sample_predict[ols_sample_predict > 0.5] = 1
ols_sample_predict[ols_sample_predict <= 0.5] = 0
ols_model_analysis = pd.concat([pd.Series(sample_validate_target), pd.Series(ols_sample_predict)], axis=1)
ols_model_analysis.columns = ['actual', 'prediction']
true_positives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 1)]\
.sum()[0]
true_negatives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 0)]\
.sum()[0]
false_positives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 1)]\
.sum()[0]
false_negatives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 0)]\
.sum()[0]
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
ols_auc = metrics.roc_auc_score(ols_model_analysis.actual, ols_model_analysis.prediction)
ols_f1 = 2*true_positives / (2*true_positives + false_positives + false_negatives)
ols_cv_metrics.append((precision, recall, ols_auc, ols_f1))
1 loops, best of 3: 696 ms per loop 1 loops, best of 3: 712 ms per loop 1 loops, best of 3: 681 ms per loop 1 loops, best of 3: 657 ms per loop 1 loops, best of 3: 658 ms per loop
ols_metric_pd = pd.DataFrame(ols_cv_metrics).mean()
ols_metrics_pd = pd.DataFrame(pd.DataFrame(ols_metric_pd).T)
ols_metrics_pd.columns=['precision', 'recall', 'auc', 'f1']
print ols_metrics_pd
precision recall auc f1 0 1 0.455974 0.689491 0.626312
fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_predict_raw)
pyplot.plot(fpr, tpr)
pyplot.plot([0,1],[0,1])
[<matplotlib.lines.Line2D at 0x112d08ed0>]
from sklearn.linear_model import Ridge
ridge = Ridge(normalize=True)
%timeit -n 1 ridge.fit(sample_train, sample_train_target)
1 loops, best of 3: 102 ms per loop
ridge_sample_predict = ridge.predict(sample_validate)
ridge_sample_predict
array([ 0.23847159, 0.09869083, 0.35982685, ..., 0.11722965, 0.1865065 , 0.07968807])
ridge_sample_predict[ridge_sample_predict > 0.5] = 1
ridge_sample_predict[ridge_sample_predict <= 0.5] = 0
ridge_sample_predict
array([ 0., 0., 0., ..., 0., 0., 0.])
ridge_model_analysis = pd.concat([pd.Series(sample_validate_target), pd.Series(ridge_sample_predict)], axis=1)
ridge_model_analysis.columns = ['actual', 'prediction']
ridge_model_analysis.head()
actual | prediction | |
---|---|---|
0 | 0 | 0 |
1 | 0 | 0 |
2 | 0 | 0 |
3 | 0 | 0 |
4 | 0 | 0 |
ridge_model_analysis[ridge_model_analysis.prediction == 1]
actual | prediction | |
---|---|---|
383 | 1 | 1 |
384 | 1 | 1 |
398 | 1 | 1 |
400 | 1 | 1 |
401 | 1 | 1 |
406 | 1 | 1 |
416 | 1 | 1 |
423 | 1 | 1 |
424 | 1 | 1 |
425 | 1 | 1 |
427 | 1 | 1 |
436 | 1 | 1 |
439 | 1 | 1 |
440 | 1 | 1 |
441 | 1 | 1 |
442 | 1 | 1 |
445 | 1 | 1 |
448 | 1 | 1 |
453 | 1 | 1 |
454 | 1 | 1 |
455 | 1 | 1 |
459 | 1 | 1 |
460 | 1 | 1 |
463 | 1 | 1 |
469 | 1 | 1 |
477 | 1 | 1 |
478 | 1 | 1 |
479 | 1 | 1 |
498 | 1 | 1 |
499 | 1 | 1 |
... | ... | ... |
5693 | 0 | 1 |
5698 | 0 | 1 |
5701 | 0 | 1 |
5702 | 0 | 1 |
5705 | 0 | 1 |
5713 | 0 | 1 |
5717 | 0 | 1 |
5719 | 0 | 1 |
5721 | 0 | 1 |
5727 | 0 | 1 |
5730 | 0 | 1 |
5731 | 0 | 1 |
5742 | 0 | 1 |
5757 | 0 | 1 |
5768 | 0 | 1 |
5771 | 0 | 1 |
5776 | 0 | 1 |
5780 | 0 | 1 |
5782 | 0 | 1 |
5785 | 0 | 1 |
5901 | 0 | 1 |
6027 | 0 | 1 |
6073 | 0 | 1 |
9186 | 0 | 1 |
9324 | 0 | 1 |
9329 | 0 | 1 |
9378 | 0 | 1 |
9636 | 0 | 1 |
10420 | 0 | 1 |
10977 | 0 | 1 |
1363 rows × 2 columns
true_positives = ridge_model_analysis[(ridge_model_analysis.actual == 1) & (ridge_model_analysis.prediction == 1)]\
.sum()[0]
true_negatives = ridge_model_analysis[(ridge_model_analysis.actual == 0) & (ridge_model_analysis.prediction == 0)]\
.sum()[0]
false_positives = ridge_model_analysis[(ridge_model_analysis.actual == 0) & (ridge_model_analysis.prediction == 1)]\
.sum()[0]
false_negatives = ridge_model_analysis[(ridge_model_analysis.actual == 1) & (ridge_model_analysis.prediction == 0)]\
.sum()[0]
precision = true_positives / (true_positives + false_negatives)
recall = true_positives / (true_positives + false_positives)
ridge_auc = metrics.roc_auc_score(ridge_model_analysis.actual, ridge_model_analysis.prediction)
ridge_f1 = 2*(true_positives) / (2*true_positives + false_positives + false_negatives)
print precision
print recall
print ridge_auc
print ridge_f1
0.278325123153 1.0 0.614017387916 0.435452793834
#Creating a function for the cross-validation process.
def cross_validate(clf, train_pd, target):
predict_raw = None
clf_cv_metrics = []
train, validate, train_target, validate_target = train_test_split(train_pd, target, test_size = 0.2)
kf = KFold(n=train_pd.shape[0], n_folds=10, shuffle=True)
for train_index, validate_index in kf:
sample_train, sample_validate = train_pd.loc[train_index], train_pd.loc[validate_index]
sample_train_target, sample_validate_target = target[train_index], target[validate_index]
#ols = linear_model.LinearRegression(normalize=True)
#print clf
clf.fit(train, train_target)
predict = clf.predict(validate)
predict_raw
predict_raw = predict.copy()
predict[predict > 0.5] = 1
predict[predict <= 0.5] = 0
clf_model_analysis = pd.concat([pd.Series(validate_target), pd.Series(predict)], axis=1)
clf_model_analysis.columns = ['actual', 'prediction']
true_positives = clf_model_analysis[(clf_model_analysis.actual == 1) & (clf_model_analysis.prediction == 1)]\
.sum()[0]
#print "tp ", true_positives
true_negatives = clf_model_analysis[(clf_model_analysis.actual == 0) & (clf_model_analysis.prediction == 0)]\
.sum()[0]
#print "tn ", true_negatives
false_positives = clf_model_analysis[(clf_model_analysis.actual == 0) & (clf_model_analysis.prediction == 1)]\
.sum()[0]
#print "fp ", false_positives
false_negatives = clf_model_analysis[(clf_model_analysis.actual == 1) & (clf_model_analysis.prediction == 0)]\
.sum()[0]
#print "fn ", false_negatives
precision = float(true_positives) / float(true_positives + false_positives)
recall = float(true_positives) / float(true_positives + false_negatives)
clf_auc = metrics.roc_auc_score(clf_model_analysis.actual, clf_model_analysis.prediction)
clf_f1 = float(2*true_positives) / float(2*true_positives + false_positives + false_negatives)
clf_cv_metrics.append((precision, recall, clf_auc, clf_f1))
fpr, tpr, thresholds = metrics.roc_curve(clf_model_analysis.actual, predict_raw)
pyplot.plot(fpr, tpr)
pyplot.plot([0,1],[0,1])
return clf_cv_metrics
ridge = Ridge(normalize=True)
ridge_cv_metrics = cross_validate(ridge, train_pd, target)
ridge_metrics = pd.DataFrame(ridge_cv_metrics).mean()
ridge_metrics_pd = pd.DataFrame(pd.DataFrame(ridge_metrics).T)
ridge_metrics_pd.columns=['precision', 'recall', 'auc', 'f1']
print ridge_metrics_pd
precision recall auc f1 0 1 0.270928 0.613244 0.426347
ols_metrics_pd
precision | recall | auc | f1 | |
---|---|---|---|---|
0 | 1 | 0.455974 | 0.689491 | 0.626312 |
lasso = linear_model.Lasso(alpha=0.1, selection='random')
lasso_cv_metrics = cross_validate(lasso,train_pd, target)
lasso_cv_metrics = pd.DataFrame(lasso_cv_metrics).mean()
lasso_cv_metrics_pd = pd.DataFrame(pd.DataFrame(lasso_cv_metrics).T)
lasso_cv_metrics_pd.columns=['precision', 'recall', 'auc', 'f1']
print lasso_cv_metrics_pd
precision recall auc f1 0 1 0.237833 0.593963 0.384273
log_reg_l2 = linear_model.LogisticRegression(C=1, penalty='l2')
log_reg_l2_cv = cross_validate(log_reg_l2, train_pd, target)
log_reg_l2_cv_metrics = pd.DataFrame(log_reg_l2_cv).mean()
log_reg_l2_metrics_pd = pd.DataFrame(pd.DataFrame(log_reg_l2_cv_metrics).T)
log_reg_l2_metrics_pd.columns=['precision', 'recall', 'auc', 'f1']
print log_reg_l2_metrics_pd
precision recall auc f1 0 1 0.714417 0.799443 0.833423
log_reg_l1 = linear_model.LogisticRegression(C=0.1, penalty='l1')
log_reg_l1_cv_metrics = cross_validate(log_reg_l1, train_pd, target)
log_reg_l1_metrics = pd.DataFrame(log_reg_l1_cv_metrics).mean()
log_reg_l1_metrics_pd = pd.DataFrame(pd.DataFrame(log_reg_l1_metrics).T)
log_reg_l1_metrics_pd.columns=['precision', 'recall', 'auc', 'f1']
print log_reg_l1_metrics_pd
precision recall auc f1 0 1 0.718546 0.801629 0.836226
from sklearn.tree import DecisionTreeClassifier, export_graphviz
dtc = DecisionTreeClassifier(max_depth=8)
dtc.fit(train_pd, target)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=None, splitter='best')
dtc_cv_metric = cross_validate(dtc, train_pd, target)
dtc_metrics = pd.DataFrame(dtc_cv_metric).mean()
dtc_metrics_pd = pd.DataFrame(pd.DataFrame(dtc_metrics).T)
dtc_metrics_pd.columns=['precision', 'recall', 'auc', 'f1']
print dtc_metrics_pd
precision recall auc f1 0 1 0.708548 0.79364 0.829416
export_graphviz(dtc, feature_names=train_pd.columns)
!dot -Tpng tree.dot -o tree.png
from IPython.display import Image
Image('tree.png', unconfined=True)
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_jobs=-1)
rf_clf_cv_metric = cross_validate(rf_clf, train_pd, target)
rf_clf_cv_metrics = pd.DataFrame(rf_clf_cv_metric).mean()
rf_clf_cv_metrics_pd = pd.DataFrame(pd.DataFrame(rf_clf_cv_metrics).T)
rf_clf_cv_metrics_pd.columns=['precision', 'recall', 'auc', 'f1']
print rf_clf_cv_metrics_pd
precision recall auc f1 0 1 0.685422 0.810167 0.813335
frames = [ols_metrics_pd, ridge_metrics_pd, lasso_cv_metrics_pd, log_reg_l1_metrics_pd, log_reg_l2_metrics_pd, dtc_metrics_pd, rf_clf_cv_metrics_pd]
model_analysis = pd.concat(frames)
model_analysis['model_name'] = ['Linear Regression', 'L2 Linear Reg', 'L1 Linear Reg', 'L1 Logistic Reg', 'L2 Logistic Reg', 'Decision Tree', 'Random Forest']
model_analysis
precision | recall | auc | f1 | model_name | |
---|---|---|---|---|---|
0 | 1 | 0.455974 | 0.689491 | 0.626312 | Linear Regression |
0 | 1 | 0.270928 | 0.613244 | 0.426347 | L2 Linear Reg |
0 | 1 | 0.237833 | 0.593963 | 0.384273 | L1 Linear Reg |
0 | 1 | 0.718546 | 0.801629 | 0.836226 | L1 Logistic Reg |
0 | 1 | 0.714417 | 0.799443 | 0.833423 | L2 Logistic Reg |
0 | 1 | 0.708548 | 0.793640 | 0.829416 | Decision Tree |
0 | 1 | 0.685422 | 0.810167 | 0.813335 | Random Forest |