#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd np.random.seed(12345) import matplotlib.pyplot as plt plt.rc('figure', figsize=(10, 6)) PREVIOUS_MAX_ROWS = pd.options.display.max_rows pd.options.display.max_columns = 20 pd.options.display.max_rows = 20 pd.options.display.max_colwidth = 80 np.set_printoptions(precision=4, suppress=True) # In[2]: data = pd.DataFrame({ 'x0': [1, 2, 3, 4, 5], 'x1': [0.01, -0.01, 0.25, -4.1, 0.], 'y': [-1.5, 0., 3.6, 1.3, -2.]}) data data.columns data.to_numpy() # In[3]: df2 = pd.DataFrame(data.to_numpy(), columns=['one', 'two', 'three']) df2 # In[4]: df3 = data.copy() df3['strings'] = ['a', 'b', 'c', 'd', 'e'] df3 df3.to_numpy() # In[5]: model_cols = ['x0', 'x1'] data.loc[:, model_cols].to_numpy() # In[6]: data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'], categories=['a', 'b']) data # In[7]: dummies = pd.get_dummies(data.category, prefix='category', dtype=float) data_with_dummies = data.drop('category', axis=1).join(dummies) data_with_dummies # In[8]: data = pd.DataFrame({ 'x0': [1, 2, 3, 4, 5], 'x1': [0.01, -0.01, 0.25, -4.1, 0.], 'y': [-1.5, 0., 3.6, 1.3, -2.]}) data import patsy y, X = patsy.dmatrices('y ~ x0 + x1', data) # In[9]: y X # In[10]: np.asarray(y) np.asarray(X) # In[11]: patsy.dmatrices('y ~ x0 + x1 + 0', data)[1] # In[12]: coef, resid, _, _ = np.linalg.lstsq(X, y, rcond=None) # In[13]: coef coef = pd.Series(coef.squeeze(), index=X.design_info.column_names) coef # In[14]: y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data) X # In[15]: y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data) X # In[16]: new_data = pd.DataFrame({ 'x0': [6, 7, 8, 9], 'x1': [3.1, -0.5, 0, 2.3], 'y': [1, 2, 3, 4]}) new_X = patsy.build_design_matrices([X.design_info], new_data) new_X # In[17]: y, X = patsy.dmatrices('y ~ I(x0 + x1)', data) X # In[18]: data = pd.DataFrame({ 'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'], 'key2': [0, 1, 0, 1, 0, 1, 0, 0], 'v1': [1, 2, 3, 4, 5, 6, 7, 8], 'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7] }) y, X = patsy.dmatrices('v2 ~ key1', data) X # In[19]: y, X = patsy.dmatrices('v2 ~ key1 + 0', data) X # In[20]: y, X = patsy.dmatrices('v2 ~ C(key2)', data) X # In[21]: data['key2'] = data['key2'].map({0: 'zero', 1: 'one'}) data y, X = patsy.dmatrices('v2 ~ key1 + key2', data) X y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data) X # In[22]: import statsmodels.api as sm import statsmodels.formula.api as smf # In[23]: # To make the example reproducible rng = np.random.default_rng(seed=12345) def dnorm(mean, variance, size=1): if isinstance(size, int): size = size, return mean + np.sqrt(variance) * rng.standard_normal(*size) N = 100 X = np.c_[dnorm(0, 0.4, size=N), dnorm(0, 0.6, size=N), dnorm(0, 0.2, size=N)] eps = dnorm(0, 0.1, size=N) beta = [0.1, 0.3, 0.5] y = np.dot(X, beta) + eps # In[24]: X[:5] y[:5] # In[25]: X_model = sm.add_constant(X) X_model[:5] # In[26]: model = sm.OLS(y, X) # In[27]: results = model.fit() results.params # In[28]: print(results.summary()) # In[29]: data = pd.DataFrame(X, columns=['col0', 'col1', 'col2']) data['y'] = y data[:5] # In[30]: results = smf.ols('y ~ col0 + col1 + col2', data=data).fit() results.params results.tvalues # In[31]: results.predict(data[:5]) # In[32]: init_x = 4 values = [init_x, init_x] N = 1000 b0 = 0.8 b1 = -0.4 noise = dnorm(0, 0.1, N) for i in range(N): new_x = values[-1] * b0 + values[-2] * b1 + noise[i] values.append(new_x) # In[33]: from statsmodels.tsa.ar_model import AutoReg MAXLAGS = 5 model = AutoReg(values, MAXLAGS) results = model.fit() # In[34]: results.params # In[35]: train = pd.read_csv('datasets/titanic/train.csv') test = pd.read_csv('datasets/titanic/test.csv') train.head(4) # In[36]: train.isna().sum() test.isna().sum() # In[37]: impute_value = train['Age'].median() train['Age'] = train['Age'].fillna(impute_value) test['Age'] = test['Age'].fillna(impute_value) # In[38]: train['IsFemale'] = (train['Sex'] == 'female').astype(int) test['IsFemale'] = (test['Sex'] == 'female').astype(int) # In[39]: predictors = ['Pclass', 'IsFemale', 'Age'] X_train = train[predictors].to_numpy() X_test = test[predictors].to_numpy() y_train = train['Survived'].to_numpy() X_train[:5] y_train[:5] # In[40]: from sklearn.linear_model import LogisticRegression model = LogisticRegression() # In[41]: model.fit(X_train, y_train) # In[42]: y_predict = model.predict(X_test) y_predict[:10] # In[43]: from sklearn.linear_model import LogisticRegressionCV model_cv = LogisticRegressionCV(Cs=10) model_cv.fit(X_train, y_train) # In[44]: from sklearn.model_selection import cross_val_score model = LogisticRegression(C=10) scores = cross_val_score(model, X_train, y_train, cv=4) scores # In[45]: # In[46]: pd.options.display.max_rows = PREVIOUS_MAX_ROWS