In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper', font_scale=1.6)
import keras.backend as K
from natsort import natsorted

SEED = 0
SAVE_PLOTS = False
Using TensorFlow backend.
In [2]:
import glob
import os

def parse_logs(log_files):
    max_epoch = 250
    logs = [pd.read_csv(f, index_col='epoch', parse_dates=[1]) for f in log_files]
    for log, f in zip(logs, log_files):
        run = f.split('/')[-2]
        log.drop(log.index[log.index > max_epoch], axis=0, inplace=True)
        if 'time' not in log:
            raise ValueError("Missing times from {}".format(f))
        log.columns = ['time', run + ' Train', run + ' Valid']
        log['time'] = (log['time'] - log['time'].min()) / np.timedelta64(1, 's')
    step_logs = pd.concat([l.drop('time', axis=1, inplace=False) for l in logs], axis=1)
    time_logs = pd.concat([l.set_index('time') for l in logs], axis=1)
    
    return step_logs, time_logs
In [3]:
LABEL_MAP = {'epoch': 'Epoch', 'time': 'Time (s)'}

def run_to_args(run):
    values = run.split(' ')[0].split('_')
    args = {'model_type': values[0].upper(), 'size': int(values[1]), 
            'num_layers': int(values[2].strip('x')),
            'drop_frac': float(values[4].strip('drop')),
            'bidirectional': 'bidir' in run}
    if 'emb' in run:
        args['embedding'] = int([v for v in values if 'emb' in v][0][3:])
    return args
    

def training_plot(logs, loss_type='Valid', ylim=(1e-2, 1e0)):
    fig = plt.figure()
    colors = sns.color_palette(n_colors=int(len(logs.columns) / 2))
    for i, c in enumerate(step_logs.columns):
        if loss_type is not None and loss_type not in c:
            continue
        to_plot = logs[c]
        to_plot.dropna(inplace=True)
        args = run_to_args(c)
        to_plot.name = "{model_type}({size}) x {num_layers}".format(**args)
        if 'embedding' in args:
            to_plot.name += ", Embedding={}".format(args['embedding'])
        to_plot.plot(color=colors[int(i / 2)], legend=True, logy=True)
    plt.xlabel(LABEL_MAP[logs.index.name])
    plt.ylabel('Loss')
    plt.ylim(ylim)
    return fig
In [4]:
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y, y_pred, classnames, filename=None):
    classnames = sorted(classnames)
    sns.set_style("whitegrid", {'axes.grid' : False})
    cm = confusion_matrix(y, y_pred, classnames)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues, vmin=0,
               vmax=np.unique(y, return_counts=True)[1].max())
    #plt.title(title)
    #plt.colorbar()
    tick_marks = np.arange(len(classnames))
    plt.xticks(tick_marks, classnames, rotation=45)
    plt.yticks(tick_marks, classnames)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label');
    if filename:
        plt.savefig(filename)
In [5]:
from copy import deepcopy
from sklearn.model_selection import GridSearchCV, ParameterGrid

RF_PARAM_GRID = {'n_estimators': [50, 100, 250], 'criterion': ['gini', 'entropy'],
                 'max_features': [0.05, 0.1, 0.2, 0.3], 'min_samples_leaf': [1, 2, 3]}

Period estimator training plots

In [ ]:
log_files = glob.glob('keras_logs/period/uneven/noise0.5/gru_*x2*_bidir/training.csv')

step_logs, time_logs = parse_logs(log_files)
step_plot = training_plot(step_logs)
time_plot = training_plot(time_logs)
if SAVE_PLOTS:
    step_plot.savefig('figures/gru_period_step.pdf')
    time_plot.savefig('figures/gru_period_time.pdf')
In [ ]:
log_files = (glob.glob('keras_logs/period/uneven/noise0.5/gru*64*bidir/training.csv')
             + glob.glob('keras_logs/period/uneven/noise0.5/lstm*64*bidir/training.csv'))

step_logs, time_logs = parse_logs(log_files)
step_plot = training_plot(step_logs)
time_plot = training_plot(time_logs)
if SAVE_PLOTS:
    step_plot.savefig('figures/gru_lstm_period_step.pdf')
    time_plot.savefig('figures/gru_lstm_period_time.pdf')
In [ ]:
K.set_learning_phase(0)
%run period.py --size 96 --num_layers 2 --drop_frac 0.25 --no_train --model_type gru --sigma 0.5 --lr 5e-4 --sim_type period/uneven/noise0.5 --bidirectional
scaler.inverse_transform(Y, copy=False);
In [ ]:
%%time
train = np.arange(args.N_train)
test = args.N_train + np.arange(args.N_test)
pred_gru = model.predict(X)
scaler.inverse_transform(pred_gru)
In [ ]:
%%time
import gatspy
from distutils.version import LooseVersion
assert LooseVersion(gatspy.__version__) < '0.3.0'

pred_gat = np.zeros(pred_gru.shape)
for i in range(pred_gat.shape[0]):
    t = np.cumsum(X[i, :, 0])
    x = X[i, :, 1]
    opt_args = {'period_range': (1.0, 10.0), 'quiet': True}
#    opt_args = {'period_range': (0.05, 0.95 * (t.max() - t.min())), 'quiet': True}
    model_gat = gatspy.periodic.LombScargleFast(fit_period=True, optimizer_kwds=opt_args)#, silence_warnings=True)
    model_gat.fit(t, x)
    omega = 2 * np.pi / model_gat.best_period
    off, A2, A1 = model_gat._best_params(omega)
#    pred_gat[i] = np.array([model_gat.best_period, np.sqrt(A1 ** 2 + A2 ** 2), np.arctan2(A2, A1), off + model_gat.ymean_])
    pred_gat[i] = np.array([model_gat.best_period, A1, A2, off + model_gat.ymean_])
In [ ]:
inds = test
fig = sns.jointplot(pred_gru[inds, 0] - Y[inds, 0], pred_gat[inds, 0] - Y[inds, 0], xlim=(-1., 1.), ylim=(-1., 1.), stat_func=None)
fig.ax_joint.annotate("GRU MSE: {:1.4f}\nLomb-Scargle MSE: {:1.4f}".format(np.mean((pred_gru[inds, 0] - Y[inds, 0]) ** 2), np.mean((pred_gat[inds, 0] - Y[inds, 0]) ** 2)), (-0.95, 0.78))
fig.ax_joint.set_xlabel('GRU error')
fig.ax_joint.set_ylabel('Lomb-Scargle error');
#plt.title("GRU MAE: {}\nGatspy MAE: {}".format(np.median(np.abs(pred_gru[:, 0] - Y[inds, 0])), np.median(np.abs(pred_gat[:, 0] - Y[inds, 0]))))
#sns.jointplot(pred_gru[inds, 0], Y[inds, 0])
#sns.jointplot(pred_gat[inds, 0], Y[inds, 0])
if SAVE_PLOTS:
    plt.savefig('figures/ls_period.pdf')

Autoencoder training plots

In [ ]:
log_files = natsorted(glob.glob('keras_logs/autoencoder/uneven/noise0.5/gru_064_x2*/training.csv'))
log_files = [l for l in log_files if 'emb64' not in l]

step_logs, time_logs = parse_logs(log_files)
step_plot = training_plot(step_logs, ylim=(0.01, 2.0))
time_plot = training_plot(time_logs, ylim=(0.01, 2.0))
if SAVE_PLOTS:
    step_plot.savefig('figures/gru_autoencoder_step.pdf')
    time_plot.savefig('figures/gru_autoencoder_time.pdf')
In [ ]:
%%time
K.set_learning_phase(0)
%run autoencoder.py --size 96 --num_layers 2 --drop_frac 0.25 --no_train --model_type gru --sigma 0.5 --lr 5e-4 --sim_type autoencoder/uneven/noise0.5 --embedding 32 --bidirectional
In [ ]:
def sinusoid(p, A1, A2, b):
    return lambda t: A1 * np.cos(2 * np.pi / p * t) + A2 * np.sin(2 * np.pi / p * t) + b

N = 3
inds = [16, 10, 25]
colors = sns.color_palette(n_colors=len(inds))
for j, i in enumerate(inds):
    t = X_raw[i, :, 0]
    m = X[i, :, 1]
    T = np.linspace(0, t.max(), 501)
    plt.figure()
    plt.plot(T, sinusoid(*sample_data.phase_to_sin_cos(Y)[i])(T), color=colors[j])
    plt.plot(t, m, 'o', color=colors[j])
    plt.xlabel('t')
    plt.ylabel('f(t)')
    plt.title(f'ω={Y[i, 0]:1.3}, A={Y[i, 1]:1.3}, ϕ={Y[i, 2]:1.3}, b={Y[i, 3]:1.3}')
    if SAVE_PLOTS:
        plt.savefig(f'figures/sinusoid_{j}.pdf')
In [ ]:
%%time
train = np.arange(args.N_train)
test = np.arange(args.N_train, args.N_train + args.N_test)

# TODO re-run without this; why so slow...?
#train = train[:5000]
#X = np.r_[X[train], X[test]]; Y = np.r_[Y[train], Y[test]]; test = np.arange(len(train), len(train) + args.N_test)

encode_model = Model(input=model.input, output=model.get_layer('encoding').output)
encoding = encode_model.predict({'main_input': X, 'aux_input': np.delete(X, 1, axis=2)})
In [ ]:
%%time
df = pd.DataFrame(np.c_[Y[train], encoding[train]], 
                  columns=[f'Y_{i}' for i in range(4)] + [f'Embedding {i}' for i in range(encoding.shape[1])])
sns.pairplot(df, kind='reg', x_vars=df.columns[:4], y_vars=df.columns[4:])

df['Y_0'] **= -1
df.rename(columns={'Y_0': 'Period', 'Y_2': 'Phase (radian)'}, inplace=True)
R = df.corr(method='spearman')

emb_period = np.argmax(np.abs(R.iloc[Y.shape[1]:, 0]))
sns.jointplot(df['Period'], df[emb_period], kind='hex', stat_func=None)
if SAVE_PLOTS:
    plt.savefig('figures/period_hex.pdf')

emb_phase  = np.argmax(np.abs(R.iloc[Y.shape[1]:, 2]))
sns.jointplot(df['Phase (radian)'], df[emb_phase], kind='hex', stat_func=None)
if SAVE_PLOTS:
    plt.savefig('figures/phase_hex.pdf')
In [ ]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=1000)
model.fit(encoding[train], 1. / Y[train, 0])
sns.regplot(1. / Y[test, 0], model.predict(encoding[test]), line_kws={'color': 'black', 'linestyle': 'dotted'})
plt.annotate(f"Mean squared error: {np.mean((1. / Y[test, 0] - model.predict(encoding[test])) ** 2):1.4f}", (0.8, 9.8))
plt.xlabel('True period')
plt.ylabel('Estimated period')
if SAVE_PLOTS:
    plt.savefig('figures/period_rf.pdf')

ASAS reconstructions

In [ ]:
log_files = natsorted(glob.glob('keras_logs/asas_full/n200_ls0.2/gru_064_x2_5m04_drop25_emb*_bidir/training.csv'))

step_logs, time_logs = parse_logs(log_files)
step_plot = training_plot(step_logs, ylim=(3e-2, 1e-1))
time_plot = training_plot(time_logs, ylim=(3e-2, 1e-1))
if SAVE_PLOTS:
    step_plot.savefig('figures/asas_autoencoder_step.pdf')
    time_plot.savefig('figures/asas_autoencoder_time.pdf')
In [7]:
%%time
import os
import joblib
from keras_util import get_run_id
from survey_autoencoder import main as survey_autoencoder

arg_dict = {'size': 64, 'embedding': 64, 'num_layers': 1, 'model_type': 'gru',
            'sim_type': 'asas_full/n200_ss0.7', 'n_min': 200, 'n_max': 200,
            'lr': 1e-3, 'bidirectional': True, 'ss_resid': 0.7,
            'drop_frac': 0.25, 'survey_files': ['data/asas/full.pkl'],
            'period_fold': False, 'no_train': True}
run = get_run_id(**arg_dict)
log_dir = os.path.join(os.getcwd(), 'keras_logs', arg_dict['sim_type'], run)
weights_path = os.path.join(log_dir, 'weights.h5')
if not os.path.exists(weights_path):
    raise FileNotFoundError(weights_path)
X, X_raw, model, means, scales, wrong_units, args = survey_autoencoder(arg_dict)
full = joblib.load('data/asas/full.pkl')
split = [el for lc in full for el in lc.split(args.n_min, args.n_max)]
if args.ss_resid:
    split = [el for el in split if el.ss_resid <= args.ss_resid]
[]
Loading /Users/brettnaul/Dropbox/Documents/timeflow/keras_logs/asas_full/n200_ss0.7/gru_064_x1_1m03_drop25_emb64_bidir/weights.h5...
CPU times: user 30.3 s, sys: 2.71 s, total: 33.1 s
Wall time: 33.4 s
In [8]:
(len(X), 0.8 * len(X), 0.2 * len(X))
Out[8]:
(33103, 26482.4, 6620.6)
In [10]:
%%time
pred = model.predict({'main_input': X[:], 'aux_input': X[:, :, [0,]]})
ords = np.argsort(np.sum((pred.squeeze() - X[:, :, 1]) ** 2, axis=1))
inds, labels = zip(*[(8320, '25'), (23994, '75')])
for ord_i, label in zip(inds, labels):
    i = ords[ord_i]
    print(i)
    t = X_raw[~wrong_units][i, :, 0]
    m = X_raw[~wrong_units][i, :, 1]
    e = X_raw[~wrong_units][i, :, 2]
    m_est = pred[i] * scales[i, 0] + means[i]
    plt.figure()
    plt.errorbar(t, m, e, None, 'o');
#    plt.errorbar(t, m_est, None, None, 'o');
    plt.xlabel('Time [day]')
    plt.ylabel('Magnitude')
    plt.legend(['Original', 'Reconstructed'])
    plt.title(f'{split[i].survey} {split[i].name} ({split[i].label})')
    plt.gca().invert_yaxis()
    if SAVE_PLOTS:
        plt.savefig(f'paper/figures/asas_reconstruct_{label}.pdf')
27594
7797
CPU times: user 642 ms, sys: 426 ms, total: 1.07 s
Wall time: 1.41 s
In [15]:
%%time
import os
from keras_util import get_run_id
from survey_autoencoder import main as survey_autoencoder

arg_dict = {'size': 64, 'embedding': 64, 'num_layers': 1, 'model_type': 'gru',
            'sim_type': 'asas_fold/n200', 'n_min': 200, 'n_max': 200,
            'lr': 1e-3, 'bidirectional': True, 'ss_resid': 0.7,
            'drop_frac': 0.25, 'survey_files': ['data/asas/full.pkl'],
            'period_fold': True, 'no_train': True}
run = get_run_id(**arg_dict)
log_dir = os.path.join(os.getcwd(), 'keras_logs', arg_dict['sim_type'], run)
weights_path = os.path.join(log_dir, 'weights.h5')
if not os.path.exists(weights_path):
    raise FileNotFoundError(weights_path)
X_fold, X_raw_fold, model, means, scales, wrong_units, args = survey_autoencoder(arg_dict)
[]
Loading /Users/brettnaul/Dropbox/Documents/timeflow/keras_logs/asas_fold/n200/gru_064_x1_1m03_drop25_emb64_bidir/weights.h5...
CPU times: user 18.6 s, sys: 3.01 s, total: 21.6 s
Wall time: 23.1 s
In [16]:
pred_fold = model.predict({'main_input': X_fold[:], 'aux_input': X_fold[:, :, [0,]]})
ords_fold = np.argsort(np.sum((pred_fold.squeeze() - X_fold[:, :, 1]) ** 2, axis=1))
for ord_i, label in zip(inds, labels):
    i = ords[ord_i]
    t = X_raw_fold[~wrong_units][i, :, 0]
    m = X_raw_fold[~wrong_units][i, :, 1]
    e = X_raw_fold[~wrong_units][i, :, 2]
    m_est = (pred[i] * scales[i, 0] + means[i])[np.argsort(split[i].times % split[i].p)]
    m_fold = pred_fold[i] * scales[i, 0] + means[i]
    plt.figure()
    plt.errorbar(t, m, e, None, 'o');
    plt.errorbar(t, m_est, None, None, 'o', alpha=0.6);
    plt.errorbar(t, m_fold, None, None, 'o');
    plt.xlabel('Phase [day]')
    plt.ylabel('Magnitude')
    plt.title(f'{split[i].survey} {split[i].name} ({split[i].label})')
    plt.legend(['Original', 'Reconstructed (non-folded)', 'Reconstructed (folded)'])
    plt.gca().invert_yaxis()
    if SAVE_PLOTS:
        plt.savefig(f'figures/asas_folded_reconstruct_{label}.pdf')

ASAS autoencoder random forests

In [ ]:
%%time
import os
import joblib
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from light_curve import LightCurve
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras_util import get_run_id
from survey_autoencoder import main as survey_autoencoder, preprocess

arg_dict = {'size': 64, 'embedding': 64, 'num_layers': 1, 'model_type': 'gru',
            'sim_type': 'asas_fold/n200_noise', 'n_min': 200, 'n_max': 200,
            'lr': 1e-3, 'bidirectional': True, 'ss_resid': 0.7,
            'drop_frac': 0.25, 'survey_files': ['data/asas/full.pkl'],
            'period_fold': True, 'no_train': True}
run = get_run_id(**arg_dict)
log_dir = os.path.join(os.getcwd(), 'keras_logs', arg_dict['sim_type'], run)
weights_path = os.path.join(log_dir, 'weights.h5')
if not os.path.exists(weights_path):
    raise FileNotFoundError(weights_path)
X, X_raw, model, means, scales, wrong_units, args = survey_autoencoder(arg_dict)

top_classes = ['RR_Lyrae_FM', 'W_Ursae_Maj', 'Classical_Cepheid', 'Beta_Persei', 'Semireg_PV']
full = joblib.load('data/asas/full.pkl')
full = [lc for lc in full if lc.label in top_classes]
split = [el for lc in full for el in lc.split(args.n_min, args.n_max)]
#if args.ss_resid:
#    split = [el for el in split if el.ss_resid <= args.ss_resid]
if args.period_fold:
    for lc in split:
        lc.period_fold()
X_list = [np.c_[lc.times, lc.measurements, lc.errors] for lc in split]
classnames, indices = np.unique([lc.label for lc in split], return_inverse=True)
periods = [lc.p for lc in split]
y = classnames[indices]
train, valid = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(X_list, y))[0]

X_raw = pad_sequences(X_list, value=0., dtype='float', padding='post')
X, means, scales, wrong_units = preprocess(X_raw, args.m_max)
encode_model = Model(input=model.input, output=model.get_layer('encoding').output)
encoding = encode_model.predict({'main_input': X, 'aux_input': np.delete(X, 1, axis=2)})
encoding = np.c_[encoding, means, scales, periods]
In [ ]:
%%time
asas_model = GridSearchCV(RandomForestClassifier(random_state=0), RF_PARAM_GRID)
asas_model.fit(encoding[train], y[train])
print("Training score: {}%".format(100 * asas_model.score(encoding[train], y[train])))
print("Validation score: {}%".format(100 * asas_model.score(encoding[valid], y[valid])))
plot_confusion_matrix(y[valid], asas_model.predict(encoding[valid]), classnames,
                      'figures/asas_confusion.pdf' if SAVE_PLOTS else None)

ASAS Richards random forests

In [ ]:
%%time
from sklearn.model_selection import StratifiedKFold
from light_curve import LightCurve
import joblib
from cesium.features import LOMB_SCARGLE_FEATS, CADENCE_FEATS, GENERAL_FEATS
from cesium.featurize import featurize_time_series

from argparse import Namespace; args = Namespace(n_min=200, n_max=200)

top_classes = ['RR_Lyrae_FM', 'W_Ursae_Maj', 'Classical_Cepheid', 'Beta_Persei', 'Semireg_PV']
full = joblib.load('data/asas/full.pkl')
full = [lc for lc in full if lc.label in top_classes]
split = [el for lc in full for el in lc.split(args.n_min, args.n_max)]
X_list = [np.c_[lc.times, lc.measurements, lc.errors] for lc in split]
classnames, indices = np.unique([lc.label for lc in split], return_inverse=True)
y = classnames[indices]
train, valid = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(X_list, y))[0]

times, measurements, errors = zip(*[x.T for x in X_list])

features_to_use = LOMB_SCARGLE_FEATS + GENERAL_FEATS# + CADENCE_FEATS

fset = featurize_time_series(times, measurements, errors, features_to_use)
#joblib.dump(fset, 'asas_richards.pkl', compress=True)
#fset = joblib.load('asas_richards.pkl')
In [ ]:
%%time
from sklearn.ensemble import RandomForestClassifier

cs_model = GridSearchCV(RandomForestClassifier(random_state=0), RF_PARAM_GRID)
cs_model.fit(fset.iloc[train], y[train])
In [ ]:
cs_pred = cs_model.predict(fset)
print(f"Training score: {100 * np.mean(cs_pred[train] == y[train])}%")
print(f"Validation score: {100 * np.mean(cs_pred[valid] == y[valid])}%")
plot_confusion_matrix(y[valid], cs_model.predict(fset.iloc[valid]), classnames,
                      'figures/asas_richards_confusion.pdf' if SAVE_PLOTS else None)

LINEAR reconstructions

In [17]:
import os
import joblib
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from light_curve import LightCurve
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras_util import parse_model_args, get_run_id
from survey_autoencoder import main as survey_autoencoder

arg_dict = {'size': 96, 'embedding': 64, 'num_layers': 2, 'model_type': 'gru',
            'sim_type': 'linear/n200', 'n_min': 200, 'n_max': 200,
            'lr': 1e-3, 'bidirectional': True, 'drop_frac': 0.25, 'period_fold': False,
            'survey_files': ['data/linear/full.pkl'], 'no_train': True}
run = get_run_id(**arg_dict)
log_dir = os.path.join(os.getcwd(), 'keras_logs', arg_dict['sim_type'], run)
weights_path = os.path.join(log_dir, 'weights.h5')
if not os.path.exists(weights_path):
    raise FileNotFoundError(weights_path)
X, X_raw, model, means, scales, wrong_units, args = survey_autoencoder(arg_dict)
[]
Loading /Users/brettnaul/Dropbox/Documents/timeflow/keras_logs/linear/n200/gru_096_x2_1m03_drop25_emb64_bidir/weights.h5...
In [19]:
pred  = model.predict({'main_input': X[:], 'aux_input': X[:, :, 0:1]})
ords = np.argsort(np.mean((pred.squeeze() - X[:, :, 1]) ** 2 / (X_raw[:, :, 2] / scales), axis=1))
inds, labels = zip(*[(1142, '25'), (3425, '75')])
#for i, label in zip(inds, labels):
#    t = X_raw[~wrong_units][i, :, 0]
#    m = X_raw[~wrong_units][i, :, 1]
#    e = X_raw[~wrong_units][i, :, 2]
#    m_est = pred[i] * scales[i, 0] + means[i]
#    plt.figure()
#    plt.errorbar(t, m, e, None, 'o');
#    plt.errorbar(t, m_est, None, None, 'o');
#    plt.xlabel('Time [day]')
#    plt.ylabel('Magnitude')
#    plt.legend(['Original', 'Reconstructed'])
#    plt.gca().invert_yaxis()
#    if SAVE_PLOTS:
#        plt.savefig(f'figures/linear_reconstruct_{label}.pdf')
In [20]:
%%time
import os
import joblib
from keras_util import get_run_id
from survey_autoencoder import main as survey_autoencoder

arg_dict = {'size': 96, 'embedding': 64, 'num_layers': 2, 'model_type': 'gru',
            'sim_type': 'asas_linear_fold/n200_ss0.7', 'n_min': 200, 'n_max': 200,
            'lr': 5e-4, 'bidirectional': True,
            'drop_frac': 0.25, 'survey_files': ['data/linear/full.pkl'],
            'period_fold': True, 'no_train': True}
run = get_run_id(**arg_dict)
log_dir = os.path.join(os.getcwd(), 'keras_logs', arg_dict['sim_type'], run)
weights_path = os.path.join(log_dir, 'weights.h5')
if not os.path.exists(weights_path):
    raise FileNotFoundError(weights_path)
X, X_raw, model, means, scales, wrong_units, args = survey_autoencoder(arg_dict)
pred_fold = model.predict({'main_input': X[:], 'aux_input': X[:, :, [0,]]}, batch_size=2000)
ords_fold = np.argsort(np.sum((pred_fold.squeeze() - X[:, :, 1]) ** 2, axis=1))
[]
Loading /Users/brettnaul/Dropbox/Documents/timeflow/keras_logs/asas_linear_fold/n200_ss0.7/gru_096_x2_5m04_drop25_emb64_bidir/weights.h5...
CPU times: user 4min 33s, sys: 57.6 s, total: 5min 30s
Wall time: 1min
In [21]:
full = joblib.load('data/linear/full.pkl')
split = [el for lc in full for el in lc.split(args.n_min, args.n_max)]
for ord_i, label in zip(inds, labels):
    i = ords[ord_i]
    print(i)
    t = X_raw[~wrong_units][i, :, 0]
    m = X_raw[~wrong_units][i, :, 1]
    e = X_raw[~wrong_units][i, :, 2]
    m_est = (pred[i] * scales[i, 0] + means[i])[np.argsort(split[i].times % split[i].p)]
    m_fold = pred_fold[i] * scales[i, 0] + means[i]
    plt.figure()
    plt.errorbar(t, m, e, None, 'o');
    plt.errorbar(t, m_est, None, None, 'o', alpha=0.6);
    plt.errorbar(t, m_fold, None, None, 'o');
    plt.xlabel('Phase [day]')
    plt.ylabel('Magnitude')
    plt.legend(['Original', 'Reconstructed (non-folded)', 'Reconstructed (folded)'])
    plt.title(f'{split[i].survey} {split[i].name} ({split[i].label})')
    plt.gca().invert_yaxis()
    if SAVE_PLOTS:
        plt.savefig(f'figures/linear_folded_reconstruct_{label}.pdf')
2761
39

LINEAR autoencoder random forests

In [ ]:
%%time
import os
import joblib
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from light_curve import LightCurve
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras_util import get_run_id
from survey_autoencoder import main as survey_autoencoder, preprocess

#arg_dict = {'size': 96, 'embedding': 64, 'num_layers': 2, 'model_type': 'gru',
#        'sim_type': 'asas_linear_fold/n200_ss0.7', 'n_min': 200, 'n_max': 200,
#        'lr': 5e-4, 'bidirectional': True, 'drop_frac': 0.25, 'period_fold': True,
#        'survey_files': ['data/linear/full.pkl'], 'no_train': True}
arg_dict = args.__dict__
run = get_run_id(**arg_dict)
log_dir = os.path.join(os.getcwd(), 'keras_logs', arg_dict['sim_type'], run)
weights_path = os.path.join(log_dir, 'weights.h5')
if not os.path.exists(weights_path):
    raise FileNotFoundError(weights_path)
X, X_raw, model, means, scales, wrong_units, args = survey_autoencoder(arg_dict)

full = joblib.load('data/linear/full.pkl')
split = [el for lc in full for el in lc.split(args.n_min, args.n_max)]
if args.period_fold:
    for lc in split:
        lc.period_fold()
X_list = [np.c_[lc.times, lc.measurements, lc.errors] for lc in split]
classnames, indices = np.unique([lc.label for lc in split], return_inverse=True)
periods = [lc.p for lc in split]
y = classnames[indices]
train, valid = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(X_list, y))[0]

encode_model = Model(input=model.input, output=model.get_layer('encoding').output)
encoding = encode_model.predict({'main_input': X, 'aux_input': np.delete(X, 1, axis=2)})
encoding = np.c_[encoding, means, scales, periods]
linear_model = GridSearchCV(RandomForestClassifier(random_state=0), RF_PARAM_GRID)
linear_model.fit(encoding[train], y[train])
print("Training score: {}%".format(100 * linear_model.score(encoding[train], y[train])))
print("Validation score: {}%".format(100 * linear_model.score(encoding[valid], y[valid])))
plot_confusion_matrix(y[valid], cs_model.predict(encoding[valid]), classnames,
                      'figures/linear_confusion.pdf' if SAVE_PLOTS else None)

LINEAR Richards random forests

In [ ]:
%%time
from sklearn.model_selection import StratifiedKFold
from light_curve import LightCurve
import joblib
from cesium.features import LOMB_SCARGLE_FEATS, CADENCE_FEATS, GENERAL_FEATS
from cesium.featurize import featurize_time_series

from argparse import Namespace; args = Namespace(n_min=200, n_max=200)

full = joblib.load('data/linear/full.pkl')
split = [el for lc in full for el in lc.split(args.n_min, args.n_max)]
X_list = [np.c_[lc.times, lc.measurements, lc.errors] for lc in split]
classnames, indices = np.unique([lc.label for lc in split], return_inverse=True)
y = classnames[indices]
train, valid = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(X_list, y))[0]

times, measurements, errors = zip(*[x.T for x in X_list])

features_to_use = LOMB_SCARGLE_FEATS + GENERAL_FEATS# + CADENCE_FEATS

fset = featurize_time_series(times, measurements, errors, features_to_use)
#joblib.dump(fset, 'linear_richards.pkl', compress=True)
#fset = joblib.load('linear_richards.pkl')
In [ ]:
%%time
from sklearn.ensemble import RandomForestClassifier

cs_model = GridSearchCV(RandomForestClassifier(random_state=0), RF_PARAM_GRID)
cs_model.fit(fset.iloc[train], y[train])
In [ ]:
cs_pred = cs_model.predict(fset)
print(f"Training score: {100 * np.mean(cs_pred[train] == y[train])}%")
print(f"Validation score: {100 * np.mean(cs_pred[valid] == y[valid])}%")
plot_confusion_matrix(y[valid], cs_model.predict(fset.iloc[valid]), classnames,
                      'figures/linear_richards_confusion.pdf' if SAVE_PLOTS else None)