import seaborn as sns
from precis.base import Dataset, Measure, AbbreviatedMeasure
from precis.abbreviate import TopNAbbreviator
from precis.evaluate import YarkoniEvaluator
from precis.generate import Generator
from precis import plot as sp
%matplotlib inline

# Initialize the Measure
ppi = Measure(X='data/PPI-R_German_data.txt', missing='drop')
# Generate scale scores from the PPI scoring key
ppi.score(key='data/PPI-R_scoring_key.txt', columns=['B','Ca','Co','F','M','R','So','St'], rescale=True)
# Display some information
print(ppi)

# Initialize an Abbreviator object
abb = TopNAbbreviator(max_items=5, min_r=0.2)
# Initialize an Evaluator object 
ev = YarkoniEvaluator(item_cost=0.02)

gen = Generator(abbreviator=abb, evaluator=ev)

gen.run(ppi, n_gens=200, seed=64, resume=False)

_ = gen.plot_history(size=(14,4))

gen.run(ppi, n_gens=800, seed=64, resume=True)
_ = gen.plot_history(size=(14,4))

abb_ppi = gen.abbreviate()
print(abb_ppi)

abb_ppi.save(path='abbreviations/', prefix='PPI', key=True, summary=True, pickle=False)

# Plot a set of scatterplots displaying correlation between abbreviated and original measure,
# one for each PPI-R scale.
sp.scale_scatter_plot(abb_ppi, rows=3, cols=3, trend=True, totals=True, jitter=0.3, alpha=0.3, size=(10,10))

sp.composite(gen, ['corr-original', 'corr-cross'], measure=abb_ppi, size=(16,6))

# Create a full set of abbreviations...
for mi in [3,5,7,9]:
    for ic in [0.02, 0.04, 0.06, 0.08]:
        abb = TopNAbbreviator(max_items=mi, min_r=0.2)
        ev = YarkoniEvaluator(item_cost=ic)
        gen = Generator(abbreviator=abb, evaluator=ev)
        gen.run(ppi, n_gens=1000, seed=64, resume=False)
        am = gen.abbreviate()
        am.save(path='abbreviations/', prefix='PPI_mi=%d_ic=%f' % (mi, ic), key=True, summary=True)

# Format all abbreviated measures into a summary table
import re
import pandas as pd
def format_table(files):
    header = '\t'.join(['MI', 'IC', 'Items', 'Mean_R2', 'mean_alpha'])
    table = [header]
    for f in files:
        m = re.search('mi\=(\d+)_ic\=([0-9\.]+)', f)
        if not m: continue
        mi, ic = m.groups()
        c = open(f).read()
        n_items = int(re.search('Number of items:\s(\d+)', c).group(1))
        scales = re.findall('^(.*)\s+\((\d+)\s.*R\^2\=([\d\.]+).*alpha\=([\d\.]+)', c, re.MULTILINE)
        df = pd.DataFrame(scales, columns=['name', 'no_items', 'R^2', 'alpha']).convert_objects(convert_numeric=True)
        vals = (int(mi), float(ic), n_items, df['R^2'].mean(), df['alpha'].mean())
        line = '%d\t%.2f\t%d\t%.2f\t%.2f' % vals  
        table.append(line)
    print('\n'.join(table))

from glob import glob
files = glob('abbreviations/*summary.txt')
format_table(files)

# Load the new data into its own measure
eng_ppi = Measure(X='data/PPI-R_MTurk_data.txt', missing='drop')
# Generate full-length PPI-R scale scores using the original PPI-R scoring key
eng_ppi.score(key='data/PPI-R_scoring_key.txt', columns=['B','Ca','Co','F','M','R','So','St'], rescale=True)
# Get the 40-item scoring key we generated above and use it to abbreviate the new measure directly
ppi_40_key = abb_ppi.key
abb_eng = AbbreviatedMeasure(eng_ppi, select=abb_ppi.original_items, key=ppi_40_key)
# Print summary of abbreviated measure to evaluate performance relative to full-length measure
print(abb_eng)

import numpy as np

# Add derivative scales to the scoring key
def add_factors(key):
    fear_dom = np.sum(key[:,[3,6,7]], axis=1)
    imp_anti = np.sum(key[:,[0,1,4,5]], axis=1)
    total = np.sum(key, axis=1)
    return np.hstack((key, fear_dom[:,None], imp_anti[:,None], total[:,None])).astype(int)

# Load full PPI and score
old_key = add_factors(ppi.key)
full_ppi = Measure(X='data/PPI-R_German_data.txt', missing='drop')
full_ppi.score(key=old_key, columns=['B','Ca','Co','F','M','R','So','St','FD','IA','Tot'], rescale=True)

# Abbreviate
new_key = add_factors(abb_ppi.key)
with_factors = AbbreviatedMeasure(full_ppi, select=abb_ppi.original_items, key=new_key)
print with_factors