import seaborn as sns from precis.base import Dataset, Measure, AbbreviatedMeasure from precis.abbreviate import TopNAbbreviator from precis.evaluate import YarkoniEvaluator from precis.generate import Generator from precis import plot as sp %matplotlib inline # Initialize the Measure ppi = Measure(X='data/PPI-R_German_data.txt', missing='drop') # Generate scale scores from the PPI scoring key ppi.score(key='data/PPI-R_scoring_key.txt', columns=['B','Ca','Co','F','M','R','So','St'], rescale=True) # Display some information print(ppi) # Initialize an Abbreviator object abb = TopNAbbreviator(max_items=5, min_r=0.2) # Initialize an Evaluator object ev = YarkoniEvaluator(item_cost=0.02) gen = Generator(abbreviator=abb, evaluator=ev) gen.run(ppi, n_gens=200, seed=64, resume=False) _ = gen.plot_history(size=(14,4)) gen.run(ppi, n_gens=800, seed=64, resume=True) _ = gen.plot_history(size=(14,4)) abb_ppi = gen.abbreviate() print(abb_ppi) abb_ppi.save(path='abbreviations/', prefix='PPI', key=True, summary=True, pickle=False) # Plot a set of scatterplots displaying correlation between abbreviated and original measure, # one for each PPI-R scale. sp.scale_scatter_plot(abb_ppi, rows=3, cols=3, trend=True, totals=True, jitter=0.3, alpha=0.3, size=(10,10)) sp.composite(gen, ['corr-original', 'corr-cross'], measure=abb_ppi, size=(16,6)) # Create a full set of abbreviations... for mi in [3,5,7,9]: for ic in [0.02, 0.04, 0.06, 0.08]: abb = TopNAbbreviator(max_items=mi, min_r=0.2) ev = YarkoniEvaluator(item_cost=ic) gen = Generator(abbreviator=abb, evaluator=ev) gen.run(ppi, n_gens=1000, seed=64, resume=False) am = gen.abbreviate() am.save(path='abbreviations/', prefix='PPI_mi=%d_ic=%f' % (mi, ic), key=True, summary=True) # Format all abbreviated measures into a summary table import re import pandas as pd def format_table(files): header = '\t'.join(['MI', 'IC', 'Items', 'Mean_R2', 'mean_alpha']) table = [header] for f in files: m = re.search('mi\=(\d+)_ic\=([0-9\.]+)', f) if not m: continue mi, ic = m.groups() c = open(f).read() n_items = int(re.search('Number of items:\s(\d+)', c).group(1)) scales = re.findall('^(.*)\s+\((\d+)\s.*R\^2\=([\d\.]+).*alpha\=([\d\.]+)', c, re.MULTILINE) df = pd.DataFrame(scales, columns=['name', 'no_items', 'R^2', 'alpha']).convert_objects(convert_numeric=True) vals = (int(mi), float(ic), n_items, df['R^2'].mean(), df['alpha'].mean()) line = '%d\t%.2f\t%d\t%.2f\t%.2f' % vals table.append(line) print('\n'.join(table)) from glob import glob files = glob('abbreviations/*summary.txt') format_table(files) # Load the new data into its own measure eng_ppi = Measure(X='data/PPI-R_MTurk_data.txt', missing='drop') # Generate full-length PPI-R scale scores using the original PPI-R scoring key eng_ppi.score(key='data/PPI-R_scoring_key.txt', columns=['B','Ca','Co','F','M','R','So','St'], rescale=True) # Get the 40-item scoring key we generated above and use it to abbreviate the new measure directly ppi_40_key = abb_ppi.key abb_eng = AbbreviatedMeasure(eng_ppi, select=abb_ppi.original_items, key=ppi_40_key) # Print summary of abbreviated measure to evaluate performance relative to full-length measure print(abb_eng) import numpy as np # Add derivative scales to the scoring key def add_factors(key): fear_dom = np.sum(key[:,[3,6,7]], axis=1) imp_anti = np.sum(key[:,[0,1,4,5]], axis=1) total = np.sum(key, axis=1) return np.hstack((key, fear_dom[:,None], imp_anti[:,None], total[:,None])).astype(int) # Load full PPI and score old_key = add_factors(ppi.key) full_ppi = Measure(X='data/PPI-R_German_data.txt', missing='drop') full_ppi.score(key=old_key, columns=['B','Ca','Co','F','M','R','So','St','FD','IA','Tot'], rescale=True) # Abbreviate new_key = add_factors(abb_ppi.key) with_factors = AbbreviatedMeasure(full_ppi, select=abb_ppi.original_items, key=new_key) print with_factors