import bigml, csv, StringIO
from bigml.api import BigML
# You need to define BIGML_USERNAME and BIGML_API_KEY in your environment settings, or add them here as options
# api = BigML(username, api_key)
api = BigML()
# Create a the diabetes source file by fetching the data from s3. Note that we wrap all these API calls with
# api.check_resource so that each step runs synchronously
diabetes_source = api.check_resource(
api.create_source("s3://bigml-public/arff/diabetes.arff", {"name": "Diabetes"}),
api.get_source)
# Create the dataset, but remove the "weight" field which is id 000009
diabetes_dataset = api.check_resource(
api.create_dataset(diabetes_source, {"excluded_fields": [ "000009" ] }),
api.get_dataset)
# Create the random sample by sampling over the entire dataset but using a range from 1 to 5
diabetes_random_sample = api.check_resource(
api.create_dataset(diabetes_dataset, {"sample_rate": 1, "range": [1,5], "name": "Diabetes Random Sample"}),
api.get_dataset)
# Now we create the model using the random sample
diabetes_random_model = api.check_resource(
api.create_model(diabetes_random_sample),
api.get_model)
# And finally, evaluate the performance of the random sample model using the entire dataset
diabetes_random_eval = api.check_resource(
api.create_evaluation(diabetes_random_model, diabetes_dataset),
api.get_evaluation)
# Here we start the Active Learning demonstration. The first step is to cluster the instances in the diabetes dataset.
# However, we have to ignore the class (which we want to predict) by scaling it to 0
diabetes_cluster = api.check_resource(
api.create_cluster(diabetes_dataset, { "k": 5, "field_scales": { "class": 0} }),
api.get_cluster)
# We need to assign a centroid and distance for each datapoint in the dataset using the cluster. This generates
# A CSV which we will fetch next. Note that we are keeping all fields in the CSV output, as well as the header and the
# distance score. This will allow us to sample from this CSV to create a new source/dataset/model
diabetes_batchcentroid = api.check_resource(
api.create_batch_centroid(diabetes_cluster, diabetes_dataset, { "all_fields": True, "header": True, "distance": True, "distance_name": "distance" }),
api.get_batch_centroid)
# Fetch the centroid scores and use the csv module to parse them into list of dictionaries. This
# dataset is small, so we are going to just do all the CSV steps in memory rather than write to disk
# and worry about handling files
centroid_scores = csv.DictReader(api.download_batch_centroid(diabetes_batchcentroid))
centroid_samples = {}
# Now we examine each row in the CSV and build up a dictionary mapping the cluster name (Ex: "Cluster 0") to a
# sampled row. The idea here is to find the row (which is an instance) that is closest to the centroid for each cluster.
# All we need to do is compare the value in the "distance" field and keep the smallest.
for row in centroid_scores:
if not row['cluster'] in centroid_samples:
centroid_samples[row['cluster']] = row
else:
if row["distance"] < centroid_samples[row['cluster']]["distance"]:
centroid_samples[row['cluster']] = row
# Rather than write to disk, we create an in-memory CSV using StringIO
# We write a header to the CSV using the field names from the centroid_scores CSV we downloaded previously
CSV = StringIO.StringIO()
buffer = csv.DictWriter(CSV, centroid_scores.fieldnames)
buffer.writeheader()
# For each of the rows with the minimum distance in centroid_samples, we right the rows to the in-memory CSV.
# These will be our intelligently chosen samples.
for cluster in centroid_samples:
buffer.writerow(centroid_samples[cluster])
# Now we repeat the source/dataset/model/eval steps for the cluster samples.
# First step: create the source from the in-memory CSV
diabetes_cluster_source = api.check_resource(
api.create_source(CSV, { "name": "Diabetes Cluster Sample" }),
api.get_source)
# Now create the dataset. We need to drop the cluster assignment and distance fields.
diabetes_cluster_sample = api.check_resource(
api.create_dataset(diabetes_cluster_source, {"excluded_fields": [ "000009", "00000a" ], "name": "Diabetes Cluster Sample" }),
api.get_dataset)
# Model the cluster sample dataset
diabetes_cluster_model = api.check_resource(
api.create_model(diabetes_cluster_sample),
api.get_model)
# Last step, evaluate this model against the original dataset
diabetes_cluster_eval = api.check_resource(
api.create_evaluation(diabetes_cluster_model, diabetes_dataset),
api.get_evaluation)
# Simple comparison of the accuracy. It's worth comparing these evaluations side-by-side in the bigml.com UI.
# All 5 metrics should be significantly better.
print "Random Sample Accuracy: %s" % diabetes_random_eval['object']['result']['model']['accuracy']
print "Cluster Sample Accuracy: %s" % diabetes_cluster_eval['object']['result']['model']['accuracy']
Random Sample Accuracy: 0.65495 Cluster Sample Accuracy: 0.73568