import bigml, csv, StringIO from bigml.api import BigML # You need to define BIGML_USERNAME and BIGML_API_KEY in your environment settings, or add them here as options # api = BigML(username, api_key) api = BigML() # Create a the diabetes source file by fetching the data from s3. Note that we wrap all these API calls with # api.check_resource so that each step runs synchronously diabetes_source = api.check_resource( api.create_source("s3://bigml-public/arff/diabetes.arff", {"name": "Diabetes"}), api.get_source) # Create the dataset, but remove the "weight" field which is id 000009 diabetes_dataset = api.check_resource( api.create_dataset(diabetes_source, {"excluded_fields": [ "000009" ] }), api.get_dataset) # Create the random sample by sampling over the entire dataset but using a range from 1 to 5 diabetes_random_sample = api.check_resource( api.create_dataset(diabetes_dataset, {"sample_rate": 1, "range": [1,5], "name": "Diabetes Random Sample"}), api.get_dataset) # Now we create the model using the random sample diabetes_random_model = api.check_resource( api.create_model(diabetes_random_sample), api.get_model) # And finally, evaluate the performance of the random sample model using the entire dataset diabetes_random_eval = api.check_resource( api.create_evaluation(diabetes_random_model, diabetes_dataset), api.get_evaluation) # Here we start the Active Learning demonstration. The first step is to cluster the instances in the diabetes dataset. # However, we have to ignore the class (which we want to predict) by scaling it to 0 diabetes_cluster = api.check_resource( api.create_cluster(diabetes_dataset, { "k": 5, "field_scales": { "class": 0} }), api.get_cluster) # We need to assign a centroid and distance for each datapoint in the dataset using the cluster. This generates # A CSV which we will fetch next. Note that we are keeping all fields in the CSV output, as well as the header and the # distance score. This will allow us to sample from this CSV to create a new source/dataset/model diabetes_batchcentroid = api.check_resource( api.create_batch_centroid(diabetes_cluster, diabetes_dataset, { "all_fields": True, "header": True, "distance": True, "distance_name": "distance" }), api.get_batch_centroid) # Fetch the centroid scores and use the csv module to parse them into list of dictionaries. This # dataset is small, so we are going to just do all the CSV steps in memory rather than write to disk # and worry about handling files centroid_scores = csv.DictReader(api.download_batch_centroid(diabetes_batchcentroid)) centroid_samples = {} # Now we examine each row in the CSV and build up a dictionary mapping the cluster name (Ex: "Cluster 0") to a # sampled row. The idea here is to find the row (which is an instance) that is closest to the centroid for each cluster. # All we need to do is compare the value in the "distance" field and keep the smallest. for row in centroid_scores: if not row['cluster'] in centroid_samples: centroid_samples[row['cluster']] = row else: if row["distance"] < centroid_samples[row['cluster']]["distance"]: centroid_samples[row['cluster']] = row # Rather than write to disk, we create an in-memory CSV using StringIO # We write a header to the CSV using the field names from the centroid_scores CSV we downloaded previously CSV = StringIO.StringIO() buffer = csv.DictWriter(CSV, centroid_scores.fieldnames) buffer.writeheader() # For each of the rows with the minimum distance in centroid_samples, we right the rows to the in-memory CSV. # These will be our intelligently chosen samples. for cluster in centroid_samples: buffer.writerow(centroid_samples[cluster]) # Now we repeat the source/dataset/model/eval steps for the cluster samples. # First step: create the source from the in-memory CSV diabetes_cluster_source = api.check_resource( api.create_source(CSV, { "name": "Diabetes Cluster Sample" }), api.get_source) # Now create the dataset. We need to drop the cluster assignment and distance fields. diabetes_cluster_sample = api.check_resource( api.create_dataset(diabetes_cluster_source, {"excluded_fields": [ "000009", "00000a" ], "name": "Diabetes Cluster Sample" }), api.get_dataset) # Model the cluster sample dataset diabetes_cluster_model = api.check_resource( api.create_model(diabetes_cluster_sample), api.get_model) # Last step, evaluate this model against the original dataset diabetes_cluster_eval = api.check_resource( api.create_evaluation(diabetes_cluster_model, diabetes_dataset), api.get_evaluation) # Simple comparison of the accuracy. It's worth comparing these evaluations side-by-side in the bigml.com UI. # All 5 metrics should be significantly better. print "Random Sample Accuracy: %s" % diabetes_random_eval['object']['result']['model']['accuracy'] print "Cluster Sample Accuracy: %s" % diabetes_cluster_eval['object']['result']['model']['accuracy']