from IPython.display import HTML
input_form = """
Ajenti Administration Interface
User: root
Password: admin
"""
javascript = """
"""
HTML(input_form + javascript)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.finance import candlestick, quotes_historical_yahoo, date2num
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from datetime import datetime, timedelta
pd.options.display.max_columns=50
def download_data(symbol, days_delta=30):
# Set Start and End Date
finish_date = datetime.today()
start_date = finish_date - timedelta(days=days_delta)
# Read from Yahoo! Finance
stocks_raw = quotes_historical_yahoo(symbol, start_date, finish_date)
stocks_df = pd.DataFrame(stocks_raw, columns=["n_date", "open", "close", "high", "low", "volume"])
return stocks_df
stocks_df = download_data("GOOG")
stocks_df
def process_date(stocks_df):
stocks_df["n_date"] = stocks_df["n_date"].astype(np.int32)
stocks_df["date"] = stocks_df["n_date"].apply(datetime.fromordinal)
return stocks_df
process_date(stocks_df)
def calculate_stats(stocks_df):
stocks_df["average"] = (stocks_df["close"] + stocks_df["high"] + stocks_df["low"]) / 3.0
stocks_df["change_amount"] = stocks_df["close"] - stocks_df["open"]
stocks_df["change_per"] = stocks_df["change_amount"] / stocks_df["average"]
stocks_df["range"] = (stocks_df["high"] - stocks_df["low"]) / stocks_df["average"]
stocks_df["change_1_amount"] = pd.Series(0.0)
stocks_df["change_1_amount"][1:] = stocks_df["average"][1:].values - stocks_df["average"][:-1].values
stocks_df["change_1_per"] = stocks_df["change_1_amount"] / stocks_df["average"]
return stocks_df
calculate_stats(stocks_df)
plt.figure(figsize=(10,6))
plt.plot_date(stocks_df["n_date"], stocks_df["average"], fmt="-b", linewidth=3, alpha=.4, label="Average")
plt.gcf().autofmt_xdate()
plt.title("Google Average Stock Price - Daily")
plt.grid()
plt.legend(loc="best")
plt.show();
fig = plt.figure(figsize=(10,6))
axes_1 = fig.add_subplot(111)
#axes_2 = fig.add_subplot(111)
axes_2 = axes_1.twinx()
line1 = axes_1.plot_date(stocks_df["n_date"], stocks_df["average"], "o-b",
linewidth=3, alpha=.4, label="Average")
line2 = axes_2.plot_date(stocks_df["n_date"], stocks_df["change_1_per"], ".-r",
linewidth=2, alpha=.4, label="Change from 1 day")
axes_2.plot_date(stocks_df["n_date"], np.zeros(len(stocks_df)), fmt="-k", linewidth=1, alpha=.4)
plt.gcf().autofmt_xdate()
axes_1.set_title("Google Average Stock Price - Daily")
axes_1.grid()
axes_1.legend(loc="upper right")
axes_2.legend(loc="lower right")
plt.show();
fig = plt.figure(figsize=(10,6))
axes = fig.add_subplot(111)
candlestick(axes, stocks_df[["n_date", "open", "close", "high", "low"]].values,
width=0.6, colorup='g', colordown='r')
axes.xaxis_date()
plt.gcf().autofmt_xdate()
plt.grid()
plt.show();
fig = plt.figure(figsize=(10,6))
axes_1 = fig.add_subplot(111)
axes_2 = axes_1.twinx()
candlestick(axes_1, stocks_df[["n_date", "open", "close", "high", "low"]].values,
width=0.6, colorup='g', colordown='r')
axes_2.plot_date(stocks_df["n_date"], stocks_df["change_1_per"], "o--b", linewidth=3, alpha=0.4)
axes_1.xaxis_date()
plt.gcf().autofmt_xdate()
axes_1.grid()
plt.show();
stock_dict={"GOOG": "Google",
"FB": "Facebook, Inc.",
"AAPL": "Apple Inc.",
"MSFT": "Microsoft Corporation",
"HPQ": "Hewlett-Packard Company",
"INTC": "Intel Corporation",
"NVDA": "NVIDIA Corporation",
"TXN": "Texas Instruments Incorporated",
"IBM": "International Business Machines Corp. (IBM)",
"SAP": "SAP SE (ADR)",
"ADBE": "Adobe Systems Incorporated",
"ADSK": "Autodesk, Inc.",
"CRM": "salesforce.com, inc.",
"N": "NetSuite Inc",
"VMW": "VMware, Inc.",
"CTXS": "Citrix Systems, Inc.",
"RHT": "Red Hat Inc",
"RAX": "Rackspace Hosting, Inc.",
"AMZN": "Amazon.com, Inc.",
"NWSA": "News Corp",
"EBAY": "eBay Inc",
"CBS": "CBS Corporation",
"CMCSA": "Comcast Corporation",
"VIAB": "Viacom, Inc.",
"NFLX": "Netflix, Inc.",
"TWX": "Time Warner Inc",
"FOXA": "Twenty-First Century Fox Inc",
"NYT": "The New York Times Company",
"TRI": "Thomson Reuters Corporation (USA)",
"DIS": "The Walt Disney Company",
"SNE": "Sony Corp (ADR)",
"PCRFY": "Panasonic Corporation (ADR)",
"CAJ": "Canon Inc (ADR)",
"TOSYY": "Toshiba Corp (USA)",
"BBRY": "BlackBerry Ltd",
"CSC": "Computer Sciences Corporation",
"GE": "General Electric Company",
"HTHIY": "Hitachi, Ltd. (ADR)",
"SIEGY": "Siemens AG (ADR)",
"CVX": "Chevron Corporation",
"XOM": "Exxon Mobil Corporation",
"BP": "BP plc (ADR)",
"CAT": "Caterpillar Inc.",
"LXK": "Lexmark International Inc",
"BKS": "Barnes & Noble, Inc.",
"FJTSY": "Fujitsu Ltd (ADR)",
"EMC": "EMC Corporation",
"ORCL": "Oracle Corporation",
"CSCO": "Cisco Systems, Inc.",
"XRX": "Xerox Corp",
}
symbols = stock_dict.keys()
names = stock_dict.values()
stocks_data = pd.DataFrame(symbols, columns=["symbol"])
stocks_data["name"] = names
stocks_data
temp_list = []
for symbol in stocks_data["symbol"]:
temp_data = download_data(symbol)
process_date(temp_data)
calculate_stats(temp_data)
temp_data["symbol"] = symbol
temp_list.append(temp_data)
stocks_df = pd.concat(temp_list)
stocks_df
fig = plt.figure(figsize=(10,6))
axes = fig.add_subplot(111)
for symbol in stocks_df["symbol"].unique():
x = stocks_df[stocks_df["symbol"] == symbol]["n_date"]
y = stocks_df[stocks_df["symbol"] == symbol]["change_1_per"]
axes.plot_date(x=x, y=y, fmt="-", alpha=0.2, linewidth=2)
axes.xaxis_date()
plt.gcf().autofmt_xdate()
plt.grid()
plt.show();
def pivot_data(stocks_df, values="change_1_per"):
clustering_data = stocks_df.pivot(index="symbol", columns="n_date", values=values)
return clustering_data
clustering_data = pivot_data(stocks_df, values="change_1_per")
clustering_data
for item in clustering_data.values:
plt.plot(item)
plt.show();
norm_data = normalize(clustering_data.values, axis=1)
norm_data = pd.DataFrame(norm_data)
for item in norm_data.values:
plt.plot(item)
plt.show();
def cluster_data(data, n_clusters=8, normalize_data=False):
if normalize_data:
data = normalize(data.values, norm='l2', axis=1, copy=True)
cluster_model = KMeans(n_clusters=n_clusters)
prediction = cluster_model.fit_predict(data)
return prediction, cluster_model, data
prediction, model, data = cluster_data(clustering_data, n_clusters=8, normalize_data=True)
print "Cluster Count: %s" % len(np.unique(prediction))
clustering_data["Cluster"] = prediction
clustering_data
def visualize_clusters(data_df, values="change_1_per", n_clusters=8, normalize_data=False):
data = pivot_data(data_df, values)
prediction, model, c_data = cluster_data(data, n_clusters=n_clusters, normalize_data=normalize_data)
c_data = pd.DataFrame(c_data, index=data.index,columns=data.columns)
data["Cluster"] = prediction
c_data["Cluster"] = prediction
plt.figure
for cluster in np.unique(prediction):
plt.plot(model.cluster_centers_[cluster], "o-", alpha=0.5, linewidth=2)
plt.show()
for cluster in np.unique(prediction):
temp_cluster_data = c_data[c_data["Cluster"]==cluster]
print "Cluster: %s" % cluster
print "Members: %s" % ["%s: %s"% (symbol, stock_dict[symbol]) for symbol in list(temp_cluster_data.index)]
plt.figure()
plt.title("Cluster#: %s" % cluster)
plt.plot(model.cluster_centers_[cluster], "o--", alpha=0.5, linewidth=2)
for symbol in temp_cluster_data.index:
plt.plot(np.ravel(temp_cluster_data.loc[[symbol]].drop("Cluster", 1).values),
alpha=0.2, linewidth=2)
plt.grid()
plt.show();
return prediction, model, c_data
prediction, model, c_data = visualize_clusters(stocks_df, values="average", n_clusters=3, normalize_data=True);
def measure_error(prediction, model, c_data):
error_score = []
for counter in range(len(c_data)):
true_val = c_data.drop("Cluster",1).values[counter]
center_val = model.cluster_centers_[c_data["Cluster"][counter]]
error_score.append(np.average(np.abs(true_val - center_val)) / np.average(center_val))
cluster_counts = c_data["Cluster"].value_counts()
return np.average(error_score), len(cluster_counts[cluster_counts==1])
measure_error(prediction, model, c_data)
from IPython.html import widgets
from IPython.html.widgets import interact
def visualize_clusters_widget(values="change_1_per", n_clusters=8, normalize_data=False):
prediction, model, c_data = visualize_clusters(data_df=stocks_df,
values=values,
n_clusters=n_clusters,
normalize_data=normalize_data
)
print measure_error(prediction, model, c_data)
interact(visualize_clusters_widget,
values=["change_1_per","close","average","change_per"],
n_clusters=(2,50),
normalize=False
);
max_clusters = 30
feature = "average"
clustering_data = pivot_data(stocks_df, values=feature)
clustering_data["Cluster"] = pd.Series()
for normalize_data in [True, False]:
fig = plt.figure(figsize=(10,6))
plt.title("K-Means - Feature: %s Normalized: %s" % (feature, normalize_data))
axes_1 = fig.add_subplot(111)
axes_2 = axes_1.twinx()
score_error_list = []
failed_clusters_list = []
for n_clusters in range(2,max_clusters):
prediction, model, data = cluster_data(clustering_data.drop("Cluster",1), n_clusters=n_clusters,
normalize_data=normalize_data)
data = pd.DataFrame(data, index=clustering_data.index,columns=clustering_data.drop("Cluster",1).columns)
data["Cluster"] = prediction
score_error, failed_clusters = measure_error(prediction, model, data)
score_error_list.append(score_error)
failed_clusters_list.append(failed_clusters)
axes_1.plot(range(2,max_clusters), score_error_list, "ro-", label = "Average Error")
axes_2.plot(range(2,max_clusters), failed_clusters_list, "bo-", label = "Failed Cluster")
axes_1.grid()
axes_1.legend(loc = "lower center")
axes_2.legend(loc = "upper center")
axes_1.set_ylabel("Average Error")
axes_2.set_ylabel("Failed Cluster")
axes_1.set_xlabel("Clusters")
plt.show()