%gui wx import urllib, urllib2 import re import datetime import calendar from bs4 import BeautifulSoup import random import pylab as pl from pylab import get_current_fig_manager as gcfm import wx import numpy as np class Date: def __init__(self, date): months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] if re.search('[a-zA-Z]{3}\ [0-9]{1,2},\ [0-9]{4}',date): splitDate = date.split(' ') threeLetterMonths = map(lambda month: month[:3], months) self.m = int(threeLetterMonths.index(splitDate[0])+1) if ',' in splitDate[1]: self.d = int(splitDate[1].replace(',','')) self.y = int(splitDate[2]) self.date = `self.m` + '-' + `self.d` + '-' + `self.y` else: splitDate = date.split('-') self.date = date self.m = int(splitDate[0]) self.d = int(splitDate[1]) self.y = int(splitDate[2]) def __repr__(self): # return str('Date(' + self.date + ')') return str(self.date) # returns (start date, end date) def dateRange(self, daysPadding): # daysPadding is in days import datetime if self.d == 0: # handle the case the day is 0 (day wasn't indicated on wikipedia page) self.d = 1 date = datetime.date(int(self.y), int(self.m), int(self.d)) difference = datetime.timedelta(days=daysPadding) beginInterval = date - difference endInterval = date + difference earliestDate = datetime.date(1985, 9, 2) # earliest borderline check if beginInterval < earliestDate: beginInterval = earliestDate endInterval = beginInterval + difference + difference beginDate = Date(str(beginInterval.month) + '-' + str(beginInterval.day) + '-' + str(beginInterval.year)) endDate = Date(str(endInterval.month) + '-' + str(endInterval.day) + '-' + str(endInterval.year)) #make sure none of dates returned are weekends import calendar if calendar.weekday(beginDate.y, beginDate.m, beginDate.d) == 6: # beginDate is Sunday # print 'beginDate is Sunday' difference = datetime.timedelta(days=2) date = datetime.date(int(beginDate.y), int(beginDate.m), int(beginDate.d)) beginInterval = date - difference elif calendar.weekday(beginDate.y, beginDate.m, beginDate.d) == 5: # beginDate is Saturday # print 'beginDate is Saturday' difference = datetime.timedelta(days=1) date = datetime.date(int(beginDate.y), int(beginDate.m), int(beginDate.d)) beginInterval = date - difference if calendar.weekday(endDate.y, endDate.m, endDate.d) == 6: # endDate is Sunday # print 'endDate is Sunday' difference = datetime.timedelta(days=1) date = datetime.date(int(endDate.y), int(endDate.m), int(endDate.d)) endInterval = date + difference elif calendar.weekday(endDate.y, endDate.m, endDate.d) == 5: # endDate is Saturday # print 'endDate is Saturday' difference = datetime.timedelta(days=2) date = datetime.date(int(endDate.y), int(endDate.m), int(endDate.d)) endInterval = date + difference # begin interval is a weekend: beginDate = Date(str(beginInterval.month) + '-' + str(beginInterval.day) + '-' + str(beginInterval.year)) endDate = Date(str(endInterval.month) + '-' + str(endInterval.day) + '-' + str(endInterval.year)) # print self # print beginDate # print endDate return (beginDate, endDate) #returns an integer representation of the date that makes the date easy to sort def numericDate(self): year = str(self.y) month = str(self.m) day = str(self.d) if len(year) == 1: year = '0'+ year if len(month) == 1: month = '0'+ month if len(day) == 1: day = '0'+ day return int(year + month + day) def parse_yahoo_stock(line): parts = line.split(',') parts_dict = {} # for product releases there may be a difference between diff('high', 'close') # as opposed to more steady differences for a 'normal' non-release day legends = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'] floats = [0, 1, 1, 1, 1, 0, 1] for i in range(len(parts)): if i == 5: parts_dict[legends[i]] = int(parts[i]) if floats[i]: parts_dict[legends[i]] = float(parts[i]) else: parts_dict[legends[i]] = parts[i] return parts_dict def getProductReleasesForApple(): import urllib, urllib2 from bs4 import BeautifulSoup import re article = "Timeline of Apple Inc. products" article = urllib.quote(article) # sanitize opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] # wikipedia blocks obvious bot attempts resource = opener.open("http://en.wikipedia.org/wiki/"+article) data = resource.read() resource.close() soup = BeautifulSoup(data) # print soup.find('div',id="bodyContent") bodyContent = soup.find('div',id="bodyContent") wikitables = bodyContent.find_all('table',class_="wikitable") products = {} for wikitable in wikitables: m = re.search('[0-9]+', str(wikitable)) year = m.group(0) # if first == 0: # first = 1 first = 1 trs = wikitable.find_all('tr') year = '' date = '' productName = '' family = '' deathDate = '' rowspan = 0 rowspanFix = 1 spanSub = 0 for tr in trs: tr = BeautifulSoup(str(tr)) if len(tr.find_all('td')) != 0: tds = tr.find_all('td') count = 0 # if len(tds.find_all('b')) != 0: if first == 1: for td in tds: if count == 0: m = re.search('[0-9]+', str(td)) year = m.group(0) year = year[3:-4] elif count == 1: # m = re.search('>[a-zA-Z0-9\ ]+<', str(td)) # date = m.group(0) date = td.text date = date + ' ' + year elif count == 2: productName = td.text elif count == 3: family = td.text elif count == 4: deathDate = td.text count += 1 first = 0 months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] dateSplit = str(date).split(' ') if ',' in dateSplit[1]: dateSplit[1] = dateSplit[1][:-1] if len(dateSplit) == 3: newD = str(months.index(dateSplit[0])+1)+'-'+dateSplit[1]+'-'+dateSplit[2] else: newD = str(months.index(dateSplit[0])+1)+'-'+'00'+'-'+dateSplit[1] products[productName] = [Date(newD), family, deathDate] else: if len(tds) == 3: for td in tds: if count == 0: productName = td.text if count == 1: family = td.text if count == 2: deathDate = td.text count += 1 months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] dateSplit = str(date).split(' ') if ',' in dateSplit[1]: dateSplit[1] = dateSplit[1][:-1] if len(dateSplit) == 3: newD = str(months.index(dateSplit[0])+1)+'-'+dateSplit[1]+'-'+dateSplit[2] else: newD = str(months.index(dateSplit[0])+1)+'-'+'00'+'-'+dateSplit[1] products[str(productName)] = [Date(newD), str(family), str(deathDate)] elif len(tds) == 4: for td in tds: if count == 0: date = td.text date = date + ' ' + year if count == 1: productName = td.text if count == 2: family = td.text if count == 3: deathDate = td.text count += 1 months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] dateSplit = str(date).split(' ') if ',' in dateSplit[1]: dateSplit[1] = dateSplit[1][:-1] if len(dateSplit) == 3: newD = str(months.index(dateSplit[0])+1)+'-'+dateSplit[1]+'-'+dateSplit[2] else: newD = str(months.index(dateSplit[0])+1)+'-'+'00'+'-'+dateSplit[1] products[str(productName)] = [Date(newD), str(family), str(deathDate)] elif len(tr.find_all('th')) != 0: ths = tr.find_all('th') return products getProductReleasesForApple() timeline = getProductReleasesForApple() sortedTimeline = sorted(timeline.items(),key=lambda tup: tup[1][0].numericDate()) productName = [] family = [] releaseDate = [] discontinueDate = [] for item in sortedTimeline: productName.append(item[0]) releaseDate.append(item[1][0]) family.append(item[1][1]) discontinueDate.append(item[1][2]) print productName[:10] print # print releaseDate[:10] # print print family[:10] class Query: #queryTypes: "timerange, family, product" def __init__(self, queryType, arg, daysPadding=5): self.daysPadding = daysPadding self.symbol = "AAPL" self.queryType = queryType self.arg = arg # set up the dataFrame if queryType=="timerange": self.startDate = arg[0].dateRange(daysPadding)[0] self.endDate = arg[1].dateRange(daysPadding)[1] # self.dateBoundary() criterion = timelineDataFrame['Release Date'].map(lambda date: (date.numericDate() > self.startDate.numericDate()) and (date.numericDate() < self.endDate.numericDate())) self.dataFrame = timelineDataFrame[criterion] elif queryType=="family": criterion = timelineDataFrame['Family'] == arg self.dataFrame = timelineDataFrame[criterion] for date in self.dataFrame['Release Date']: dateRange = date.dateRange(daysPadding) self.startDate = self.dataFrame['Release Date'][0].dateRange(daysPadding)[0] self.endDate = self.dataFrame['Release Date'][-1].dateRange(daysPadding)[1] # self.dateBoundary() elif queryType=="product": self.dataFrame = timelineDataFrame[timelineDataFrame.index == arg] dateRange = self.dataFrame['Release Date'][0].dateRange(daysPadding) self.startDate = dateRange[0] self.endDate = dateRange[1] # self.dateBoundary() else: print "invalid queryType" self.setStockData() # print self.dataFrame def plotIndividualStockDifferences(self): import matplotlib.pyplot as plt plot = mouseHoverPlot(self.dataFrame['Individual Stock Difference'], self.dataFrame, 'Individual Stock Difference') pl.ylabel('Stock Difference over %d day(s) in dollars' % self.daysPadding) pl.xlabel('Product Names') pl.show() def plotSlopeChanges(self): import matplotlib.pyplot as plt plot = mouseHoverPlot(self.dataFrame['Stock Slope Change'], self.dataFrame, 'Stock Slope Change') pl.ylabel('Stock slope changes over %d day(s) in dollars' % self.daysPadding) pl.xlabel('Product Names') pl.show() def getIndividualStock(self, releaseDate): dateRange = releaseDate.dateRange(self.daysPadding) startIndividualDate = dateRange[0] endIndividualDate = dateRange[1] interval = 'd' url = "http://ichart.yahoo.com/table.csv?s=%s&a=%i&b=%i&c=%i&d=%i&e=%i&f=%i&g=%s&ignore=.csv" \ % ( self.symbol, startIndividualDate.m-1, startIndividualDate.d, startIndividualDate.y, endIndividualDate.m-1, endIndividualDate.d, endIndividualDate.y, interval) from time import sleep u = urllib.urlopen(url) ulines = u.read().split("\n") start = ulines[-2] end = ulines[1] difference = parse_yahoo_stock(end)['Close'] - parse_yahoo_stock(start)['Close'] # print 'getIndividualStock works' if difference > 0: sign = '+' else: sign = '-' #print sign + str(difference) return difference def getRangeStockData(self): interval = 'd' url = "http://ichart.yahoo.com/table.csv?s=%s&a=%i&b=%i&c=%i&d=%i&e=%i&f=%i&g=%s&ignore=.csv" \ % ( self.symbol, self.startDate.m-1, self.startDate.d, self.startDate.y, self.endDate.m-1, self.endDate.d, self.endDate.y, interval) from time import sleep u = urllib.urlopen(url) ulines = u.read().split("\n") start = ulines[-2] end = ulines[1] difference = parse_yahoo_stock(end)['Close'] - parse_yahoo_stock(start)['Close'] # print 'rangestockdata works' if difference > 0: sign = '+' else: sign = '-' #print sign + str(difference) return difference def getStockSlope(self, releaseDate): interval = 'd' startDate = releaseDate.dateRange(self.daysPadding)[0] endDate = releaseDate.dateRange(self.daysPadding)[1] # print self.dataFrame url = "http://ichart.yahoo.com/table.csv?s=%s&a=%i&b=%i&c=%i&d=%i&e=%i&f=%i&g=%s&ignore=.csv" \ % ( self.symbol, startDate.m-1, startDate.d, startDate.y, releaseDate.m-1, releaseDate.d, releaseDate.y, interval) from time import sleep u = urllib.urlopen(url) ulines = u.read().split("\n") start = ulines[-2] end = ulines[1] # print end # print self.startDate # print releaseDate # print self.endDate # print 'about to start stockSlopes' # print 'startDate: ', startDate # print 'releaseDate: ', releaseDate # print 'endDate: ', endDate # print start,end # print parse_yahoo_stock(end) leadingDifference = parse_yahoo_stock(end)['Close'] - parse_yahoo_stock(start)['Close'] # print 'getstockslope part 1 works' leadingSlope = leadingDifference / self.daysPadding url = "http://ichart.yahoo.com/table.csv?s=%s&a=%i&b=%i&c=%i&d=%i&e=%i&f=%i&g=%s&ignore=.csv" \ % ( self.symbol, releaseDate.m-1, releaseDate.d, releaseDate.y, endDate.m-1, endDate.d, endDate.y, interval) # from time import sleep u = urllib.urlopen(url) ulines = u.read().split("\n") start = ulines[-2] end = ulines[1] leavingDifference = parse_yahoo_stock(end)['Close'] - parse_yahoo_stock(start)['Close'] # print 'getstockslope part 2 works' leavingSlope = leavingDifference / self.daysPadding # print start,end slopeDifference = leavingSlope - leadingSlope return slopeDifference def setStockData(self): # should set both RangeStock and IndividualStock rangeStockImpact = self.getRangeStockData() # print 'set rangeStockData' self.dataFrame['Range Stock Difference'] = rangeStockImpact indivStocks = [] stockSlopes = [] for date in self.dataFrame['Release Date']: indivStocks.append(self.getIndividualStock(date)) stockSlopes.append(self.getStockSlope(date)) self.dataFrame['Individual Stock Difference'] = indivStocks # print 'set individual stock' self.dataFrame['Stock Slope Change'] = stockSlopes # print 'set stock slopes' #returns row of most influential product def getMostInfluencial(self): maxIndex = self.dataFrame['Individual Stock Difference'].argmax() maxProduct = self.dataFrame.index[maxIndex] maxProductRow = self.dataFrame[self.dataFrame.index == maxProduct] return maxProductRow import matplotlib as plt plt.use('WXAgg') plt.interactive(False) class mouseHoverPlot(object): def __init__(self, dataY, dataFrame, plotType): import pylab as pl from pylab import get_current_fig_manager as gcfm import wx import numpy as np import random self.plotType = plotType self.dataFrame = dataFrame self.figure = pl.figure() self.axis = self.figure.add_subplot(111) # create a long tooltip with newline to get around wx bug (in v2.6.3.3) # where newlines aren't recognized on subsequent self.tooltip.SetTip() calls self.tooltip = wx.ToolTip(tip='tip with a long %s line and a newline\n' % (' '*100)) gcfm().canvas.SetToolTip(self.tooltip) self.tooltip.Enable(False) self.tooltip.SetDelay(0) self.figure.canvas.mpl_connect('motion_notify_event', self._onMotion) self.dataX = range(len(dataY)) self.dataY = dataY self.xTicks = dataFrame.index pl.xticks(self.dataX, self.xTicks) self.axis.plot(self.dataX, self.dataY, linestyle='-', marker='o', markersize=15, label='myplot') def _onMotion(self, event): collisionFound = False if event.xdata != None and event.ydata != None: # mouse is inside the axes for i in xrange(len(self.dataX)): radius = .2 if abs(event.xdata - self.dataX[i]) < radius and abs(event.ydata - self.dataY[i]) < radius: def productName(xPos): return self.xTicks[int(round(xPos))] # print self.dataFrame[self.dataFrame.index == productName(event.xdata)]['Release Date'][0] top = tip='Product: %s\nRelease Date: %s\nStock Price Difference: $%.2f' % (productName(event.xdata),self.dataFrame[self.dataFrame.index == productName(event.xdata)]['Release Date'][0], self.dataFrame[self.dataFrame.index == productName(event.xdata)][self.plotType][0] ) self.tooltip.SetTip(tip) self.tooltip.Enable(True) collisionFound = True break if not collisionFound: self.tooltip.Enable(False) from IPython.core.display import HTML selectform = "" selectform += "

Single Products

" selectform += "" selectform += "


" selectform += "" "var selected=$('select#singleProducts').find(':selected').text();py1='selected = "'+selected+'"';IPython.notebook.kernel.execute(py1);" #selectform += "" selectform += "

Time Range

" selectform += "" script="" HTML(selectform+"") argument = "" if qtype == 'singleProduct': print 'Single Product' print singleProduct qtype = 'product' argument = singleProduct elif qtype == 'family': print 'Family' print family qtype = 'family' argument = family elif qtype == 'timeRange': print 'Time Range' print timeRange qtype = 'timerange' argument = timeRange import pandas timelineDataFrame = pandas.DataFrame({'Product Name': productName, 'Release Date':releaseDate, 'Family': family, 'Date Discontinued': discontinueDate, 'Individual Stock Difference': 0, 'Range Stock Difference': 0, 'Stock Slope Change': 0}).set_index('Product Name') def removeOldItems(borderDate): criterion = timelineDataFrame['Release Date'].map(lambda date: date.numericDate() > borderDate.numericDate()) return timelineDataFrame[criterion] earliestDate = Date("9-2-1985") timelineDataFrame = removeOldItems(earliestDate) timeline = getProductReleasesForApple() sortedTimeline = sorted(timeline.items(),key=lambda tup: tup[1][0].numericDate()) productName = [] family = [] releaseDate = [] discontinueDate = [] for item in sortedTimeline: productName.append(item[0]) releaseDate.append(item[1][0]) family.append(item[1][1]) discontinueDate.append(item[1][2]) import pandas timelineDataFrame = pandas.DataFrame({'Product Name': productName, 'Release Date':releaseDate, 'Family': family, 'Date Discontinued': discontinueDate, 'Individual Stock Difference': 0, 'Range Stock Difference': 0, 'Stock Slope Change': 0}).set_index('Product Name') def removeOldItems(borderDate): criterion = timelineDataFrame['Release Date'].map(lambda date: date.numericDate() > borderDate.numericDate()) return timelineDataFrame[criterion] earliestDate = Date("9-2-1985") timelineDataFrame = removeOldItems(earliestDate) # print timelineDataFrame[:20] # sampleQuery = Query("timerange", (Date("1-1-1911"), Date("3-6-1992"))) # sampleQuery.plotSlopeChanges() print qtype print argument sampleFamilyQuery = Query(qtype, argument) sampleFamilyQuery.plotIndividualStockDifferences() sampleFamilyQuery.plotSlopeChanges()