import os os.getcwd() os.chdir('..') os.getcwd() os.chdir('./week2/') os.getcwd() import pandas as pd fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD' # we can directly use read_csv to download the file # this is equivalent to R's combined download.file() and read.table() or read.csv() commands cameraData = pd.read_csv(fileUrl) # save data locally cameraData.to_csv('../data/cameras.csv', index=False) # for simplicity I'll use IPython tricks to list folder contents !ls ../data # get current date and time # this is equivalent to R date() command # note that I use IPython ! prefix to run my system's command dateDownloaded = !date print '\nDate downloaded: ' + str(dateDownloaded) cameraData.head() import urllib2 # download the file as camera.xls and save it in ./data subfolder fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.xls?accessType=DOWNLOAD' f = urllib2.urlopen(fileUrl) data = f.read() with open('../data/camera.xls', 'wb') as w: w.write(data) # load the Excel file as a pandas DataFrame cameraData = pd.ExcelFile('../data/camera.xls') cameraData = cameraData.parse('Baltimore Fixed Speed Cameras', index_col=None, na_values=['NA']) cameraData.head() import json # first we get the json file from the website fileUrl = 'https://data.baltimorecity.gov/api/views/dz54-2aru/rows.json?accessType=DOWNLOAD' req = urllib2.Request(fileUrl) opener = urllib2.build_opener() f = opener.open(req) # then we read it into a data structure jsonCamera = json.loads(f.read()) # json is loadad as dictionary print jsonCamera['meta']['view']['id'] print jsonCamera['meta']['view']['name'] print jsonCamera['meta']['view']['attribution'] # first read the csv file cameraData = pd.read_csv('../data/cameras.csv') # take a subset of the columns tmpData = cameraData.ix[:,2:] # then save it to a different csv file # this is equivalent to R's write.table() command tmpData.to_csv('../data/camerasModified.csv', sep=',', index=False) cameraData2 = pd.read_csv('../data/camerasModified.csv') cameraData2.head() print ['../data' + str(i) + '.csv' for i in range(1, 6)] from lxml.html import parse url = 'http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en' # this is equivalent to the combined R's opening/reading/closing connection and htmlTreeParse() commands html3 = parse(url).getroot() # get the title text using xpath expression # this is equivalent to R xpathSApply() command title = html3.xpath('//title') print [x.text_content() for x in title] # get the texts of col-citedby elements using xpath expression citedby = html3.xpath("//td[@id='col-citedby']") print [x.text_content() for x in citedby]