import os #os.chdir('/path/to/extracted/fileformat.zip') ''' os.chdir expects a string object as an argument the string can be either a path relative to your current working directory or an absolute path I'm passing an absolute path below: ''' os.chdir('/Users/rweiss/Dropbox/presentations/IRiSS2013/text1/fileformats/') ''' os.listdir also expects a string object os.getcwd() returns the current working path as a string object so, you can pass os.getcwd() as an argument to os.listdir...when the code is interpreted, python will handle the returns for you note that os.listdir() returns a list of strings, where each element in the list is a string representing the filename of the files in the directory ''' os.listdir(os.getcwd()) ''' csv is a module in the python standard library you need to import csv in order to gain access to the methods and objects available to you in the csv module ''' import csv ''' this code will print the first line in the file 'sociology_2010.csv' because we are using csv.reader and the file has field names, the first line are field names ''' with open('amazon/sociology_2010.csv', 'rb') as csvfile: # this will automatically call csvfile.close() when the code below completes amazon_reader = csv.reader(csvfile) #this instantiates a csv.reader object called "amazon_reader". #csv.reader will preloaded with the values from the file argument "csvfile", which is the data from the file "sociology_2010.csv" above. for row in amazon_reader: # amazon_reader is an iterable object, so you can write the expression "for VALUE in ITERABLE" and whatever code you write that #affects VALUE will affect every iterable element in the iterable object (in this case, the csv.reader) print row #prints whatever is the value of row break # this just breaks after one iteration of the loop. the file is big, so i didn't want it to fill the screen import csv with open('amazon/sociology_2010.csv', 'rb') as csvfile: amazon_reader = csv.DictReader(csvfile, delimiter=',', quotechar='\"') #a DictReader has more methods available to it than a regular csv.reader() #though csv.readers also can have delimiters and quotechars specified...this stuff is listed in the python docs online. for row in amazon_reader: print row #because this is a DictReader, every row in the csv is represented as a dictionary, not as a list (as per the regular csv.reader) break print amazon_reader.fieldnames #DictReaders have a member method "fieldnames" which returns a list of strings corresponding to the first row of the file with open('amazon/sociology_2010.csv', 'rb') as csvfile: amazon_reader = csv.DictReader(csvfile, delimiter=',') for row in amazon_reader: print row['title'] #because they are dicts, you can refer to the value by the key; in this case, the "title" field break #print [row['title'] for row in amazon_reader] #solution 1 with open('amazon/sociology_2010.csv', 'rb') as csvfile: amazon_reader = csv.DictReader(csvfile, delimiter=',') amazon_reviews = [row['review_text'] for row in amazon_reader] print len(amazon_reviews) #solution 2 with open('amazon/sociology_2010.csv', 'rb') as csvfile: amazon_reader = csv.DictReader(csvfile, delimiter=',') amazon_review_dicts = [{row['doi']: row['review_text']} for row in amazon_reader] #doc_tf_vectors = [tf(term, text) for term, text in amazon_reviews] from openpyxl import load_workbook import csv, sys wb = load_workbook('example.xlsx') print type(wb) #print dir(wb) # create object from xlsx workbook wb.get_sheet_names() # print out all sheet names print type(wb.get_sheet_names()) print type(wb.get_sheet_names()[0]) print type(wb.worksheets) for sheet in wb.worksheets: print type(sheet) # print sheet.title # another way of printing all sheet names sheet1 = wb.worksheets[0] print sheet1# grab the first sheet (also look up .get_sheet_by_name()) print sheet1.cell('A1').value # print the text value of cell A1 row2 = sheet1.rows[1] # create tuple of Cell objects for cell in row2: print cell.value # print all the text values of every cell in the row print dir(wb) print wb.encoding print type(wb.encoding) print wb.properties print dir(wb.properties) print wb.properties.creator from xlrd import open_workbook wb = open_workbook('example.xls') # create object from xls workbook print wb.sheets() for sheet in wb.sheets(): print sheet.name # printing all sheet names sheet1 = wb.sheets()[0] # grab the first sheet, also look up .sheet_by_index(NUM) print sheet1.cell(1,1)# print the text value of cell A1 row2 = sheet1.row(1) # create tuple of Cell objects for cell in row2: print cell.value # print all the text values of every cell in the row print dir(wb) print wb.encoding print wb.nsheets print type(wb.sheets()) print type(wb.sheets()[0]) print type(sheet1.row(1)) print type(row2[0]) from docx import opendocx, getdocumenttext document = opendocx('syllabus.docx') print type(document) paragraphs = getdocumenttext(document) print type(paragraphs) print len(paragraphs) print paragraphs[0] print type(paragraphs[0]) print type(paragraphs[0].encode('utf-8')) print paragraphs[0] import zipfile import os zfile = zipfile.ZipFile("jstor.zip") dirname = 'jstor' print os.listdir(os.getcwd()) #this would be better as try-except, but that's for future learning if os.path.isdir(dirname): pass else: os.makedirs(dirname) print 'Files in directory '+ dirname + ': '+ ', '.join(os.listdir(os.path.join(os.getcwd(), dirname))) for name in zfile.namelist(): print 'Extracting file ' + name + '...' fd = open(os.path.join(dirname, name), "w") fd.write(zfile.read(name)) fd.close() print 'Files in directory '+ dirname + ': '+ ','.join(os.listdir(os.path.join(os.getcwd(), dirname)))