import os
#os.chdir('/path/to/extracted/fileformat.zip')

'''
os.chdir expects a string object as an argument
the string can be either a path relative to your current working directory or an absolute path
I'm passing an absolute path below:
'''

os.chdir('/Users/rweiss/Dropbox/presentations/IRiSS2013/text1/fileformats/')

'''
os.listdir also expects a string object
os.getcwd() returns the current working path as a string object
so, you can pass os.getcwd() as an argument to os.listdir...when the code is interpreted, python will handle the returns for you
note that os.listdir() returns a list of strings, where each element in the list is a string representing the filename of the
files in the directory 
'''
os.listdir(os.getcwd())

'''
csv is a module in the python standard library
you need to import csv in order to gain access to the methods and objects available to you in the csv module
'''

import csv

'''
this code will print the first line in the file 'sociology_2010.csv'
because we are using csv.reader and the file has field names, the first line are field names
'''

with open('amazon/sociology_2010.csv', 'rb') as csvfile: # this will automatically call csvfile.close() when the code below completes
    amazon_reader = csv.reader(csvfile) #this instantiates a csv.reader object called "amazon_reader".  
    #csv.reader will preloaded with the values from the file argument "csvfile", which is the data from the file "sociology_2010.csv" above.
    for row in amazon_reader: # amazon_reader is an iterable object, so you can write the expression "for VALUE in ITERABLE" and whatever code you write that
        #affects VALUE will affect every iterable element in the iterable object (in this case, the csv.reader)
        print row #prints whatever is the value of row
        break # this just breaks after one iteration of the loop.  the file is big, so i didn't want it to fill the screen

import csv
with open('amazon/sociology_2010.csv', 'rb') as csvfile:
    amazon_reader = csv.DictReader(csvfile, delimiter=',', quotechar='\"') #a DictReader has more methods available to it than a regular csv.reader()
    #though csv.readers also can have delimiters and quotechars specified...this stuff is listed in the python docs online.
    for row in amazon_reader:
        print row #because this is a DictReader, every row in the csv is represented as a dictionary, not as a list (as per the regular csv.reader)
        break

print amazon_reader.fieldnames #DictReaders have a member method "fieldnames" which returns a list of strings corresponding to the first row of the file

with open('amazon/sociology_2010.csv', 'rb') as csvfile:
    amazon_reader = csv.DictReader(csvfile, delimiter=',')
    for row in amazon_reader:
        print row['title'] #because they are dicts, you can refer to the value by the key; in this case, the "title" field
        break
    #print [row['title'] for row in amazon_reader]
    

#solution 1
with open('amazon/sociology_2010.csv', 'rb') as csvfile:
    amazon_reader = csv.DictReader(csvfile, delimiter=',')
    amazon_reviews = [row['review_text'] for row in amazon_reader]

print len(amazon_reviews)

#solution 2
with open('amazon/sociology_2010.csv', 'rb') as csvfile:
    amazon_reader = csv.DictReader(csvfile, delimiter=',')
    amazon_review_dicts = [{row['doi']: row['review_text']} for row in amazon_reader]

    
#doc_tf_vectors = [tf(term, text) for term, text in amazon_reviews]

from openpyxl import load_workbook
import csv, sys

wb = load_workbook('example.xlsx')
print type(wb)
#print dir(wb)
# create object from xlsx workbook
wb.get_sheet_names() # print out all sheet names
print type(wb.get_sheet_names())
print type(wb.get_sheet_names()[0])
print type(wb.worksheets)
for sheet in wb.worksheets: 
    print type(sheet)
#	print sheet.title # another way of printing all sheet names


sheet1 = wb.worksheets[0]
print sheet1# grab the first sheet (also look up .get_sheet_by_name())
print sheet1.cell('A1').value # print the text value of cell A1

row2 = sheet1.rows[1] # create tuple of Cell objects
for cell in row2:
	print cell.value # print all the text values of every cell in the row

print dir(wb)

print wb.encoding
print type(wb.encoding)
print wb.properties
print dir(wb.properties)
print wb.properties.creator

from xlrd import open_workbook

wb = open_workbook('example.xls') # create object from xls workbook
print wb.sheets()
for sheet in wb.sheets():
    print sheet.name # printing all sheet names


sheet1 = wb.sheets()[0] # grab the first sheet, also look up .sheet_by_index(NUM)
print sheet1.cell(1,1)# print the text value of cell A1
row2 = sheet1.row(1) # create tuple of Cell objects

for cell in row2:
    print cell.value # print all the text values of every cell in the row

print dir(wb)

print wb.encoding
print wb.nsheets
print type(wb.sheets())
print type(wb.sheets()[0])
print type(sheet1.row(1))
print type(row2[0])

from docx import opendocx, getdocumenttext

document = opendocx('syllabus.docx')
print type(document)
paragraphs = getdocumenttext(document)
print type(paragraphs)
print len(paragraphs)
print paragraphs[0]

print type(paragraphs[0])

print type(paragraphs[0].encode('utf-8'))
print paragraphs[0]

import zipfile
import os

zfile = zipfile.ZipFile("jstor.zip")
dirname = 'jstor'

print os.listdir(os.getcwd())

#this would be better as try-except, but that's for future learning
if os.path.isdir(dirname):
    pass
else:
    os.makedirs(dirname)

print 'Files in directory '+ dirname + ': '+ ', '.join(os.listdir(os.path.join(os.getcwd(), dirname)))

for name in zfile.namelist():
    print 'Extracting file ' + name + '...'
    fd = open(os.path.join(dirname, name), "w")
    fd.write(zfile.read(name))
    fd.close()

print 'Files in directory '+ dirname + ': '+ ','.join(os.listdir(os.path.join(os.getcwd(), dirname)))