from pandas import DataFrame, Series
import pandas as pd
!cat ch06/ex1.csv
a,b,c,d,message 1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo
df = pd.read_csv('ch06/ex1.csv')
df
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
type(df)
pandas.core.frame.DataFrame
pd.read_table('ch06/ex1.csv', sep=',')
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
!cat ch06/ex2.csv
1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo
pd.read_csv('ch06/ex2.csv', header=None)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'message'])
a | b | c | message | |
---|---|---|---|---|
1 | 2 | 3 | 4 | hello |
5 | 6 | 7 | 8 | world |
9 | 10 | 11 | 12 | foo |
names = ['a', 'b', 'c', 'd', 'message']
# message -> index
pd.read_csv('ch06/ex2.csv', names=names, index_col='message')
a | b | c | d | |
---|---|---|---|---|
message | ||||
hello | 1 | 2 | 3 | 4 |
world | 5 | 6 | 7 | 8 |
foo | 9 | 10 | 11 | 12 |
!cat ch06/csv_mindex.csv
key1,key2,value1,value2 one,a,1,2 one,b,3,4 one,c,5,6 one,d,7,8 two,a,9,10 two,b,11,12 two,c,13,14 two,d,15,16
parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])
parsed
value1 | value2 | ||
---|---|---|---|
key1 | key2 | ||
one | a | 1 | 2 |
b | 3 | 4 | |
c | 5 | 6 | |
d | 7 | 8 | |
two | a | 9 | 10 |
b | 11 | 12 | |
c | 13 | 14 | |
d | 15 | 16 |
list(open('ch06/ex3.txt'))
[' A B C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb 0.927272 0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382 1.100491\n']
result = pd.read_table('ch06/ex3.txt', sep='\s+')
result
A | B | C | |
---|---|---|---|
aaa | -0.264438 | -1.026059 | -0.619500 |
bbb | 0.927272 | 0.302904 | -0.032399 |
ccc | -0.264273 | -0.386314 | -0.217601 |
ddd | -0.871858 | -0.348382 | 1.100491 |
pd.read_csv('ch06/ex3.txt', delimiter='\s+')
A | B | C | |
---|---|---|---|
aaa | -0.264438 | -1.026059 | -0.619500 |
bbb | 0.927272 | 0.302904 | -0.032399 |
ccc | -0.264273 | -0.386314 | -0.217601 |
ddd | -0.871858 | -0.348382 | 1.100491 |
# Read CSV(comma-separated) file into DataFrame
pd.read_csv?
!cat ch06/ex4.csv
# hey! a,b,c,d,message # just wanted to make things more difficult for you # who reads CSV files with computers, anyway? 1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo
pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
!cat ch06/ex5.csv
something,a,b,c,d,message one,1,2,3,4,NA two,5,6,,8,world three,9,10,11,12,foo
result = pd.read_csv('ch06/ex5.csv')
result
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11 | 12 | foo |
pd.isnull(result)
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | False | False | False | False | False | True |
1 | False | False | False | True | False | False |
2 | False | False | False | False | False | False |
result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])
result
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11 | 12 | foo |
# world를 NA값으로 처리하니 NaN으로 나온다.
# 특정한 값을 NA 처리할 수 있을것 같다.
pd.read_csv('ch06/ex5.csv', na_values=['world'])
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | NaN |
2 | three | 9 | 10 | 11 | 12 | foo |
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ch06/ex5.csv', na_values=sentinels)
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | NaN | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11 | 12 | NaN |
result = pd.read_csv('ch06/ex6.csv')
result
<class 'pandas.core.frame.DataFrame'> Int64Index: 10000 entries, 0 to 9999 Data columns (total 5 columns): one 10000 non-null values two 10000 non-null values three 10000 non-null values four 10000 non-null values key 10000 non-null values dtypes: float64(4), object(1)
pd.read_csv('ch06/ex6.csv', nrows=5)
one | two | three | four | key | |
---|---|---|---|---|---|
0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
chunker
<pandas.io.parsers.TextFileReader at 0x109b87f90>
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
tot = Series([])
for piece in chunker:
tot = tot.add( piece['key'].value_counts(), fill_value=0)
tot = tot.order(ascending=False)
tot[:10]
E 368 X 364 L 346 O 343 Q 340 M 338 J 337 F 335 K 334 H 330 dtype: float64
data = pd.read_csv('ch06/ex5.csv')
data
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11 | 12 | foo |
data.to_csv('ch06/out.csv')
!cat ch06/out.csv
,something,a,b,c,d,message 0,one,1,2,3.0,4, 1,two,5,6,,8,world 2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, sep='|')
|something|a|b|c|d|message 0|one|1|2|3.0|4| 1|two|5|6||8|world 2|three|9|10|11.0|12|foo
# Write DataFrame to a comma-separated value (csv) file
# na_rep -> Missing data representation. NA REPresentation
data.to_csv?
data.to_csv(sys.stdout, na_rep='NULL')
,something,a,b,c,d,message 0,one,1,2,3.0,4,NULL 1,two,5,6,NULL,8,world 2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, na_rep='NaN')
,something,a,b,c,d,message 0,one,1,2,3.0,4,NaN 1,two,5,6,NaN,8,world 2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, index=False, header=False)
one,1,2,3.0,4, two,5,6,,8,world three,9,10,11.0,12,foo
data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c'])
a,b,c 1,2,3.0 5,6, 9,10,11.0
dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)
ts.to_csv('ch06/tseries.csv')
!cat ch06/tseries.csv
2000-01-01,0 2000-01-02,1 2000-01-03,2 2000-01-04,3 2000-01-05,4 2000-01-06,5 2000-01-07,6
pd.DataFrame.to_csv?
Series.from_csv('ch06/tseries.csv', parse_dates=True)
2000-01-01 0 2000-01-02 1 2000-01-03 2 2000-01-04 3 2000-01-05 4 2000-01-06 5 2000-01-07 6 dtype: int64
type( Series.from_csv('ch06/tseries.csv', parse_dates=True) )
pandas.core.series.Series
# parse dates: boolean, default True.
# Parse dates. Different default from read_table
Series.from_csv?
pd.read_csv('ch06/tseries.csv', header=None)
0 | 1 | |
---|---|---|
0 | 2000-01-01 | 0 |
1 | 2000-01-02 | 1 |
2 | 2000-01-03 | 2 |
3 | 2000-01-04 | 3 |
4 | 2000-01-05 | 4 |
5 | 2000-01-06 | 5 |
6 | 2000-01-07 | 6 |
type(pd.read_csv('ch06/tseries.csv', header=None))
pandas.core.frame.DataFrame
pd.read_csv?
!cat ch06/ex7.csv
"a","b","c" "1","2","3" "1","2","3","4"
import csv
f = open('ch06/ex7.csv')
reader = csv.reader(f)
for line in reader:
print line
['a', 'b', 'c'] ['1', '2', '3'] ['1', '2', '3', '4']
lines = list(csv.reader(open('ch06/ex7.csv')))
header, values = lines[0], lines[1:]
header
['a', 'b', 'c']
values
[['1', '2', '3'], ['1', '2', '3', '4']]
# header = a,b,c
# values를 1,1을 같이 묶는다. 2,2 묶고. 3,3 묶고. 4는 header가 a,b,c 3개 밖에 없기 때문에 포함되지 않는다.
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
class my_dialect(csv.Dialect):
lineterminator = '\n'
delimiter = ';'
quotechar = '"'
reader = csv.reader
reader = csv.reader?
reader = csv.reader
reader = csv.reader
# quoting이 꼭 integer여야 한다는 오류가 발생해서 삽질하다가 뒤에 quoting keyword를 붙여줌..
reader = csv.reader(f, dialect=my_dialect)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-85-92557f61d368> in <module>() 1 # quoting이 꼭 integer여야 한다는 오류가 발생해서 삽질하다가 뒤에 quoting keyword를 붙여줌.. ----> 2 reader = csv.reader(f, dialect=my_dialect) TypeError: "quoting" must be an integer
reader = csv.reader(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)
csv.QUOTE_NONE
3
reader = csv.reader(f, delimiter='|')
# 어떤 옵션들 있는지 보려고 했더니 안 보여주네...
csv.reader??
with open('mydata.csv', 'w') as f:
writer = csv.writer(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)
writer.writerow(('one', 'two', 'three'))
writer.writerow(('1', '2', '3'))
writer.writerow(('4', '5', '6'))
writer.writerow(('7', '8', '9'))
!cat mydata.csv
one;two;three 1;2;3 4;5;6 7;8;9
# json은 python에서처럼 '으로 하면 안된다. 현재 """로 감싸 문자열로 저장되어 있기 때문에 javascript에서는 '를 string 값으로 인식하지 않아서 에러 발생
obj = """
{
'name': 'Wes',
'places_lived': ['United States', 'Spain', 'Germany'],
'pet': null, 'siblings': [{'name': 'Scott', 'age':25, 'pet':'Zuko'},
{'name': 'Katie', 'age':33, 'pet': 'Cisco'}]
}
"""
# ValueError: Expecting property name: line 3 column 5 (char 7)
result = json.loads(obj)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-93-f05e1f9794f2> in <module>() 1 # ValueError: Expecting property name: line 3 column 5 (char 7) ----> 2 result = json.loads(obj) NameError: name 'json' is not defined
obj = """
{
"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null, "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"},
{"name": "Katie", "age":33, "pet": "Cisco"}]
}
"""
obj
'\n{\n "name": "Wes",\n "places_lived": ["United States", "Spain", "Germany"],\n "pet": null, "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"},\n {"name": "Katie", "age":33, "pet": "Cisco"}]\n}\n'
import json
# ValueError: Expecting property name: line 3 column 5 (char 7)
result = json.loads(obj)
result
{u'name': u'Wes', u'pet': None, u'places_lived': [u'United States', u'Spain', u'Germany'], u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'}, {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}
asjson = json.dumps(result)
# '가 아니라 "인 것을 확인하자
asjson
'{"pet": null, "siblings": [{"pet": "Zuko", "age": 25, "name": "Scott"}, {"pet": "Cisco", "age": 33, "name": "Katie"}], "name": "Wes", "places_lived": ["United States", "Spain", "Germany"]}'
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings
name | age | |
---|---|---|
0 | Scott | 25 |
1 | Katie | 33 |
from lxml.html import parse
from urllib2 import urlopen
# 데이터를 가져 올 url을 넘긴 후
# 데이터를 받아 온 후 parse
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()
links = doc.findall('.//a')
links[15:20]
[<Element a at 0x109d23590>, <Element a at 0x109d235f0>, <Element a at 0x109d23650>, <Element a at 0x109d236b0>, <Element a at 0x109d23710>]
lnk = links[28]
lnk
<Element a at 0x109d23a70>
lnk.get('href')
'https://edit.yahoo.com/mc2.0/eval_profile?.intl=us&.lang=en-US&.done=http://finance.yahoo.com/q/op%3fs=AAPL%2bOptions&.src=quote&.intl=us&.lang=en-US'
lnk.text_content()
'Account Info'
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
urls[-10:]
['/q/op?s=AAPL&k=580.000000', '/q?s=AAPL140328P00580000', '/q/op?s=AAPL&k=600.000000', '/q?s=AAPL140328P00600000', '/q/os?s=AAPL&m=2014-03-28', 'http://help.yahoo.com/l/us/yahoo/finance/quotes/fitadelay.html', 'http://billing.finance.yahoo.com/realtime_quotes/signup?.src=quote&.refer=quote', 'http://www.capitaliq.com', 'http://www.csidata.com', 'http://www.morningstar.com/']
tables = doc.findall('.//table')
calls = tables[9]
puts = tables[13]
rows = calls.findall('.//tr')
def _unpack(row, kind='td'):
elts = row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
_unpack(rows[0], kind='th')
['Strike', 'Symbol', 'Last', 'Chg', 'Bid', 'Ask', 'Vol', 'Open Int']
_unpack(rows[1], kind='td')
['440.00', 'AAPL7140328C00440000', '94.04', ' 0.00', '98.30', '102.15', '1', '1']
from pandas.io.parsers import TextParser
def parse_options_data(table):
rows = table.findall('.//tr')
header = _unpack(rows[0], kind='th')
data = [_unpack(r) for r in rows[1:]]
return TextParser(data, names=header).get_chunk()
call_data = parse_options_data(calls)
put_data = parse_options_data(puts)
call_data[:10]
Strike | Symbol | Last | Chg | Bid | Ask | Vol | Open Int | |
---|---|---|---|---|---|---|---|---|
0 | 440.0 | AAPL7140328C00440000 | 94.04 | 0.00 | 98.30 | 102.15 | 1 | 1 |
1 | 450.0 | AAPL140328C00450000 | 86.25 | 0.00 | 89.40 | 91.75 | 5 | 11 |
2 | 450.0 | AAPL7140328C00450000 | 79.35 | 0.00 | 88.20 | 92.15 | 2 | 2 |
3 | 460.0 | AAPL140328C00460000 | 69.52 | 0.00 | 78.50 | 81.80 | 3 | 3 |
4 | 470.0 | AAPL140328C00470000 | 71.75 | 2.78 | 69.55 | 70.90 | 16 | 78 |
5 | 480.0 | AAPL140328C00480000 | 68.05 | 17.45 | 59.65 | 61.75 | 11 | 32 |
6 | 485.0 | AAPL140328C00485000 | 62.35 | 4.41 | 54.50 | 56.80 | 5 | 6 |
7 | 485.0 | AAPL7140328C00485000 | 64.00 | 0.00 | 53.20 | 57.20 | 0 | 1 |
8 | 490.0 | AAPL140328C00490000 | 58.50 | 7.09 | 49.30 | 50.90 | 1 | 60 |
9 | 492.5 | AAPL140328C00492500 | 37.15 | 0.00 | 46.30 | 49.10 | 5 | 5 |
put_data[:10]
Strike | Symbol | Last | Chg | Bid | Ask | Vol | Open Int | |
---|---|---|---|---|---|---|---|---|
0 | 430 | AAPL140328P00430000 | 0.01 | 0.00 | N/A | 0.01 | 6 | 164 |
1 | 440 | AAPL140328P00440000 | 0.01 | 0.00 | N/A | 0.01 | 105 | 236 |
2 | 440 | AAPL7140328P00440000 | 0.18 | 0.00 | N/A | 0.58 | 1 | 1 |
3 | 450 | AAPL140328P00450000 | 0.01 | 0.00 | N/A | 0.01 | 27 | 771 |
4 | 460 | AAPL140328P00460000 | 0.01 | 0.01 | N/A | 0.01 | 10 | 281 |
5 | 460 | AAPL7140328P00460000 | 0.15 | 0.00 | N/A | 0.07 | 20 | 20 |
6 | 470 | AAPL140328P00470000 | 0.01 | 0.01 | N/A | 0.02 | 2 | 493 |
7 | 480 | AAPL140328P00480000 | 0.02 | 0.00 | N/A | 0.02 | 1 | 1,065 |
8 | 485 | AAPL140328P00485000 | 0.01 | 0.02 | 0.01 | 0.02 | 63 | 781 |
9 | 485 | AAPL7140328P00485000 | 0.21 | 0.00 | N/A | 0.26 | 10 | 10 |
%%writefile Performance_MNR.xml
<INDICATOR>
<INDICATOR_SEQ>373889</INDICATOR_SEQ>
<PARENT_SEQ></PARENT_SEQ>
<AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME>
<INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
<DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION>
<PERIOD_YEAR>2011</PERIOD_YEAR>
<PERIOD_MONTH>12</PERIOD_MONTH>
<CATEGORY>Service Indicators</CATEGORY>
<FREQUENCY>M</FREQUENCY>
<DESIRED_CHANGE>U</DESIRED_CHANGE>
<INDICATOR_UNIT>%</INDICATOR_UNIT>
<DECIMAL_PLACES>1</DECIMAL_PLACES>
<YTD_TARGET>97.00</YTD_TARGET>
<YTD_ACTUAL></YTD_ACTUAL>
<MONTHLY_TARGET>97.00</MONTHLY_TARGET>
<MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
Overwriting Performance_MNR.xml
from lxml import objectify
import urllib2
path = 'Performance_MNR.xml'
# online_path = 'http://www.mta.info/developers/data/lirr/lirr_gtfs.xml'
# data = urllib2.urlopen(online_path).read()
# f = open(path, 'w')
# f.write(data)
# f.close()
parsed = objectify.parse(open(path))
root = parsed.getroot()
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
'DESIRED_CHANGE', 'DECIMAL_PLACES']
# root.INDICATOR -> root
for elt in root:
el_data = {}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag] = child.pyval
data.append(el_data)
data
[{'AGENCY_NAME': 'MEtro-North Railroad', 'CATEGORY': 'Service Indicators', 'DESCRIPTION': 'Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.', 'FREQUENCY': 'M', 'INDICATOR_NAME': 'Escalator Availability', 'INDICATOR_UNIT': '%', 'MONTHLY_ACTUAL': u'', 'MONTHLY_TARGET': 97.0, 'PERIOD_MONTH': 12, 'PERIOD_YEAR': 2011, 'YTD_ACTUAL': u'', 'YTD_TARGET': 97.0}]
# 위의 값과 비교하기 위해 테스트 해본 것
for elt in root:
for child in elt.getchildren():
print child.tag, child.pyval
INDICATOR_SEQ 373889 PARENT_SEQ AGENCY_NAME MEtro-North Railroad INDICATOR_NAME Escalator Availability DESCRIPTION Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009. PERIOD_YEAR 2011 PERIOD_MONTH 12 CATEGORY Service Indicators FREQUENCY M DESIRED_CHANGE U INDICATOR_UNIT % DECIMAL_PLACES 1 YTD_TARGET 97.0 YTD_ACTUAL MONTHLY_TARGET 97.0 MONTHLY_ACTUAL
perf = DataFrame(data)
perf
AGENCY_NAME | CATEGORY | DESCRIPTION | FREQUENCY | INDICATOR_NAME | INDICATOR_UNIT | MONTHLY_ACTUAL | MONTHLY_TARGET | PERIOD_MONTH | PERIOD_YEAR | YTD_ACTUAL | YTD_TARGET | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | MEtro-North Railroad | Service Indicators | Percent of the time that escalators are operat... | M | Escalator Availability | % | 97 | 12 | 2011 | 97 |
frame = pd.read_csv('ch06/ex1.csv')
frame
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
frame.save('ch06/frame_pickle')
/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/generic.py:720: FutureWarning: save is deprecated, use to_pickle warnings.warn("save is deprecated, use to_pickle", FutureWarning)
pd.load('ch06/frame_pickle')
/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/common.py:2306: FutureWarning: load is deprecated, use read_pickle warnings.warn("load is deprecated, use read_pickle", FutureWarning)
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
디스크에 이진 형식으로 저장된 대용량의 과학 자료를 효율적으로 읽고 쓸 수 있는 다양한 도구 존재
산업 기준에 맞는 인기 라이브러리중 하나가 HDF5(Hierarchical Data Format), 계층적 데이터 형식
내부적으로 파일 시스템 같은 노드 구조
여러 개의 데이터셋을 저장하고 부가 정보 기록 가능
다양한 압축 기술을 사용해서 on-the-fly(실시간) 압축 지원
반복되는 패턴을 가진 데이터 좀 더 효과적 저장
메모리에 모두 적재할 수 없는 엄ㅊ어나게 큰 데이터를 아주 큰 배열에서 필요한 만큼의 작은 부분들만 효과적으로 읽고 쓸 수 있는 훌륭한 선택
PyTables: HDF5를 추상화하여 여러가지 유연한 데이터 컨테이너와 테이블 색인, 질의 기능 그리고 외부 메모리 연산(out-of-core, external memory algorithm) 지원
h5py: 직접적이지만 고수준의 HDF5 API에 대한 인터페이스 제공
pandas는 PyTable를 이용한 HDFStore라는 가벼운 사전 클래스를 통해 pandas 객체를 저장
store = pd.HDFStore('mydata.h5')
--------------------------------------------------------------------------- Exception Traceback (most recent call last) <ipython-input-137-35f4287dfd8f> in <module>() ----> 1 store = pd.HDFStore('mydata.h5') /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/pytables.pyc in __init__(self, path, mode, complevel, complib, fletcher32, **kwargs) 343 import tables as _ 344 except ImportError: # pragma: no cover --> 345 raise Exception('HDFStore requires PyTables') 346 347 self._path = path Exception: HDFStore requires PyTables
xls_file = pd.ExcelFile('data.xls')
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) <ipython-input-138-42764eb0b1ec> in <module>() ----> 1 xls_file = pd.ExcelFile('data.xls') /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/excel.pyc in __init__(self, path_or_buf, **kwds) 71 def __init__(self, path_or_buf, **kwds): 72 ---> 73 import xlrd # throw an ImportError if we need to 74 75 ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) ImportError: No module named xlrd
table = xls_file.parse('Sheet1')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-139-b85a48af3b10> in <module>() ----> 1 table = xls_file.parse('Sheet1') NameError: name 'xls_file' is not defined
import requests
url = 'http://search.twitter.com/search.json?q=python%20pandas'
resp = requests.get(url)
resp
<Response [401]>
resp.text
u'{"errors":[{"message":"The Twitter REST API v1 is no longer active. Please migrate to API v1.1. https://dev.twitter.com/docs/api/1.1/overview.","code":64}]}'
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL, d INTEGER
);"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()
data = [('Atlanta', 'Georgia', 1.25, 6),
('Tallahassee', 'Florida', 2.6, 3),
('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()
대부분의 파이썬 SQL 드라이버(PyODBC, psycopg2, MySQLdb, pymssql 등)는 테이블에 대해 select 쿼리를 수행하면 튜플 리스트를 반환한다
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows
[(u'Atlanta', u'Georgia', 1.25, 6), (u'Tallahassee', u'Florida', 2.6, 3), (u'Sacramento', u'California', 1.7, 5)]
반환된 튜플 리스트를 DataFrame 생성자에 바로 전달해도 되지만 칼럼의 이름을 지정해주면 더 편하다. cursor의 description 속성을 활용하자.
cursor.description
(('a', None, None, None, None, None, None), ('b', None, None, None, None, None, None), ('c', None, None, None, None, None, None), ('d', None, None, None, None, None, None))
DataFrame(rows, columns=zip(*cursor.description)[0])
a | b | c | d | |
---|---|---|---|---|
0 | Atlanta | Georgia | 1.25 | 6 |
1 | Tallahassee | Florida | 2.60 | 3 |
2 | Sacramento | California | 1.70 | 5 |
import pandas.io.sql as sql
sql.read_frame('select * from test', con)
a | b | c | d | |
---|---|---|---|---|
0 | Atlanta | Georgia | 1.25 | 6 |
1 | Tallahassee | Florida | 2.60 | 3 |
2 | Sacramento | California | 1.70 | 5 |