#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd np.random.seed(12345) import matplotlib.pyplot as plt plt.rc("figure", figsize=(10, 6)) pd.options.display.max_colwidth = 75 pd.options.display.max_columns = 20 np.set_printoptions(precision=4, suppress=True) # In[2]: get_ipython().system('cat examples/ex1.csv') # In[3]: df = pd.read_csv("examples/ex1.csv") df # In[4]: get_ipython().system('cat examples/ex2.csv') # In[5]: pd.read_csv("examples/ex2.csv", header=None) pd.read_csv("examples/ex2.csv", names=["a", "b", "c", "d", "message"]) # In[6]: names = ["a", "b", "c", "d", "message"] pd.read_csv("examples/ex2.csv", names=names, index_col="message") # In[7]: get_ipython().system('cat examples/csv_mindex.csv') parsed = pd.read_csv("examples/csv_mindex.csv", index_col=["key1", "key2"]) parsed # In[8]: get_ipython().system('cat examples/ex3.txt') # In[9]: result = pd.read_csv("examples/ex3.txt", sep="\s+") result # In[10]: get_ipython().system('cat examples/ex4.csv') pd.read_csv("examples/ex4.csv", skiprows=[0, 2, 3]) # In[11]: get_ipython().system('cat examples/ex5.csv') result = pd.read_csv("examples/ex5.csv") result # In[12]: pd.isna(result) # In[13]: result = pd.read_csv("examples/ex5.csv", na_values=["NULL"]) result # In[14]: result2 = pd.read_csv("examples/ex5.csv", keep_default_na=False) result2 result2.isna() result3 = pd.read_csv("examples/ex5.csv", keep_default_na=False, na_values=["NA"]) result3 result3.isna() # In[15]: sentinels = {"message": ["foo", "NA"], "something": ["two"]} pd.read_csv("examples/ex5.csv", na_values=sentinels, keep_default_na=False) # In[16]: pd.options.display.max_rows = 10 # In[17]: result = pd.read_csv("examples/ex6.csv") result # In[18]: pd.read_csv("examples/ex6.csv", nrows=5) # In[19]: chunker = pd.read_csv("examples/ex6.csv", chunksize=1000) type(chunker) # In[20]: chunker = pd.read_csv("examples/ex6.csv", chunksize=1000) tot = pd.Series([], dtype='int64') for piece in chunker: tot = tot.add(piece["key"].value_counts(), fill_value=0) tot = tot.sort_values(ascending=False) # In[21]: tot[:10] # In[22]: data = pd.read_csv("examples/ex5.csv") data # In[23]: data.to_csv("examples/out.csv") get_ipython().system('cat examples/out.csv') # In[24]: import sys data.to_csv(sys.stdout, sep="|") # In[25]: data.to_csv(sys.stdout, na_rep="NULL") # In[26]: data.to_csv(sys.stdout, index=False, header=False) # In[27]: data.to_csv(sys.stdout, index=False, columns=["a", "b", "c"]) # In[28]: get_ipython().system('cat examples/ex7.csv') # In[29]: import csv f = open("examples/ex7.csv") reader = csv.reader(f) # In[30]: for line in reader: print(line) f.close() # In[31]: with open("examples/ex7.csv") as f: lines = list(csv.reader(f)) # In[32]: header, values = lines[0], lines[1:] # In[33]: data_dict = {h: v for h, v in zip(header, zip(*values))} data_dict # In[34]: obj = """ {"name": "Wes", "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"], "pet": null, "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]}, {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}] } """ # In[35]: import json result = json.loads(obj) result # In[36]: asjson = json.dumps(result) asjson # In[37]: siblings = pd.DataFrame(result["siblings"], columns=["name", "age"]) siblings # In[38]: get_ipython().system('cat examples/example.json') # In[39]: data = pd.read_json("examples/example.json") data # In[40]: data.to_json(sys.stdout) data.to_json(sys.stdout, orient="records") # In[41]: tables = pd.read_html("examples/fdic_failed_bank_list.html") len(tables) failures = tables[0] failures.head() # In[42]: close_timestamps = pd.to_datetime(failures["Closing Date"]) close_timestamps.dt.year.value_counts() # In[43]: from lxml import objectify path = "datasets/mta_perf/Performance_MNR.xml" with open(path) as f: parsed = objectify.parse(f) root = parsed.getroot() # In[44]: data = [] skip_fields = ["PARENT_SEQ", "INDICATOR_SEQ", "DESIRED_CHANGE", "DECIMAL_PLACES"] for elt in root.INDICATOR: el_data = {} for child in elt.getchildren(): if child.tag in skip_fields: continue el_data[child.tag] = child.pyval data.append(el_data) # In[45]: perf = pd.DataFrame(data) perf.head() # In[46]: perf2 = pd.read_xml(path) perf2.head() # In[47]: frame = pd.read_csv("examples/ex1.csv") frame frame.to_pickle("examples/frame_pickle") # In[48]: pd.read_pickle("examples/frame_pickle") # In[49]: get_ipython().system('rm examples/frame_pickle') # In[50]: fec = pd.read_parquet('datasets/fec/fec.parquet') # In[51]: xlsx = pd.ExcelFile("examples/ex1.xlsx") # In[52]: xlsx.sheet_names # In[53]: xlsx.parse(sheet_name="Sheet1") # In[54]: xlsx.parse(sheet_name="Sheet1", index_col=0) # In[55]: frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1") frame # In[56]: writer = pd.ExcelWriter("examples/ex2.xlsx") frame.to_excel(writer, "Sheet1") writer.close() # In[57]: frame.to_excel("examples/ex2.xlsx") # In[58]: get_ipython().system('rm examples/ex2.xlsx') # In[59]: get_ipython().system('rm -f examples/mydata.h5') # In[60]: frame = pd.DataFrame({"a": np.random.standard_normal(100)}) store = pd.HDFStore("examples/mydata.h5") store["obj1"] = frame store["obj1_col"] = frame["a"] store # In[61]: store["obj1"] # In[62]: store.put("obj2", frame, format="table") store.select("obj2", where=["index >= 10 and index <= 15"]) store.close() # In[63]: frame.to_hdf("examples/mydata.h5", "obj3", format="table") pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"]) # In[64]: import os os.remove("examples/mydata.h5") # In[65]: import requests url = "https://api.github.com/repos/pandas-dev/pandas/issues" resp = requests.get(url) resp.raise_for_status() resp # In[66]: data = resp.json() data[0]["title"] # In[67]: issues = pd.DataFrame(data, columns=["number", "title", "labels", "state"]) issues # In[68]: import sqlite3 query = """ CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );""" con = sqlite3.connect("mydata.sqlite") con.execute(query) con.commit() # In[69]: data = [("Atlanta", "Georgia", 1.25, 6), ("Tallahassee", "Florida", 2.6, 3), ("Sacramento", "California", 1.7, 5)] stmt = "INSERT INTO test VALUES(?, ?, ?, ?)" con.executemany(stmt, data) con.commit() # In[70]: cursor = con.execute("SELECT * FROM test") rows = cursor.fetchall() rows # In[71]: cursor.description pd.DataFrame(rows, columns=[x[0] for x in cursor.description]) # In[72]: import sqlalchemy as sqla db = sqla.create_engine("sqlite:///mydata.sqlite") pd.read_sql("SELECT * FROM test", db) # In[73]: get_ipython().system('rm mydata.sqlite') # In[74]: