import pandas as pd
import numpy as np
from ggplot import *
df = pd.read_excel("1033-program-foia-may-2014.xlsx")
df.columns
Index([u'State', u'County', u'NSN', u'Item Name', u'Quantity', u'UI', u'Acquisition Cost', u'Ship Date'], dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 73028 entries, 0 to 73027 Data columns (total 8 columns): State 73028 non-null object County 73028 non-null object NSN 72983 non-null object Item Name 71732 non-null object Quantity 73028 non-null int64 UI 73028 non-null object Acquisition Cost 73028 non-null float64 Ship Date 73028 non-null object dtypes: float64(1), int64(1), object(6)
df.describe()
Quantity | Acquisition Cost | |
---|---|---|
count | 73028.000000 | 73028.000000 |
mean | 15.006792 | 7967.575490 |
std | 384.623930 | 197293.243356 |
min | 1.000000 | 0.000000 |
25% | 1.000000 | 58.710000 |
50% | 1.000000 | 200.000000 |
75% | 5.000000 | 499.000000 |
max | 91000.000000 | 18000000.000000 |
# 91000? Is that right?
df.sort("Quantity", ascending=False)
State | County | NSN | Item Name | Quantity | UI | Acquisition Cost | Ship Date | |
---|---|---|---|---|---|---|---|---|
52530 | CA | VENTURA | 6145-01-254-4040 | WIRE,ELECTRICAL | 91000 | Foot | 0.58 | 2013-10-02 00:00:00 |
36221 | CA | LOS ANGELES | 5305-01-205-0041 | SCREW,CAP,SOCKET HEAD | 43822 | Each | 0.23 | 2014-03-11 00:00:00 |
52399 | CA | VENTURA | 5975-01-331-5006 | STRAP,TIEDOWN,ELECTRICAL COMPONENTS | 6000 | Each | 0.27 | 2013-06-05 00:00:00 |
52536 | CA | VENTURA | 6145-DS-00C-ABLE | CABLE COAX | 6000 | FT | 0.11 | 20120302 |
39189 | CA | LOS ANGELES | 9320-00-641-3051 | RUBBER SHEET,SOLID | 6000 | Each | 10.82 | 2013-03-14 00:00:00 |
36226 | CA | LOS ANGELES | 5310-01-283-6216 | WASHER,FLAT | 5181 | Each | 0.23 | 2014-02-22 00:00:00 |
53473 | CA | YOLO | 6510-01-515-7528 | BANDAGE KIT | 5000 | KT | 7.76 | 20120217 |
39190 | CA | LOS ANGELES | 9320-00-641-3051 | RUBBER SHEET,SOLID | 5000 | Each | 10.82 | 2013-03-14 00:00:00 |
65245 | FL | BRADFORD | 4240-00-165-5026 | FILTER ELEMENT SET, | 4245 | SE | 20.42 | 20090625 |
37344 | CA | LOS ANGELES | 7340-00-243-5390 | SPOON,FIELD MESS | 3824 | Each | 3.21 | 2014-03-19 00:00:00 |
3520 | AL | CHEROKEE | 5660-00-262-9914 | POST,FENCE,METAL | 3306 | Each | 8.49 | 2014-02-28 00:00:00 |
3517 | AL | CHEROKEE | 5660-00-262-9914 | POST,FENCE,METAL | 3306 | Each | 8.49 | 2014-02-27 00:00:00 |
3516 | AL | CHEROKEE | 5660-00-262-9914 | POST,FENCE,METAL | 3306 | Each | 8.49 | 2014-02-26 00:00:00 |
3518 | AL | CHEROKEE | 5660-00-262-9914 | POST,FENCE,METAL | 3306 | Each | 8.49 | 2014-02-28 00:00:00 |
5861 | AL | CULLMAN | 9540-01-014-0400 | BEAM,STRUCTURAL | 3000 | Foot | 27.79 | 2012-01-16 00:00:00 |
39621 | CA | MENDOCINO | 6145-01-254-4040 | WIRE,ELECTRICAL | 3000 | Foot | 0.57 | 2013-07-29 00:00:00 |
52535 | CA | VENTURA | 6145-DS-00C-ABLE | CABLE VIDEO | 3000 | FT | 0.10 | 20120302 |
36220 | CA | LOS ANGELES | 5305-01-174-7044 | SCREW,MACHINE | 2905 | Each | 9.74 | 2014-02-25 00:00:00 |
39188 | CA | LOS ANGELES | 8970-00-149-1094 | MEAL,INDIVIDUAL | 2873 | Box | 99.95 | 2014-04-02 00:00:00 |
36763 | CA | LOS ANGELES | 6515-00-967-6983 | SUTURE,NONABSORBABLE,SURGICAL | 2755 | Package | 42.98 | 2013-01-28 00:00:00 |
62059 | DE | KENT | 5305-01-253-2993 | SCREW,TAPPING | 2580 | Each | 0.35 | 2013-03-13 00:00:00 |
37482 | CA | LOS ANGELES | 7920-00-240-6350 | BROOM,WHISK | 2240 | Each | 5.37 | 2014-03-19 00:00:00 |
11002 | AL | JEFFERSON | 7340-00-240-7436 | KNIFE,FIELD MESS | 2217 | EA | 3.58 | 20120801 |
11000 | AL | JEFFERSON | 7340-00-240-7436 | KNIFE,FIELD MESS | 2217 | Each | 3.58 | 2012-07-06 00:00:00 |
11006 | AL | JEFFERSON | 7340-00-243-5390 | SPOON,FIELD MESS | 2205 | EA | 3.06 | 20120801 |
11003 | AL | JEFFERSON | 7340-00-243-5390 | SPOON,FIELD MESS | 2205 | Each | 3.06 | 2012-07-05 00:00:00 |
36049 | CA | LOS ANGELES | 5120-01-054-7141 | WRENCH,BOX AND OPEN END,COMBINATION | 2190 | Each | 15.01 | 2013-03-25 00:00:00 |
44883 | CA | SACRAMENTO | 9520-00-277-4901 | ANGLE,STRUCTURAL | 2092 | Foot | 2.12 | 2013-01-16 00:00:00 |
38709 | CA | LOS ANGELES | 8465-01-416-4634 | SPECTACLES,BALLISTIC AND LASER PROTECTIVE | 2043 | Each | 20.30 | 2012-03-25 00:00:00 |
21405 | AZ | MARICOPA | 1005-00-921-5004 | MAGAZINE,CARTRIDGE | 2000 | Each | 9.31 | 2013-06-21 00:00:00 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
29738 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29739 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29740 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29718 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29716 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29692 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-18 00:00:00 |
29703 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29693 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-18 00:00:00 |
29694 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-18 00:00:00 |
29695 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29696 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29697 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29698 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29699 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29700 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29701 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29702 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29704 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29715 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29705 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29706 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29707 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29708 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29709 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29710 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29711 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29712 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29713 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
29714 | CA | KERN | 1005-00-073-9421 | RIFLE,5.56 MILLIMETER | 1 | Each | 499.00 | 2006-12-26 00:00:00 |
73027 | FL | WASHINGTON | 7830-DS-TRE-ADM1 | TREADMILL | 1 | Each | 1500.00 | 2012-11-08 00:00:00 |
73028 rows × 8 columns
# What's wrong with the above table? (Foot, Ship Date, No county codes)
df.UI.value_counts()
Each 51581 EA 13370 Pair 1381 PR 1285 Unknown 982 Kit 732 Set 463 KT 441 SE 360 Package 355 PG 288 Box 246 BX 190 Assembly 140 Roll 114 ... ST 2 LG 1 Skein 1 CY 1 Bolt 1 MR 1 MX 1 Board Foot 1 JR 1 E4 1 Cartridge 1 SK 1 UU 1 SP 1 Hank 1 Length: 94, dtype: int64
df = df.replace({"UI":["EA", "EACH", "PR", "PG", "KT", "SE", "BX", "RO", "FT", "LT", "SL", "DZ", "HD", "AY", "CL", "LB", "RL", "SH", "CN"]},
{"UI":["Each", "Each", "Pair", "Package", "Kit", "Set", "Box", "Roll", "Foot", "Lot", "Spool", "Dozen", "Hundred", "Assembly", "Coil", "Pound", "Reel", "Sheet", "Container"]})
df.UI.value_counts()
Each 64956 Pair 2666 Kit 1173 Unknown 982 Set 823 Package 643 Box 436 Assembly 206 Roll 204 Foot 182 Lot 112 Dozen 80 Hundred 77 Spool 63 Pound 53 ... Square Foot 2 UU 1 SK 1 LG 1 Skein 1 E4 1 CY 1 JR 1 MX 1 Board Foot 1 Cartridge 1 Hank 1 Bolt 1 MR 1 SP 1 Length: 75, dtype: int64
df = df.replace({"UI":"Unknown"}, {"UI":np.NaN})
df.UI.value_counts()
Each 64956 Pair 2666 Kit 1173 Set 823 Package 643 Box 436 Assembly 206 Roll 204 Foot 182 Lot 112 Dozen 80 Hundred 77 Spool 63 Reel 53 Pound 53 ... Gross 2 LG 1 CY 1 SK 1 Hank 1 Bolt 1 MR 1 MX 1 Cartridge 1 Board Foot 1 JR 1 E4 1 Skein 1 UU 1 SP 1 Length: 74, dtype: int64
df.UI.value_counts()[20:]
Assortment 10 GL 9 Can 8 OT 7 BD 7 Drum 7 Plate 7 AT 6 PT 6 DR 5 Tube 5 Bundle 5 Strip 4 CO 4 Bale 4 Outfit 4 BP 4 BG 4 BE 4 YD 4 Thousand 3 SF 3 BA 3 Bag 3 CA 3 TU 3 Length 2 PM 2 Packet 2 ST 2 Pad 2 Gallon 2 Meter 2 Square yard 2 CT 2 QT 2 EE 2 Square Foot 2 SY 2 Gross 2 LG 1 CY 1 SK 1 Hank 1 Bolt 1 MR 1 MX 1 Cartridge 1 Board Foot 1 JR 1 E4 1 Skein 1 UU 1 SP 1 Length: 54, dtype: int64
dontcare = df.UI.value_counts()[20:]
df = df[~df.UI.isin(dontcare.index)]
date_col = df["Ship Date"].astype(str)
dates_a = pd.to_datetime(date_col[date_col.str.contains("(\d{4}-\d{2}-\d{2})")])
dates_b = pd.to_datetime(date_col[date_col.str.contains("(\d{4}\d{2}\d{2})")])
/Users/makmana/scicompute/env/lib/python2.7/site-packages/pandas/core/strings.py:188: UserWarning: This pattern has match groups. To actually get the groups, use str.extract. " groups, use str.extract.", UserWarning)
dates_new = pd.concat([dates_a.dropna(), dates_b.dropna()]).reindex_like(df)
dates_new
0 2012-08-30 1 2012-08-30 2 2012-08-30 3 2012-08-30 4 2012-08-30 5 2012-08-30 6 2012-08-30 7 2012-08-30 8 2012-08-30 9 2012-08-30 10 2012-08-30 11 2012-08-30 12 2012-08-30 13 2012-08-30 14 2012-08-30 ... 73013 2013-06-07 73014 2013-06-07 73015 2013-08-19 73016 2008-04-15 73017 2007-01-03 73018 2007-01-03 73019 2008-04-14 73020 2008-04-14 73021 2008-04-14 73022 2008-04-14 73023 2011-03-30 73024 2011-03-30 73025 2012-11-08 73026 2012-11-08 73027 2012-11-08 Name: Ship Date, Length: 72854, dtype: datetime64[ns]
df["Ship Date"] = dates_new
# See what the costs look like
df["Acquisition Cost"].hist(bins=100)
df.sort("Acquisition Cost")