import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 데이터 읽기; 한번에 인덱스화 해서 읽는 방법은 나중에 소개됩니다.
df = pd.read_csv('data/apple_stock.csv')
df.head()
Date | High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|---|
0 | 2009-12-31 | 30.478571 | 30.080000 | 30.447144 | 30.104286 | 88102700.0 | 20.159719 |
1 | 2010-01-04 | 30.642857 | 30.340000 | 30.490000 | 30.572857 | 123432400.0 | 20.473503 |
2 | 2010-01-05 | 30.798571 | 30.464285 | 30.657143 | 30.625713 | 150476200.0 | 20.508902 |
3 | 2010-01-06 | 30.747143 | 30.107143 | 30.625713 | 30.138571 | 138040000.0 | 20.182680 |
4 | 2010-01-07 | 30.285715 | 29.864286 | 30.250000 | 30.082857 | 119282800.0 | 20.145369 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2265 entries, 0 to 2264 Data columns (total 7 columns): Date 2265 non-null object High 2265 non-null float64 Low 2265 non-null float64 Open 2265 non-null float64 Close 2265 non-null float64 Volume 2265 non-null float64 Adj Close 2265 non-null float64 dtypes: float64(6), object(1) memory usage: 123.9+ KB
DatetimeIndex 아닌 RangeIndex 를 갖고 있으므로, Date 칼럼으로부터 DateTime 인덱스 생성.
df['Date'] = df['Date'].apply(pd.to_datetime)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2265 entries, 0 to 2264 Data columns (total 7 columns): Date 2265 non-null datetime64[ns] High 2265 non-null float64 Low 2265 non-null float64 Open 2265 non-null float64 Close 2265 non-null float64 Volume 2265 non-null float64 Adj Close 2265 non-null float64 dtypes: datetime64[ns](1), float64(6) memory usage: 123.9 KB
Date 칼럼이 object 에서 datetime 으로 형변환됨.
df.set_index('Date', inplace=True)
df.head()
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2009-12-31 | 30.478571 | 30.080000 | 30.447144 | 30.104286 | 88102700.0 | 20.159719 |
2010-01-04 | 30.642857 | 30.340000 | 30.490000 | 30.572857 | 123432400.0 | 20.473503 |
2010-01-05 | 30.798571 | 30.464285 | 30.657143 | 30.625713 | 150476200.0 | 20.508902 |
2010-01-06 | 30.747143 | 30.107143 | 30.625713 | 30.138571 | 138040000.0 | 20.182680 |
2010-01-07 | 30.285715 | 29.864286 | 30.250000 | 30.082857 | 119282800.0 | 20.145369 |
Datetime 타입의 인덱스 생성됨
monthly 혹은 yearly groupby 후 aggregation 할 때의 문제점.
df['month'] = df.index.month
df.groupby('month').agg(sum)
High | Low | Open | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
month | ||||||
1 | 16141.351418 | 15840.451458 | 16007.617104 | 15989.401403 | 1.748822e+10 | 14038.491524 |
2 | 15739.942860 | 15445.045748 | 15572.844261 | 15607.478556 | 1.511651e+10 | 13799.143261 |
3 | 18676.318544 | 18358.607164 | 18522.812897 | 18516.759998 | 1.646522e+10 | 16387.278601 |
4 | 17763.422855 | 17444.350006 | 17612.804337 | 17595.992851 | 1.656252e+10 | 15549.686649 |
5 | 18874.642849 | 18552.902849 | 18711.531479 | 18731.747120 | 1.595049e+10 | 16741.255772 |
6 | 19040.630035 | 18743.354290 | 18909.838608 | 18881.168510 | 1.427367e+10 | 16927.685444 |
7 | 18908.574245 | 18616.531422 | 18760.998562 | 18775.181419 | 1.403693e+10 | 16779.291849 |
8 | 21322.417149 | 20943.891430 | 21113.692909 | 21152.527245 | 1.505109e+10 | 19060.226547 |
9 | 19596.267178 | 19222.517181 | 19423.325752 | 19400.800022 | 1.547466e+10 | 17478.810833 |
10 | 21670.588585 | 21245.169964 | 21466.111412 | 21471.372849 | 1.579270e+10 | 19412.855421 |
11 | 19932.064278 | 19547.734276 | 19761.048534 | 19746.541416 | 1.301104e+10 | 17929.619272 |
12 | 19514.165709 | 19137.724298 | 19344.287140 | 19309.390026 | 1.223923e+10 | 17473.007708 |
df.groupby(df.index.year).sum()
High | Low | Open | Close | Volume | Adj Close | month | |
---|---|---|---|---|---|---|---|
Date | |||||||
2009 | 30.478571 | 30.080000 | 30.447144 | 30.104286 | 8.810270e+07 | 20.159719 | 12 |
2010 | 9445.277117 | 9246.514246 | 9358.474289 | 9354.328560 | 3.775623e+10 | 6264.245874 | 1659 |
2011 | 13227.248539 | 12970.717155 | 13106.211399 | 13104.155712 | 3.101483e+10 | 8775.365667 | 1650 |
2012 | 20779.478600 | 20354.327164 | 20594.740028 | 20573.204269 | 3.299105e+10 | 14019.096821 | 1625 |
2013 | 17195.001381 | 16856.895695 | 17032.612881 | 17014.855690 | 2.560539e+10 | 13377.068882 | 1649 |
2014 | 23439.191429 | 23051.835686 | 23239.397186 | 23250.692886 | 1.591449e+10 | 21242.415085 | 1651 |
2015 | 30553.620010 | 29953.500031 | 30284.289993 | 30250.079948 | 1.306315e+10 | 28458.719048 | 1655 |
2016 | 26567.650009 | 26129.949989 | 26335.959984 | 26360.209991 | 9.680622e+09 | 25313.373260 | 1652 |
2017 | 38002.920044 | 37521.400078 | 37761.670013 | 37788.319946 | 6.810426e+09 | 36972.054916 | 1640 |
2018 | 47939.520004 | 46983.060043 | 47463.110077 | 47452.410126 | 8.537986e+09 | 47134.853607 | 1633 |
df.groupby([df.index.year, df.index.month]).agg(sum)
High | Low | Open | Close | Volume | Adj Close | month | ||
---|---|---|---|---|---|---|---|---|
Date | Date | |||||||
2009 | 12 | 30.478571 | 30.080000 | 30.447144 | 30.104286 | 8.810270e+07 | 20.159719 | 12 |
2010 | 1 | 573.331425 | 557.244289 | 568.397141 | 563.798571 | 3.792249e+09 | 377.554930 | 19 |
2 | 544.092857 | 532.875715 | 537.678570 | 539.588568 | 2.694020e+09 | 361.342400 | 38 | |
3 | 738.235716 | 727.080000 | 732.047146 | 734.064285 | 3.038543e+09 | 491.575537 | 69 | |
4 | 759.410004 | 744.359997 | 753.004292 | 753.445721 | 3.091782e+09 | 504.554579 | 84 | |
5 | 730.337135 | 699.494276 | 719.941429 | 718.461422 | 4.520664e+09 | 481.126873 | 100 | |
6 | 832.535709 | 812.074276 | 824.641434 | 820.742855 | 4.162813e+09 | 549.620947 | 132 | |
7 | 775.377140 | 753.262863 | 766.438568 | 764.849998 | 3.917426e+09 | 512.191607 | 147 | |
8 | 797.491417 | 783.245712 | 790.775719 | 789.872864 | 2.397280e+09 | 528.948475 | 176 | |
9 | 827.359993 | 811.739990 | 818.522850 | 821.552849 | 2.962480e+09 | 550.163418 | 189 | |
10 | 909.855713 | 893.007137 | 901.279999 | 902.802849 | 3.058644e+09 | 604.573469 | 210 | |
11 | 941.915722 | 926.407131 | 934.340004 | 934.764286 | 2.377022e+09 | 625.976896 | 231 | |
12 | 1015.334286 | 1005.722858 | 1011.407139 | 1010.384293 | 1.743309e+09 | 676.616745 | 264 | |
2011 | 1 | 973.799992 | 955.328583 | 964.618561 | 966.800003 | 2.710384e+09 | 647.430040 | 20 |
2 | 958.600002 | 944.887138 | 950.895706 | 953.210007 | 2.323987e+09 | 638.329285 | 38 | |
3 | 1151.114277 | 1131.195709 | 1143.562855 | 1141.755714 | 2.826615e+09 | 764.591324 | 69 | |
4 | 981.997139 | 965.321423 | 975.171432 | 972.615704 | 2.313457e+09 | 651.324560 | 80 | |
5 | 1032.851433 | 1018.240005 | 1025.958569 | 1025.354282 | 1.728015e+09 | 686.641624 | 105 | |
6 | 1051.567142 | 1031.604290 | 1043.190010 | 1040.541424 | 2.315963e+09 | 696.811869 | 132 | |
7 | 1071.921425 | 1050.395718 | 1058.758564 | 1063.537140 | 2.663487e+09 | 712.211285 | 140 | |
8 | 1254.649998 | 1219.489994 | 1239.569992 | 1237.934288 | 4.035650e+09 | 828.998539 | 184 | |
9 | 1191.664288 | 1164.200008 | 1177.138565 | 1177.480000 | 2.994362e+09 | 788.514557 | 189 | |
10 | 1205.449989 | 1174.577141 | 1190.231430 | 1191.690002 | 3.285357e+09 | 798.030468 | 210 | |
11 | 1166.638569 | 1145.058582 | 1158.804287 | 1154.445717 | 2.240925e+09 | 773.089329 | 231 | |
12 | 1186.994286 | 1170.418564 | 1178.311428 | 1178.791431 | 1.576633e+09 | 789.392788 | 252 | |
2012 | 1 | 1231.452858 | 1216.601433 | 1225.028564 | 1224.508568 | 1.714964e+09 | 820.007877 | 20 |
2 | 1430.997147 | 1404.107147 | 1413.722862 | 1421.631432 | 2.842139e+09 | 952.013706 | 40 | |
3 | 1829.345703 | 1792.201431 | 1812.982857 | 1815.024284 | 3.896084e+09 | 1215.454231 | 66 | |
4 | 1758.535728 | 1710.428566 | 1741.735703 | 1731.437141 | 3.899748e+09 | 1159.479080 | 80 | |
5 | 1798.158585 | 1754.962852 | 1778.181435 | 1774.687134 | 2.776586e+09 | 1188.442024 | 110 | |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2016 | 7 | 1983.279999 | 1960.049995 | 1969.019989 | 1971.129997 | 6.857796e+08 | 1891.232445 | 140 |
8 | 2487.230011 | 2460.689995 | 2471.650002 | 2476.300018 | 6.301285e+08 | 2387.152748 | 184 | |
9 | 2346.650017 | 2305.960007 | 2324.140015 | 2327.999992 | 9.680156e+08 | 2245.737114 | 189 | |
10 | 2444.150002 | 2413.769981 | 2430.619980 | 2429.850006 | 6.869143e+08 | 2343.988197 | 210 | |
11 | 2332.439995 | 2288.760017 | 2311.729988 | 2313.239990 | 7.213055e+08 | 2241.850868 | 231 | |
12 | 2415.529991 | 2384.449997 | 2393.909988 | 2401.050003 | 6.087717e+08 | 2328.097778 | 252 | |
2017 | 1 | 2397.619995 | 2375.710007 | 2381.869987 | 2391.399994 | 5.631220e+08 | 2318.740952 | 20 |
2 | 2549.070007 | 2523.899986 | 2531.460014 | 2540.569977 | 5.749686e+08 | 2470.778885 | 38 | |
3 | 3247.550003 | 3213.309982 | 3228.330002 | 3234.210007 | 5.616284e+08 | 3149.539642 | 69 | |
4 | 2728.360001 | 2704.219986 | 2717.570023 | 2714.849991 | 3.733041e+08 | 2643.776230 | 76 | |
5 | 3367.349991 | 3324.390015 | 3343.249985 | 3349.009979 | 6.539818e+08 | 3269.984604 | 110 | |
6 | 3279.949997 | 3224.819992 | 3260.730026 | 3252.289963 | 6.841781e+08 | 3180.219025 | 132 | |
7 | 2983.529999 | 2942.110031 | 2961.930008 | 2965.989990 | 4.220119e+08 | 2900.263428 | 140 | |
8 | 3683.490021 | 3629.480026 | 3655.780029 | 3657.490036 | 6.610690e+08 | 3586.263855 | 184 | |
9 | 3177.390030 | 3128.930008 | 3162.180008 | 3152.169998 | 6.803741e+08 | 3094.421860 | 180 | |
10 | 3488.910004 | 3446.750031 | 3461.829956 | 3471.979996 | 5.042913e+08 | 3408.372849 | 220 | |
11 | 3641.990005 | 3590.809998 | 3622.299988 | 3620.530014 | 6.006634e+08 | 3562.712875 | 231 | |
12 | 3457.709991 | 3416.970016 | 3434.439987 | 3437.830002 | 5.308335e+08 | 3386.980713 | 240 | |
2018 | 1 | 3679.270020 | 3629.180023 | 3654.449982 | 3654.109985 | 6.594873e+08 | 3600.061661 | 21 |
2 | 3227.869965 | 3144.110016 | 3178.250000 | 3185.139999 | 9.278943e+08 | 3146.956253 | 38 | |
3 | 3712.790009 | 3634.290039 | 3675.990005 | 3664.420029 | 7.130109e+08 | 3624.938492 | 63 | |
4 | 3604.809982 | 3534.140015 | 3565.070007 | 3566.520020 | 6.660579e+08 | 3528.093307 | 84 | |
5 | 4102.319992 | 4044.970001 | 4069.040039 | 4081.809998 | 6.209763e+08 | 4047.851425 | 110 | |
6 | 3986.470016 | 3935.799988 | 3963.130020 | 3961.050003 | 5.276245e+08 | 3933.482071 | 126 | |
7 | 4022.339996 | 3969.309967 | 3994.119995 | 3996.540009 | 3.936914e+08 | 3968.725082 | 147 | |
8 | 4934.909988 | 4860.029999 | 4883.340012 | 4906.960037 | 7.002737e+08 | 4884.851288 | 184 | |
9 | 4266.080002 | 4181.480011 | 4224.290009 | 4219.400024 | 6.789720e+08 | 4204.729004 | 171 | |
10 | 5150.640030 | 5011.109985 | 5087.600021 | 5079.449997 | 7.897485e+08 | 5061.788559 | 230 | |
11 | 4072.939987 | 3962.709976 | 4028.209991 | 4015.949997 | 9.613264e+08 | 4012.316437 | 231 | |
12 | 3179.080017 | 3075.930023 | 3139.619995 | 3121.060028 | 8.989225e+08 | 3121.060028 | 228 |
109 rows × 7 columns
시계열 데이터를 처리할 때 자주 하는 것 중 하나가 시간 인덱스를 기준으로 resampling 하는 것입니다. 그러기 위해 resample 메서드를 사용합니다.
Alias | Description |
---|---|
B | business day frequency |
C | custom business day frequency (experimental) |
D | calendar day frequency |
W | weekly frequency |
M | month end frequency |
SM | semi-month end frequency (15th and end of month) |
BM | business month end frequency |
CBM | custom business month end frequency |
MS | month start frequency |
SMS | semi-month start frequency (1st and 15th) |
BMS | business month start frequency |
CBMS | custom business month start frequency |
Q | quarter end frequency |
BQ | business quarter endfrequency |
QS | quarter start frequency |
BQS | business quarter start frequency |
A | year end frequency |
BA | business year end frequency |
AS | year start frequency |
BAS | business year start frequency |
BH | business hour frequency |
H | hourly frequency |
T, min | minutely frequency |
S | secondly frequency |
L, ms | milliseconds |
U, us | microseconds |
N | nanoseconds |
resample 호출 시 rule 파라메터를 입력해야 하는데, 입력한 rule 단위로 resampling 하게 됩니다. resampling 되어 리턴된 객체의 row 를 재조합하기 위해 aggregation 함수 (mean, sum, count, etc) 따위를 실행해서 결과를 도출합니다.
# Yearly Means
df.resample(rule='A').mean()
High | Low | Open | Close | Volume | Adj Close | month | |
---|---|---|---|---|---|---|---|
Date | |||||||
2009-12-31 | 30.478571 | 30.080000 | 30.447144 | 30.104286 | 8.810270e+07 | 20.159719 | 12.000000 |
2010-12-31 | 37.481258 | 36.692517 | 37.136803 | 37.120351 | 1.498263e+08 | 24.858119 | 6.583333 |
2011-12-31 | 52.489082 | 51.471100 | 52.008775 | 52.000618 | 1.230747e+08 | 34.822880 | 6.547619 |
2012-12-31 | 83.117914 | 81.417309 | 82.378960 | 82.292817 | 1.319642e+08 | 56.076387 | 6.500000 |
2013-12-31 | 68.234132 | 66.892443 | 67.589734 | 67.519269 | 1.016087e+08 | 53.083607 | 6.543651 |
2014-12-31 | 93.012664 | 91.475538 | 92.219830 | 92.264654 | 6.315273e+07 | 84.295298 | 6.551587 |
2015-12-31 | 121.244524 | 118.863095 | 120.175754 | 120.040000 | 5.183789e+07 | 112.931425 | 6.567460 |
2016-12-31 | 105.427183 | 103.690278 | 104.507778 | 104.604008 | 3.841516e+07 | 100.449894 | 6.555556 |
2017-12-31 | 151.406056 | 149.487650 | 150.444900 | 150.551075 | 2.713317e+07 | 147.299024 | 6.533865 |
2018-12-31 | 190.994104 | 187.183506 | 189.096056 | 189.053427 | 3.401588e+07 | 187.788261 | 6.505976 |
resampling 이후 호출할 함수를 임의로 만들 수도 있습니다.
def first_day(sample):
"""
샘플링 기간에 상관 없이 기간 내 첫번째 데이터를 리턴
"""
return sample[0]
df.resample(rule='A').apply(first_day)
High | Low | Open | Close | Volume | Adj Close | month | |
---|---|---|---|---|---|---|---|
Date | |||||||
2009-12-31 | 30.478571 | 30.080000 | 30.447144 | 30.104286 | 88102700.0 | 20.159719 | 12 |
2010-12-31 | 30.642857 | 30.340000 | 30.490000 | 30.572857 | 123432400.0 | 20.473503 | 1 |
2011-12-31 | 47.180000 | 46.405716 | 46.520000 | 47.081429 | 111284600.0 | 31.528688 | 1 |
2012-12-31 | 58.928570 | 58.428570 | 58.485714 | 58.747143 | 75555200.0 | 39.340778 | 1 |
2013-12-31 | 79.285713 | 77.375717 | 79.117142 | 78.432854 | 140129500.0 | 55.923737 | 1 |
2014-12-31 | 79.575714 | 78.860001 | 79.382858 | 79.018570 | 58671200.0 | 67.251503 | 1 |
2015-12-31 | 111.440002 | 107.349998 | 111.389999 | 109.330002 | 53204600.0 | 101.963631 | 1 |
2016-12-31 | 105.370003 | 102.000000 | 102.610001 | 105.349998 | 67649400.0 | 99.925850 | 1 |
2017-12-31 | 116.330002 | 114.760002 | 115.800003 | 116.150002 | 28781900.0 | 112.620964 | 1 |
2018-12-31 | 172.300003 | 169.259995 | 170.160004 | 172.259995 | 25555900.0 | 169.712067 | 1 |
df[['Close']].resample('A').mean().plot(kind='bar')
plt.title('Yearly Mean Close Price for Apple');
df[['Open']]['2015':].resample('M').max().plot(kind='bar', figsize=(15,8))
plt.title('Monthly Max Opening Price for Apple since 2015');
from datetime import datetime
daily_daterange = \
pd.date_range(start=datetime(2018,9,1),
end=datetime(2019,1,24),
freq='B')
daily_daterange
DatetimeIndex(['2018-09-03', '2018-09-04', '2018-09-05', '2018-09-06', '2018-09-07', '2018-09-10', '2018-09-11', '2018-09-12', '2018-09-13', '2018-09-14', ... '2019-01-11', '2019-01-14', '2019-01-15', '2019-01-16', '2019-01-17', '2019-01-18', '2019-01-21', '2019-01-22', '2019-01-23', '2019-01-24'], dtype='datetime64[ns]', length=104, freq='B')
daily_dataset = \
pd.DataFrame(
data = {'value': np.random.rand(len(daily_daterange))},
index=daily_daterange)
daily_dataset.head()
value | |
---|---|
2018-09-03 | 0.104522 |
2018-09-04 | 0.515069 |
2018-09-05 | 0.887659 |
2018-09-06 | 0.352635 |
2018-09-07 | 0.288954 |
daily_dataset.resample('W-MON').min()
value | |
---|---|
2018-09-03 | 0.104522 |
2018-09-10 | 0.288954 |
2018-09-17 | 0.196259 |
2018-09-24 | 0.116807 |
2018-10-01 | 0.319771 |
2018-10-08 | 0.318485 |
2018-10-15 | 0.078227 |
2018-10-22 | 0.150858 |
2018-10-29 | 0.092146 |
2018-11-05 | 0.595603 |
2018-11-12 | 0.036903 |
2018-11-19 | 0.187757 |
2018-11-26 | 0.028984 |
2018-12-03 | 0.420747 |
2018-12-10 | 0.040747 |
2018-12-17 | 0.066793 |
2018-12-24 | 0.228188 |
2018-12-31 | 0.123885 |
2019-01-07 | 0.203852 |
2019-01-14 | 0.185558 |
2019-01-21 | 0.057018 |
2019-01-28 | 0.144828 |
daily_dataset.resample('M').min()
value | |
---|---|
2018-09-30 | 0.104522 |
2018-10-31 | 0.078227 |
2018-11-30 | 0.028984 |
2018-12-31 | 0.040747 |
2019-01-31 | 0.057018 |