import numpy as np
import pandas as pd
%matplotlib inline
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
df_origin = pd.read_csv("data/train.csv")
df_origin.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0 | 3 | 13 | 16 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 8 | 32 | 40 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 5 | 27 | 32 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 3 | 10 | 13 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 0 | 1 | 1 |
df_origin.tail(24)
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
10862 | 2012-12-19 00:00:00 | 4 | 0 | 1 | 1 | 12.30 | 15.910 | 61 | 0.0000 | 6 | 35 | 41 |
10863 | 2012-12-19 01:00:00 | 4 | 0 | 1 | 1 | 12.30 | 15.910 | 65 | 6.0032 | 1 | 14 | 15 |
10864 | 2012-12-19 02:00:00 | 4 | 0 | 1 | 1 | 11.48 | 15.150 | 65 | 6.0032 | 1 | 2 | 3 |
10865 | 2012-12-19 03:00:00 | 4 | 0 | 1 | 1 | 10.66 | 13.635 | 75 | 8.9981 | 0 | 5 | 5 |
10866 | 2012-12-19 04:00:00 | 4 | 0 | 1 | 1 | 9.84 | 12.120 | 75 | 8.9981 | 1 | 6 | 7 |
10867 | 2012-12-19 05:00:00 | 4 | 0 | 1 | 1 | 10.66 | 14.395 | 75 | 6.0032 | 2 | 29 | 31 |
10868 | 2012-12-19 06:00:00 | 4 | 0 | 1 | 1 | 9.84 | 12.880 | 75 | 6.0032 | 3 | 109 | 112 |
10869 | 2012-12-19 07:00:00 | 4 | 0 | 1 | 1 | 10.66 | 13.635 | 75 | 8.9981 | 3 | 360 | 363 |
10870 | 2012-12-19 08:00:00 | 4 | 0 | 1 | 1 | 9.84 | 12.880 | 87 | 7.0015 | 13 | 665 | 678 |
10871 | 2012-12-19 09:00:00 | 4 | 0 | 1 | 1 | 11.48 | 14.395 | 75 | 7.0015 | 8 | 309 | 317 |
10872 | 2012-12-19 10:00:00 | 4 | 0 | 1 | 1 | 13.12 | 16.665 | 70 | 7.0015 | 17 | 147 | 164 |
10873 | 2012-12-19 11:00:00 | 4 | 0 | 1 | 1 | 16.40 | 20.455 | 54 | 15.0013 | 31 | 169 | 200 |
10874 | 2012-12-19 12:00:00 | 4 | 0 | 1 | 1 | 16.40 | 20.455 | 54 | 19.0012 | 33 | 203 | 236 |
10875 | 2012-12-19 13:00:00 | 4 | 0 | 1 | 1 | 17.22 | 21.210 | 50 | 12.9980 | 30 | 183 | 213 |
10876 | 2012-12-19 14:00:00 | 4 | 0 | 1 | 1 | 17.22 | 21.210 | 50 | 12.9980 | 33 | 185 | 218 |
10877 | 2012-12-19 15:00:00 | 4 | 0 | 1 | 1 | 17.22 | 21.210 | 50 | 19.0012 | 28 | 209 | 237 |
10878 | 2012-12-19 16:00:00 | 4 | 0 | 1 | 1 | 17.22 | 21.210 | 50 | 23.9994 | 37 | 297 | 334 |
10879 | 2012-12-19 17:00:00 | 4 | 0 | 1 | 1 | 16.40 | 20.455 | 50 | 26.0027 | 26 | 536 | 562 |
10880 | 2012-12-19 18:00:00 | 4 | 0 | 1 | 1 | 15.58 | 19.695 | 50 | 23.9994 | 23 | 546 | 569 |
10881 | 2012-12-19 19:00:00 | 4 | 0 | 1 | 1 | 15.58 | 19.695 | 50 | 26.0027 | 7 | 329 | 336 |
10882 | 2012-12-19 20:00:00 | 4 | 0 | 1 | 1 | 14.76 | 17.425 | 57 | 15.0013 | 10 | 231 | 241 |
10883 | 2012-12-19 21:00:00 | 4 | 0 | 1 | 1 | 13.94 | 15.910 | 61 | 15.0013 | 4 | 164 | 168 |
10884 | 2012-12-19 22:00:00 | 4 | 0 | 1 | 1 | 13.94 | 17.425 | 61 | 6.0032 | 12 | 117 | 129 |
10885 | 2012-12-19 23:00:00 | 4 | 0 | 1 | 1 | 13.12 | 16.665 | 66 | 8.9981 | 4 | 84 | 88 |
df_origin.describe()
season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.00000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 | 10886.000000 |
mean | 2.506614 | 0.028569 | 0.680875 | 1.418427 | 20.23086 | 23.655084 | 61.886460 | 12.799395 | 36.021955 | 155.552177 | 191.574132 |
std | 1.116174 | 0.166599 | 0.466159 | 0.633839 | 7.79159 | 8.474601 | 19.245033 | 8.164537 | 49.960477 | 151.039033 | 181.144454 |
min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.82000 | 0.760000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 2.000000 | 0.000000 | 0.000000 | 1.000000 | 13.94000 | 16.665000 | 47.000000 | 7.001500 | 4.000000 | 36.000000 | 42.000000 |
50% | 3.000000 | 0.000000 | 1.000000 | 1.000000 | 20.50000 | 24.240000 | 62.000000 | 12.998000 | 17.000000 | 118.000000 | 145.000000 |
75% | 4.000000 | 0.000000 | 1.000000 | 2.000000 | 26.24000 | 31.060000 | 77.000000 | 16.997900 | 49.000000 | 222.000000 | 284.000000 |
max | 4.000000 | 1.000000 | 1.000000 | 4.000000 | 41.00000 | 45.455000 | 100.000000 | 56.996900 | 367.000000 | 886.000000 | 977.000000 |
df_test = pd.read_csv('data/test.csv')
df_test.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | |
---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-20 00:00:00 | 1 | 0 | 1 | 1 | 10.66 | 11.365 | 56 | 26.0027 |
1 | 2011-01-20 01:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 |
2 | 2011-01-20 02:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 |
3 | 2011-01-20 03:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 |
4 | 2011-01-20 04:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 |
df_test.shape
(6493, 9)
并没有发现nan和其他异常
df_origin.columns.values
array(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'], dtype=object)
def findNan(df):
cols = df.columns.values
print cols
for c in cols:
print c, df[c].unique()
findNan(df_origin)
['datetime' 'season' 'holiday' 'workingday' 'weather' 'temp' 'atemp' 'humidity' 'windspeed' 'casual' 'registered' 'count'] datetime ['2011-01-01 00:00:00' '2011-01-01 01:00:00' '2011-01-01 02:00:00' ..., '2012-12-19 21:00:00' '2012-12-19 22:00:00' '2012-12-19 23:00:00'] season [1 2 3 4] holiday [0 1] workingday [0 1] weather [1 2 3 4] temp [ 9.84 9.02 8.2 13.12 15.58 14.76 17.22 18.86 18.04 16.4 13.94 12.3 10.66 6.56 5.74 7.38 4.92 11.48 4.1 3.28 2.46 21.32 22.96 23.78 24.6 19.68 22.14 20.5 27.06 26.24 25.42 27.88 28.7 30.34 31.16 29.52 33.62 35.26 36.9 32.8 31.98 34.44 36.08 37.72 38.54 1.64 0.82 39.36 41. ] atemp [ 14.395 13.635 12.88 17.425 19.695 16.665 21.21 22.725 21.97 20.455 11.365 10.605 9.85 8.335 6.82 5.305 6.06 9.09 12.12 7.575 15.91 3.03 3.79 4.545 15.15 18.18 25. 26.515 27.275 29.545 23.485 25.76 31.06 30.305 24.24 18.94 31.82 32.575 33.335 28.79 34.85 35.605 37.12 40.15 41.665 40.91 39.395 34.09 28.03 36.365 37.88 42.425 43.94 38.635 1.515 0.76 2.275 43.18 44.695 45.455] humidity [ 81 80 75 86 76 77 72 82 88 87 94 100 71 66 57 46 42 39 44 47 50 43 40 35 30 32 64 69 55 59 63 68 74 51 56 52 49 48 37 33 28 38 36 93 29 53 34 54 41 45 92 62 58 61 60 65 70 27 25 26 31 73 21 24 23 22 19 15 67 10 8 12 14 13 17 16 18 20 85 0 83 84 78 79 89 97 90 96 91] windspeed [ 0. 6.0032 16.9979 19.0012 19.9995 12.998 15.0013 8.9981 11.0014 22.0028 30.0026 23.9994 27.9993 26.0027 7.0015 32.9975 36.9974 31.0009 35.0008 39.0007 43.9989 40.9973 51.9987 46.0022 50.0021 43.0006 56.9969 47.9988] casual [ 3 8 5 0 2 1 12 26 29 47 35 40 41 15 9 6 11 4 7 16 20 19 10 13 14 18 17 21 33 23 22 28 48 52 42 24 30 27 32 58 62 51 25 31 59 45 73 55 68 34 38 102 84 39 36 43 46 60 80 83 74 37 70 81 100 99 54 88 97 144 149 124 98 50 72 57 71 67 95 90 126 174 168 170 175 138 92 56 111 89 69 139 166 219 240 147 148 78 53 63 79 114 94 85 128 93 121 156 135 103 44 49 64 91 119 167 181 179 161 143 75 66 109 123 113 65 86 82 132 129 196 142 122 106 61 107 120 195 183 206 158 137 76 115 150 188 193 180 127 154 108 96 110 112 169 131 176 134 162 153 210 118 141 146 159 178 177 136 215 198 248 225 194 237 242 235 224 236 222 77 87 101 145 182 171 160 133 105 104 187 221 201 205 234 185 164 200 130 155 116 125 204 186 214 245 218 217 152 191 256 251 262 189 212 272 223 208 165 229 151 117 199 140 226 286 352 357 367 291 233 190 283 295 232 173 184 172 320 355 326 321 354 299 227 254 260 207 274 308 288 311 253 197 163 275 298 282 266 220 241 230 157 293 257 269 255 228 276 332 361 356 331 279 203 250 259 297 265 267 192 239 238 213 264 244 243 246 289 287 209 263 249 247 284 327 325 312 350 258 362 310 317 268 202 294 280 216 292 304] registered [ 13 32 27 10 1 0 2 7 6 24 30 55 47 71 70 52 26 31 25 17 16 8 4 19 46 54 73 64 67 58 43 29 20 9 5 3 63 153 81 33 41 48 53 66 146 148 102 49 11 36 92 177 98 37 50 79 68 202 179 110 34 87 192 109 74 65 85 186 166 127 82 40 18 95 216 116 42 57 78 59 163 158 51 76 190 125 178 39 14 15 56 60 90 83 69 28 35 22 12 77 44 38 75 184 174 154 97 214 45 72 130 94 139 135 197 137 141 156 117 155 134 89 80 108 61 124 132 196 107 114 172 165 105 119 183 175 88 62 86 170 145 217 91 195 152 21 126 115 223 207 123 236 128 151 100 198 157 168 84 99 173 121 159 93 23 212 111 193 103 113 122 106 96 249 218 194 213 191 142 224 244 143 267 256 211 161 131 246 118 164 275 204 230 243 112 238 144 185 101 222 138 206 104 200 129 247 140 209 136 176 120 229 210 133 259 147 227 150 282 162 265 260 189 237 245 205 308 283 248 303 291 280 208 286 352 290 262 203 284 293 160 182 316 338 279 187 277 362 321 331 372 377 350 220 472 450 268 435 169 225 464 485 323 388 367 266 255 415 233 467 456 305 171 470 385 253 215 240 235 263 221 351 539 458 339 301 397 271 532 480 365 241 421 242 234 341 394 540 463 361 429 359 180 188 261 254 366 181 398 272 167 149 325 521 426 298 428 487 431 288 239 453 454 345 417 434 278 285 442 484 451 252 471 488 270 258 264 281 410 516 500 343 311 432 475 479 355 329 199 400 414 423 232 219 302 529 510 348 346 441 473 335 445 555 527 273 364 299 269 257 342 324 226 391 466 297 517 486 489 492 228 289 455 382 380 295 251 418 412 340 433 231 333 514 483 276 478 287 381 334 347 320 493 491 369 201 408 378 443 460 465 313 513 292 497 376 326 413 328 525 296 452 506 393 368 337 567 462 349 319 300 515 373 399 507 396 512 503 386 427 312 384 530 310 536 437 505 371 375 534 469 474 553 402 274 523 448 409 387 438 407 250 459 425 422 379 392 430 401 306 370 449 363 389 374 436 356 317 446 294 508 315 522 494 327 495 404 447 504 318 579 551 498 533 332 554 509 573 545 395 440 547 557 623 571 614 638 628 642 647 602 634 648 353 322 357 314 563 615 681 601 543 577 354 661 653 304 645 646 419 610 677 618 595 565 586 670 656 626 581 546 604 596 383 621 564 309 360 330 549 589 461 631 673 358 651 663 538 616 662 344 640 659 770 608 617 584 307 667 605 641 594 629 603 518 665 769 749 499 719 734 696 688 570 675 405 411 643 733 390 680 764 679 531 637 652 778 703 537 576 613 715 726 598 625 444 672 782 548 682 750 716 609 698 572 669 633 725 704 658 620 542 575 511 741 790 644 740 735 560 739 439 660 697 336 619 712 624 580 678 684 468 649 786 718 775 636 578 746 743 481 664 711 689 751 745 424 699 552 709 591 757 768 767 723 558 561 403 502 692 780 622 761 690 744 857 562 702 802 727 811 886 406 787 496 708 758 812 807 791 639 781 833 756 544 789 742 655 416 806 773 737 706 566 713 800 839 779 766 794 803 788 720 668 490 568 597 477 583 501 556 593 420 541 694 650 559 666 700 693 582] count [ 16 40 32 13 1 2 3 8 14 36 56 84 94 106 110 93 67 35 37 34 28 39 17 9 6 20 53 70 75 59 74 76 65 30 22 31 5 64 154 88 44 51 61 77 72 157 52 12 4 179 100 42 57 78 97 63 83 212 182 112 54 48 11 33 195 115 46 79 71 62 89 190 169 132 43 19 95 219 122 45 86 172 163 69 23 7 210 134 73 50 87 187 123 15 25 98 102 55 10 49 82 92 41 38 188 47 178 155 24 18 27 99 217 130 136 29 128 81 68 139 137 202 60 162 144 158 117 90 159 101 118 129 26 104 91 113 105 21 80 125 133 197 109 161 135 116 176 168 108 103 175 147 96 220 127 205 174 121 230 66 114 216 243 152 199 58 166 170 165 160 140 211 120 145 256 126 223 85 206 124 255 222 285 146 274 272 185 191 232 327 224 107 119 196 171 214 242 148 268 201 150 111 167 228 198 204 164 233 257 151 248 235 141 249 194 259 156 153 244 213 181 221 250 304 241 271 282 225 253 237 299 142 313 310 207 138 280 173 332 331 149 267 301 312 278 281 184 215 367 349 292 303 339 143 189 366 386 273 325 356 314 343 333 226 203 177 263 297 288 236 240 131 452 383 284 291 309 321 193 337 388 300 200 180 209 354 361 306 277 428 362 286 351 192 411 421 276 264 238 266 371 269 537 518 218 265 459 186 517 544 365 290 410 396 296 440 533 520 258 450 246 260 344 553 470 298 347 373 436 378 342 289 340 382 390 358 385 239 374 598 524 384 425 611 550 434 318 442 401 234 594 527 364 387 491 398 270 279 294 295 322 456 437 392 231 394 453 308 604 480 283 565 489 487 183 302 547 513 454 486 467 572 525 379 502 558 564 391 293 247 317 369 420 451 404 341 251 335 417 363 357 438 579 556 407 336 334 477 539 551 424 346 353 481 506 432 409 466 326 254 463 380 275 311 315 360 350 252 328 476 227 601 586 423 330 569 538 370 498 638 607 416 261 355 552 208 468 449 381 377 397 492 427 461 422 305 375 376 414 447 408 418 457 545 496 368 245 596 563 443 562 229 316 402 287 372 514 472 511 488 419 595 578 400 348 587 497 433 475 406 430 324 262 323 412 530 543 413 435 555 523 441 529 532 585 399 584 559 307 582 571 426 516 465 329 483 600 570 628 531 455 389 505 359 431 460 590 429 599 338 566 482 568 540 495 345 591 593 446 485 393 500 473 352 320 479 444 462 405 620 499 625 395 528 319 519 445 512 471 508 526 509 484 448 515 549 501 612 597 464 644 712 676 734 662 782 749 623 713 746 651 686 690 679 685 648 560 503 521 554 541 721 801 561 573 589 729 618 494 757 800 684 744 759 822 698 490 536 655 643 626 615 567 617 632 646 692 704 624 656 610 738 671 678 660 658 635 681 616 522 673 781 775 576 677 748 776 557 743 666 813 504 627 706 641 575 639 769 680 546 717 710 458 622 705 630 732 770 439 779 659 602 478 733 650 873 846 474 634 852 868 745 812 669 642 730 672 645 694 493 668 647 702 665 834 850 790 415 724 869 700 793 723 534 831 613 653 857 719 867 823 403 693 603 583 542 614 580 811 795 747 581 722 689 849 872 631 649 819 674 830 814 633 825 629 835 667 755 794 661 772 657 771 777 837 891 652 739 865 767 741 469 605 858 843 640 737 862 810 577 818 854 682 851 848 897 832 791 654 856 839 725 863 808 792 696 701 871 968 750 970 877 925 977 758 884 766 894 715 783 683 842 774 797 886 892 784 687 809 917 901 887 785 900 761 806 507 948 844 798 827 670 637 619 592 943 838 817 888 890 788 588 606 608 691 711 663 731 708 609 688 636]
没有发现null
df_origin['hour'] = df_origin['datetime'].str[11:13]
df_origin.hour = df_origin.hour.astype(int)
df_origin.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0 | 3 | 13 | 16 | 0 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 8 | 32 | 40 | 1 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 5 | 27 | 32 | 2 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 3 | 10 | 13 | 3 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 0 | 1 | 1 | 4 |
from datetime import datetime
def func(df):
i = 0
for timestamp in df['datetime']:
i += 1
date_object = datetime.strptime(timestamp[:10], '%Y-%m-%d')
date = datetime.date(date_object).weekday()
df.loc[i-1, 'week'] = date
return df
df_origin = func(df_origin)
df_origin.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | hour | week | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0 | 3 | 13 | 16 | 0 | 5 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 8 | 32 | 40 | 1 | 5 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 5 | 27 | 32 | 2 | 5 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 3 | 10 | 13 | 3 | 5 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 0 | 1 | 1 | 4 | 5 |
df_origin['year'] = df_origin['datetime'].str[:4]
df_origin['month'] = df_origin['datetime'].str[5:7]
df_origin['year'] = df_origin.year.astype(int)
df_origin['month'] = df_origin.month.astype(int)
df_origin.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | hour | week | year | month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0 | 3 | 13 | 16 | 0 | 5 | 2011 | 1 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 8 | 32 | 40 | 1 | 5 | 2011 | 1 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 5 | 27 | 32 | 2 | 5 | 2011 | 1 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 3 | 10 | 13 | 3 | 5 | 2011 | 1 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 0 | 1 | 1 | 4 | 5 | 2011 | 1 |
df_origin.columns.values
array(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count', 'hour', 'week', 'year', 'month'], dtype=object)
df_clean = df_origin[['season', 'holiday', 'workingday', 'weather', 'temp',
'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
'hour', 'week', 'year', 'month']]
df_clean.head()
season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | hour | week | year | month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0 | 3 | 13 | 16 | 0 | 5 | 2011 | 1 |
1 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 8 | 32 | 40 | 1 | 5 | 2011 | 1 |
2 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 5 | 27 | 32 | 2 | 5 | 2011 | 1 |
3 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 3 | 10 | 13 | 3 | 5 | 2011 | 1 |
4 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 0 | 1 | 1 | 4 | 5 | 2011 | 1 |
df_test['hour'] = df_test['datetime'].str[11:13]
df_test.hour = df_test.hour.astype(int)
df_test = func(df_test)
df_test.shape
(6493, 11)
df_test['year'] = df_test['datetime'].str[:4]
df_test['month'] = df_test['datetime'].str[5:7]
df_test['year'] = df_test.year.astype(int)
df_test['month'] = df_test.month.astype(int)
df_test.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | hour | week | year | month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-20 00:00:00 | 1 | 0 | 1 | 1 | 10.66 | 11.365 | 56 | 26.0027 | 0 | 3 | 2011 | 1 |
1 | 2011-01-20 01:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 | 1 | 3 | 2011 | 1 |
2 | 2011-01-20 02:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 | 2 | 3 | 2011 | 1 |
3 | 2011-01-20 03:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 | 3 | 3 | 2011 | 1 |
4 | 2011-01-20 04:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 | 4 | 3 | 2011 | 1 |
df_clean_test = df_test.drop('datetime', axis=1)
df_clean_test.head()
season | holiday | workingday | weather | temp | atemp | humidity | windspeed | hour | week | year | month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 1 | 1 | 10.66 | 11.365 | 56 | 26.0027 | 0 | 3 | 2011 | 1 |
1 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 | 1 | 3 | 2011 | 1 |
2 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 | 2 | 3 | 2011 | 1 |
3 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 | 3 | 3 | 2011 | 1 |
4 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 | 4 | 3 | 2011 | 1 |
df_origin['casual'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f04e10f2c90>
df_origin['registered'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f04e16c91d0>
df_clean['log_cas'] = log(df_origin['casual'] + 1)
df_clean['log_reg'] = log(df_origin['registered'] + 1)
df_clean.head()
season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | hour | week | year | month | log_cas | log_reg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0 | 3 | 13 | 16 | 0 | 5 | 2011 | 1 | 1.386294 | 2.639057 |
1 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 8 | 32 | 40 | 1 | 5 | 2011 | 1 | 2.197225 | 3.496508 |
2 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 5 | 27 | 32 | 2 | 5 | 2011 | 1 | 1.791759 | 3.332205 |
3 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 3 | 10 | 13 | 3 | 5 | 2011 | 1 | 1.386294 | 2.397895 |
4 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 0 | 1 | 1 | 4 | 5 | 2011 | 1 | 0.000000 | 0.693147 |
df_clean['log_cas'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f04e1d5ec90>
df_clean['log_reg'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f04e14a0f50>
df_clean['temp'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f04e1207a10>
df_clean.head()
season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | hour | week | year | month | log_cas | log_reg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0 | 3 | 13 | 16 | 0 | 5 | 2011 | 1 | 1.386294 | 2.639057 |
1 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 8 | 32 | 40 | 1 | 5 | 2011 | 1 | 2.197225 | 3.496508 |
2 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 5 | 27 | 32 | 2 | 5 | 2011 | 1 | 1.791759 | 3.332205 |
3 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 3 | 10 | 13 | 3 | 5 | 2011 | 1 | 1.386294 | 2.397895 |
4 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 0 | 1 | 1 | 4 | 5 | 2011 | 1 | 0.000000 | 0.693147 |
fea_cols=['season', 'holiday', 'workingday', 'weather', 'temp',
'atemp', 'humidity', 'windspeed',
'hour', 'week', 'year']
df_clean[fea_cols].corr()
season | holiday | workingday | weather | temp | atemp | humidity | windspeed | hour | week | year | |
---|---|---|---|---|---|---|---|---|---|---|---|
season | 1.000000 | 0.029368 | -0.008126 | 0.008879 | 0.258689 | 0.264744 | 0.190610 | -0.147121 | -0.006546 | -0.010553 | -0.004797 |
holiday | 0.029368 | 1.000000 | -0.250491 | -0.007074 | 0.000295 | -0.005215 | 0.001929 | 0.008409 | -0.000354 | -0.191832 | 0.012021 |
workingday | -0.008126 | -0.250491 | 1.000000 | 0.033772 | 0.029966 | 0.024660 | -0.010880 | 0.013373 | 0.002780 | -0.704267 | -0.002482 |
weather | 0.008879 | -0.007074 | 0.033772 | 1.000000 | -0.055035 | -0.055376 | 0.406244 | 0.007261 | -0.022740 | -0.047692 | -0.012548 |
temp | 0.258689 | 0.000295 | 0.029966 | -0.055035 | 1.000000 | 0.984948 | -0.064949 | -0.017852 | 0.145430 | -0.038466 | 0.061226 |
atemp | 0.264744 | -0.005215 | 0.024660 | -0.055376 | 0.984948 | 1.000000 | -0.043536 | -0.057473 | 0.140343 | -0.040235 | 0.058540 |
humidity | 0.190610 | 0.001929 | -0.010880 | 0.406244 | -0.064949 | -0.043536 | 1.000000 | -0.318607 | -0.278011 | -0.026507 | -0.078606 |
windspeed | -0.147121 | 0.008409 | 0.013373 | 0.007261 | -0.017852 | -0.057473 | -0.318607 | 1.000000 | 0.146631 | -0.024804 | -0.015221 |
hour | -0.006546 | -0.000354 | 0.002780 | -0.022740 | 0.145430 | 0.140343 | -0.278011 | 0.146631 | 1.000000 | -0.002925 | -0.004234 |
week | -0.010553 | -0.191832 | -0.704267 | -0.047692 | -0.038466 | -0.040235 | -0.026507 | -0.024804 | -0.002925 | 1.000000 | -0.003785 |
year | -0.004797 | 0.012021 | -0.002482 | -0.012548 | 0.061226 | 0.058540 | -0.078606 | -0.015221 | -0.004234 | -0.003785 | 1.000000 |
clf_cal = RandomForestRegressor(n_estimators=1000, min_samples_split=11, oob_score=True)
clf_cal.fit(df_clean[fea_cols].values, df_clean['log_cas'].values)
pd.DataFrame(clf_cal.feature_importances_).plot(kind='bar')
clf_cal.oob_score_
0.89344344503536677
print clf_cal.feature_importances_
[ 0.01330289 0.00076692 0.04385451 0.01615629 0.20644448 0.05063536 0.04191187 0.01809409 0.5683129 0.02823818 0.01228251]
fea_cas = ['season', 'workingday', 'weather', 'temp',
'humidity', 'windspeed','hour', 'week', 'year']
clf_cal.fit(df_clean[fea_cas].values, df_clean['log_cas'].values)
pd.DataFrame(clf_cal.feature_importances_).plot(kind='bar')
clf_cal.oob_score_
0.89297838303244015
clf_reg = RandomForestRegressor(n_estimators=1000, min_samples_split=11, oob_score=True)
clf_reg.fit(df_clean[fea_cols].values, df_clean['log_reg'].values)
pd.DataFrame(clf_reg.feature_importances_).plot(kind='bar')
clf_reg.oob_score_
0.95359572173626694
print clf_reg.feature_importances_
[ 0.03950298 0.00121528 0.05732649 0.0100157 0.02107919 0.01384416 0.01612237 0.00780388 0.75752831 0.03690614 0.0386555 ]
fea_regs=['season', 'workingday', 'weather', 'temp', 'humidity', 'hour', 'week', 'year']
clf_reg.fit(df_clean[fea_regs].values, df_clean['log_reg'].values)
pd.DataFrame(clf_reg.feature_importances_).plot(kind='bar')
clf_reg.oob_score_
0.95262817165263025
y_pred7 = exp(clf_cal.predict(df_clean_test[fea_cas])) + exp(clf_reg.predict(df_clean_test[fea_regs])) - 2
y_pred7[:40]
array([ 12.6578609 , 5.11132596, 2.14958816, 1.92269641, 1.81692124, 5.57134727, 37.76960661, 105.51192167, 230.32781203, 132.20060507, 62.05271725, 64.3684902 , 78.37407 , 75.49564067, 71.52856154, 82.01572385, 92.11493153, 195.16373374, 184.1821498 , 108.26276277, 79.14253341, 51.47307386, 41.69604466, 29.04587166, 13.1373564 , 7.19038866, 2.18226466, 1.37555083, 1.17584422, 5.91155125, 30.8322952 , 98.04260523, 206.6726649 , 132.49579712, 48.25784851, 59.97465746, 71.71815482, 63.04166008, 58.55824695, 58.87427819])
y_pred7 = [round(x) for x in y_pred7]
df_test['count'] = y_pred7
df_test['count'] = df_test['count'].astype(int)
df_test.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | hour | week | year | month | count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-20 00:00:00 | 1 | 0 | 1 | 1 | 10.66 | 11.365 | 56 | 26.0027 | 0 | 3 | 2011 | 1 | 13 |
1 | 2011-01-20 01:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 | 1 | 3 | 2011 | 1 | 5 |
2 | 2011-01-20 02:00:00 | 1 | 0 | 1 | 1 | 10.66 | 13.635 | 56 | 0.0000 | 2 | 3 | 2011 | 1 | 2 |
3 | 2011-01-20 03:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 | 3 | 3 | 2011 | 1 | 2 |
4 | 2011-01-20 04:00:00 | 1 | 0 | 1 | 1 | 10.66 | 12.880 | 56 | 11.0014 | 4 | 3 | 2011 | 1 | 2 |
df_test.shape
(6493, 14)
df_test.to_csv('result15', seq=',', columns=['datetime', 'count'], header=['datetime', 'count'], index = False)
效果一般,进一步尝试