参考資料
https://qiita.com/nkjm/items/e751e49c7d2c619cbeab
https://momonoki2017.blogspot.com/2018/04/r007-riris.html
http://d-m-l.jp/Rbiz/task_rf.html
https://funatsu-lab.github.io/open-course-ware/machine-learning/random-forest/
http://takenaka-akio.org/doc/r_auto/chapter_03.html
http://yut.hatenablog.com/entry/20120827/1346024147
https://mjin.doshisha.ac.jp/R/Chap_23/23.html
https://qiita.com/TsutomuNakamura/items/a1a6a02cb9bb0dcbb37f 混同行列(Confusion Matrix) とは
http://d-m-l.jp/Rbiz/task_rf.html ランダムフォレストとは機械学習のアルゴリズムの1つで、学習用のデータをランダムにサンプリングして多数の決定木を作成し、作成した決定木をもとに多数決で結果を決める方法です。精度、汎用性が高く扱いやすい分析手法です。
ランダムフォレストの特徴
#ランダムフォレストで使用するデータ - Titanics.rpart - Titanic - Titanichはtraingが統計処理されたデータでありこの演習には不向き - cordataは、グラフィック用に処理されたデータでありtrainのPclasswを3区分したり、sexを2区分するなど一部質的化したが、Fareh・年齢は量的データのままであり、氏名はそのままであり、欠落のあるデータは補完してある。 - ダミー変数ummy_varn等はカテゴリーデータをintegerデータに置き換えたものであり以下の論点に合わないらしいので使わない - lldataを使っても良いが、(makedummies()を使用してダミー変数)を実施する前のdumとnot_dum結合した、 - train2を使用する
#randomForestではCharacterは使わないようにしよう http://ushi-goroshi.hatenablog.com/entry/2019/01/30/171259
library(car)
## Loading required package: carData
library(caret)
## Warning: package 'caret' was built under R version 3.6.2
## Loading required package: lattice
library(cluster)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(data.table)
## Warning: package 'data.table' was built under R version 3.6.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.2
library(epitools)
library(effects)
## Warning: package 'effects' was built under R version 3.6.3
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
## Use the command
## lattice::trellis.par.set(effectsTheme())
## to customize lattice options for effects plots.
## See ?effectsTheme for details.
library(ggplot2)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.6.2
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ranger)
## Warning: package 'ranger' was built under R version 3.6.2
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(rgl)
library(rattle)
## Warning: package 'rattle' was built under R version 3.6.2
## Rattle: A free graphical interface for data science with R.
## バージョン 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## 'rattle()' と入力して、データを多角的に分析します。
##
## Attaching package: 'rattle'
## The following object is masked from 'package:ranger':
##
## importance
## The following object is masked from 'package:randomForest':
##
## importance
library(readr)
## Warning: package 'readr' was built under R version 3.6.2
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.2
## Loading required package: rpart
library(rpart)
library(readr)
library(reshape)
## Warning: package 'reshape' was built under R version 3.6.2
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following object is masked from 'package:data.table':
##
## melt
library(rsconnect)
## Warning: package 'rsconnect' was built under R version 3.6.2
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:reshape':
##
## colsplit, melt, recast
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## The following objects are masked from 'package:reshape':
##
## expand, smiths
library(xtable)
library(nnet)
## Warning: package 'nnet' was built under R version 3.6.2
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(randomForest)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.3.0 --
## √ tibble 2.1.3 √ stringr 1.4.0
## √ purrr 0.3.3 √ forcats 0.4.0
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x randomForest::combine() masks dplyr::combine()
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::lift() masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()
## x dplyr::recode() masks car::recode()
## x reshape::rename() masks dplyr::rename()
## x purrr::some() masks car::some()
## x purrr::transpose() masks data.table::transpose()
gesui = read_csv("osui.csv")
## Parsed with column specification:
## cols(
## OBJECTID = col_double(),
## sys_name = col_double(),
## slope = col_double(),
## uedokaburi = col_double(),
## masuhonsuu = col_double(),
## long = col_double(),
## kubun = col_double(),
## did = col_double(),
## kouhou = col_double(),
## nendo = col_double(),
## ekijyouka = col_double(),
## kyouyounensuu = col_double(),
## kansyu = col_double(),
## kei = col_double(),
## kinkyuudo = col_double(),
## taisyo = col_double()
## )
gesui<- data.frame(gesui) # 教科書ではlogit
#OBJECTID列をデータから削除
gesui <- gesui[-1:-2]
stargazer(as.data.frame(gesui),type = "html")
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
slope | 1,423 | 3.437 | 2.323 | -6 | 1.8 | 4.5 | 10 |
uedokaburi | 1,423 | 4.371 | 2.475 | 0.360 | 2.727 | 4.949 | 13.863 |
masuhonsuu | 1,423 | 1.338 | 1.808 | 0 | 0 | 2 | 13 |
long | 1,423 | 35.129 | 18.972 | 0.970 | 21.445 | 46.510 | 196.280 |
kubun | 1,423 | 1.204 | 0.403 | 1 | 1 | 1 | 2 |
did | 1,423 | 0.696 | 0.460 | 0 | 0 | 1 | 1 |
kouhou | 1,423 | 0.415 | 0.493 | 0 | 0 | 1 | 1 |
nendo | 1,423 | 1,982.967 | 5.973 | 1,974 | 1,978 | 1,990 | 2,006 |
ekijyouka | 1,423 | 0.396 | 0.611 | 0 | 0 | 1 | 4 |
kyouyounensuu | 1,423 | 33.033 | 5.973 | 10 | 26 | 38 | 42 |
kansyu | 1,423 | 1.198 | 0.399 | 1 | 1 | 1 | 2 |
kei | 1,423 | 517.182 | 308.765 | 200 | 250 | 800 | 1,650 |
kinkyuudo | 1,423 | 1.297 | 1.457 | 0 | 0 | 3 | 3 |
taisyo | 1,423 | 0.448 | 0.497 | 0 | 0 | 1 | 1 |
gesui$kansyu <- as.factor(gesui$kansyu)
gesui$taisyo <- as.factor(gesui$taisyo)
gesui$kubun <- as.factor(gesui$kubun)
gesui$did <- as.factor(gesui$did)
gesui$ekijyouka <- as.factor(gesui$ekijyouka)
stargazer(as.data.frame(gesui),type = "html")
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
slope | 1,423 | 3.437 | 2.323 | -6 | 1.8 | 4.5 | 10 |
uedokaburi | 1,423 | 4.371 | 2.475 | 0.360 | 2.727 | 4.949 | 13.863 |
masuhonsuu | 1,423 | 1.338 | 1.808 | 0 | 0 | 2 | 13 |
long | 1,423 | 35.129 | 18.972 | 0.970 | 21.445 | 46.510 | 196.280 |
kouhou | 1,423 | 0.415 | 0.493 | 0 | 0 | 1 | 1 |
nendo | 1,423 | 1,982.967 | 5.973 | 1,974 | 1,978 | 1,990 | 2,006 |
kyouyounensuu | 1,423 | 33.033 | 5.973 | 10 | 26 | 38 | 42 |
kei | 1,423 | 517.182 | 308.765 | 200 | 250 | 800 | 1,650 |
kinkyuudo | 1,423 | 1.297 | 1.457 | 0 | 0 | 3 | 3 |
exclude_cols = c("OBJECTID","kinkyuudo")
gesui = gesui[ !names(gesui) %in% exclude_cols ]
model = randomForest(taisyo ~ ., data = gesui)
model
##
## Call:
## randomForest(formula = taisyo ~ ., data = gesui)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 25.79%
## Confusion matrix:
## 0 1 class.error
## 0 630 156 0.1984733
## 1 211 426 0.3312402
predition = predict(model, gesui)
predition
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 0
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 0 1 1 0 0 1 1 1 1 0 0 1 1 0 1 1
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 1 0 1 1 1 1 0 0 0 1 0 1 1 1 0 0
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 0 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0
## 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
## 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0
## 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 1 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
## 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0
## 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
## 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
## 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
## 0 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
## 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
## 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1
## 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 0 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
## 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1
## 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
## 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 1
## 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
## 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1
## 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
## 1 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1
## 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
## 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1
## 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
## 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
## 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1
## 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
## 1 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0
## 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
## 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1
## 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464
## 0 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0
## 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
## 0 1 1 0 0 0 1 0 1 1 0 0 0 0 0 1
## 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
## 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0
## 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
## 1 1 1 0 1 0 1 1 1 0 1 0 1 0 1 1
## 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
## 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1
## 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544
## 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1
## 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
## 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0
## 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
## 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0
## 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
## 0 0 1 0 0 0 1 0 1 1 0 1 1 0 0 1
## 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608
## 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0
## 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
## 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
## 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
## 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1
## 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
## 1 0 1 1 0 0 1 1 0 0 0 1 0 1 1 1
## 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
## 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
## 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
## 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
## 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
## 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0
## 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
## 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0
## 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
## 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1
## 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
## 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1
## 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
## 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1
## 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
## 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1
## 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880
## 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 0
## 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
## 0 0 1 0 1 1 0 1 1 1 1 1 1 1 0 0
## 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912
## 0 1 0 0 1 1 1 1 0 0 1 1 1 1 1 1
## 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928
## 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0
## 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
## 1 1 1 0 1 1 1 0 0 0 1 1 1 0 1 0
## 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960
## 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
## 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
## 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1
## 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
## 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 1
## 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
## 1 1 1 1 1 1 0 1 1 1 0 1 0 0 0 1
## 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
## 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0
## 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
## 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0
## 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056
## 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
## 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1
## 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104
## 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1
## 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120
## 1 1 1 0 1 0 0 0 0 1 1 1 1 1 1 0
## 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136
## 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1 0
## 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
## 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1
## 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168
## 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0
## 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
## 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1
## 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200
## 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1
## 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
## 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0
## 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
## 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0
## 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280
## 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
## 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296
## 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1
## 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312
## 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328
## 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1
## 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360
## 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376
## 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408
## 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0
## 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
## 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## Levels: 0 1
#gesui$kinkyuudo <- as.factor(gesui$kinkyuudo)
names(gesui)
## [1] "slope" "uedokaburi" "masuhonsuu" "long"
## [5] "kubun" "did" "kouhou" "nendo"
## [9] "ekijyouka" "kyouyounensuu" "kansyu" "kei"
## [13] "taisyo"
train <- gesui[1:300,]
test <- gesui[301:478,]
model1 = randomForest(taisyo ~ ., data = gesui)
model1
##
## Call:
## randomForest(formula = taisyo ~ ., data = gesui)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 26.14%
## Confusion matrix:
## 0 1 class.error
## 0 625 161 0.2048346
## 1 211 426 0.3312402
model2 = randomForest(taisyo ~ ., data = train)
model2
##
## Call:
## randomForest(formula = taisyo ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 25.33%
## Confusion matrix:
## 0 1 class.error
## 0 125 36 0.2236025
## 1 40 99 0.2877698
#参考資料ではimportance(model1)で変数の重みが算出される事になっているが、実際にはmodel1$importancedでないと 算出できない。
model1$importance
## MeanDecreaseGini
## slope 107.45536
## uedokaburi 123.55042
## masuhonsuu 41.80903
## long 115.89397
## kubun 10.66087
## did 17.42830
## kouhou 13.22427
## nendo 59.81277
## ekijyouka 26.57786
## kyouyounensuu 58.34269
## kansyu 19.81990
## kei 56.21831
model2$importance
## MeanDecreaseGini
## slope 22.538432
## uedokaburi 24.977515
## masuhonsuu 9.892594
## long 27.362053
## kubun 2.183548
## did 4.353573
## kouhou 2.463430
## nendo 11.895859
## ekijyouka 9.967479
## kyouyounensuu 11.395888
## kansyu 4.478837
## kei 12.134081
varImpPlot(model1)
varImpPlot(model2)
ランダムフォレストチューニング(データ検証) http://d-m-l.jp/Rbiz/task_rf.html
http://sfchaos.hatenablog.com/entry/20150628/p1
#注1:set.seed(123)乱数発生 ttps://qiita.com/aich_08_/items/6d885c91c9d461514018
まずは単純にtuneRF関数を実行してみる まずは特別な設定を行わずにtuneRF関数を実行してみよう.tuneRF関数の第1引数には説明変数,第2引数には目的変数を指定する.また,doBest引数をTRUEに指定すると,評価が最も良いモデルを返すようになる.
dim(gesui)
## [1] 1423 13
sapply(gesui, class)
## slope uedokaburi masuhonsuu long kubun
## "numeric" "numeric" "numeric" "numeric" "factor"
## did kouhou nendo ekijyouka kyouyounensuu
## "factor" "numeric" "numeric" "factor" "numeric"
## kansyu kei taisyo
## "factor" "numeric" "factor"
head(gesui)
## slope uedokaburi masuhonsuu long kubun did kouhou nendo ekijyouka
## 1 0.00 3.758000 0 9.94 1 1 1 1983 1
## 2 3.90 3.763000 0 15.40 2 1 0 1976 1
## 3 1.32 3.538794 1 14.85 2 1 0 1976 1
## 4 1.22 1.054575 1 3.39 2 1 0 2004 1
## 5 2.50 1.533001 0 7.78 2 1 0 1988 0
## 6 3.50 4.122386 0 9.75 1 1 1 1976 1
## kyouyounensuu kansyu kei taisyo
## 1 33 1 1100 0
## 2 40 1 800 1
## 3 40 1 250 1
## 4 12 2 200 0
## 5 28 2 250 1
## 6 40 1 1100 0
set.seed(123)#注1
gesui.tune <- tuneRF(gesui %>% select(-taisyo) ,# 説明変数
gesui$taisyo, # 目的変数
doBest = T) #分岐に使う変数の数(mtry)を求めるフラグ
## mtry = 3 OOB error = 26.84%
## Searching left ...
## mtry = 2 OOB error = 25.02%
## 0.06806283 0.05
## mtry = 1 OOB error = 30.01%
## -0.1994382 0.05
## Searching right ...
## mtry = 6 OOB error = 28.11%
## -0.1235955 0.05
この結果,特徴量の個数が
3個のときに,Out-of-Bag誤差(OOB error)は7.11% 6個のときに,Out-of-Bag誤差は6.698%、 2個のときに,Out-of-Bag誤差は6.28%、 1個のときに,Out-of-Bag誤差は5.868%、
となり,特徴量の個数が3個のときにOut-of-Bag誤差が最少となり, この個数に設定するのが良さそうであることがわかる*1
構築する決定木の個数を増やしてみる ntreeTry引数はデフォルトでは50となっており,50個の決定木を構築することがわかる.500個の決定木を構築するように指定してみよう.
set.seed(123)#注1
gesui.tune <- tuneRF(gesui %>% select(-taisyo) ,# 説明変数
gesui$taisyo, # 目的変数
ntreeTry=500, #決定木数
trace = TRUE,
doBest = T)
## mtry = 3 OOB error = 26.21%
## Searching left ...
## mtry = 2 OOB error = 26.21%
## 0 0.05
## Searching right ...
## mtry = 6 OOB error = 25.65%
## 0.02144772 0.05
3個のときに,Out-of-Bag誤差(OOB error)が最大であることは変わらない
チューニングで求めたmtry(tuneRF()結果を、オブジェクトの$mtryに入っています)はこの関数の引数に代入します。
gesui.rf <- randomForest( # 予測、分類器の構築
taisyo ~ ., # モデル式
data = gesui, # データ
mtry = gesui.tune$mtry) # 分岐に使う変数の数
gesui.rf
##
## Call:
## randomForest(formula = taisyo ~ ., data = gesui, mtry = gesui.tune$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 26.56%
## Confusion matrix:
## 0 1 class.error
## 0 615 171 0.2175573
## 1 207 430 0.3249608
x=gesui.rf$importance
出力結果の読み方 OOB estimate of error rate:誤判別率 Confusion matrix:縦軸が予測数、横軸が実際の数。下の例では”0”(緊急度3以下)と478個予測したうち、実際に”0”だったものが450個、“1”だったものが28個と読み取れます。
#重要度順のグラフを出力
rank <- data.frame(x) # 重要度のリストをデータフレームに変換
rank$factor <- rownames(rank) # 行名になっている要因をデータフレームに追加
rank <- rank[order(rank[,1], decreasing=T),] # 重要度(偏回帰係数的なもの)順に並び替え
rownames(rank) <- 1:nrow(rank) # ランキングを行名にする
rank
## MeanDecreaseGini factor
## 1 148.152131 uedokaburi
## 2 136.057274 long
## 3 119.694564 slope
## 4 59.052743 kei
## 5 55.132503 nendo
## 6 53.761260 kyouyounensuu
## 7 43.242571 masuhonsuu
## 8 25.623134 ekijyouka
## 9 22.195448 kansyu
## 10 16.698539 did
## 11 10.761155 kouhou
## 12 9.886133 kubun
重要度順のグラフを出力
varImpPlot(gesui.rf)
#plot(gesui, col=c(2, 3, 4)[gesui$kionkyudo])
plot(gesui, col=c(2, 3)[gesui$taisyo])
-下水道データ読み込み# 基本統計量表示 gesui # 教科書ではlogit
gesui = read_csv("gesuidou.csv")
## Parsed with column specification:
## cols(
## OBJECTID = col_double(),
## slope = col_double(),
## long = col_double(),
## uedokaburi = col_double(),
## sitadokaburi = col_double(),
## masuhonsuu = col_double(),
## nendo = col_double(),
## kei = col_double(),
## kubun = col_double(),
## did = col_double(),
## kouhou = col_double(),
## ekijyouka = col_double(),
## kansyu = col_double(),
## kinkyuudo = col_double(),
## taisyo = col_double()
## )
gesui<- data.frame(gesui) # 教科書ではlogit
gesui$kansyu <- as.factor(gesui$kansyu)
gesui$taisyo <- as.factor(gesui$taisyo)
gesui$kubun <- as.factor(gesui$kubun)
gesui$did <- as.factor(gesui$did)
gesui$ekijyouka <- as.factor(gesui$ekijyouka)
#gesui <- gesui[-1] #OBJECTID列をデータから削除
exclude_cols = c("OBJECTID","sys_name")
gesui = gesui[ !names(gesui) %in% exclude_cols ]
set.seed(123)#注1
gesui.tune <- tuneRF(gesui %>% select(-kinkyuudo) ,# 説明変数
gesui$kinkyuudo, # 目的変数
ntreeTry=500, #決定木数
trace = TRUE,
doBest = T)
## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 4 OOB error = 1.480305
## Searching left ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 2 OOB error = 1.486863
## -0.004430167 0.05
## Searching right ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 8 OOB error = 1.519686
## -0.02660376 0.05
## Warning in randomForest.default(x, y, mtry = res[which.min(res[, 2]), 1], :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
3個のときに,Out-of-Bag誤差(OOB error)が最大であることは変わらない
チューニングで求めたmtry(tuneRF()結果を、オブジェクトの$mtryに入っています)はこの関数の引数に代入します。
gesui.rf <- randomForest( # 予測、分類器の構築
kinkyuudo ~ ., # モデル式
data = gesui, # データ
mtry = gesui.tune$mtry) # 分岐に使う変数の数
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
gesui.rf
##
## Call:
## randomForest(formula = kinkyuudo ~ ., data = gesui, mtry = gesui.tune$mtry)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 1.476213
## % Var explained: 30.78
x=gesui.rf$importance
出力結果の読み方 OOB estimate of error rate:誤判別率 Confusion matrix:縦軸が予測数、横軸が実際の数。 上の例では正解率69.04% ”0”(緊急度3以下)と218個予測したうち、実際に”0”だったものが162個、“2”だったものが2個、“3”だったものが54と読み取れます。
#重要度順のグラフを出力
rank <- data.frame(x) # 重要度のリストをデータフレームに変換
rank$factor <- rownames(rank) # 行名になっている要因をデータフレームに追加
rank <- rank[order(rank[,1], decreasing=T),] # 重要度(偏回帰係数的なもの)順に並び替え
rownames(rank) <- 1:nrow(rank) # ランキングを行名にする
rank
## IncNodePurity factor
## 1 137.118695 long
## 2 132.486026 sitadokaburi
## 3 130.789376 slope
## 4 124.661796 uedokaburi
## 5 116.354171 kansyu
## 6 69.429390 nendo
## 7 62.659995 kei
## 8 47.334756 masuhonsuu
## 9 38.187225 ekijyouka
## 10 19.239761 did
## 11 14.344117 kouhou
## 12 9.550413 taisyo
## 13 5.626238 kubun
重要度順のグラフを出力
varImpPlot(gesui.rf)
plot(gesui, col=c(2, 3, 4)[gesui$kionkyudo])
-下水道データ読み込み# 基本統計量表示 gesui # 教科書ではlogit
#gesui = read_csv("osui2.csv")
gesui = read_csv("enbi.csv")
## Parsed with column specification:
## cols(
## OBJECTID = col_double(),
## sys_name = col_double(),
## slope = col_double(),
## uedokaburi = col_double(),
## masuhonsuu = col_double(),
## long = col_double(),
## kubun = col_double(),
## did = col_double(),
## kouhou = col_double(),
## nendo = col_double(),
## ekijyouka = col_double(),
## kyouyounensuu = col_double(),
## kansyu = col_double(),
## kei = col_double(),
## kinkyuudo = col_double(),
## taisyo = col_double()
## )
gesui <- data.frame(gesui) # 教科書ではlogit
#testデータの行番号取得
#randomgesui<-sample(282,200)
#train <- gesui[randomgesui,]
#test <-gesui[-randomgesui,]
#cat(test$sys_name, file = "testrow.txt",append=FALSE)
#write.table(test,"testoutput.txt", quote=F,
# col.names=T, append=T)
gesui <- gesui[-1:-2] #OBJECTID,sys_name列をデータから削除
gesui <- gesui[-13]
gesui <- gesui[-8]
gesui <- gesui[-10]
stargazer(as.data.frame(gesui),type = "html")
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
slope | 282 | 3.309 | 2.017 | 0.000 | 1.900 | 4.100 | 9.900 |
uedokaburi | 282 | 4.218 | 2.570 | 1.009 | 2.462 | 5.397 | 13.385 |
masuhonsuu | 282 | 1.284 | 1.765 | 0 | 0 | 2 | 11 |
long | 282 | 31.300 | 15.309 | 0.970 | 21.325 | 40.492 | 96.820 |
kubun | 282 | 1.209 | 0.407 | 1 | 1 | 1 | 2 |
did | 282 | 0.766 | 0.424 | 0 | 1 | 1 | 1 |
kouhou | 282 | 0.337 | 0.473 | 0 | 0 | 1 | 1 |
ekijyouka | 282 | 0.202 | 0.402 | 0 | 0 | 0 | 1 |
kyouyounensuu | 282 | 27.514 | 5.204 | 10 | 25 | 27 | 40 |
kei | 282 | 390.248 | 162.287 | 200 | 250 | 600 | 900 |
taisyo | 282 | 0.312 | 0.464 | 0 | 0 | 1 | 1 |
gesui$taisyo <- as.factor(gesui$taisyo)
#gesui$kansyu <- as.factor(gesui$kansyu)
gesui$kubun <- as.factor(gesui$kubun)
gesui$did <- as.factor(gesui$did)
gesui$ekijyouka <- as.factor(gesui$ekijyouka)
#gesui$kinkyuudo <- as.factor(gesui$kinkyuudo)
sapply(gesui, class)
## slope uedokaburi masuhonsuu long kubun
## "numeric" "numeric" "numeric" "numeric" "factor"
## did kouhou ekijyouka kyouyounensuu kei
## "factor" "numeric" "factor" "numeric" "numeric"
## taisyo
## "factor"
summary(gesui)
## slope uedokaburi masuhonsuu long kubun
## Min. :0.000 Min. : 1.009 Min. : 0.000 Min. : 0.97 1:223
## 1st Qu.:1.900 1st Qu.: 2.462 1st Qu.: 0.000 1st Qu.:21.32 2: 59
## Median :2.685 Median : 3.402 Median : 1.000 Median :30.06
## Mean :3.309 Mean : 4.218 Mean : 1.284 Mean :31.30
## 3rd Qu.:4.100 3rd Qu.: 5.397 3rd Qu.: 2.000 3rd Qu.:40.49
## Max. :9.900 Max. :13.385 Max. :11.000 Max. :96.82
## did kouhou ekijyouka kyouyounensuu kei taisyo
## 0: 66 Min. :0.0000 0:225 Min. :10.00 Min. :200.0 0:194
## 1:216 1st Qu.:0.0000 1: 57 1st Qu.:25.00 1st Qu.:250.0 1: 88
## Median :0.0000 Median :25.00 Median :250.0
## Mean :0.3369 Mean :27.51 Mean :390.2
## 3rd Qu.:1.0000 3rd Qu.:27.00 3rd Qu.:600.0
## Max. :1.0000 Max. :40.00 Max. :900.0
randomgesui<-sample(282,200)
train <- gesui[randomgesui,]
test <-gesui[-randomgesui,]
gesui <- train
sapply(gesui, class)
summary(gesui) train <- gesui
stargazer(as.data.frame(gesui),type = "html")
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
slope | 200 | 3.296 | 2.032 | 0.000 | 1.900 | 4.200 | 9.700 |
uedokaburi | 200 | 4.121 | 2.418 | 1.055 | 2.476 | 5.261 | 13.385 |
masuhonsuu | 200 | 1.270 | 1.781 | 0 | 0 | 2 | 11 |
long | 200 | 31.977 | 15.610 | 0.970 | 21.745 | 41.785 | 96.820 |
kouhou | 200 | 0.355 | 0.480 | 0 | 0 | 1 | 1 |
kyouyounensuu | 200 | 27.615 | 5.313 | 10 | 25 | 27 | 40 |
kei | 200 | 394.500 | 164.025 | 200 | 250 | 600 | 900 |
stargazer(as.data.frame(train),type = "html")
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
slope | 200 | 3.296 | 2.032 | 0.000 | 1.900 | 4.200 | 9.700 |
uedokaburi | 200 | 4.121 | 2.418 | 1.055 | 2.476 | 5.261 | 13.385 |
masuhonsuu | 200 | 1.270 | 1.781 | 0 | 0 | 2 | 11 |
long | 200 | 31.977 | 15.610 | 0.970 | 21.745 | 41.785 | 96.820 |
kouhou | 200 | 0.355 | 0.480 | 0 | 0 | 1 | 1 |
kyouyounensuu | 200 | 27.615 | 5.313 | 10 | 25 | 27 | 40 |
kei | 200 | 394.500 | 164.025 | 200 | 250 | 600 | 900 |
stargazer(as.data.frame(test),type = "html")
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
slope | 82 | 3.340 | 1.991 | 0.000 | 1.896 | 3.968 | 9.900 |
uedokaburi | 82 | 4.454 | 2.910 | 1.009 | 2.455 | 5.852 | 12.289 |
masuhonsuu | 82 | 1.317 | 1.735 | 0 | 0 | 2 | 10 |
long | 82 | 29.647 | 14.509 | 2.710 | 19.992 | 39.252 | 65.070 |
kouhou | 82 | 0.293 | 0.458 | 0 | 0 | 1 | 1 |
kyouyounensuu | 82 | 27.268 | 4.952 | 17 | 25 | 27 | 40 |
kei | 82 | 379.878 | 158.476 | 200 | 250 | 600 | 600 |
#model = randomForest(taisyo ~ ., data = gesui)
model = randomForest(taisyo ~ ., data = train,
importance = TRUE)
#model = randomForest(kinkyuudo ~ ., data = gesui)
model
##
## Call:
## randomForest(formula = taisyo ~ ., data = train, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 22.5%
## Confusion matrix:
## 0 1 class.error
## 0 124 11 0.08148148
## 1 34 31 0.52307692
#predition = predict(model, gesui)
predition = predict(model, test)
predition
## 6 10 12 15 17 22 24 26 27 29 32 40 41 42 44 48 49 55 57 63
## 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1
## 64 67 72 76 78 81 86 89 90 92 93 96 97 100 105 107 109 110 114 116
## 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 119 120 122 123 127 135 138 149 156 159 163 166 168 169 170 189 196 203 205 206
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1
## 213 214 218 224 231 234 235 236 240 241 243 247 250 251 252 256 263 265 267 268
## 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 269 279
## 0 0
## Levels: 0 1
summary(predition)
## 0 1
## 67 15
table(predition,test$taisyo)
##
## predition 0 1
## 0 57 10
## 1 2 13
#sapply(gesui, class)
#summary(gesui)
model$importance
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## slope 0.0171904776 1.783538e-02 0.017284210 15.596312
## uedokaburi 0.0327665262 3.498829e-02 0.033685048 18.468203
## masuhonsuu -0.0043225492 4.831905e-03 -0.001419027 5.219948
## long 0.0122155909 6.330937e-02 0.028224996 18.533727
## kubun 0.0103212785 7.487213e-03 0.009435202 1.848433
## did -0.0005150767 1.915146e-02 0.005929484 1.863908
## kouhou 0.0093454066 4.849525e-03 0.008027985 1.566609
## ekijyouka 0.0149262893 9.548294e-05 0.010136057 2.815742
## kyouyounensuu 0.0294452124 1.036483e-01 0.052721334 12.823230
## kei 0.0064964598 5.752397e-02 0.022684056 5.278796
varImpPlot(model)
dim(gesui)
## [1] 200 11
sapply(gesui, class)
## slope uedokaburi masuhonsuu long kubun
## "numeric" "numeric" "numeric" "numeric" "factor"
## did kouhou ekijyouka kyouyounensuu kei
## "factor" "numeric" "factor" "numeric" "numeric"
## taisyo
## "factor"
head(gesui)
## slope uedokaburi masuhonsuu long kubun did kouhou ekijyouka kyouyounensuu
## 108 9.60 2.481001 3 39.08 2 1 0 0 26
## 115 4.12 4.084854 7 51.90 2 1 1 0 24
## 223 1.34 3.484858 4 20.20 1 1 0 0 25
## 65 3.60 1.572562 1 24.52 2 1 0 0 37
## 28 1.72 2.119883 0 17.00 1 1 0 0 40
## 79 1.82 1.366309 0 23.59 2 1 0 1 24
## kei taisyo
## 108 250 0
## 115 250 0
## 223 250 0
## 65 250 1
## 28 250 1
## 79 250 0
set.seed(123)#注1
#gesui.tune <- tuneRF(gesui %>% select(-kinkyuudo) ,# 説明変数
# gesui$kinkyuudo, # 目的変数
gesui.tune <- tuneRF(gesui %>% select(-taisyo) ,# 説明変数
gesui$taisyo, # 目的変数
doBest = T) #分岐に使う変数の数(mtry)を求めるフラグ
## mtry = 3 OOB error = 23%
## Searching left ...
## mtry = 2 OOB error = 23%
## 0 0.05
## Searching right ...
## mtry = 6 OOB error = 24%
## -0.04347826 0.05
set.seed(123)#注1 #gesui.tune <- tuneRF(gesui %>% select(-kinkyuudo) ,# 説明変数 # gesui\(kinkyuudo, # 目的変数 gesui.tune <- tuneRF(gesui %>% select(-taisyo),# 説明変数 gesui\)taisyo,# 目的変数 doBest = T)#分岐に使う変数の数(mtry)を求めるフラグ
この結果,特徴量の個数が 3個以上のときに,Out-of-Bag誤差(OOB error)は3.99% 2個のときに,Out-of-Bag誤差は4.57%、 1個のときに,Out-of-Bag誤差は4.36%、
となり,特徴量の個数が3個のときにOut-of-Bag誤差が最少となり, この個数に設定するのが良さそうであることがわかる*1
構築する決定木の個数を増やしてみる ntreeTry引数はデフォルトでは50となっており,50個の決定木を構築することがわかる.1500個の決定木を構築するように指定してみよう.
set.seed(123)#注1
#gesui.tune <- tuneRF(gesui %>% select(-kinkyuudo) ,# 説明変数
# gesui$kinkyuudo, # 目的変数
gesui.tune <- tuneRF(gesui %>% select(-taisyo) ,# 説明変数
gesui$taisyo, # 目的変数
ntreeTry=2500, #決定木数
trace = TRUE,
doBest = T)
## mtry = 3 OOB error = 22.5%
## Searching left ...
## mtry = 2 OOB error = 20.5%
## 0.08888889 0.05
## mtry = 1 OOB error = 23.5%
## -0.1463415 0.05
## Searching right ...
## mtry = 6 OOB error = 24%
## -0.1707317 0.05
6個のときに,Out-of-Bag誤差(OOB error)が最大となり、5.06%となっている。
チューニングで求めたmtry(tuneRF()結果を、オブジェクトの$mtryに入っています)はこの関数の引数に代入します。 # 3/26 importance =TRUEにしないとimportance関数は使えない事が分かった。
gesui.rf2 <- randomForest( # 予測、分類器の構築
# kinkyuudo ~ ., # モデル式
taisyo ~ ., # モデル式
data = gesui, # データ
mtry = gesui.tune$mtry,importance = TRUE) # 3/26 importance =TRUEにしないとimportance関数は使えない事が分かった。
#分岐に使う変数の数
predrandam = predict(gesui.rf2, test)
predrandam
## 6 10 12 15 17 22 24 26 27 29 32 40 41 42 44 48 49 55 57 63
## 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1
## 64 67 72 76 78 81 86 89 90 92 93 96 97 100 105 107 109 110 114 116
## 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 119 120 122 123 127 135 138 149 156 159 163 166 168 169 170 189 196 203 205 206
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1
## 213 214 218 224 231 234 235 236 240 241 243 247 250 251 252 256 263 265 267 268
## 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 269 279
## 0 0
## Levels: 0 1
summary(predrandam)
## 0 1
## 66 16
table(predrandam,test$taisyo)
##
## predrandam 0 1
## 0 57 9
## 1 2 14
下水劣化推定変数の重要度
gesui.rf2
##
## Call:
## randomForest(formula = taisyo ~ ., data = gesui, mtry = gesui.tune$mtry, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 21%
## Confusion matrix:
## 0 1 class.error
## 0 128 7 0.05185185
## 1 35 30 0.53846154
x=gesui.rf2$importance
x
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## slope 0.019278350 0.006175289 0.0150091372 12.786018
## uedokaburi 0.033746693 0.032532627 0.0332667763 15.548922
## masuhonsuu -0.004367810 0.007278541 -0.0006865278 5.283975
## long 0.011615214 0.055539961 0.0258221275 14.811650
## kubun 0.010304180 0.003067069 0.0080297285 1.889741
## did -0.003405668 0.033601746 0.0082865121 2.016948
## kouhou 0.009793807 0.005519660 0.0083875405 1.385116
## ekijyouka 0.010575397 0.002003619 0.0079779208 2.613673
## kyouyounensuu 0.020940650 0.093122078 0.0445696937 12.140770
## kei 0.008438833 0.055213043 0.0231262079 5.200025
http://sfchaos.hatenablog.com/entry/20150628/p1 https://tjo.hatenablog.com/entry/2013/09/02/190449
出力結果の読み方 OOB estimate of error rate:誤判別率 Confusion matrix:縦軸が予測数、横軸が実際の数。下の例では”0”(緊急度3以下)と478個予測したうち、実際に”0”だったものが450個、“1”だったものが28個と読み取れます。
重要度の高い順番に並び替え
rank <- data.frame(x) # 重要度のリストをデータフレームに変換
rank$factor <- rownames(rank) # 行名になっている要因をデータフレームに追加
rank <- rank[order(rank[,1], decreasing=T),] # 重要度(偏回帰係数的なもの)順に並び替え
rownames(rank) <- 1:nrow(rank) # ランキングを行名にする
rank
## X0 X1 MeanDecreaseAccuracy MeanDecreaseGini factor
## 1 0.033746693 0.032532627 0.0332667763 15.548922 uedokaburi
## 2 0.020940650 0.093122078 0.0445696937 12.140770 kyouyounensuu
## 3 0.019278350 0.006175289 0.0150091372 12.786018 slope
## 4 0.011615214 0.055539961 0.0258221275 14.811650 long
## 5 0.010575397 0.002003619 0.0079779208 2.613673 ekijyouka
## 6 0.010304180 0.003067069 0.0080297285 1.889741 kubun
## 7 0.009793807 0.005519660 0.0083875405 1.385116 kouhou
## 8 0.008438833 0.055213043 0.0231262079 5.200025 kei
## 9 -0.003405668 0.033601746 0.0082865121 2.016948 did
## 10 -0.004367810 0.007278541 -0.0006865278 5.283975 masuhonsuu
plot(gesui.rf2)
varImpPlot(gesui.rf2)
参考 https://yolo-kiyoshi.com/2019/09/16/post-1226/ https://aotamasaki.hatenablog.com/entry/bias_in_feature_importances
# 別のサイトでのランダムフォレストによるEDAをRで実践 https://navaclass.com/random-forest-eda/
#set.seed(111)
#ランダムフォレストモデルの学習
#boston.rf <- randomForest(kinkyuudo ~ .,
#boston.rf <- randomForest(taisyo ~ .,
# data = train,
# importance = TRUE)
#テストデータに対する予測
#pred <- predict(boston.rf, newdata = test)
#観測値と予測値をプロット
#plot(test$taisyo, pred, main = boston.rf$call)
#curve(identity, add = TRUE)
#pred = predict(gesui.rf2, test)
pred = predict(gesui.rf2, train)#報告書用
#plot(test$taisyo, pred, main = gesui.rf2$call)
plot(train$taisyo, pred, main = gesui.rf2$call)#報告書用
curve(identity, add = TRUE)
#3/26追加
kekka<-table(pred,train$taisyo)
kekka
##
## pred 0 1
## 0 134 2
## 1 1 63
pred = predict(gesui.rf2, test)#報告書用
kekka<-table(pred,test$taisyo)
kekka
##
## pred 0 1
## 0 57 9
## 1 2 14
#予測誤差(RMSE:二乗平均平方根誤差)
#予測誤差の推定のため目的変数をニューリックに変換する。
rms <- function(act, pred) {
sqrt(mean((act - pred) ^ 2))
}
cat(" RMSE =", rms(test$taisyo, pred))
## Warning in Ops.factor(act, pred): '-' not meaningful for factors
## RMSE = NA
#線形回帰モデルの予測誤差と比較
cat(" RMSE = ",
rms(test$taisyo,
predict(lm(taisyo ~ ., data = train), newdata = test)))
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## Warning in Ops.factor(act, pred): '-' not meaningful for factors
## RMSE = NA
https://funatsu-lab.github.io/open-course-ware/machine-learning/random-forest/
#特徴量重要度の出力 type = 1
boston.imp <-
sort(gesui.rf2$importance, decreasing = TRUE)
barplot(boston.imp, names.arg = rownames(boston.imp))
#SVMによる予測
ビジネスに活かすデータマイニング(尾崎豊) http://yut.hatenablog.com/entry/20120827/1346024147
stargazer(as.data.frame(gesui),type = "html")
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
slope | 200 | 3.296 | 2.032 | 0.000 | 1.900 | 4.200 | 9.700 |
uedokaburi | 200 | 4.121 | 2.418 | 1.055 | 2.476 | 5.261 | 13.385 |
masuhonsuu | 200 | 1.270 | 1.781 | 0 | 0 | 2 | 11 |
long | 200 | 31.977 | 15.610 | 0.970 | 21.745 | 41.785 | 96.820 |
kouhou | 200 | 0.355 | 0.480 | 0 | 0 | 1 | 1 |
kyouyounensuu | 200 | 27.615 | 5.313 | 10 | 25 | 27 | 40 |
kei | 200 | 394.500 | 164.025 | 200 | 250 | 600 | 900 |
library(e1071)
d.svm<-svm(taisyo ~ ., data = train)
print(d.svm)
##
## Call:
## svm(formula = taisyo ~ ., data = train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 133
predsvm<-predict(d.svm,newdata=test)
summary(predsvm)
## 0 1
## 70 12
kekka<-table(predsvm,test$taisyo)
kekka
##
## predsvm 0 1
## 0 57 13
## 1 2 10
library( nnet )
予測式
nn<-nnet(taisyo ~., data=train,size = 2, rang = .1, decay = 5e-4, maxit = 200 )
## # weights: 25
## initial value 133.863430
## final value 126.116369
## converged
nn
## a 10-2-1 network with 25 weights
## inputs: slope uedokaburi masuhonsuu long kubun2 did1 kouhou ekijyouka1 kyouyounensuu kei
## output(s): taisyo
## options were - entropy fitting decay=5e-04
nn_predict<-predict(nn,test,type="class")
table(nn_predict, test$taisyo)
##
## nn_predict 0 1
## 0 59 23
cat(test$taisyo, file = "testtaisyo2.txt", append =FALSE)
cat(nn_predict, file = "nnresult2.txt", append =FALSE)
nn_predict<-predict(nn,test,type=“raw”) nn_predict#推定値の生データ出力:https://mjin.doshisha.ac.jp/R/Chap_23/23.html nn_predict<-predict(nn,test,type=“class”)#推定値のグループ出力 #推定値グループのファイルテキスト出力http://takenaka-akio.org/doc/r_auto/chapter_03.html nn_predict cat(test\(taisyo, file = "testtaisyo.txt", append =FALSE) cat(predrandam, file = "lfresult.txt", append =FALSE) cat(predsvm, file = "svmresult.txt", append =FALSE) cat(nn_predict, file = "nnresult.txt", append =FALSE) kekka<-table(nn_predict, test\)taisyo) kekka