library読み込み

library(mlbench)
library(caret)
library(plotly)
library(heatmaply)
library(corrplot)
library(minerva)

データ読み込み

data(BostonHousing)

データ分割

set.seed(71)
Index <- createDataPartition(BostonHousing$medv, p = .7,
                                  list = FALSE,
                                  times = 1)
Train <- BostonHousing[ Index,]
Test  <- BostonHousing[-Index,]

可視化1

plot_ly(Train, x = Train$chas, y = Train$medv, type = "box")

可視化2

corrplot(cor(Train[,c(1:3, 5:14)]), method="circle")

可視化3

M <- mine(Train[,c(1:3, 5:14)])
corrplot(M$MIC, method="circle")

可視化4

heatmaply(cor(Train[,c(1:3, 5:14)]), k_row = 2, k_col = 2)
NULL

学習

set.seed(71)
con<-trainControl(method = "repeatedcv",
                  number = 10,
                  preProc = c("center", "scale"))
train_grid = expand.grid(mtry = 1:10)
rf_fit = train(Train[ , 1:13], Train$medv,
                 method = "rf", 
                 tuneGrid = train_grid,
                 trControl=con)
rf_fit
Random Forest 

356 samples
 13 predictor

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 1 times) 
Summary of sample sizes: 320, 321, 320, 320, 321, 320, ... 
Resampling results across tuning parameters:

  mtry  RMSE      Rsquared 
   1    4.627402  0.7942172
   2    3.804643  0.8452705
   3    3.554503  0.8586200
   4    3.399954  0.8664139
   5    3.396918  0.8655126
   6    3.396945  0.8641700
   7    3.360618  0.8663705
   8    3.355925  0.8667604
   9    3.335951  0.8683290
  10    3.347374  0.8682950

RMSE was used to select the optimal model using  the smallest value.
The final value used for the model was mtry = 9. 

学習結果

pred <- predict(rf_fit, Test[ , 1:13])
pred2 <- predict(rf_fit, Train[ , 1:13])
sqrt(mean((pred - Test$medv)^2))
[1] 2.864187
cor(pred, Test$medv)^2
[1] 0.905791
cor(pred2, Train$medv)^2
[1] 0.9806551

学習結果作図

plot(pred, Test$medv,  pch =16, col=6, xlim=c(0, 60), ylim=c(0, 60), lty=2, ann=F)
par(new=T)  
plot(pred2,Train$medv,  pch =21, col=1, xlim=c(0, 60), ylim=c(0, 60))

Validation (Alexander Tropshaの提唱するQSARモデルのバリデーション指標)

論文: The Importance of Being Earnest: Validation is the Absolute Essential for Successful Application and Interpretation of QSPR Models

# Tropsha's R^2: (Acceptable: Tropha's R^2 > 0.5)
R2_tr <- 1 -  sum((Test$medv - (pred))^2)/sum((Test$medv - mean(pred2))^2)
R2_tr 
[1] 0.8994333
# K:(Acceptable: 0.85 < k < 1.15)
k <- (sum(pred * Test$medv))/(sum((pred)^2))
k
[1] 1.012774
#R^20 (Acceptable: (R2_tr-R_20)/R2_tr < 0.1)
R_20 <- 1 - (sum((pred - k*(Test$medv))^2)/sum((pred - mean(Test$medv))^2))
(R2_tr-R_20)/R2_tr
[1] 0.04388054

基準値を満たすため外部データセットによるValidation結果も良好。

LS0tCnRpdGxlOiAiUnN0dWRpbyBOb3RlYm9vayB0ZXN05YW86Kaa5pu4IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyNsaWJyYXJ56Kqt44G/6L6844G/CmBgYHtyfQpsaWJyYXJ5KG1sYmVuY2gpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocGxvdGx5KQpsaWJyYXJ5KGhlYXRtYXBseSkKbGlicmFyeShjb3JycGxvdCkKbGlicmFyeShtaW5lcnZhKQpgYGAKCiMjI+ODh+ODvOOCv+iqreOBv+i+vOOBvwpgYGB7cn0KZGF0YShCb3N0b25Ib3VzaW5nKQpgYGAKCiMjI+ODh+ODvOOCv+WIhuWJsgpgYGB7cn0Kc2V0LnNlZWQoNzEpCkluZGV4IDwtIGNyZWF0ZURhdGFQYXJ0aXRpb24oQm9zdG9uSG91c2luZyRtZWR2LCBwID0gLjcsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBsaXN0ID0gRkFMU0UsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0aW1lcyA9IDEpCgpUcmFpbiA8LSBCb3N0b25Ib3VzaW5nWyBJbmRleCxdClRlc3QgIDwtIEJvc3RvbkhvdXNpbmdbLUluZGV4LF0KYGBgCgojIyPlj6/oppbljJYxCmBgYHtyfQpwbG90X2x5KFRyYWluLCB4ID0gVHJhaW4kY2hhcywgeSA9IFRyYWluJG1lZHYsIHR5cGUgPSAiYm94IikKYGBgCgojIyPlj6/oppbljJYyCmBgYHtyfQpjb3JycGxvdChjb3IoVHJhaW5bLGMoMTozLCA1OjE0KV0pLCBtZXRob2Q9ImNpcmNsZSIpCmBgYAoKIyMj5Y+v6KaW5YyWMwpgYGB7cn0KTSA8LSBtaW5lKFRyYWluWyxjKDE6MywgNToxNCldKQpjb3JycGxvdChNJE1JQywgbWV0aG9kPSJjaXJjbGUiKQpgYGAKCiMjI+WPr+imluWMljQKYGBge3J9CmhlYXRtYXBseShjb3IoVHJhaW5bLGMoMTozLCA1OjE0KV0pLCBrX3JvdyA9IDIsIGtfY29sID0gMikKYGBgCgojIyPlrabnv5IKYGBge3J9CnNldC5zZWVkKDcxKQpjb248LXRyYWluQ29udHJvbChtZXRob2QgPSAicmVwZWF0ZWRjdiIsCiAgICAgICAgICAgICAgICAgIG51bWJlciA9IDEwLAogICAgICAgICAgICAgICAgICBwcmVQcm9jID0gYygiY2VudGVyIiwgInNjYWxlIikpCgp0cmFpbl9ncmlkID0gZXhwYW5kLmdyaWQobXRyeSA9IDE6MTApCgpyZl9maXQgPSB0cmFpbihUcmFpblsgLCAxOjEzXSwgVHJhaW4kbWVkdiwKICAgICAgICAgICAgICAgICBtZXRob2QgPSAicmYiLCAKICAgICAgICAgICAgICAgICB0dW5lR3JpZCA9IHRyYWluX2dyaWQsCiAgICAgICAgICAgICAgICAgdHJDb250cm9sPWNvbikKCnJmX2ZpdApgYGAKCiMjI+Wtpue/kue1kOaenApgYGB7cn0KcHJlZCA8LSBwcmVkaWN0KHJmX2ZpdCwgVGVzdFsgLCAxOjEzXSkKcHJlZDIgPC0gcHJlZGljdChyZl9maXQsIFRyYWluWyAsIDE6MTNdKQoKc3FydChtZWFuKChwcmVkIC0gVGVzdCRtZWR2KV4yKSkKY29yKHByZWQsIFRlc3QkbWVkdileMgpjb3IocHJlZDIsIFRyYWluJG1lZHYpXjIKYGBgCgojIyPlrabnv5LntZDmnpzkvZzlm7MKYGBge3J9CnBsb3QocHJlZCwgVGVzdCRtZWR2LCAgcGNoID0xNiwgY29sPTYsIHhsaW09YygwLCA2MCksIHlsaW09YygwLCA2MCksIGx0eT0yLCBhbm49RikKcGFyKG5ldz1UKSAgCnBsb3QocHJlZDIsVHJhaW4kbWVkdiwgIHBjaCA9MjEsIGNvbD0xLCB4bGltPWMoMCwgNjApLCB5bGltPWMoMCwgNjApKQpgYGAKCiMjI1ZhbGlkYXRpb24gKEFsZXhhbmRlciBUcm9wc2hh44Gu5o+Q5ZSx44GZ44KLUVNBUuODouODh+ODq+OBruODkOODquODh+ODvOOCt+ODp+ODs+aMh+aomSkK6KuW5paHOiBbVGhlIEltcG9ydGFuY2Ugb2YgQmVpbmcgRWFybmVzdDogVmFsaWRhdGlvbiBpcyB0aGUgQWJzb2x1dGUgRXNzZW50aWFsIGZvciBTdWNjZXNzZnVsIEFwcGxpY2F0aW9uIGFuZCBJbnRlcnByZXRhdGlvbiBvZiBRU1BSIE1vZGVsc10oaHR0cDovL29ubGluZWxpYnJhcnkud2lsZXkuY29tL2RvaS8xMC4xMDAyL3FzYXIuMjAwMzkwMDA3L2Fic3RyYWN0KQpgYGB7cn0KIyBUcm9wc2hhJ3MgUl4yOiAoQWNjZXB0YWJsZTogVHJvcGhhJ3MgUl4yID4gMC41KQpSMl90ciA8LSAxIC0gIHN1bSgoVGVzdCRtZWR2IC0gKHByZWQpKV4yKS9zdW0oKFRlc3QkbWVkdiAtIG1lYW4ocHJlZDIpKV4yKQpSMl90ciAKCiMgSzooQWNjZXB0YWJsZTogMC44NSA8IGsgPCAxLjE1KQprIDwtIChzdW0ocHJlZCAqIFRlc3QkbWVkdikpLyhzdW0oKHByZWQpXjIpKQprCgojUl4yMCAoQWNjZXB0YWJsZTogKFIyX3RyLVJfMjApL1IyX3RyIDwgMC4xKQpSXzIwIDwtIDEgLSAoc3VtKChwcmVkIC0gayooVGVzdCRtZWR2KSleMikvc3VtKChwcmVkIC0gbWVhbihUZXN0JG1lZHYpKV4yKSkKKFIyX3RyLVJfMjApL1IyX3RyCmBgYArln7rmupblgKTjgpLmuoDjgZ/jgZnjgZ/jgoHlpJbpg6jjg4fjg7zjgr/jgrvjg4Pjg4jjgavjgojjgotWYWxpZGF0aW9u57WQ5p6c44KC6Imv5aW944CCCg==