library(mlbench)
library(caret)
library(plotly)
library(heatmaply)
library(corrplot)
library(minerva)
data(BostonHousing)
set.seed(71)
Index <- createDataPartition(BostonHousing$medv, p = .7,
list = FALSE,
times = 1)
Train <- BostonHousing[ Index,]
Test <- BostonHousing[-Index,]
plot_ly(Train, x = Train$chas, y = Train$medv, type = "box")
corrplot(cor(Train[,c(1:3, 5:14)]), method="circle")
M <- mine(Train[,c(1:3, 5:14)])
corrplot(M$MIC, method="circle")
heatmaply(cor(Train[,c(1:3, 5:14)]), k_row = 2, k_col = 2)
NULL
set.seed(71)
con<-trainControl(method = "repeatedcv",
number = 10,
preProc = c("center", "scale"))
train_grid = expand.grid(mtry = 1:10)
rf_fit = train(Train[ , 1:13], Train$medv,
method = "rf",
tuneGrid = train_grid,
trControl=con)
rf_fit
Random Forest
356 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 1 times)
Summary of sample sizes: 320, 321, 320, 320, 321, 320, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared
1 4.627402 0.7942172
2 3.804643 0.8452705
3 3.554503 0.8586200
4 3.399954 0.8664139
5 3.396918 0.8655126
6 3.396945 0.8641700
7 3.360618 0.8663705
8 3.355925 0.8667604
9 3.335951 0.8683290
10 3.347374 0.8682950
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 9.
pred <- predict(rf_fit, Test[ , 1:13])
pred2 <- predict(rf_fit, Train[ , 1:13])
sqrt(mean((pred - Test$medv)^2))
[1] 2.864187
cor(pred, Test$medv)^2
[1] 0.905791
cor(pred2, Train$medv)^2
[1] 0.9806551
plot(pred, Test$medv, pch =16, col=6, xlim=c(0, 60), ylim=c(0, 60), lty=2, ann=F)
par(new=T)
plot(pred2,Train$medv, pch =21, col=1, xlim=c(0, 60), ylim=c(0, 60))
# Tropsha's R^2: (Acceptable: Tropha's R^2 > 0.5)
R2_tr <- 1 - sum((Test$medv - (pred))^2)/sum((Test$medv - mean(pred2))^2)
R2_tr
[1] 0.8994333
# K:(Acceptable: 0.85 < k < 1.15)
k <- (sum(pred * Test$medv))/(sum((pred)^2))
k
[1] 1.012774
#R^20 (Acceptable: (R2_tr-R_20)/R2_tr < 0.1)
R_20 <- 1 - (sum((pred - k*(Test$medv))^2)/sum((pred - mean(Test$medv))^2))
(R2_tr-R_20)/R2_tr
[1] 0.04388054
基準値を満たすため外部データセットによるValidation結果も良好。