머신러닝을 수행하기 위해 우리는 caret 프레임워크를 사용한다. 데이터는 mlbench의 Sonar를 사용할 예정이다. 그리고 머신러닝 알고리즘으로 Random Forest를 사용한다. 분류 분석기를 만들어 볼 생각이다.
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(mlbench)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(Sonar)
str(Sonar)
## 'data.frame': 208 obs. of 61 variables:
## $ V1 : num 0.02 0.0453 0.0262 0.01 0.0762 0.0286 0.0317 0.0519 0.0223 0.0164 ...
## $ V2 : num 0.0371 0.0523 0.0582 0.0171 0.0666 0.0453 0.0956 0.0548 0.0375 0.0173 ...
## $ V3 : num 0.0428 0.0843 0.1099 0.0623 0.0481 ...
## $ V4 : num 0.0207 0.0689 0.1083 0.0205 0.0394 ...
## $ V5 : num 0.0954 0.1183 0.0974 0.0205 0.059 ...
## $ V6 : num 0.0986 0.2583 0.228 0.0368 0.0649 ...
## $ V7 : num 0.154 0.216 0.243 0.11 0.121 ...
## $ V8 : num 0.16 0.348 0.377 0.128 0.247 ...
## $ V9 : num 0.3109 0.3337 0.5598 0.0598 0.3564 ...
## $ V10 : num 0.211 0.287 0.619 0.126 0.446 ...
## $ V11 : num 0.1609 0.4918 0.6333 0.0881 0.4152 ...
## $ V12 : num 0.158 0.655 0.706 0.199 0.395 ...
## $ V13 : num 0.2238 0.6919 0.5544 0.0184 0.4256 ...
## $ V14 : num 0.0645 0.7797 0.532 0.2261 0.4135 ...
## $ V15 : num 0.066 0.746 0.648 0.173 0.453 ...
## $ V16 : num 0.227 0.944 0.693 0.213 0.533 ...
## $ V17 : num 0.31 1 0.6759 0.0693 0.7306 ...
## $ V18 : num 0.3 0.887 0.755 0.228 0.619 ...
## $ V19 : num 0.508 0.802 0.893 0.406 0.203 ...
## $ V20 : num 0.48 0.782 0.862 0.397 0.464 ...
## $ V21 : num 0.578 0.521 0.797 0.274 0.415 ...
## $ V22 : num 0.507 0.405 0.674 0.369 0.429 ...
## $ V23 : num 0.433 0.396 0.429 0.556 0.573 ...
## $ V24 : num 0.555 0.391 0.365 0.485 0.54 ...
## $ V25 : num 0.671 0.325 0.533 0.314 0.316 ...
## $ V26 : num 0.641 0.32 0.241 0.533 0.229 ...
## $ V27 : num 0.71 0.327 0.507 0.526 0.7 ...
## $ V28 : num 0.808 0.277 0.853 0.252 1 ...
## $ V29 : num 0.679 0.442 0.604 0.209 0.726 ...
## $ V30 : num 0.386 0.203 0.851 0.356 0.472 ...
## $ V31 : num 0.131 0.379 0.851 0.626 0.51 ...
## $ V32 : num 0.26 0.295 0.504 0.734 0.546 ...
## $ V33 : num 0.512 0.198 0.186 0.612 0.288 ...
## $ V34 : num 0.7547 0.2341 0.2709 0.3497 0.0981 ...
## $ V35 : num 0.854 0.131 0.423 0.395 0.195 ...
## $ V36 : num 0.851 0.418 0.304 0.301 0.418 ...
## $ V37 : num 0.669 0.384 0.612 0.541 0.46 ...
## $ V38 : num 0.61 0.106 0.676 0.881 0.322 ...
## $ V39 : num 0.494 0.184 0.537 0.986 0.283 ...
## $ V40 : num 0.274 0.197 0.472 0.917 0.243 ...
## $ V41 : num 0.051 0.167 0.465 0.612 0.198 ...
## $ V42 : num 0.2834 0.0583 0.2587 0.5006 0.2444 ...
## $ V43 : num 0.282 0.14 0.213 0.321 0.185 ...
## $ V44 : num 0.4256 0.1628 0.2222 0.3202 0.0841 ...
## $ V45 : num 0.2641 0.0621 0.2111 0.4295 0.0692 ...
## $ V46 : num 0.1386 0.0203 0.0176 0.3654 0.0528 ...
## $ V47 : num 0.1051 0.053 0.1348 0.2655 0.0357 ...
## $ V48 : num 0.1343 0.0742 0.0744 0.1576 0.0085 ...
## $ V49 : num 0.0383 0.0409 0.013 0.0681 0.023 0.0264 0.0507 0.0285 0.0777 0.0092 ...
## $ V50 : num 0.0324 0.0061 0.0106 0.0294 0.0046 0.0081 0.0159 0.0178 0.0439 0.0198 ...
## $ V51 : num 0.0232 0.0125 0.0033 0.0241 0.0156 0.0104 0.0195 0.0052 0.0061 0.0118 ...
## $ V52 : num 0.0027 0.0084 0.0232 0.0121 0.0031 0.0045 0.0201 0.0081 0.0145 0.009 ...
## $ V53 : num 0.0065 0.0089 0.0166 0.0036 0.0054 0.0014 0.0248 0.012 0.0128 0.0223 ...
## $ V54 : num 0.0159 0.0048 0.0095 0.015 0.0105 0.0038 0.0131 0.0045 0.0145 0.0179 ...
## $ V55 : num 0.0072 0.0094 0.018 0.0085 0.011 0.0013 0.007 0.0121 0.0058 0.0084 ...
## $ V56 : num 0.0167 0.0191 0.0244 0.0073 0.0015 0.0089 0.0138 0.0097 0.0049 0.0068 ...
## $ V57 : num 0.018 0.014 0.0316 0.005 0.0072 0.0057 0.0092 0.0085 0.0065 0.0032 ...
## $ V58 : num 0.0084 0.0049 0.0164 0.0044 0.0048 0.0027 0.0143 0.0047 0.0093 0.0035 ...
## $ V59 : num 0.009 0.0052 0.0095 0.004 0.0107 0.0051 0.0036 0.0048 0.0059 0.0056 ...
## $ V60 : num 0.0032 0.0044 0.0078 0.0117 0.0094 0.0062 0.0103 0.0053 0.0022 0.004 ...
## $ Class: Factor w/ 2 levels "M","R": 2 2 2 2 2 2 2 2 2 2 ...
학습 데이터와 검증 데이터를 구분하기 위해 데이터를 쪼개보자.
inTrain <- caret::createDataPartition(
y = Sonar$Class, # 반응 자료 지정
p = 0.75, # 훈련용 데이터 비율
list = FALSE #결과의 형식
)
결과는 인덱스로 나온다. 따라서 포함할 것을 포저티브, 제외할 것을 네거티브로 정한다.
training <- Sonar[inTrain,]
testing <- Sonar[-inTrain,]
nrow(training)
## [1] 157
nrow(testing)
## [1] 51
rfFit <- train(
Class ~ . ,
data = training,
method = 'rf',
preProc = c('center','scale') #전처리 옵션, 중심화 및 척도화 수행
)
결과를 보자.
rfFit
## Random Forest
##
## 157 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## Pre-processing: centered (60), scaled (60)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 157, 157, 157, 157, 157, 157, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8116525 0.6205097
## 31 0.7624605 0.5227849
## 60 0.7525106 0.5037965
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
두번째 시도(mtry=2)가 가장 좋다고 판단했다. 시각화해서 확인해보자.
plot(rfFit)
인공지능에게 새로운 값을 예측하게 해보자.
rfClasses <- predict(rfFit,newdata = testing)
실제 값과 예측 값의 차이를 혼돈행렬(confusion matrix)로 살펴보자.
rfClasses %>%
confusionMatrix(testing$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 25 4
## R 2 20
##
## Accuracy : 0.8824
## 95% CI : (0.7613, 0.9556)
## No Information Rate : 0.5294
## P-Value [Acc > NIR] : 8.488e-08
##
## Kappa : 0.7628
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9259
## Specificity : 0.8333
## Pos Pred Value : 0.8621
## Neg Pred Value : 0.9091
## Prevalence : 0.5294
## Detection Rate : 0.4902
## Detection Prevalence : 0.5686
## Balanced Accuracy : 0.8796
##
## 'Positive' Class : M
##
mtry를 1에서 10까지의 범위로 확대해서 더 자세히 살펴보자.
rfFit2 <- train(
Class ~ .,
data = training,
method = 'rf',
tuneGrid = {
expand.grid(mtry = 1:10)
},
verbose = FALSE
)
plot(rfFit2)
summary(rfFit2)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 157 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 314 matrix numeric
## oob.times 157 -none- numeric
## classes 2 -none- character
## importance 60 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 157 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 60 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 1 -none- list
rfFit2
## Random Forest
##
## 157 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 157, 157, 157, 157, 157, 157, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.7948900 0.5915493
## 2 0.7938766 0.5887885
## 3 0.7913668 0.5831011
## 4 0.7798276 0.5613612
## 5 0.7761242 0.5531880
## 6 0.7742547 0.5502895
## 7 0.7636352 0.5293168
## 8 0.7767408 0.5556543
## 9 0.7704425 0.5431659
## 10 0.7592693 0.5198157
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
rfFit2 %>%
predict(newdata = testing) %>%
confusionMatrix(testing$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction M R
## M 26 4
## R 1 20
##
## Accuracy : 0.902
## 95% CI : (0.7859, 0.9674)
## No Information Rate : 0.5294
## P-Value [Acc > NIR] : 1.209e-08
##
## Kappa : 0.8019
##
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.9630
## Specificity : 0.8333
## Pos Pred Value : 0.8667
## Neg Pred Value : 0.9524
## Prevalence : 0.5294
## Detection Rate : 0.5098
## Detection Prevalence : 0.5882
## Balanced Accuracy : 0.8981
##
## 'Positive' Class : M
##