rm(list=ls())
ls()
## character(0)
setwd("c:/data")
getwd()
## [1] "c:/data"
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
library(ggplot2)
library(caTools)
df<-read.csv("diagnosis.csv")
#분류모형에 영향력이 없는 컬럼은 삭제한다
#타켓변수는 범주형 변수로 변환한다
#타켓변수의 클래스 비율을 확인한다
#결측값 유무를 파악한다.
df<-df %>% select(-id)
df$diagnosis<-as.factor(df$diagnosis)
prop.table(table(df$diagnosis)) #colSums(is.na(df))
##
## B M
## 0.6274165 0.3725835
any(is.na(df))
## [1] FALSE
#train, test dat set 분할
set.seed(3)
inTraining<-createDataPartition(y=df$diagnosis,p=.75,list=FALSE)
training<-df[inTraining,]
testing<-df[-inTraining,]
#trainControl()함수 조율
my_trainControl<-trainControl(method="repeatedcv",
number=10,
repeats=10,
classProbs=TRUE,
summaryFunction = twoClassSummary)
#랜덤포레스트
rffit<-train(diagnosis~.,data=training,
method="rf",
tuneLenght=10,
trControl=my_trainControl,
verbose=FALSE,
metric="ROC")
rffit
## Random Forest
##
## 427 samples
## 30 predictor
## 2 classes: 'B', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 386, 384, 384, 384, 384, 384, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.9940791 0.9716809 0.9348333
## 16 0.9927258 0.9645726 0.9416667
## 30 0.9917769 0.9555698 0.9429167
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
plot(rffit)

#ROC : 특이도 (가장 높은 값이 최고로 좋은 값)
library(caret)
plot(varImp(rffit,scale=FALSE),top=10,main="rt")

predict(rffit,newdata=testing) %>% confusionMatrix(testing$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 88 6
## M 1 47
##
## Accuracy : 0.9507
## 95% CI : (0.9011, 0.98)
## No Information Rate : 0.6268
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8926
##
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.9888
## Specificity : 0.8868
## Pos Pred Value : 0.9362
## Neg Pred Value : 0.9792
## Prevalence : 0.6268
## Detection Rate : 0.6197
## Detection Prevalence : 0.6620
## Balanced Accuracy : 0.9378
##
## 'Positive' Class : B
##