rm(list=ls())
ls()
## character(0)
setwd("c:/data")
getwd()
## [1] "c:/data"
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
library(ggplot2)
library(caTools)

df<-read.csv("diagnosis.csv")

#분류모형에 영향력이 없는 컬럼은 삭제한다
#타켓변수는 범주형 변수로 변환한다
#타켓변수의 클래스 비율을 확인한다
#결측값 유무를 파악한다.

df<-df %>% select(-id)
df$diagnosis<-as.factor(df$diagnosis)
prop.table(table(df$diagnosis)) #colSums(is.na(df))
## 
##         B         M 
## 0.6274165 0.3725835
any(is.na(df))
## [1] FALSE
#train, test dat set 분할
set.seed(3)
inTraining<-createDataPartition(y=df$diagnosis,p=.75,list=FALSE)
training<-df[inTraining,]
testing<-df[-inTraining,]


#trainControl()함수 조율
my_trainControl<-trainControl(method="repeatedcv",
                              number=10,
                              repeats=10,
                              classProbs=TRUE,
                              summaryFunction = twoClassSummary)

#랜덤포레스트
rffit<-train(diagnosis~.,data=training,
             method="rf",
             tuneLenght=10,
             trControl=my_trainControl,
             verbose=FALSE,
             metric="ROC")


rffit
## Random Forest 
## 
## 427 samples
##  30 predictor
##   2 classes: 'B', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 386, 384, 384, 384, 384, 384, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.9940791  0.9716809  0.9348333
##   16    0.9927258  0.9645726  0.9416667
##   30    0.9917769  0.9555698  0.9429167
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
plot(rffit)

#ROC : 특이도 (가장 높은 값이 최고로 좋은 값)

library(caret)
plot(varImp(rffit,scale=FALSE),top=10,main="rt")

predict(rffit,newdata=testing) %>% confusionMatrix(testing$diagnosis)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 88  6
##          M  1 47
##                                         
##                Accuracy : 0.9507        
##                  95% CI : (0.9011, 0.98)
##     No Information Rate : 0.6268        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.8926        
##                                         
##  Mcnemar's Test P-Value : 0.1306        
##                                         
##             Sensitivity : 0.9888        
##             Specificity : 0.8868        
##          Pos Pred Value : 0.9362        
##          Neg Pred Value : 0.9792        
##              Prevalence : 0.6268        
##          Detection Rate : 0.6197        
##    Detection Prevalence : 0.6620        
##       Balanced Accuracy : 0.9378        
##                                         
##        'Positive' Class : B             
##