library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(lubridate)
## Warning: 패키지 'lubridate'는 R 버전 4.1.3에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# 1. ROC_AUC 이해하기
# AUC(Area Under the ROC curve)란  ROC Curve 의 아래 면적을 나타내는
# 수치로 분류 모델(분류기)의 성능을 나타내는 지표
# ROC curve는 1-특이도(False Positive Rate, FPR, 거짓 긍정률)와 
# 민감도(True Positive Rate, TPR, 참 긍정률)을 각각 x축, y축에 나타낸 그래프
# x축은 1-특이도 y축은 민감도 roc 커브 특정 임계값일 때 좌표 연결하면 커브가 
# 만들어진다.

library(mlbench)
## Warning: 패키지 'mlbench'는 R 버전 4.1.3에서 작성되었습니다
data("BreastCancer")
colSums(is.na(BreastCancer))
##              Id    Cl.thickness       Cell.size      Cell.shape   Marg.adhesion 
##               0               0               0               0               0 
##    Epith.c.size     Bare.nuclei     Bl.cromatin Normal.nucleoli         Mitoses 
##               0              16               0               0               0 
##           Class 
##               0
View(BreastCancer)
BreastCancer %>% glimpse
## Rows: 699
## Columns: 11
## $ Id              <chr> "1000025", "1002945", "1015425", "1016277", "1017023",~
## $ Cl.thickness    <ord> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, 7, 4, 4, ~
## $ Cell.size       <ord> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7, 4, 1, 1,~
## $ Cell.shape      <ord> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5, 6, 1, 1,~
## $ Marg.adhesion   <ord> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10, 4, 1, 1,~
## $ Epith.c.size    <ord> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, 6, 2, 2, ~
## $ Bare.nuclei     <fct> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, 9, 1, 1, ~
## $ Bl.cromatin     <fct> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, 4, 2, 3, ~
## $ Normal.nucleoli <fct> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1, ~
## $ Mitoses         <fct> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1, ~
## $ Class           <fct> benign, benign, benign, benign, benign, malignant, ben~
levels(BreastCancer$Class)
## [1] "benign"    "malignant"
str(BreastCancer)
## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# 중앙값을 결측치 대체
BreastCancer$Bare.nuclei<-as.numeric(BreastCancer$Bare.nuclei)
median(BreastCancer$Bare.nuclei,na.rm=TRUE)
## [1] 1
BreastCancer$Bare.nuclei<-ifelse(is.na(BreastCancer$Bare.nuclei),1,BreastCancer$Bare.nuclei)

set.seed(210615)
library(caret)
idx1<-createDataPartition(BreastCancer$Class,p=0.8,list=F)
# 데이터를 훈련 데이터를 테스트 데이터로 분할하여 
# 훈련데이터로 사용할 데이터의 색인을 list 반환
# p 훈련 데이터에서 사용할 데이터의 비율

train<-BreastCancer[idx1,]
test<-BreastCancer[-idx1,]
library(randomForest)
## Warning: 패키지 'randomForest'는 R 버전 4.1.3에서 작성되었습니다
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## 다음의 패키지를 부착합니다: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
rffit<-randomForest(Class~.-Id,data=train)
rffit
## 
## Call:
##  randomForest(formula = Class ~ . - Id, data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 3.39%
## Confusion matrix:
##           benign malignant class.error
## benign       355        12  0.03269755
## malignant      7       186  0.03626943
prefit<-predict(rffit,newdata=test,type='response')
prefit
##         6        24        31        38        42        46        52        55 
## malignant    benign    benign    benign malignant    benign malignant malignant 
##        56        60        63        64        69        70        76        78 
## malignant malignant malignant    benign malignant    benign    benign    benign 
##        81        84        85        87        92        93       105       109 
##    benign    benign malignant malignant    benign    benign malignant    benign 
##       112       118       120       122       130       136       144       146 
## malignant malignant    benign    benign    benign    benign    benign    benign 
##       147       148       150       158       173       174       177       183 
## malignant    benign malignant    benign    benign malignant    benign    benign 
##       186       187       188       195       197       226       230       231 
##    benign malignant malignant    benign malignant    benign malignant malignant 
##       238       245       257       265       268       276       278       281 
## malignant    benign    benign malignant malignant    benign    benign    benign 
##       284       293       298       299       300       302       303       304 
## malignant malignant    benign    benign malignant    benign malignant    benign 
##       310       312       316       318       319       321       324       330 
##    benign    benign malignant malignant    benign malignant malignant malignant 
##       333       337       339       342       343       345       347       351 
##    benign malignant    benign    benign    benign malignant    benign    benign 
##       355       356       358       359       360       366       372       376 
##    benign    benign malignant malignant malignant    benign    benign    benign 
##       384       389       390       397       408       413       418       424 
##    benign    benign    benign    benign    benign malignant    benign    benign 
##       428       429       432       434       440       442       444       446 
## malignant    benign    benign    benign    benign    benign    benign    benign 
##       450       455       458       462       474       487       491       495 
## malignant    benign malignant    benign    benign    benign    benign malignant 
##       498       501       503       505       508       512       531       544 
##    benign    benign    benign    benign    benign    benign malignant    benign 
##       548       559       566       567       571       573       574       584 
##    benign    benign malignant    benign malignant    benign    benign    benign 
##       585       589       600       602       604       607       615       618 
##    benign malignant    benign    benign malignant    benign    benign    benign 
##       622       634       677 
##    benign malignant    benign 
## Levels: benign malignant
prefit1<-predict(rffit,newdata=test,type="prob")
prefit1
##     benign malignant
## 6    0.002     0.998
## 24   0.546     0.454
## 31   1.000     0.000
## 38   0.766     0.234
## 42   0.106     0.894
## 46   0.998     0.002
## 52   0.354     0.646
## 55   0.008     0.992
## 56   0.044     0.956
## 60   0.270     0.730
## 63   0.022     0.978
## 64   0.538     0.462
## 69   0.020     0.980
## 70   1.000     0.000
## 76   0.938     0.062
## 78   0.998     0.002
## 81   0.834     0.166
## 84   0.880     0.120
## 85   0.004     0.996
## 87   0.188     0.812
## 92   1.000     0.000
## 93   1.000     0.000
## 105  0.110     0.890
## 109  0.996     0.004
## 112  0.052     0.948
## 118  0.008     0.992
## 120  1.000     0.000
## 122  1.000     0.000
## 130  0.854     0.146
## 136  0.966     0.034
## 144  0.984     0.016
## 146  1.000     0.000
## 147  0.236     0.764
## 148  1.000     0.000
## 150  0.008     0.992
## 158  1.000     0.000
## 173  1.000     0.000
## 174  0.000     1.000
## 177  1.000     0.000
## 183  0.998     0.002
## 186  1.000     0.000
## 187  0.016     0.984
## 188  0.006     0.994
## 195  1.000     0.000
## 197  0.044     0.956
## 226  1.000     0.000
## 230  0.000     1.000
## 231  0.038     0.962
## 238  0.030     0.970
## 245  1.000     0.000
## 257  1.000     0.000
## 265  0.052     0.948
## 268  0.090     0.910
## 276  1.000     0.000
## 278  1.000     0.000
## 281  1.000     0.000
## 284  0.026     0.974
## 293  0.232     0.768
## 298  0.766     0.234
## 299  0.658     0.342
## 300  0.182     0.818
## 302  1.000     0.000
## 303  0.000     1.000
## 304  1.000     0.000
## 310  0.842     0.158
## 312  1.000     0.000
## 316  0.204     0.796
## 318  0.010     0.990
## 319  0.932     0.068
## 321  0.012     0.988
## 324  0.078     0.922
## 330  0.010     0.990
## 333  1.000     0.000
## 337  0.134     0.866
## 339  1.000     0.000
## 342  1.000     0.000
## 343  1.000     0.000
## 345  0.008     0.992
## 347  0.888     0.112
## 351  1.000     0.000
## 355  1.000     0.000
## 356  0.986     0.014
## 358  0.000     1.000
## 359  0.016     0.984
## 360  0.178     0.822
## 366  1.000     0.000
## 372  1.000     0.000
## 376  1.000     0.000
## 384  1.000     0.000
## 389  1.000     0.000
## 390  0.972     0.028
## 397  1.000     0.000
## 408  1.000     0.000
## 413  0.002     0.998
## 418  1.000     0.000
## 424  1.000     0.000
## 428  0.040     0.960
## 429  1.000     0.000
## 432  0.954     0.046
## 434  1.000     0.000
## 440  1.000     0.000
## 442  0.758     0.242
## 444  1.000     0.000
## 446  1.000     0.000
## 450  0.000     1.000
## 455  0.998     0.002
## 458  0.010     0.990
## 462  0.960     0.040
## 474  1.000     0.000
## 487  1.000     0.000
## 491  1.000     0.000
## 495  0.482     0.518
## 498  1.000     0.000
## 501  0.998     0.002
## 503  1.000     0.000
## 505  1.000     0.000
## 508  1.000     0.000
## 512  1.000     0.000
## 531  0.020     0.980
## 544  1.000     0.000
## 548  1.000     0.000
## 559  1.000     0.000
## 566  0.000     1.000
## 567  1.000     0.000
## 571  0.122     0.878
## 573  1.000     0.000
## 574  1.000     0.000
## 584  1.000     0.000
## 585  0.948     0.052
## 589  0.070     0.930
## 600  0.980     0.020
## 602  1.000     0.000
## 604  0.256     0.744
## 607  1.000     0.000
## 615  1.000     0.000
## 618  1.000     0.000
## 622  0.510     0.490
## 634  0.036     0.964
## 677  1.000     0.000
## attr(,"class")
## [1] "matrix" "array"  "votes"
class(prefit1)
## [1] "matrix" "array"  "votes"
class(prefit)
## [1] "factor"
head(test$Class)
## [1] malignant malignant benign    benign    malignant benign   
## Levels: benign malignant
library(pROC)

# 정답라벨,수치형 예측결과 
library(pROC)
prfit_num<-as.numeric(prefit)
prfit_num
##   [1] 2 1 1 1 2 1 2 2 2 2 2 1 2 1 1 1 1 1 2 2 1 1 2 1 2 2 1 1 1 1 1 1 2 1 2 1 1
##  [38] 2 1 1 1 2 2 1 2 1 2 2 2 1 1 2 2 1 1 1 2 2 1 1 2 1 2 1 1 1 2 2 1 2 2 2 1 2
##  [75] 1 1 1 2 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1
## [112] 2 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1
result<-roc(test$Class,prfit_num)
## Setting levels: control = benign, case = malignant
## Setting direction: controls < cases
class(test$Class)
## [1] "factor"
class(prfit_num)
## [1] "numeric"
plot(result,legacy.axes=TRUE)

result$auc
## Area under the curve: 0.9627
same<-prefit==test$Class
prefit
##         6        24        31        38        42        46        52        55 
## malignant    benign    benign    benign malignant    benign malignant malignant 
##        56        60        63        64        69        70        76        78 
## malignant malignant malignant    benign malignant    benign    benign    benign 
##        81        84        85        87        92        93       105       109 
##    benign    benign malignant malignant    benign    benign malignant    benign 
##       112       118       120       122       130       136       144       146 
## malignant malignant    benign    benign    benign    benign    benign    benign 
##       147       148       150       158       173       174       177       183 
## malignant    benign malignant    benign    benign malignant    benign    benign 
##       186       187       188       195       197       226       230       231 
##    benign malignant malignant    benign malignant    benign malignant malignant 
##       238       245       257       265       268       276       278       281 
## malignant    benign    benign malignant malignant    benign    benign    benign 
##       284       293       298       299       300       302       303       304 
## malignant malignant    benign    benign malignant    benign malignant    benign 
##       310       312       316       318       319       321       324       330 
##    benign    benign malignant malignant    benign malignant malignant malignant 
##       333       337       339       342       343       345       347       351 
##    benign malignant    benign    benign    benign malignant    benign    benign 
##       355       356       358       359       360       366       372       376 
##    benign    benign malignant malignant malignant    benign    benign    benign 
##       384       389       390       397       408       413       418       424 
##    benign    benign    benign    benign    benign malignant    benign    benign 
##       428       429       432       434       440       442       444       446 
## malignant    benign    benign    benign    benign    benign    benign    benign 
##       450       455       458       462       474       487       491       495 
## malignant    benign malignant    benign    benign    benign    benign malignant 
##       498       501       503       505       508       512       531       544 
##    benign    benign    benign    benign    benign    benign malignant    benign 
##       548       559       566       567       571       573       574       584 
##    benign    benign malignant    benign malignant    benign    benign    benign 
##       585       589       600       602       604       607       615       618 
##    benign malignant    benign    benign malignant    benign    benign    benign 
##       622       634       677 
##    benign malignant    benign 
## Levels: benign malignant
test$Class
##   [1] malignant malignant benign    benign    malignant benign    malignant
##   [8] malignant malignant malignant malignant malignant malignant benign   
##  [15] benign    benign    benign    benign    malignant malignant benign   
##  [22] benign    malignant benign    malignant malignant benign    benign   
##  [29] benign    benign    benign    benign    malignant benign    malignant
##  [36] benign    benign    malignant benign    benign    benign    malignant
##  [43] malignant benign    benign    benign    malignant malignant malignant
##  [50] benign    benign    malignant malignant benign    benign    benign   
##  [57] malignant malignant benign    benign    malignant benign    malignant
##  [64] benign    benign    benign    benign    malignant benign    malignant
##  [71] malignant malignant benign    malignant benign    benign    benign   
##  [78] malignant benign    benign    benign    benign    malignant malignant
##  [85] malignant benign    benign    benign    benign    benign    benign   
##  [92] benign    benign    malignant benign    benign    malignant benign   
##  [99] benign    benign    benign    benign    benign    benign    malignant
## [106] benign    malignant benign    benign    benign    benign    benign   
## [113] benign    benign    benign    benign    benign    benign    malignant
## [120] benign    benign    benign    malignant benign    malignant benign   
## [127] benign    benign    benign    malignant benign    benign    malignant
## [134] benign    benign    benign    benign    malignant benign   
## Levels: benign malignant
accuracy<-sum(same)/NROW(same)
accuracy
## [1] 0.9640288