library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(lubridate)
## Warning: 패키지 'lubridate'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# 1. ROC_AUC 이해하기
# AUC(Area Under the ROC curve)란 ROC Curve 의 아래 면적을 나타내는
# 수치로 분류 모델(분류기)의 성능을 나타내는 지표
# ROC curve는 1-특이도(False Positive Rate, FPR, 거짓 긍정률)와
# 민감도(True Positive Rate, TPR, 참 긍정률)을 각각 x축, y축에 나타낸 그래프
# x축은 1-특이도 y축은 민감도 roc 커브 특정 임계값일 때 좌표 연결하면 커브가
# 만들어진다.
library(mlbench)
## Warning: 패키지 'mlbench'는 R 버전 4.1.3에서 작성되었습니다
data("BreastCancer")
colSums(is.na(BreastCancer))
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion
## 0 0 0 0 0
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## 0 16 0 0 0
## Class
## 0
View(BreastCancer)
BreastCancer %>% glimpse
## Rows: 699
## Columns: 11
## $ Id <chr> "1000025", "1002945", "1015425", "1016277", "1017023",~
## $ Cl.thickness <ord> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, 7, 4, 4, ~
## $ Cell.size <ord> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7, 4, 1, 1,~
## $ Cell.shape <ord> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5, 6, 1, 1,~
## $ Marg.adhesion <ord> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10, 4, 1, 1,~
## $ Epith.c.size <ord> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, 6, 2, 2, ~
## $ Bare.nuclei <fct> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, 9, 1, 1, ~
## $ Bl.cromatin <fct> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, 4, 2, 3, ~
## $ Normal.nucleoli <fct> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1, ~
## $ Mitoses <fct> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1, ~
## $ Class <fct> benign, benign, benign, benign, benign, malignant, ben~
levels(BreastCancer$Class)
## [1] "benign" "malignant"
str(BreastCancer)
## 'data.frame': 699 obs. of 11 variables:
## $ Id : chr "1000025" "1002945" "1015425" "1016277" ...
## $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# 중앙값을 결측치 대체
BreastCancer$Bare.nuclei<-as.numeric(BreastCancer$Bare.nuclei)
median(BreastCancer$Bare.nuclei,na.rm=TRUE)
## [1] 1
BreastCancer$Bare.nuclei<-ifelse(is.na(BreastCancer$Bare.nuclei),1,BreastCancer$Bare.nuclei)
set.seed(210615)
library(caret)
idx1<-createDataPartition(BreastCancer$Class,p=0.8,list=F)
# 데이터를 훈련 데이터를 테스트 데이터로 분할하여
# 훈련데이터로 사용할 데이터의 색인을 list 반환
# p 훈련 데이터에서 사용할 데이터의 비율
train<-BreastCancer[idx1,]
test<-BreastCancer[-idx1,]
library(randomForest)
## Warning: 패키지 'randomForest'는 R 버전 4.1.3에서 작성되었습니다
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## 다음의 패키지를 부착합니다: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rffit<-randomForest(Class~.-Id,data=train)
rffit
##
## Call:
## randomForest(formula = Class ~ . - Id, data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 3.39%
## Confusion matrix:
## benign malignant class.error
## benign 355 12 0.03269755
## malignant 7 186 0.03626943
prefit<-predict(rffit,newdata=test,type='response')
prefit
## 6 24 31 38 42 46 52 55
## malignant benign benign benign malignant benign malignant malignant
## 56 60 63 64 69 70 76 78
## malignant malignant malignant benign malignant benign benign benign
## 81 84 85 87 92 93 105 109
## benign benign malignant malignant benign benign malignant benign
## 112 118 120 122 130 136 144 146
## malignant malignant benign benign benign benign benign benign
## 147 148 150 158 173 174 177 183
## malignant benign malignant benign benign malignant benign benign
## 186 187 188 195 197 226 230 231
## benign malignant malignant benign malignant benign malignant malignant
## 238 245 257 265 268 276 278 281
## malignant benign benign malignant malignant benign benign benign
## 284 293 298 299 300 302 303 304
## malignant malignant benign benign malignant benign malignant benign
## 310 312 316 318 319 321 324 330
## benign benign malignant malignant benign malignant malignant malignant
## 333 337 339 342 343 345 347 351
## benign malignant benign benign benign malignant benign benign
## 355 356 358 359 360 366 372 376
## benign benign malignant malignant malignant benign benign benign
## 384 389 390 397 408 413 418 424
## benign benign benign benign benign malignant benign benign
## 428 429 432 434 440 442 444 446
## malignant benign benign benign benign benign benign benign
## 450 455 458 462 474 487 491 495
## malignant benign malignant benign benign benign benign malignant
## 498 501 503 505 508 512 531 544
## benign benign benign benign benign benign malignant benign
## 548 559 566 567 571 573 574 584
## benign benign malignant benign malignant benign benign benign
## 585 589 600 602 604 607 615 618
## benign malignant benign benign malignant benign benign benign
## 622 634 677
## benign malignant benign
## Levels: benign malignant
prefit1<-predict(rffit,newdata=test,type="prob")
prefit1
## benign malignant
## 6 0.002 0.998
## 24 0.546 0.454
## 31 1.000 0.000
## 38 0.766 0.234
## 42 0.106 0.894
## 46 0.998 0.002
## 52 0.354 0.646
## 55 0.008 0.992
## 56 0.044 0.956
## 60 0.270 0.730
## 63 0.022 0.978
## 64 0.538 0.462
## 69 0.020 0.980
## 70 1.000 0.000
## 76 0.938 0.062
## 78 0.998 0.002
## 81 0.834 0.166
## 84 0.880 0.120
## 85 0.004 0.996
## 87 0.188 0.812
## 92 1.000 0.000
## 93 1.000 0.000
## 105 0.110 0.890
## 109 0.996 0.004
## 112 0.052 0.948
## 118 0.008 0.992
## 120 1.000 0.000
## 122 1.000 0.000
## 130 0.854 0.146
## 136 0.966 0.034
## 144 0.984 0.016
## 146 1.000 0.000
## 147 0.236 0.764
## 148 1.000 0.000
## 150 0.008 0.992
## 158 1.000 0.000
## 173 1.000 0.000
## 174 0.000 1.000
## 177 1.000 0.000
## 183 0.998 0.002
## 186 1.000 0.000
## 187 0.016 0.984
## 188 0.006 0.994
## 195 1.000 0.000
## 197 0.044 0.956
## 226 1.000 0.000
## 230 0.000 1.000
## 231 0.038 0.962
## 238 0.030 0.970
## 245 1.000 0.000
## 257 1.000 0.000
## 265 0.052 0.948
## 268 0.090 0.910
## 276 1.000 0.000
## 278 1.000 0.000
## 281 1.000 0.000
## 284 0.026 0.974
## 293 0.232 0.768
## 298 0.766 0.234
## 299 0.658 0.342
## 300 0.182 0.818
## 302 1.000 0.000
## 303 0.000 1.000
## 304 1.000 0.000
## 310 0.842 0.158
## 312 1.000 0.000
## 316 0.204 0.796
## 318 0.010 0.990
## 319 0.932 0.068
## 321 0.012 0.988
## 324 0.078 0.922
## 330 0.010 0.990
## 333 1.000 0.000
## 337 0.134 0.866
## 339 1.000 0.000
## 342 1.000 0.000
## 343 1.000 0.000
## 345 0.008 0.992
## 347 0.888 0.112
## 351 1.000 0.000
## 355 1.000 0.000
## 356 0.986 0.014
## 358 0.000 1.000
## 359 0.016 0.984
## 360 0.178 0.822
## 366 1.000 0.000
## 372 1.000 0.000
## 376 1.000 0.000
## 384 1.000 0.000
## 389 1.000 0.000
## 390 0.972 0.028
## 397 1.000 0.000
## 408 1.000 0.000
## 413 0.002 0.998
## 418 1.000 0.000
## 424 1.000 0.000
## 428 0.040 0.960
## 429 1.000 0.000
## 432 0.954 0.046
## 434 1.000 0.000
## 440 1.000 0.000
## 442 0.758 0.242
## 444 1.000 0.000
## 446 1.000 0.000
## 450 0.000 1.000
## 455 0.998 0.002
## 458 0.010 0.990
## 462 0.960 0.040
## 474 1.000 0.000
## 487 1.000 0.000
## 491 1.000 0.000
## 495 0.482 0.518
## 498 1.000 0.000
## 501 0.998 0.002
## 503 1.000 0.000
## 505 1.000 0.000
## 508 1.000 0.000
## 512 1.000 0.000
## 531 0.020 0.980
## 544 1.000 0.000
## 548 1.000 0.000
## 559 1.000 0.000
## 566 0.000 1.000
## 567 1.000 0.000
## 571 0.122 0.878
## 573 1.000 0.000
## 574 1.000 0.000
## 584 1.000 0.000
## 585 0.948 0.052
## 589 0.070 0.930
## 600 0.980 0.020
## 602 1.000 0.000
## 604 0.256 0.744
## 607 1.000 0.000
## 615 1.000 0.000
## 618 1.000 0.000
## 622 0.510 0.490
## 634 0.036 0.964
## 677 1.000 0.000
## attr(,"class")
## [1] "matrix" "array" "votes"
class(prefit1)
## [1] "matrix" "array" "votes"
class(prefit)
## [1] "factor"
head(test$Class)
## [1] malignant malignant benign benign malignant benign
## Levels: benign malignant
library(pROC)
# 정답라벨,수치형 예측결과
library(pROC)
prfit_num<-as.numeric(prefit)
prfit_num
## [1] 2 1 1 1 2 1 2 2 2 2 2 1 2 1 1 1 1 1 2 2 1 1 2 1 2 2 1 1 1 1 1 1 2 1 2 1 1
## [38] 2 1 1 1 2 2 1 2 1 2 2 2 1 1 2 2 1 1 1 2 2 1 1 2 1 2 1 1 1 2 2 1 2 2 2 1 2
## [75] 1 1 1 2 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1
## [112] 2 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1
result<-roc(test$Class,prfit_num)
## Setting levels: control = benign, case = malignant
## Setting direction: controls < cases
class(test$Class)
## [1] "factor"
class(prfit_num)
## [1] "numeric"
plot(result,legacy.axes=TRUE)

result$auc
## Area under the curve: 0.9627
same<-prefit==test$Class
prefit
## 6 24 31 38 42 46 52 55
## malignant benign benign benign malignant benign malignant malignant
## 56 60 63 64 69 70 76 78
## malignant malignant malignant benign malignant benign benign benign
## 81 84 85 87 92 93 105 109
## benign benign malignant malignant benign benign malignant benign
## 112 118 120 122 130 136 144 146
## malignant malignant benign benign benign benign benign benign
## 147 148 150 158 173 174 177 183
## malignant benign malignant benign benign malignant benign benign
## 186 187 188 195 197 226 230 231
## benign malignant malignant benign malignant benign malignant malignant
## 238 245 257 265 268 276 278 281
## malignant benign benign malignant malignant benign benign benign
## 284 293 298 299 300 302 303 304
## malignant malignant benign benign malignant benign malignant benign
## 310 312 316 318 319 321 324 330
## benign benign malignant malignant benign malignant malignant malignant
## 333 337 339 342 343 345 347 351
## benign malignant benign benign benign malignant benign benign
## 355 356 358 359 360 366 372 376
## benign benign malignant malignant malignant benign benign benign
## 384 389 390 397 408 413 418 424
## benign benign benign benign benign malignant benign benign
## 428 429 432 434 440 442 444 446
## malignant benign benign benign benign benign benign benign
## 450 455 458 462 474 487 491 495
## malignant benign malignant benign benign benign benign malignant
## 498 501 503 505 508 512 531 544
## benign benign benign benign benign benign malignant benign
## 548 559 566 567 571 573 574 584
## benign benign malignant benign malignant benign benign benign
## 585 589 600 602 604 607 615 618
## benign malignant benign benign malignant benign benign benign
## 622 634 677
## benign malignant benign
## Levels: benign malignant
test$Class
## [1] malignant malignant benign benign malignant benign malignant
## [8] malignant malignant malignant malignant malignant malignant benign
## [15] benign benign benign benign malignant malignant benign
## [22] benign malignant benign malignant malignant benign benign
## [29] benign benign benign benign malignant benign malignant
## [36] benign benign malignant benign benign benign malignant
## [43] malignant benign benign benign malignant malignant malignant
## [50] benign benign malignant malignant benign benign benign
## [57] malignant malignant benign benign malignant benign malignant
## [64] benign benign benign benign malignant benign malignant
## [71] malignant malignant benign malignant benign benign benign
## [78] malignant benign benign benign benign malignant malignant
## [85] malignant benign benign benign benign benign benign
## [92] benign benign malignant benign benign malignant benign
## [99] benign benign benign benign benign benign malignant
## [106] benign malignant benign benign benign benign benign
## [113] benign benign benign benign benign benign malignant
## [120] benign benign benign malignant benign malignant benign
## [127] benign benign benign malignant benign benign malignant
## [134] benign benign benign benign malignant benign
## Levels: benign malignant
accuracy<-sum(same)/NROW(same)
accuracy
## [1] 0.9640288