rm(list=ls())
ls()
## character(0)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
getwd()
## [1] "C:/data"
setwd("c:/data") # 작업환경 모든 데이터 작업은 data 폴더에 진행
df<-read.csv("diagnosis.csv")
glimpse(df)
## Rows: 569
## Columns: 32
## $ id <int> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
df<-df%>% select(-id)
glimpse(df)
## Rows: 569
## Columns: 31
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
df$diagnosis <- as.factor(df$diagnosis)
glimpse(df)
## Rows: 569
## Columns: 31
## $ diagnosis <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M…
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
table(df$diagnosis)
##
## B M
## 357 212
prop.table(table(df$diagnosis))
##
## B M
## 0.6274165 0.3725835
tt<-c(1,2,3,NA) #결측값(무응답)
is.na(tt) #결측값이 존재하면 TRUE 출력함
## [1] FALSE FALSE FALSE TRUE
data("airquality") # 데이터불러오기
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
colSums(is.na(df))
## diagnosis radius_mean texture_mean
## 0 0 0
## perimeter_mean area_mean smoothness_mean
## 0 0 0
## compactness_mean concavity_mean concave.points_mean
## 0 0 0
## symmetry_mean fractal_dimension_mean radius_se
## 0 0 0
## texture_se perimeter_se area_se
## 0 0 0
## smoothness_se compactness_se concavity_se
## 0 0 0
## concave.points_se symmetry_se fractal_dimension_se
## 0 0 0
## radius_worst texture_worst perimeter_worst
## 0 0 0
## area_worst smoothness_worst compactness_worst
## 0 0 0
## concavity_worst concave.points_worst symmetry_worst
## 0 0 0
## fractal_dimension_worst
## 0
set.seed(3) # 재현성
inTraining<-createDataPartition(y=df$diagnosis,p=.8,list=FALSE)
training<-df[inTraining,]
testing<-df[-inTraining,]
dim(df)
## [1] 569 31
dim(training)
## [1] 456 31
dim(testing)
## [1] 113 31
my_trainControl<-trainControl(method="repeatedcv",
number=5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
rffit<-train(diagnosis ~., data=training,
method = "rf",
trControl = my_trainControl,
verbose = FALSE,
## Specity which metric to optimize
metric = "ROC")
rffit
## Random Forest
##
## 456 samples
## 30 predictor
## 2 classes: 'B', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times)
## Summary of sample sizes: 365, 365, 365, 365, 364
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.9912032 0.9789474 0.9352941
## 16 0.9865218 0.9684815 0.9411765
## 30 0.9862638 0.9684815 0.9294118
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
predict(rffit,newdata=testing) # 모델의 예측값
## [1] M M M M M B M M M B M M B B B B B B M B B M B M B B B B B B B B M B B B M
## [38] M M B M M B M M M B B M M B B B B B B M B B M B B M B B B B B M B B B M B
## [75] B B B B B B M B B B B M B M B M M B M B B M B M B B B M B M M B B B B B B
## [112] B M
## Levels: B M
predict(rffit,newdata=testing,type='prob')
## B M
## 7 0.002 0.998
## 9 0.078 0.922
## 26 0.016 0.984
## 27 0.050 0.950
## 34 0.004 0.996
## 39 0.586 0.414
## 43 0.024 0.976
## 48 0.194 0.806
## 55 0.258 0.742
## 56 0.994 0.006
## 57 0.004 0.996
## 65 0.062 0.938
## 67 0.980 0.020
## 68 0.988 0.012
## 72 0.978 0.022
## 81 0.976 0.024
## 94 0.988 0.012
## 98 0.996 0.004
## 100 0.380 0.620
## 111 0.998 0.002
## 114 0.988 0.012
## 120 0.184 0.816
## 127 0.516 0.484
## 133 0.028 0.972
## 138 0.994 0.006
## 140 0.970 0.030
## 141 1.000 0.000
## 155 0.928 0.072
## 160 1.000 0.000
## 164 0.988 0.012
## 167 0.992 0.008
## 189 0.998 0.002
## 191 0.152 0.848
## 192 0.762 0.238
## 193 0.958 0.042
## 196 0.994 0.006
## 198 0.208 0.792
## 202 0.024 0.976
## 208 0.272 0.728
## 217 0.962 0.038
## 219 0.000 1.000
## 240 0.002 0.998
## 250 0.996 0.004
## 255 0.004 0.996
## 256 0.500 0.500
## 266 0.000 1.000
## 271 0.998 0.002
## 272 0.996 0.004
## 281 0.000 1.000
## 284 0.062 0.938
## 286 1.000 0.000
## 302 0.992 0.008
## 304 0.994 0.006
## 305 1.000 0.000
## 307 0.998 0.002
## 317 0.990 0.010
## 329 0.006 0.994
## 332 0.948 0.052
## 335 0.996 0.004
## 336 0.002 0.998
## 341 0.632 0.368
## 345 0.994 0.006
## 352 0.034 0.966
## 355 0.982 0.018
## 361 0.988 0.012
## 375 0.980 0.020
## 376 0.608 0.392
## 379 0.934 0.066
## 380 0.400 0.600
## 384 0.934 0.066
## 385 0.982 0.018
## 387 0.998 0.002
## 390 0.004 0.996
## 391 1.000 0.000
## 399 0.990 0.010
## 402 0.992 0.008
## 403 0.986 0.014
## 405 0.992 0.008
## 408 0.906 0.094
## 414 0.646 0.354
## 418 0.000 1.000
## 419 0.994 0.006
## 421 0.998 0.002
## 426 0.992 0.008
## 428 0.964 0.036
## 431 0.126 0.874
## 432 0.976 0.024
## 436 0.154 0.846
## 438 0.984 0.016
## 442 0.038 0.962
## 445 0.094 0.906
## 456 0.938 0.062
## 461 0.008 0.992
## 473 0.722 0.278
## 479 0.994 0.006
## 480 0.058 0.942
## 482 0.948 0.052
## 485 0.408 0.592
## 496 0.890 0.110
## 497 0.816 0.184
## 501 0.658 0.342
## 504 0.014 0.986
## 506 0.798 0.202
## 513 0.066 0.934
## 517 0.002 0.998
## 526 0.982 0.018
## 527 0.764 0.236
## 535 0.988 0.012
## 540 0.932 0.068
## 542 0.530 0.470
## 554 0.990 0.010
## 555 0.912 0.088
## 568 0.000 1.000
df<- predict(rffit,newdata=testing)
df
## [1] M M M M M B M M M B M M B B B B B B M B B M B M B B B B B B B B M B B B M
## [38] M M B M M B M M M B B M M B B B B B B M B B M B B M B B B B B M B B B M B
## [75] B B B B B B M B B B B M B M B M M B M B B M B M B B B M B M M B B B B B B
## [112] B M
## Levels: B M
testing$diagnosis
## [1] M M M M M M M M M B M M B B B B B B M B B M M M B B B B B B B B M B B B M
## [38] M M B M M B M M M B B M M B B B B B B M B B M B B M B B B B B M B B B M B
## [75] B B B B B B M B B B B M B M B M M B M B B M B B B B B M B M M B B B B B B
## [112] B M
## Levels: B M
confusionMatrix(testing$diagnosis,df)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 70 1
## M 2 40
##
## Accuracy : 0.9735
## 95% CI : (0.9244, 0.9945)
## No Information Rate : 0.6372
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9429
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9722
## Specificity : 0.9756
## Pos Pred Value : 0.9859
## Neg Pred Value : 0.9524
## Prevalence : 0.6372
## Detection Rate : 0.6195
## Detection Prevalence : 0.6283
## Balanced Accuracy : 0.9739
##
## 'Positive' Class : B
##
d<-(70+1+4+38)
(70+38)/d
## [1] 0.9557522