Model Construction

rm(list=ls())
ls()

## character(0)

library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## 필요한 패키지를 로딩중입니다: ggplot2

## 필요한 패키지를 로딩중입니다: lattice

getwd()

## [1] "C:/data"

setwd("c:/data") # 작업환경 모든 데이터 작업은 data 폴더에 진행 
df<-read.csv("diagnosis.csv") 
glimpse(df)

## Rows: 569
## Columns: 32
## $ id                      <int> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…

df<-df%>% select(-id)
glimpse(df)

## Rows: 569
## Columns: 31
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…

df$diagnosis <- as.factor(df$diagnosis)
glimpse(df)

## Rows: 569
## Columns: 31
## $ diagnosis               <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…

table(df$diagnosis)

## 
##   B   M 
## 357 212

prop.table(table(df$diagnosis))

## 
##         B         M 
## 0.6274165 0.3725835

tt<-c(1,2,3,NA) #결측값(무응답)


is.na(tt) #결측값이 존재하면 TRUE 출력함

## [1] FALSE FALSE FALSE  TRUE

data("airquality") # 데이터불러오기

summary(airquality)

##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
##

colSums(is.na(airquality))

##   Ozone Solar.R    Wind    Temp   Month     Day 
##      37       7       0       0       0       0

colSums(is.na(df))

##               diagnosis             radius_mean            texture_mean 
##                       0                       0                       0 
##          perimeter_mean               area_mean         smoothness_mean 
##                       0                       0                       0 
##        compactness_mean          concavity_mean     concave.points_mean 
##                       0                       0                       0 
##           symmetry_mean  fractal_dimension_mean               radius_se 
##                       0                       0                       0 
##              texture_se            perimeter_se                 area_se 
##                       0                       0                       0 
##           smoothness_se          compactness_se            concavity_se 
##                       0                       0                       0 
##       concave.points_se             symmetry_se    fractal_dimension_se 
##                       0                       0                       0 
##            radius_worst           texture_worst         perimeter_worst 
##                       0                       0                       0 
##              area_worst        smoothness_worst       compactness_worst 
##                       0                       0                       0 
##         concavity_worst    concave.points_worst          symmetry_worst 
##                       0                       0                       0 
## fractal_dimension_worst 
##                       0

set.seed(3) # 재현성


inTraining<-createDataPartition(y=df$diagnosis,p=.8,list=FALSE)
training<-df[inTraining,]
testing<-df[-inTraining,]


dim(df)

## [1] 569  31

dim(training)

## [1] 456  31

dim(testing)

## [1] 113  31

my_trainControl<-trainControl(method="repeatedcv",
                              number=5,
                              classProbs = TRUE,
                              summaryFunction = twoClassSummary)


rffit<-train(diagnosis ~., data=training,
                       method = "rf",
                       trControl = my_trainControl,
                       verbose = FALSE,
                       ## Specity which metric to optimize
                       metric = "ROC")


rffit

## Random Forest 
## 
## 456 samples
##  30 predictor
##   2 classes: 'B', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times) 
## Summary of sample sizes: 365, 365, 365, 365, 364 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.9912032  0.9789474  0.9352941
##   16    0.9865218  0.9684815  0.9411765
##   30    0.9862638  0.9684815  0.9294118
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

predict(rffit,newdata=testing)   # 모델의 예측값

##   [1] M M M M M B M M M B M M B B B B B B M B B M B M B B B B B B B B M B B B M
##  [38] M M B M M B M M M B B M M B B B B B B M B B M B B M B B B B B M B B B M B
##  [75] B B B B B B M B B B B M B M B M M B M B B M B M B B B M B M M B B B B B B
## [112] B M
## Levels: B M

predict(rffit,newdata=testing,type='prob')

##         B     M
## 7   0.002 0.998
## 9   0.078 0.922
## 26  0.016 0.984
## 27  0.050 0.950
## 34  0.004 0.996
## 39  0.586 0.414
## 43  0.024 0.976
## 48  0.194 0.806
## 55  0.258 0.742
## 56  0.994 0.006
## 57  0.004 0.996
## 65  0.062 0.938
## 67  0.980 0.020
## 68  0.988 0.012
## 72  0.978 0.022
## 81  0.976 0.024
## 94  0.988 0.012
## 98  0.996 0.004
## 100 0.380 0.620
## 111 0.998 0.002
## 114 0.988 0.012
## 120 0.184 0.816
## 127 0.516 0.484
## 133 0.028 0.972
## 138 0.994 0.006
## 140 0.970 0.030
## 141 1.000 0.000
## 155 0.928 0.072
## 160 1.000 0.000
## 164 0.988 0.012
## 167 0.992 0.008
## 189 0.998 0.002
## 191 0.152 0.848
## 192 0.762 0.238
## 193 0.958 0.042
## 196 0.994 0.006
## 198 0.208 0.792
## 202 0.024 0.976
## 208 0.272 0.728
## 217 0.962 0.038
## 219 0.000 1.000
## 240 0.002 0.998
## 250 0.996 0.004
## 255 0.004 0.996
## 256 0.500 0.500
## 266 0.000 1.000
## 271 0.998 0.002
## 272 0.996 0.004
## 281 0.000 1.000
## 284 0.062 0.938
## 286 1.000 0.000
## 302 0.992 0.008
## 304 0.994 0.006
## 305 1.000 0.000
## 307 0.998 0.002
## 317 0.990 0.010
## 329 0.006 0.994
## 332 0.948 0.052
## 335 0.996 0.004
## 336 0.002 0.998
## 341 0.632 0.368
## 345 0.994 0.006
## 352 0.034 0.966
## 355 0.982 0.018
## 361 0.988 0.012
## 375 0.980 0.020
## 376 0.608 0.392
## 379 0.934 0.066
## 380 0.400 0.600
## 384 0.934 0.066
## 385 0.982 0.018
## 387 0.998 0.002
## 390 0.004 0.996
## 391 1.000 0.000
## 399 0.990 0.010
## 402 0.992 0.008
## 403 0.986 0.014
## 405 0.992 0.008
## 408 0.906 0.094
## 414 0.646 0.354
## 418 0.000 1.000
## 419 0.994 0.006
## 421 0.998 0.002
## 426 0.992 0.008
## 428 0.964 0.036
## 431 0.126 0.874
## 432 0.976 0.024
## 436 0.154 0.846
## 438 0.984 0.016
## 442 0.038 0.962
## 445 0.094 0.906
## 456 0.938 0.062
## 461 0.008 0.992
## 473 0.722 0.278
## 479 0.994 0.006
## 480 0.058 0.942
## 482 0.948 0.052
## 485 0.408 0.592
## 496 0.890 0.110
## 497 0.816 0.184
## 501 0.658 0.342
## 504 0.014 0.986
## 506 0.798 0.202
## 513 0.066 0.934
## 517 0.002 0.998
## 526 0.982 0.018
## 527 0.764 0.236
## 535 0.988 0.012
## 540 0.932 0.068
## 542 0.530 0.470
## 554 0.990 0.010
## 555 0.912 0.088
## 568 0.000 1.000

df<- predict(rffit,newdata=testing) 
df

##   [1] M M M M M B M M M B M M B B B B B B M B B M B M B B B B B B B B M B B B M
##  [38] M M B M M B M M M B B M M B B B B B B M B B M B B M B B B B B M B B B M B
##  [75] B B B B B B M B B B B M B M B M M B M B B M B M B B B M B M M B B B B B B
## [112] B M
## Levels: B M

testing$diagnosis

##   [1] M M M M M M M M M B M M B B B B B B M B B M M M B B B B B B B B M B B B M
##  [38] M M B M M B M M M B B M M B B B B B B M B B M B B M B B B B B M B B B M B
##  [75] B B B B B B M B B B B M B M B M M B M B B M B B B B B M B M M B B B B B B
## [112] B M
## Levels: B M

confusionMatrix(testing$diagnosis,df)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 70  1
##          M  2 40
##                                           
##                Accuracy : 0.9735          
##                  95% CI : (0.9244, 0.9945)
##     No Information Rate : 0.6372          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9429          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9722          
##             Specificity : 0.9756          
##          Pos Pred Value : 0.9859          
##          Neg Pred Value : 0.9524          
##              Prevalence : 0.6372          
##          Detection Rate : 0.6195          
##    Detection Prevalence : 0.6283          
##       Balanced Accuracy : 0.9739          
##                                           
##        'Positive' Class : B               
##

d<-(70+1+4+38)
(70+38)/d

## [1] 0.9557522

Model Construction_01

Kim, Dong-Hyun

2023-10-21