#유방암 데이터 RandomForest 분석
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## 다음의 패키지를 부착합니다: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
df<-read.csv('BreastCancer.csv')
glimpse(df)
## Rows: 569
## Columns: 33
## $ id                      <int> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
## $ X                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
colSums(is.na(df))
##                      id               diagnosis             radius_mean 
##                       0                       0                       0 
##            texture_mean          perimeter_mean               area_mean 
##                       0                       0                       0 
##         smoothness_mean        compactness_mean          concavity_mean 
##                       0                       0                       0 
##     concave.points_mean           symmetry_mean  fractal_dimension_mean 
##                       0                       0                       0 
##               radius_se              texture_se            perimeter_se 
##                       0                       0                       0 
##                 area_se           smoothness_se          compactness_se 
##                       0                       0                       0 
##            concavity_se       concave.points_se             symmetry_se 
##                       0                       0                       0 
##    fractal_dimension_se            radius_worst           texture_worst 
##                       0                       0                       0 
##         perimeter_worst              area_worst        smoothness_worst 
##                       0                       0                       0 
##       compactness_worst         concavity_worst    concave.points_worst 
##                       0                       0                       0 
##          symmetry_worst fractal_dimension_worst                       X 
##                       0                       0                     569
df$diagnosis<-as.factor(df$diagnosis)
glimpse(df)
## Rows: 569
## Columns: 33
## $ id                      <int> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
## $ X                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
df1<-df %>% select(diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean)
train_list <- createDataPartition(y=df1$diagnosis,p=0.8,list=FALSE)
train_df<-df1[train_list,]
test_df<-df1[-train_list,]
ntree<-c(400,500,600,700,800)
mtry<-c(2:6)
param<-data.frame(n=ntree,m=mtry)
for(i in param$n){
  cat('ntree=','\n')
  for(j in param$m){
    cat('mtry=','\n')
    model<-randomForest(diagnosis~.,data=df1,ntree=i,mtry=j)
  }
}
## ntree= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## ntree= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## ntree= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## ntree= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## ntree= 
## mtry= 
## mtry= 
## mtry= 
## mtry= 
## mtry=
model
## 
## Call:
##  randomForest(formula = diagnosis ~ ., data = df1, ntree = i,      mtry = j) 
##                Type of random forest: classification
##                      Number of trees: 800
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 6.68%
## Confusion matrix:
##     B   M class.error
## B 343  14  0.03921569
## M  24 188  0.11320755
model
## 
## Call:
##  randomForest(formula = diagnosis ~ ., data = df1, ntree = i,      mtry = j) 
##                Type of random forest: classification
##                      Number of trees: 800
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 6.68%
## Confusion matrix:
##     B   M class.error
## B 343  14  0.03921569
## M  24 188  0.11320755
result_model<-train(diagnosis~.,data=train_df,
                    preProcess=c("scale","center"),
                    method="rf", ntree=700,
                    trControl=trainControl(method='cv',number = 10, repeats=3))
## Warning: `repeats` has no meaning for this resampling method.
result_model
## Random Forest 
## 
## 456 samples
##   6 predictor
##   2 classes: 'B', 'M' 
## 
## Pre-processing: scaled (6), centered (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 411, 410, 410, 411, 410, 411, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9274879  0.8430467
##   4     0.9296618  0.8468287
##   6     0.9252174  0.8363513
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
confusionMatrix(
  predict(result_model,newdata = test_df,type='raw'),test_df$diagnosis
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 70  6
##          M  1 36
##                                           
##                Accuracy : 0.9381          
##                  95% CI : (0.8765, 0.9747)
##     No Information Rate : 0.6283          
##     P-Value [Acc > NIR] : 1.718e-14       
##                                           
##                   Kappa : 0.8641          
##                                           
##  Mcnemar's Test P-Value : 0.1306          
##                                           
##             Sensitivity : 0.9859          
##             Specificity : 0.8571          
##          Pos Pred Value : 0.9211          
##          Neg Pred Value : 0.9730          
##              Prevalence : 0.6283          
##          Detection Rate : 0.6195          
##    Detection Prevalence : 0.6726          
##       Balanced Accuracy : 0.9215          
##                                           
##        'Positive' Class : B               
##