ls()
## character(0)
rm(list = ls())
getwd()
## [1] "D:/data"
#암진단 데이터입니다, 양성과 악성를 분류하는 데이터입니다. 
#유의사항 독립변수중 id,X 데이터는 제외한다. 데이터분할은 7:3으로 한다. 
#train/test  test 데이터의 ROC_ACU 결과를 rmarkdown 단톡방으로 6월18일까지 제출해주세요!
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## Warning: 패키지 'tidyr'는 R 버전 4.1.3에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## 필요한 패키지를 로딩중입니다: lattice
## 
## 다음의 패키지를 부착합니다: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
library(proxy)
## 
## 다음의 패키지를 부착합니다: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
df<-read.csv("wbc.csv")

#1) ID 번호, 2) 진단 (M = 악성, B = 양성) 3-32) 각 세포 핵에 대해 10 개의 실제 값 특징이 계산됩니다 
# a) 반경 (경계의 중심에서 점까지의 거리의 평균) b) 질감 (회색 음영 값의 표준 편차) c) 경계
# d) 영역 e) 평활도 (반경 길이의 국부적 변화) f) 컴팩트 함 (둘레 ^ 2 / 면적 - 1.0)
# g) 오목한 부분 (윤곽선의 오목한 부분의 심각도) h) 오목점 (윤곽선의 오목한 부분 수)
# i) 대칭, j) 프랙탈 치수 ( "해안선 근사치"- 1)


train_list<-createDataPartition(y=df$diagnosis,p=0.7,list=FALSE)
head(train_list)
##      Resample1
## [1,]         1
## [2,]         2
## [3,]         3
## [4,]         6
## [5,]         7
## [6,]         8
df_train<-df[train_list,]
df_test<-df[-train_list,]
NROW(df_train)
## [1] 399
NROW(df_test)
## [1] 170
df_train %>% glimpse
## Rows: 399
## Columns: 33
## $ id                      <int> 842302, 842517, 84300903, 843786, 844359, 8445~
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "~
## $ radius_mean             <dbl> 17.99, 20.57, 19.69, 12.45, 18.25, 13.71, 13.0~
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 20.83, 21.8~
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 90.20, ~
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 577.9, ~
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2196~
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.5835~
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.3770~
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 3.856, 2.40~
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 50.96, 24.~
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 17.06, 15.4~
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 28.14, 30.7~
## $ perimeter_worst         <dbl> 184.6, 158.8, 152.5, 103.4, 153.2, 110.6, 106.~
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 897.0, ~
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1654~
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.3682~
## $ concavity_worst         <dbl> 0.7119, 0.2416, 0.4504, 0.5355, 0.3784, 0.2678~
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.3196~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
## $ X                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
df_test %>% glimpse
## Rows: 170
## Columns: 33
## $ id                      <int> 84348301, 84358402, 84501001, 84862001, 851065~
## $ diagnosis               <chr> "M", "M", "M", "M", "B", "B", "M", "M", "M", "~
## $ radius_mean             <dbl> 11.420, 20.290, 12.460, 16.130, 13.080, 9.504,~
## $ texture_mean            <dbl> 20.38, 14.34, 24.04, 20.68, 15.71, 12.44, 21.3~
## $ perimeter_mean          <dbl> 77.58, 135.10, 83.97, 108.10, 85.63, 60.34, 11~
## $ area_mean               <dbl> 386.1, 1297.0, 475.9, 798.8, 520.0, 273.9, 904~
## $ smoothness_mean         <dbl> 0.14250, 0.10030, 0.11860, 0.11700, 0.10750, 0~
## $ compactness_mean        <dbl> 0.28390, 0.13280, 0.23960, 0.20220, 0.12700, 0~
## $ concavity_mean          <dbl> 0.241400, 0.198000, 0.227300, 0.172200, 0.0456~
## $ concave.points_mean     <dbl> 0.105200, 0.104300, 0.085430, 0.102800, 0.0311~
## $ symmetry_mean           <dbl> 0.2597, 0.1809, 0.2030, 0.2164, 0.1967, 0.1815~
## $ fractal_dimension_mean  <dbl> 0.09744, 0.05883, 0.08243, 0.07356, 0.06811, 0~
## $ radius_se               <dbl> 0.4956, 0.7572, 0.2976, 0.5692, 0.1852, 0.2773~
## $ texture_se              <dbl> 1.1560, 0.7813, 1.5990, 1.0730, 0.7477, 0.9768~
## $ perimeter_se            <dbl> 3.445, 5.438, 2.039, 3.854, 1.383, 1.909, 5.45~
## $ area_se                 <dbl> 27.230, 94.440, 23.940, 54.180, 14.670, 15.700~
## $ smoothness_se           <dbl> 0.009110, 0.011490, 0.007149, 0.007026, 0.0040~
## $ compactness_se          <dbl> 0.074580, 0.024610, 0.072170, 0.025010, 0.0189~
## $ concavity_se            <dbl> 0.056610, 0.056880, 0.077430, 0.031880, 0.0169~
## $ concave.points_se       <dbl> 0.018670, 0.018850, 0.014320, 0.012970, 0.0064~
## $ symmetry_se             <dbl> 0.059630, 0.017560, 0.017890, 0.016890, 0.0167~
## $ fractal_dimension_se    <dbl> 0.009208, 0.005115, 0.010080, 0.004142, 0.0024~
## $ radius_worst            <dbl> 14.910, 22.540, 15.090, 20.960, 14.500, 10.230~
## $ texture_worst           <dbl> 26.50, 16.67, 40.68, 31.48, 20.49, 15.66, 31.5~
## $ perimeter_worst         <dbl> 98.87, 152.20, 97.65, 136.80, 96.09, 65.13, 17~
## $ area_worst              <dbl> 567.7, 1575.0, 711.4, 1315.0, 630.5, 314.9, 22~
## $ smoothness_worst        <dbl> 0.20980, 0.13740, 0.18530, 0.17890, 0.13120, 0~
## $ compactness_worst       <dbl> 0.86630, 0.20500, 1.05800, 0.42330, 0.27760, 0~
## $ concavity_worst         <dbl> 0.686900, 0.400000, 1.105000, 0.478400, 0.1890~
## $ concave.points_worst    <dbl> 0.25750, 0.16250, 0.22100, 0.20730, 0.07283, 0~
## $ symmetry_worst          <dbl> 0.6638, 0.2364, 0.4366, 0.3706, 0.3184, 0.2450~
## $ fractal_dimension_worst <dbl> 0.17300, 0.07678, 0.20750, 0.11420, 0.08183, 0~
## $ X                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
df_train %>% mutate(index="train")->df_train
df_test %>% mutate(index="test")->df_test
bind_rows(df_train,df_test)->full
full %>% select(-id,-X)->full
full %>% glimpse
## Rows: 569
## Columns: 32
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "~
## $ radius_mean             <dbl> 17.99, 20.57, 19.69, 12.45, 18.25, 13.71, 13.0~
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 20.83, 21.8~
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 90.20, ~
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 577.9, ~
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2196~
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.5835~
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.3770~
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 3.856, 2.40~
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 50.96, 24.~
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 17.06, 15.4~
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 28.14, 30.7~
## $ perimeter_worst         <dbl> 184.6, 158.8, 152.5, 103.4, 153.2, 110.6, 106.~
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 897.0, ~
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1654~
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.3682~
## $ concavity_worst         <dbl> 0.7119, 0.2416, 0.4504, 0.5355, 0.3784, 0.2678~
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.3196~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
## $ index                   <chr> "train", "train", "train", "train", "train", "~
full$diagnosis<-as.factor(full$diagnosis)
full %>% glimpse
## Rows: 569
## Columns: 32
## $ diagnosis               <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, B~
## $ radius_mean             <dbl> 17.99, 20.57, 19.69, 12.45, 18.25, 13.71, 13.0~
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 20.83, 21.8~
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 90.20, ~
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 577.9, ~
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2196~
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.5835~
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.3770~
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 3.856, 2.40~
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 50.96, 24.~
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 17.06, 15.4~
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 28.14, 30.7~
## $ perimeter_worst         <dbl> 184.6, 158.8, 152.5, 103.4, 153.2, 110.6, 106.~
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 897.0, ~
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1654~
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.3682~
## $ concavity_worst         <dbl> 0.7119, 0.2416, 0.4504, 0.5355, 0.3784, 0.2678~
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.3196~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
## $ index                   <chr> "train", "train", "train", "train", "train", "~
colSums(is.na(full))
##               diagnosis             radius_mean            texture_mean 
##                       0                       0                       0 
##          perimeter_mean               area_mean         smoothness_mean 
##                       0                       0                       0 
##        compactness_mean          concavity_mean     concave.points_mean 
##                       0                       0                       0 
##           symmetry_mean  fractal_dimension_mean               radius_se 
##                       0                       0                       0 
##              texture_se            perimeter_se                 area_se 
##                       0                       0                       0 
##           smoothness_se          compactness_se            concavity_se 
##                       0                       0                       0 
##       concave.points_se             symmetry_se    fractal_dimension_se 
##                       0                       0                       0 
##            radius_worst           texture_worst         perimeter_worst 
##                       0                       0                       0 
##              area_worst        smoothness_worst       compactness_worst 
##                       0                       0                       0 
##         concavity_worst    concave.points_worst          symmetry_worst 
##                       0                       0                       0 
## fractal_dimension_worst                   index 
##                       0                       0
summary(is.na(full))
##  diagnosis       radius_mean     texture_mean    perimeter_mean 
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:569       FALSE:569       FALSE:569       FALSE:569      
##  area_mean       smoothness_mean compactness_mean concavity_mean 
##  Mode :logical   Mode :logical   Mode :logical    Mode :logical  
##  FALSE:569       FALSE:569       FALSE:569        FALSE:569      
##  concave.points_mean symmetry_mean   fractal_dimension_mean radius_se      
##  Mode :logical       Mode :logical   Mode :logical          Mode :logical  
##  FALSE:569           FALSE:569       FALSE:569              FALSE:569      
##  texture_se      perimeter_se     area_se        smoothness_se  
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:569       FALSE:569       FALSE:569       FALSE:569      
##  compactness_se  concavity_se    concave.points_se symmetry_se    
##  Mode :logical   Mode :logical   Mode :logical     Mode :logical  
##  FALSE:569       FALSE:569       FALSE:569         FALSE:569      
##  fractal_dimension_se radius_worst    texture_worst   perimeter_worst
##  Mode :logical        Mode :logical   Mode :logical   Mode :logical  
##  FALSE:569            FALSE:569       FALSE:569       FALSE:569      
##  area_worst      smoothness_worst compactness_worst concavity_worst
##  Mode :logical   Mode :logical    Mode :logical     Mode :logical  
##  FALSE:569       FALSE:569        FALSE:569         FALSE:569      
##  concave.points_worst symmetry_worst  fractal_dimension_worst   index        
##  Mode :logical        Mode :logical   Mode :logical           Mode :logical  
##  FALSE:569            FALSE:569       FALSE:569               FALSE:569
names(full)
##  [1] "diagnosis"               "radius_mean"            
##  [3] "texture_mean"            "perimeter_mean"         
##  [5] "area_mean"               "smoothness_mean"        
##  [7] "compactness_mean"        "concavity_mean"         
##  [9] "concave.points_mean"     "symmetry_mean"          
## [11] "fractal_dimension_mean"  "radius_se"              
## [13] "texture_se"              "perimeter_se"           
## [15] "area_se"                 "smoothness_se"          
## [17] "compactness_se"          "concavity_se"           
## [19] "concave.points_se"       "symmetry_se"            
## [21] "fractal_dimension_se"    "radius_worst"           
## [23] "texture_worst"           "perimeter_worst"        
## [25] "area_worst"              "smoothness_worst"       
## [27] "compactness_worst"       "concavity_worst"        
## [29] "concave.points_worst"    "symmetry_worst"         
## [31] "fractal_dimension_worst" "index"
full %>% glimpse
## Rows: 569
## Columns: 32
## $ diagnosis               <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, B~
## $ radius_mean             <dbl> 17.99, 20.57, 19.69, 12.45, 18.25, 13.71, 13.0~
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 20.83, 21.8~
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 90.20, ~
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 577.9, ~
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2196~
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.5835~
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.3770~
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 3.856, 2.40~
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 50.96, 24.~
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 17.06, 15.4~
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 28.14, 30.7~
## $ perimeter_worst         <dbl> 184.6, 158.8, 152.5, 103.4, 153.2, 110.6, 106.~
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 897.0, ~
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1654~
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.3682~
## $ concavity_worst         <dbl> 0.7119, 0.2416, 0.4504, 0.5355, 0.3784, 0.2678~
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.3196~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
## $ index                   <chr> "train", "train", "train", "train", "train", "~
recipe(diagnosis~.,data=full) %>% 
    step_YeoJohnson(radius_mean,texture_mean,perimeter_mean,area_mean,
                  smoothness_mean, compactness_mean, concavity_mean,
                  concave.points_mean, symmetry_mean, fractal_dimension_mean, 
                  radius_se, texture_se, perimeter_se, area_se, smoothness_se,
                  compactness_se, concavity_se, concave.points_se, symmetry_se,
                  fractal_dimension_se, 
                  radius_worst, texture_worst, perimeter_worst, area_worst,
                  smoothness_worst, compactness_worst, concavity_worst,
                  concave.points_worst, symmetry_worst, fractal_dimension_worst) %>% 
      step_center(radius_mean,texture_mean,perimeter_mean,area_mean,
                  smoothness_mean, compactness_mean, concavity_mean,
                  concave.points_mean, symmetry_mean, fractal_dimension_mean, 
                  radius_se, texture_se, perimeter_se, area_se, smoothness_se,
                  compactness_se, concavity_se, concave.points_se, symmetry_se,
                  fractal_dimension_se, 
                  radius_worst, texture_worst, perimeter_worst, area_worst,
                  smoothness_worst, compactness_worst, concavity_worst,
                  concave.points_worst, symmetry_worst, fractal_dimension_worst) %>% 
    step_scale(radius_mean,texture_mean,perimeter_mean,area_mean,
               smoothness_mean, compactness_mean, concavity_mean,
               concave.points_mean, symmetry_mean, fractal_dimension_mean, 
               radius_se, texture_se, perimeter_se, area_se, smoothness_se,
               compactness_se, concavity_se, concave.points_se, symmetry_se,
               fractal_dimension_se, 
               radius_worst, texture_worst, perimeter_worst, area_worst,
               smoothness_worst, compactness_worst, concavity_worst,
               concave.points_worst, symmetry_worst, fractal_dimension_worst) %>%
    prep() %>% juice()->data
data %>% glimpse
## Rows: 569
## Columns: 32
## $ radius_mean             <dbl> 1.133883785, 1.617924232, 1.463509328, -0.3677~
## $ texture_mean            <dbl> -2.6763108, -0.2641451, 0.5473245, -0.8241080,~
## $ perimeter_mean          <dbl> 1.25871524, 1.52738014, 1.45338613, -0.2510152~
## $ area_mean               <dbl> 1.1254308234, 1.6325123335, 1.4603611261, -0.3~
## $ smoothness_mean         <dbl> 1.56708746, -0.82623545, 0.94138212, 2.2354545~
## $ compactness_mean        <dbl> 3.28062806, -0.48664348, 1.05199990, 1.2432415~
## $ concavity_mean          <dbl> 2.65054179, -0.02382489, 1.36227979, 0.8655400~
## $ concave.points_mean     <dbl> 2.53024886, 0.54766227, 2.03543978, 0.82393067~
## $ symmetry_mean           <dbl> 2.215565542, 0.001391139, 0.938858720, 1.00451~
## $ fractal_dimension_mean  <dbl> 2.2537638, -0.8678888, -0.3976580, 1.8883435, ~
## $ radius_se               <dbl> 1.891121233, 0.957773674, 1.456535070, 0.00581~
## $ texture_se              <dbl> -0.49776327, -0.99743439, -0.83324887, -0.5383~
## $ perimeter_se            <dbl> 1.97134979, 0.73176211, 1.21196297, -0.0841302~
## $ area_se                 <dbl> 1.8744596, 1.2757387, 1.4947914, 0.0432361, 0.~
## $ smoothness_se           <dbl> -0.21381351, -0.60481867, -0.29674391, 0.15620~
## $ compactness_se          <dbl> 1.31570389, -0.69231710, 0.81425704, 0.4451519~
## $ concavity_se            <dbl> 0.72338965, -0.44039256, 0.21288911, 0.1598845~
## $ concave.points_se       <dbl> 0.66023900, 0.25993335, 1.42357487, -0.0690627~
## $ symmetry_se             <dbl> 1.14774677, -0.80474229, 0.23682715, 0.1340009~
## $ fractal_dimension_se    <dbl> 0.90628565, -0.09935632, 0.29330133, 0.4864178~
## $ radius_worst            <dbl> 1.61821085, 1.57730024, 1.41850851, 0.05600571~
## $ texture_worst           <dbl> -1.48705971, -0.28812729, 0.07134428, -0.22876~
## $ perimeter_worst         <dbl> 1.808915062, 1.429358435, 1.320779419, 0.11951~
## $ area_worst              <dbl> 1.650761073, 1.608609957, 1.424055444, 0.07894~
## $ smoothness_worst        <dbl> 1.30653666, -0.37528175, 0.52694375, 2.0467119~
## $ compactness_worst       <dbl> 1.94102968, -0.29631980, 1.20863781, 1.5744313~
## $ concavity_worst         <dbl> 1.72866091, 0.07068341, 1.00462771, 1.28100735~
## $ concave.points_worst    <dbl> 1.93395138, 1.10062517, 1.72122879, 0.95470737~
## $ symmetry_worst          <dbl> 2.7482041, -0.2436753, 1.1512420, 1.7525273, 0~
## $ fractal_dimension_worst <dbl> 1.93531174, 0.28094279, 0.20121416, 2.23983079~
## $ index                   <fct> train, train, train, train, train, train, trai~
## $ diagnosis               <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, B~
names(data)
##  [1] "radius_mean"             "texture_mean"           
##  [3] "perimeter_mean"          "area_mean"              
##  [5] "smoothness_mean"         "compactness_mean"       
##  [7] "concavity_mean"          "concave.points_mean"    
##  [9] "symmetry_mean"           "fractal_dimension_mean" 
## [11] "radius_se"               "texture_se"             
## [13] "perimeter_se"            "area_se"                
## [15] "smoothness_se"           "compactness_se"         
## [17] "concavity_se"            "concave.points_se"      
## [19] "symmetry_se"             "fractal_dimension_se"   
## [21] "radius_worst"            "texture_worst"          
## [23] "perimeter_worst"         "area_worst"             
## [25] "smoothness_worst"        "compactness_worst"      
## [27] "concavity_worst"         "concave.points_worst"   
## [29] "symmetry_worst"          "fractal_dimension_worst"
## [31] "index"                   "diagnosis"
data %>% filter(index=="train") %>% select(-index)->train
data %>% filter(index=="test") %>% select(-index)->test
train %>% glimpse
## Rows: 399
## Columns: 31
## $ radius_mean             <dbl> 1.133883785, 1.617924232, 1.463509328, -0.3677~
## $ texture_mean            <dbl> -2.6763108, -0.2641451, 0.5473245, -0.8241080,~
## $ perimeter_mean          <dbl> 1.25871524, 1.52738014, 1.45338613, -0.2510152~
## $ area_mean               <dbl> 1.1254308234, 1.6325123335, 1.4603611261, -0.3~
## $ smoothness_mean         <dbl> 1.56708746, -0.82623545, 0.94138212, 2.2354545~
## $ compactness_mean        <dbl> 3.28062806, -0.48664348, 1.05199990, 1.2432415~
## $ concavity_mean          <dbl> 2.65054179, -0.02382489, 1.36227979, 0.8655400~
## $ concave.points_mean     <dbl> 2.53024886, 0.54766227, 2.03543978, 0.82393067~
## $ symmetry_mean           <dbl> 2.215565542, 0.001391139, 0.938858720, 1.00451~
## $ fractal_dimension_mean  <dbl> 2.2537638, -0.8678888, -0.3976580, 1.8883435, ~
## $ radius_se               <dbl> 1.891121233, 0.957773674, 1.456535070, 0.00581~
## $ texture_se              <dbl> -0.49776327, -0.99743439, -0.83324887, -0.5383~
## $ perimeter_se            <dbl> 1.97134979, 0.73176211, 1.21196297, -0.0841302~
## $ area_se                 <dbl> 1.8744596, 1.2757387, 1.4947914, 0.0432361, 0.~
## $ smoothness_se           <dbl> -0.21381351, -0.60481867, -0.29674391, 0.15620~
## $ compactness_se          <dbl> 1.31570389, -0.69231710, 0.81425704, 0.4451519~
## $ concavity_se            <dbl> 0.72338965, -0.44039256, 0.21288911, 0.1598845~
## $ concave.points_se       <dbl> 0.66023900, 0.25993335, 1.42357487, -0.0690627~
## $ symmetry_se             <dbl> 1.14774677, -0.80474229, 0.23682715, 0.1340009~
## $ fractal_dimension_se    <dbl> 0.90628565, -0.09935632, 0.29330133, 0.4864178~
## $ radius_worst            <dbl> 1.61821085, 1.57730024, 1.41850851, 0.05600571~
## $ texture_worst           <dbl> -1.48705971, -0.28812729, 0.07134428, -0.22876~
## $ perimeter_worst         <dbl> 1.808915062, 1.429358435, 1.320779419, 0.11951~
## $ area_worst              <dbl> 1.650761073, 1.608609957, 1.424055444, 0.07894~
## $ smoothness_worst        <dbl> 1.30653666, -0.37528175, 0.52694375, 2.0467119~
## $ compactness_worst       <dbl> 1.94102968, -0.29631980, 1.20863781, 1.5744313~
## $ concavity_worst         <dbl> 1.72866091, 0.07068341, 1.00462771, 1.28100735~
## $ concave.points_worst    <dbl> 1.93395138, 1.10062517, 1.72122879, 0.95470737~
## $ symmetry_worst          <dbl> 2.7482041, -0.2436753, 1.1512420, 1.7525273, 0~
## $ fractal_dimension_worst <dbl> 1.93531174, 0.28094279, 0.20121416, 2.23983079~
## $ diagnosis               <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, B~
test %>% glimpse
## Rows: 170
## Columns: 31
## $ radius_mean             <dbl> -0.75859441, 1.56988048, -0.36416505, 0.715646~
## $ texture_mean            <dbl> 0.35740629, -1.23243596, 1.10845128, 0.4237809~
## $ perimeter_mean          <dbl> -0.51443375, 1.58194943, -0.18125733, 0.803825~
## $ area_mean               <dbl> -0.83550310, 1.59371956, -0.37066306, 0.696029~
## $ smoothness_mean         <dbl> 3.28066684, 0.28012535, 1.58130803, 1.46754343~
## $ compactness_mean        <dbl> 3.39991742, 0.53886631, 2.56110495, 1.85294273~
## $ concavity_mean          <dbl> 1.9142129, 1.3698061, 1.7373434, 1.0461727, -0~
## $ concave.points_mean     <dbl> 1.4504311, 1.4272370, 0.9409324, 1.3885800, -0~
## $ symmetry_mean           <dbl> 2.864862154, -0.009552062, 0.796597103, 1.2853~
## $ fractal_dimension_mean  <dbl> 4.90660199, -0.56195552, 2.78064892, 1.5243395~
## $ radius_se               <dbl> 0.790980012, 1.477705479, -0.241027598, 1.0379~
## $ texture_se              <dbl> 0.09693632, -0.85015993, 0.88023086, -0.084873~
## $ perimeter_se            <dbl> 0.7554983, 1.4504094, -0.2602392, 0.9431884, -~
## $ area_se                 <dbl> 0.0454752, 1.4985659, -0.1561116, 0.9491189, -~
## $ smoothness_se           <dbl> 0.689095329, 1.481763364, 0.035976834, -0.0049~
## $ compactness_se          <dbl> 2.74186785, -0.04847723, 2.60729247, -0.026141~
## $ concavity_se            <dbl> 0.8187979283, 0.8277424542, 1.5085202629, -0.0~
## $ concave.points_se       <dbl> 1.114026779, 1.143198850, 0.409035052, 0.19024~
## $ symmetry_se             <dbl> 4.72851977, -0.36077483, -0.32085405, -0.44182~
## $ fractal_dimension_se    <dbl> 2.045710868, 0.498889164, 2.375256073, 0.13117~
## $ radius_worst            <dbl> -0.08361851, 1.29258941, -0.03778559, 1.078894~
## $ texture_worst           <dbl> 0.22788904, -1.63644398, 2.11071757, 0.9658412~
## $ perimeter_worst         <dbl> -0.03944504, 1.31542433, -0.08432038, 1.014820~
## $ area_worst              <dbl> -0.436477155, 1.308335943, 0.001718328, 1.0412~
## $ smoothness_worst        <dbl> 3.391290721, 0.220362270, 2.318255541, 2.03795~
## $ compactness_worst       <dbl> 2.28027115, -0.13171342, 2.48549100, 1.2035759~
## $ concavity_worst         <dbl> 1.673614727, 0.816755340, 2.353607940, 1.10082~
## $ concave.points_worst    <dbl> 1.8607404, 0.8063672, 1.4960429, 1.3469812, -0~
## $ symmetry_worst          <dbl> 6.04072615, -0.86758960, 2.36835989, 1.3015633~
## $ fractal_dimension_worst <dbl> 4.93067187, -0.39675052, 6.84083682, 1.6750863~
## $ diagnosis               <fct> M, M, M, M, B, B, M, M, M, M, M, M, B, M, M, B~
ctrl<-trainControl(method="cv",summaryFunction=twoClassSummary,classProbs=TRUE)

train(diagnosis~.,data=train, method="rpart",metric="ROC",trControl=ctrl)->rpfit
rpfit
## CART 
## 
## 399 samples
##  30 predictor
##   2 classes: 'B', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 359, 359, 359, 359, 359, 359, ... 
## Resampling results across tuning parameters:
## 
##   cp          ROC        Sens   Spec     
##   0.02013423  0.9303429  0.928  0.8990476
##   0.04026846  0.9251429  0.928  0.9123810
##   0.82550336  0.7766667  0.940  0.6133333
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02013423.
confusionMatrix(rpfit)
## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction    B    M
##          B 58.1  3.8
##          M  4.5 33.6
##                             
##  Accuracy (average) : 0.9173
test %>% glimpse
## Rows: 170
## Columns: 31
## $ radius_mean             <dbl> -0.75859441, 1.56988048, -0.36416505, 0.715646~
## $ texture_mean            <dbl> 0.35740629, -1.23243596, 1.10845128, 0.4237809~
## $ perimeter_mean          <dbl> -0.51443375, 1.58194943, -0.18125733, 0.803825~
## $ area_mean               <dbl> -0.83550310, 1.59371956, -0.37066306, 0.696029~
## $ smoothness_mean         <dbl> 3.28066684, 0.28012535, 1.58130803, 1.46754343~
## $ compactness_mean        <dbl> 3.39991742, 0.53886631, 2.56110495, 1.85294273~
## $ concavity_mean          <dbl> 1.9142129, 1.3698061, 1.7373434, 1.0461727, -0~
## $ concave.points_mean     <dbl> 1.4504311, 1.4272370, 0.9409324, 1.3885800, -0~
## $ symmetry_mean           <dbl> 2.864862154, -0.009552062, 0.796597103, 1.2853~
## $ fractal_dimension_mean  <dbl> 4.90660199, -0.56195552, 2.78064892, 1.5243395~
## $ radius_se               <dbl> 0.790980012, 1.477705479, -0.241027598, 1.0379~
## $ texture_se              <dbl> 0.09693632, -0.85015993, 0.88023086, -0.084873~
## $ perimeter_se            <dbl> 0.7554983, 1.4504094, -0.2602392, 0.9431884, -~
## $ area_se                 <dbl> 0.0454752, 1.4985659, -0.1561116, 0.9491189, -~
## $ smoothness_se           <dbl> 0.689095329, 1.481763364, 0.035976834, -0.0049~
## $ compactness_se          <dbl> 2.74186785, -0.04847723, 2.60729247, -0.026141~
## $ concavity_se            <dbl> 0.8187979283, 0.8277424542, 1.5085202629, -0.0~
## $ concave.points_se       <dbl> 1.114026779, 1.143198850, 0.409035052, 0.19024~
## $ symmetry_se             <dbl> 4.72851977, -0.36077483, -0.32085405, -0.44182~
## $ fractal_dimension_se    <dbl> 2.045710868, 0.498889164, 2.375256073, 0.13117~
## $ radius_worst            <dbl> -0.08361851, 1.29258941, -0.03778559, 1.078894~
## $ texture_worst           <dbl> 0.22788904, -1.63644398, 2.11071757, 0.9658412~
## $ perimeter_worst         <dbl> -0.03944504, 1.31542433, -0.08432038, 1.014820~
## $ area_worst              <dbl> -0.436477155, 1.308335943, 0.001718328, 1.0412~
## $ smoothness_worst        <dbl> 3.391290721, 0.220362270, 2.318255541, 2.03795~
## $ compactness_worst       <dbl> 2.28027115, -0.13171342, 2.48549100, 1.2035759~
## $ concavity_worst         <dbl> 1.673614727, 0.816755340, 2.353607940, 1.10082~
## $ concave.points_worst    <dbl> 1.8607404, 0.8063672, 1.4960429, 1.3469812, -0~
## $ symmetry_worst          <dbl> 6.04072615, -0.86758960, 2.36835989, 1.3015633~
## $ fractal_dimension_worst <dbl> 4.93067187, -0.39675052, 6.84083682, 1.6750863~
## $ diagnosis               <fct> M, M, M, M, B, B, M, M, M, M, M, M, B, M, M, B~
predict(rpfit,test,type="prob")->rffit1
predict(rpfit,test,type="raw")->rffit2
head(rffit1)
##             B          M
## 1 0.962809917 0.03719008
## 2 0.008403361 0.99159664
## 3 0.962809917 0.03719008
## 4 0.008403361 0.99159664
## 5 0.962809917 0.03719008
## 6 0.962809917 0.03719008
head(rffit2)
## [1] B M B M B B
## Levels: B M
confusionMatrix(rffit2,test$diagnosis)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 102  11
##          M   5  52
##                                           
##                Accuracy : 0.9059          
##                  95% CI : (0.8517, 0.9452)
##     No Information Rate : 0.6294          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7942          
##                                           
##  Mcnemar's Test P-Value : 0.2113          
##                                           
##             Sensitivity : 0.9533          
##             Specificity : 0.8254          
##          Pos Pred Value : 0.9027          
##          Neg Pred Value : 0.9123          
##              Prevalence : 0.6294          
##          Detection Rate : 0.6000          
##    Detection Prevalence : 0.6647          
##       Balanced Accuracy : 0.8893          
##                                           
##        'Positive' Class : B               
## 
importance<-varImp(rpfit,scale=FALSE)
print(importance)
## rpart variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                         Overall
## perimeter_worst         148.877
## radius_worst            145.295
## concave.points_worst    137.543
## area_worst              133.384
## concave.points_mean     132.678
## texture_worst            20.175
## smoothness_worst          9.799
## texture_mean              8.431
## symmetry_worst            6.225
## area_mean                 6.225
## smoothness_se             0.000
## radius_mean               0.000
## perimeter_mean            0.000
## fractal_dimension_mean    0.000
## compactness_se            0.000
## fractal_dimension_worst   0.000
## compactness_mean          0.000
## area_se                   0.000
## compactness_worst         0.000
## radius_se                 0.000
rffit2_num<-as.numeric(rffit2)
result<-roc(test$diagnosis,rffit2_num)
## Setting levels: control = B, case = M
## Setting direction: controls < cases
result
## 
## Call:
## roc.default(response = test$diagnosis, predictor = rffit2_num)
## 
## Data: rffit2_num in 107 controls (test$diagnosis B) < 63 cases (test$diagnosis M).
## Area under the curve: 0.8893
result$auc
## Area under the curve: 0.8893