bigdata_part04

# titanic data
library(dplyr)

## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(recipes)

## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(caret)

## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: ggplot2

## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: lattice

read.delim("titanic3.txt",header=TRUE,sep=",")->full
full %>% glimpse

## Rows: 1,309
## Columns: 14
## $ pclass    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ survived  <int> 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, ~
## $ name      <chr> "Allen, Miss. Elisabeth Walton", "Allison, Master. Hudson Tr~
## $ sex       <chr> "female", "male", "female", "male", "female", "male", "femal~
## $ age       <dbl> 29.00, 0.92, 2.00, 30.00, 25.00, 48.00, 63.00, 39.00, 53.00,~
## $ sibsp     <int> 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ parch     <int> 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ~
## $ ticket    <chr> "24160", "113781", "113781", "113781", "113781", "19952", "1~
## $ fare      <dbl> 211.3375, 151.5500, 151.5500, 151.5500, 151.5500, 26.5500, 7~
## $ cabin     <chr> "B5", "C22 C26", "C22 C26", "C22 C26", "C22 C26", "E12", "D7~
## $ embarked  <chr> "S", "S", "S", "S", "S", "S", "S", "S", "S", "C", "C", "C", ~
## $ boat      <chr> "2", "11", "", "", "", "3", "10", "", "D", "", "", "4", "9",~
## $ body      <int> NA, NA, NA, 135, NA, NA, NA, NA, NA, 22, 124, NA, NA, NA, NA~
## $ home.dest <chr> "St Louis, MO", "Montreal, PQ / Chesterville, ON", "Montreal~

#.1 데이터 분할
train_list<-createDataPartition(full$survived,p=0.7,list=FALSE)
full_train<-full[train_list,]
full_test<-full[-train_list,]
NROW(full_train)

## [1] 917

NROW(full_test)

## [1] 392

train<-full_train
test<-full_test

train %>% mutate(index='train')->train
test %>% mutate(index='test')->test
bind_rows(train,test)->full
select <- dplyr::select
full %>% select(-boat,-body,-home.dest)->full
full %>% glimpse

## Rows: 1,309
## Columns: 12
## $ pclass   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ survived <int> 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1~
## $ name     <chr> "Allen, Miss. Elisabeth Walton", "Allison, Master. Hudson Tre~
## $ sex      <chr> "female", "male", "female", "male", "female", "male", "female~
## $ age      <dbl> 29.00, 0.92, 2.00, 30.00, 25.00, 48.00, 63.00, 71.00, 18.00, ~
## $ sibsp    <int> 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0~
## $ parch    <int> 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0~
## $ ticket   <chr> "24160", "113781", "113781", "113781", "113781", "19952", "13~
## $ fare     <dbl> 211.3375, 151.5500, 151.5500, 151.5500, 151.5500, 26.5500, 77~
## $ cabin    <chr> "B5", "C22 C26", "C22 C26", "C22 C26", "C22 C26", "E12", "D7"~
## $ embarked <chr> "S", "S", "S", "S", "S", "S", "S", "C", "C", "S", "S", "C", "~
## $ index    <chr> "train", "train", "train", "train", "train", "train", "train"~

# select <- dplyr::select<= select는 dplyr 패키지의 select 임을 명시적으로 지정
# unused arguments 
# 2. 목표변수, 기타변수 변환
full$survived<-ifelse(full$survived==0,"생존","사망")
full$survived<-as.factor(full$survived)
full$pclass<-as.factor(full$pclass)
full$sex<-as.factor(full$sex)
full$embarked<-as.factor(full$embarked) # 탑승지 

# 3. 결측치 확인
colSums(is.na(full))

##   pclass survived     name      sex      age    sibsp    parch   ticket 
##        0        0        0        0      263        0        0        0 
##     fare    cabin embarked    index 
##        1        0        0        0

summary(full)

##  pclass  survived       name               sex           age       
##  1:323   사망:500   Length:1309        female:466   Min.   : 0.17  
##  2:277   생존:809   Class :character   male  :843   1st Qu.:21.00  
##  3:709              Mode  :character                Median :28.00  
##                                                     Mean   :29.88  
##                                                     3rd Qu.:39.00  
##                                                     Max.   :80.00  
##                                                     NA's   :263    
##      sibsp            parch          ticket               fare        
##  Min.   :0.0000   Min.   :0.000   Length:1309        Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   Class :character   1st Qu.:  7.896  
##  Median :0.0000   Median :0.000   Mode  :character   Median : 14.454  
##  Mean   :0.4989   Mean   :0.385                      Mean   : 33.295  
##  3rd Qu.:1.0000   3rd Qu.:0.000                      3rd Qu.: 31.275  
##  Max.   :8.0000   Max.   :9.000                      Max.   :512.329  
##                                                      NA's   :1        
##     cabin           embarked    index          
##  Length:1309         :  2    Length:1309       
##  Class :character   C:270    Class :character  
##  Mode  :character   Q:123    Mode  :character  
##                     S:914                      
##                                                
##                                                
##

table(full$embarked)

## 
##       C   Q   S 
##   2 270 123 914

levels(full$embarked)[1]<-NA
# table( ) 함수는 NA 값을 제외하고 값을 출력시키므로, useNA에 
# always를 지정해 NA에 대한 개수도 출력하도록 하여 빈도를 확인했다.
# 지정해 NA에 대한 개수도 출력하도록 하여 빈도를 확인했다.
table(full$embarked,useNA="always")

## 
##    C    Q    S <NA> 
##  270  123  914    2

full %>% filter(!is.na(age)&!is.na(fare)&!is.na(embarked))->full
colSums(is.na(full))

##   pclass survived     name      sex      age    sibsp    parch   ticket 
##        0        0        0        0        0        0        0        0 
##     fare    cabin embarked    index 
##        0        0        0        0

# 4 데이터 전처리
recipe(survived~.,data=full) %>% step_YeoJohnson(age,sibsp,parch,fare) %>% 
  step_center(age,sibsp,parch,fare) %>% 
  step_scale(age,sibsp,parch,fare) %>% 
  prep() %>% juice()->data
data %>% filter(index=="train") %>% select(-index,-name,-ticket,-cabin)->train
data %>% filter(index=='test') %>% select(-index,-name,-ticket,-cabin)->test
ctrl<-trainControl(method="cv",summaryFunction = twoClassSummary,
                   classProbs = TRUE)
ctrl  # 10-fold cv

## $method
## [1] "cv"
## 
## $number
## [1] 10
## 
## $repeats
## [1] NA
## 
## $search
## [1] "grid"
## 
## $p
## [1] 0.75
## 
## $initialWindow
## NULL
## 
## $horizon
## [1] 1
## 
## $fixedWindow
## [1] TRUE
## 
## $skip
## [1] 0
## 
## $verboseIter
## [1] FALSE
## 
## $returnData
## [1] TRUE
## 
## $returnResamp
## [1] "final"
## 
## $savePredictions
## [1] FALSE
## 
## $classProbs
## [1] TRUE
## 
## $summaryFunction
## function (data, lev = NULL, model = NULL) 
## {
##     if (length(lev) > 2) {
##         stop(paste("Your outcome has", length(lev), "levels. The twoClassSummary() function isn't appropriate."))
##     }
##     requireNamespaceQuietStop("pROC")
##     if (!all(levels(data[, "pred"]) == lev)) {
##         stop("levels of observed and predicted data do not match")
##     }
##     rocObject <- try(pROC::roc(data$obs, data[, lev[1]], direction = ">", 
##         quiet = TRUE), silent = TRUE)
##     rocAUC <- if (inherits(rocObject, "try-error")) 
##         NA
##     else rocObject$auc
##     out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"], 
##         lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
##     names(out) <- c("ROC", "Sens", "Spec")
##     out
## }
## <bytecode: 0x000000002d51a618>
## <environment: namespace:caret>
## 
## $selectionFunction
## [1] "best"
## 
## $preProcOptions
## $preProcOptions$thresh
## [1] 0.95
## 
## $preProcOptions$ICAcomp
## [1] 3
## 
## $preProcOptions$k
## [1] 5
## 
## $preProcOptions$freqCut
## [1] 19
## 
## $preProcOptions$uniqueCut
## [1] 10
## 
## $preProcOptions$cutoff
## [1] 0.9
## 
## 
## $sampling
## NULL
## 
## $index
## NULL
## 
## $indexOut
## NULL
## 
## $indexFinal
## NULL
## 
## $timingSamps
## [1] 0
## 
## $predictionBounds
## [1] FALSE FALSE
## 
## $seeds
## [1] NA
## 
## $adaptive
## $adaptive$min
## [1] 5
## 
## $adaptive$alpha
## [1] 0.05
## 
## $adaptive$method
## [1] "gls"
## 
## $adaptive$complete
## [1] TRUE
## 
## 
## $trim
## [1] FALSE
## 
## $allowParallel
## [1] TRUE

train(survived~.,data=train,
      method="rpart",metric='ROC',
      trControl=ctrl)->rffit

rffit

## CART 
## 
## 737 samples
##   7 predictor
##   2 classes: '사망', '생존' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 664, 663, 662, 663, 664, 664, ... 
## Resampling results across tuning parameters:
## 
##   cp          ROC        Sens       Spec     
##   0.02159468  0.7840223  0.6416129  0.8719345
##   0.02657807  0.7505421  0.6674194  0.8396934
##   0.44186047  0.6303982  0.3633333  0.8974630
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02159468.

confusionMatrix(rffit)

## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction 사망 생존
##       사망 26.2  7.6
##       생존 14.7 51.6
##                             
##  Accuracy (average) : 0.7775

predict(rffit,test,type="prob")->rffit1
predict(rffit,test,type="raw")->rffit2

confusionMatrix(rffit2,test$survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 사망 생존
##       사망   96   31
##       생존   28  151
##                                           
##                Accuracy : 0.8072          
##                  95% CI : (0.7585, 0.8499)
##     No Information Rate : 0.5948          
##     P-Value [Acc > NIR] : 1.654e-15       
##                                           
##                   Kappa : 0.6015          
##                                           
##  Mcnemar's Test P-Value : 0.7946          
##                                           
##             Sensitivity : 0.7742          
##             Specificity : 0.8297          
##          Pos Pred Value : 0.7559          
##          Neg Pred Value : 0.8436          
##              Prevalence : 0.4052          
##          Detection Rate : 0.3137          
##    Detection Prevalence : 0.4150          
##       Balanced Accuracy : 0.8019          
##                                           
##        'Positive' Class : 사망            
##

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 다음의 패키지를 부착합니다: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

rffit2_num<-as.numeric(rffit2)
rffit2_num

##   [1] 2 1 2 1 2 1 2 2 1 2 1 1 2 1 1 2 2 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1
##  [38] 2 2 2 1 2 2 2 2 1 2 1 1 1 2 2 1 2 2 2 2 2 1 1 2 2 2 1 1 1 1 2 2 2 2 2 1 2
##  [75] 1 2 1 2 1 2 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 1 1 2 2 1 1 2 1 2 1 1 2
## [112] 2 2 2 2 2 1 1 2 2 1 2 2 2 2 1 1 1 2 1 2 1 1 2 2 1 1 1 2 2 1 1 2 1 2 1 1 2
## [149] 2 2 1 1 2 1 2 2 1 2 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 1 2 1 2 2 2 1 1 1 2 2 2
## [186] 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2 1 2 1 2 2 2 1
## [223] 2 1 1 2 2 1 2 1 1 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 1 1
## [260] 1 2 2 1 1 2 2 1 2 1 1 2 1 2 2 2 2 2 2 1 2 2 1 2 2 1 2 2 2 2 1 1 2 2 2 2 2
## [297] 1 1 2 2 1 2 2 2 1 1

result<-roc(test$survived,rffit2_num)

## Setting levels: control = 사망, case = 생존

## Setting direction: controls < cases

result

## 
## Call:
## roc.default(response = test$survived, predictor = rffit2_num)
## 
## Data: rffit2_num in 124 controls (test$survived 사망) < 182 cases (test$survived 생존).
## Area under the curve: 0.8019

result$auc

## Area under the curve: 0.8019

# 4.목표변수가 연속형,RMSE,R^2
# RMSE는 예측값-관측값의 제곱된 갑의 평균에 루트
# 낮을수록 더 적합한 모델로 평가
# 결정계수(R^2) 데이터 예측의 정확도 성능 0~1 범위



df<-read.csv("nyc.csv")
# Our goal is to create a regression model with price as the response and some
# combination of the other five variables as predictors.
df %>% glimpse

## Rows: 165
## Columns: 9
## $ Case       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ Restaurant <chr> "Daniella Ristorante", "Tello's Ristorante", "Biricchino", ~
## $ Price      <int> 43, 32, 34, 41, 54, 52, 34, 34, 39, 44, 45, 47, 52, 35, 47,~
## $ Food       <int> 22, 20, 21, 20, 24, 22, 22, 20, 22, 21, 19, 21, 21, 19, 20,~
## $ Decor      <int> 18, 19, 13, 20, 19, 22, 16, 18, 19, 17, 17, 19, 19, 17, 18,~
## $ Service    <int> 20, 19, 18, 17, 21, 21, 21, 21, 22, 19, 20, 21, 20, 19, 21,~
## $ East       <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ latitude   <dbl> 40.74683, 40.74342, 40.74886, 40.74848, 40.73958, 40.74069,~
## $ longitude  <dbl> -73.99676, -73.99954, -73.99552, -74.00331, -73.99591, -73.~

# case,id 변수는 과적합을 유발, 반드시 제외하고 분석 
nyc<-df %>% select(3:7)
summary(nyc)

##      Price            Food           Decor          Service     
##  Min.   :19.00   Min.   :16.00   Min.   : 6.00   Min.   :14.00  
##  1st Qu.:36.00   1st Qu.:19.00   1st Qu.:16.00   1st Qu.:18.00  
##  Median :43.00   Median :21.00   Median :18.00   Median :20.00  
##  Mean   :42.67   Mean   :20.59   Mean   :17.68   Mean   :19.39  
##  3rd Qu.:50.00   3rd Qu.:22.00   3rd Qu.:19.00   3rd Qu.:21.00  
##  Max.   :65.00   Max.   :25.00   Max.   :25.00   Max.   :24.00  
##       East       
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :1.0000  
##  Mean   :0.6303  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

# Example 1: Ordinary Least Squares Regression
model_1 <- train(Price ~ ., nyc, method="lm")
summary(model_1)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9993  -3.8909   0.1715   3.3611  17.8236 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -24.244248   4.758009  -5.095 9.70e-07 ***
## Food          1.553922   0.372672   4.170 4.98e-05 ***
## Decor         1.897926   0.219100   8.662 4.79e-15 ***
## Service       0.005318   0.399493   0.013   0.9894    
## East          2.003432   0.961794   2.083   0.0388 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.776 on 160 degrees of freedom
## Multiple R-squared:  0.6275, Adjusted R-squared:  0.6181 
## F-statistic: 67.37 on 4 and 160 DF,  p-value: < 2.2e-16

predictions <-predict(model_1, nyc)
head(predictions)

##        1        2        3        4        5        6 
## 44.21108 42.99585 33.15689 44.88314 49.22217 51.80811

SSE <- sum((nyc$Price - predictions)^2) # 설명된 변동 
SST <- sum((nyc$Price - mean(nyc$Price))^2) # (yi-y바)^2 합, 개별 y의 편차제곱의합 
r2 <- 1 - SSE / SST
r2

## [1] 0.6274519

# Example 2: Using CV to Estimate Out-of-Sample r-Squared
# This is done by specifying a trainControl argument.
set.seed(1)
tc <- trainControl(method="cv", number=10)
model_2 <- train(Price ~ ., nyc, method="lm", trControl=tc)
summary(model_2)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9993  -3.8909   0.1715   3.3611  17.8236 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -24.244248   4.758009  -5.095 9.70e-07 ***
## Food          1.553922   0.372672   4.170 4.98e-05 ***
## Decor         1.897926   0.219100   8.662 4.79e-15 ***
## Service       0.005318   0.399493   0.013   0.9894    
## East          2.003432   0.961794   2.083   0.0388 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.776 on 160 degrees of freedom
## Multiple R-squared:  0.6275, Adjusted R-squared:  0.6181 
## F-statistic: 67.37 on 4 and 160 DF,  p-value: < 2.2e-16

model_2

## Linear Regression 
## 
## 165 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 150, 148, 148, 147, 149, 149, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   5.732494  0.6139411  4.564717
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

# We can view the metrics for each of the 10 folds through the model’s resample attribut
model_2$resample

##        RMSE  Rsquared      MAE Resample
## 1  4.497467 0.8709394 3.658307   Fold01
## 2  4.097291 0.8456047 3.057666   Fold02
## 3  8.324965 0.3866326 6.572674   Fold03
## 4  5.134052 0.5815749 4.015348   Fold04
## 5  5.379182 0.5928145 4.085217   Fold05
## 6  5.469966 0.6578463 4.436465   Fold06
## 7  5.004219 0.6500137 4.164401   Fold07
## 8  6.020422 0.4652042 5.505438   Fold08
## 9  6.202349 0.6238497 4.562988   Fold09
## 10 7.195027 0.4649314 5.588667   Fold10

# Example 3: Using Cross-Validation for Hyperparameter Tuning
set.seed(1)
tc <- trainControl(method="cv", number=10, 
                   selectionFunction="Rsquared")
param_grid <- expand.grid(k = c(1:20))
model_3 <- train(Price ~ ., nyc, method="knn", 
                 preProcess=c("center", "scale"), 
                 tuneGrid=param_grid, trainControl=tc, 
                 metric="RMSE") #"Rsquared"
model_3

## k-Nearest Neighbors 
## 
## 165 samples
##   4 predictor
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 165, 165, 165, 165, 165, 165, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  8.223940  0.3658463  6.526515
##    2  7.681634  0.4097964  6.055992
##    3  7.252432  0.4503394  5.724564
##    4  6.937049  0.4822027  5.487844
##    5  6.815849  0.4923156  5.321114
##    6  6.699129  0.5068218  5.210046
##    7  6.631676  0.5131660  5.157253
##    8  6.584585  0.5187462  5.097153
##    9  6.524093  0.5272520  5.028651
##   10  6.532138  0.5271451  5.036258
##   11  6.530362  0.5274103  5.060232
##   12  6.510896  0.5322052  5.057949
##   13  6.491657  0.5372839  5.056959
##   14  6.476135  0.5422905  5.038480
##   15  6.489995  0.5421655  5.052868
##   16  6.505787  0.5407957  5.049296
##   17  6.492769  0.5440470  5.048905
##   18  6.513894  0.5423501  5.060370
##   19  6.531310  0.5412766  5.074675
##   20  6.535247  0.5409758  5.087226
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 14.

model_4 <- train(Price ~ ., nyc, method="knn", 
                 preProcess=c("center", "scale"), 
                 tuneGrid=param_grid, trainControl=tc, 
                 metric="Rsquared")
model_4

## k-Nearest Neighbors 
## 
## 165 samples
##   4 predictor
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 165, 165, 165, 165, 165, 165, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  8.236745  0.3483429  6.546623
##    2  7.668465  0.3897857  6.084771
##    3  7.379658  0.4166542  5.861675
##    4  7.167891  0.4363930  5.686774
##    5  6.913186  0.4610130  5.476801
##    6  6.835730  0.4681090  5.385797
##    7  6.750958  0.4762047  5.272789
##    8  6.695236  0.4803363  5.213516
##    9  6.634529  0.4872808  5.172904
##   10  6.599362  0.4915494  5.138621
##   11  6.568724  0.4954858  5.105011
##   12  6.526740  0.5014081  5.080169
##   13  6.508919  0.5033565  5.073107
##   14  6.486646  0.5077315  5.050160
##   15  6.465243  0.5113948  5.027572
##   16  6.484219  0.5087875  5.041698
##   17  6.463362  0.5128351  5.016095
##   18  6.461651  0.5142957  5.016941
##   19  6.473544  0.5130788  5.029638
##   20  6.482878  0.5121549  5.038150
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was k = 18.

# KNN 알고리즘은 classification 문제를 푸는데 유용한 알고리즘이지만, 동시에 regression 
# 문제도 풀 수 있습니다.
model_3

## k-Nearest Neighbors 
## 
## 165 samples
##   4 predictor
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 165, 165, 165, 165, 165, 165, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  8.223940  0.3658463  6.526515
##    2  7.681634  0.4097964  6.055992
##    3  7.252432  0.4503394  5.724564
##    4  6.937049  0.4822027  5.487844
##    5  6.815849  0.4923156  5.321114
##    6  6.699129  0.5068218  5.210046
##    7  6.631676  0.5131660  5.157253
##    8  6.584585  0.5187462  5.097153
##    9  6.524093  0.5272520  5.028651
##   10  6.532138  0.5271451  5.036258
##   11  6.530362  0.5274103  5.060232
##   12  6.510896  0.5322052  5.057949
##   13  6.491657  0.5372839  5.056959
##   14  6.476135  0.5422905  5.038480
##   15  6.489995  0.5421655  5.052868
##   16  6.505787  0.5407957  5.049296
##   17  6.492769  0.5440470  5.048905
##   18  6.513894  0.5423501  5.060370
##   19  6.531310  0.5412766  5.074675
##   20  6.535247  0.5409758  5.087226
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 14.

model_3$bestTune

##     k
## 14 14

model_3$results

##     k     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1   1 8.223940 0.3658463 6.526515 0.7719792 0.10552733 0.5959375
## 2   2 7.681634 0.4097964 6.055992 0.6395965 0.10020021 0.5026869
## 3   3 7.252432 0.4503394 5.724564 0.5375280 0.09156403 0.4853829
## 4   4 6.937049 0.4822027 5.487844 0.4663403 0.08719568 0.4130118
## 5   5 6.815849 0.4923156 5.321114 0.4662337 0.09179709 0.4024573
## 6   6 6.699129 0.5068218 5.210046 0.4858122 0.09695271 0.4560387
## 7   7 6.631676 0.5131660 5.157253 0.4847689 0.09536929 0.4521420
## 8   8 6.584585 0.5187462 5.097153 0.4927621 0.09437150 0.4651013
## 9   9 6.524093 0.5272520 5.028651 0.5449553 0.09969106 0.5007178
## 10 10 6.532138 0.5271451 5.036258 0.5654736 0.09938211 0.5078184
## 11 11 6.530362 0.5274103 5.060232 0.5631104 0.09792394 0.4993133
## 12 12 6.510896 0.5322052 5.057949 0.5518040 0.09870701 0.4733253
## 13 13 6.491657 0.5372839 5.056959 0.5524016 0.09809260 0.4650902
## 14 14 6.476135 0.5422905 5.038480 0.5186019 0.09415011 0.4389057
## 15 15 6.489995 0.5421655 5.052868 0.5433188 0.09687266 0.4535402
## 16 16 6.505787 0.5407957 5.049296 0.5492017 0.09597308 0.4611141
## 17 17 6.492769 0.5440470 5.048905 0.5358358 0.09359488 0.4555417
## 18 18 6.513894 0.5423501 5.060370 0.5280433 0.09428737 0.4446772
## 19 19 6.531310 0.5412766 5.074675 0.5217155 0.09244746 0.4399072
## 20 20 6.535247 0.5409758 5.087226 0.5094787 0.09245159 0.4399084

model_3$preProcess

## Created from 165 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)

nyc[1:4,2:5]

##   Food Decor Service East
## 1   22    18      20    0
## 2   20    19      19    0
## 3   21    13      18    0
## 4   20    20      17    0

predict(model_3,nyc[1:4,2:5])

## [1] 41.27778 38.78571 34.66667 40.57143

# he object returned by train also contains a preProcess attribute that contains our
# feature scaler.
scaled_data = predict(model_3$preProcess, nyc[1:4, 2:5])
scaled_data

##         Food      Decor    Service      East
## 1  0.7093251  0.1179645  0.2846613 -1.301762
## 2 -0.2952984  0.4852125 -0.1850299 -1.301762
## 3  0.2070133 -1.7182756 -0.6547211 -1.301762
## 4 -0.2952984  0.8524606 -1.1244122 -1.301762

predict(model_3$finalModel, scaled_data)

## [1] 41.27778 38.78571 34.66667 40.57143

# knn 장점 단순하고 효율적이다, 훈련단계가 빠르다.
# 적절한 k의 선택이 필요하다.
# 사례기반 학습하기 때문에 새로운 입력 패턴이 출력값을 요구할 때 까지 아무런
# 액션도 취하지 않는다. 게으른 학습, 어떠한 모형의 모수도 생성되지 않는다.
# k=하이퍼 파라미터, k값이 커질수록 전체 노이즈(이상치)가 줄어들면서 모델의 정확도 커짐
# 교차검증을 통해서 적합한 k값을 찾음
wbc<-read.csv("wbc.csv")
names(wbc)

##  [1] "id"                      "diagnosis"              
##  [3] "radius_mean"             "texture_mean"           
##  [5] "perimeter_mean"          "area_mean"              
##  [7] "smoothness_mean"         "compactness_mean"       
##  [9] "concavity_mean"          "concave.points_mean"    
## [11] "symmetry_mean"           "fractal_dimension_mean" 
## [13] "radius_se"               "texture_se"             
## [15] "perimeter_se"            "area_se"                
## [17] "smoothness_se"           "compactness_se"         
## [19] "concavity_se"            "concave.points_se"      
## [21] "symmetry_se"             "fractal_dimension_se"   
## [23] "radius_worst"            "texture_worst"          
## [25] "perimeter_worst"         "area_worst"             
## [27] "smoothness_worst"        "compactness_worst"      
## [29] "concavity_worst"         "concave.points_worst"   
## [31] "symmetry_worst"          "fractal_dimension_worst"
## [33] "X"

wbc$id <- NULL
wbc %>% glimpse

## Rows: 569
## Columns: 32
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "~
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450~
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9~
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, ~
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, ~
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0~
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0~
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0~
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0~
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087~
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0~
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345~
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902~
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18~
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.~
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114~
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246~
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0~
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188~
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0~
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051~
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8~
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6~
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,~
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, ~
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791~
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249~
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0~
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0~
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0~
## $ X                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~

# id는 환자 아이디이기 때문에 모델에서 제외할 필요가 있다.
# 머신러닝 기법에서는 id 변수는 제외시켜야 한다. id 포함된 모델은 과적합 위험 
table(wbc$diagnosis)

## 
##   B   M 
## 357 212

# R 머신러닝 분류기는 목표 특징이 팩터로 코딩해야 한다.
wbc$diagnosis<-factor(wbc$diagnosis,levels=c("B","M"),
                         labels=c("Benign","Malignant"))
round(prop.table(table(wbc$diagnosis)),2)

## 
##    Benign Malignant 
##      0.63      0.37

wbc %>% select(radius_mean ,area_mean,smoothness_mean) %>% summary

##   radius_mean       area_mean      smoothness_mean  
##  Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
##  1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
##  Median :13.370   Median : 551.1   Median :0.09587  
##  Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
##  3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
##  Max.   :28.110   Max.   :2501.0   Max.   :0.16340

# knn 거리계산 입력 특징 측정, 척도에 종속되어 있다. 
# 특정값을 표준 범위로 재조정하고자 정규화 한다.

colSums(is.na(wbc))

##               diagnosis             radius_mean            texture_mean 
##                       0                       0                       0 
##          perimeter_mean               area_mean         smoothness_mean 
##                       0                       0                       0 
##        compactness_mean          concavity_mean     concave.points_mean 
##                       0                       0                       0 
##           symmetry_mean  fractal_dimension_mean               radius_se 
##                       0                       0                       0 
##              texture_se            perimeter_se                 area_se 
##                       0                       0                       0 
##           smoothness_se          compactness_se            concavity_se 
##                       0                       0                       0 
##       concave.points_se             symmetry_se    fractal_dimension_se 
##                       0                       0                       0 
##            radius_worst           texture_worst         perimeter_worst 
##                       0                       0                       0 
##              area_worst        smoothness_worst       compactness_worst 
##                       0                       0                       0 
##         concavity_worst    concave.points_worst          symmetry_worst 
##                       0                       0                       0 
## fractal_dimension_worst                       X 
##                       0                     569

wbc %>% select(-X)->wbc
names(wbc)

##  [1] "diagnosis"               "radius_mean"            
##  [3] "texture_mean"            "perimeter_mean"         
##  [5] "area_mean"               "smoothness_mean"        
##  [7] "compactness_mean"        "concavity_mean"         
##  [9] "concave.points_mean"     "symmetry_mean"          
## [11] "fractal_dimension_mean"  "radius_se"              
## [13] "texture_se"              "perimeter_se"           
## [15] "area_se"                 "smoothness_se"          
## [17] "compactness_se"          "concavity_se"           
## [19] "concave.points_se"       "symmetry_se"            
## [21] "fractal_dimension_se"    "radius_worst"           
## [23] "texture_worst"           "perimeter_worst"        
## [25] "area_worst"              "smoothness_worst"       
## [27] "compactness_worst"       "concavity_worst"        
## [29] "concave.points_worst"    "symmetry_worst"         
## [31] "fractal_dimension_worst"

idx<-createDataPartition(wbc$diagnosis,p=0.7,list=F)
train<-wbc[idx,]
test<-wbc[-idx,]

train %>% glimpse

## Rows: 399
## Columns: 31
## $ diagnosis               <fct> Malignant, Malignant, Malignant, Malignant, Ma~
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 12.450, 18.250, 13.000~
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 21.82, 23.2~
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 87.50, ~
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 519.8, ~
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2350~
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.3063~
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.0020~
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 2.406, 2.46~
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 24.32, 40.~
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 15.49, 19.1~
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 30.73, 33.8~
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 103.40, 153.20, 106.20~
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 739.3, ~
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1703~
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.5401~
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.53550, 0.37840, 0~
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.4378~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~

test %>% glimpse

## Rows: 170
## Columns: 31
## $ diagnosis               <fct> Malignant, Malignant, Malignant, Malignant, Ma~
## $ radius_mean             <dbl> 11.420, 20.290, 13.710, 12.460, 13.730, 16.130~
## $ texture_mean            <dbl> 20.38, 14.34, 20.83, 24.04, 22.61, 20.68, 22.1~
## $ perimeter_mean          <dbl> 77.58, 135.10, 90.20, 83.97, 93.60, 108.10, 13~
## $ area_mean               <dbl> 386.1, 1297.0, 577.9, 475.9, 578.3, 798.8, 126~
## $ smoothness_mean         <dbl> 0.14250, 0.10030, 0.11890, 0.11860, 0.11310, 0~
## $ compactness_mean        <dbl> 0.28390, 0.13280, 0.16450, 0.23960, 0.22930, 0~
## $ concavity_mean          <dbl> 0.24140, 0.19800, 0.09366, 0.22730, 0.21280, 0~
## $ concave.points_mean     <dbl> 0.105200, 0.104300, 0.059850, 0.085430, 0.0802~
## $ symmetry_mean           <dbl> 0.2597, 0.1809, 0.2196, 0.2030, 0.2069, 0.2164~
## $ fractal_dimension_mean  <dbl> 0.09744, 0.05883, 0.07451, 0.08243, 0.07682, 0~
## $ radius_se               <dbl> 0.4956, 0.7572, 0.5835, 0.2976, 0.2121, 0.5692~
## $ texture_se              <dbl> 1.1560, 0.7813, 1.3770, 1.5990, 1.1690, 1.0730~
## $ perimeter_se            <dbl> 3.445, 5.438, 3.856, 2.039, 2.061, 3.854, 5.86~
## $ area_se                 <dbl> 27.230, 94.440, 50.960, 23.940, 19.210, 54.180~
## $ smoothness_se           <dbl> 0.009110, 0.011490, 0.008805, 0.007149, 0.0064~
## $ compactness_se          <dbl> 0.074580, 0.024610, 0.030290, 0.072170, 0.0593~
## $ concavity_se            <dbl> 0.056610, 0.056880, 0.024880, 0.077430, 0.0550~
## $ concave.points_se       <dbl> 0.018670, 0.018850, 0.014480, 0.014320, 0.0162~
## $ symmetry_se             <dbl> 0.05963, 0.01756, 0.01486, 0.01789, 0.01961, 0~
## $ fractal_dimension_se    <dbl> 0.009208, 0.005115, 0.005412, 0.010080, 0.0080~
## $ radius_worst            <dbl> 14.910, 22.540, 17.060, 15.090, 15.030, 20.960~
## $ texture_worst           <dbl> 26.50, 16.67, 28.14, 40.68, 32.01, 31.48, 30.8~
## $ perimeter_worst         <dbl> 98.87, 152.20, 110.60, 97.65, 108.80, 136.80, ~
## $ area_worst              <dbl> 567.7, 1575.0, 897.0, 711.4, 697.7, 1315.0, 23~
## $ smoothness_worst        <dbl> 0.20980, 0.13740, 0.16540, 0.18530, 0.16510, 0~
## $ compactness_worst       <dbl> 0.86630, 0.20500, 0.36820, 1.05800, 0.77250, 0~
## $ concavity_worst         <dbl> 0.68690, 0.40000, 0.26780, 1.10500, 0.69430, 0~
## $ concave.points_worst    <dbl> 0.25750, 0.16250, 0.15560, 0.22100, 0.22080, 0~
## $ symmetry_worst          <dbl> 0.6638, 0.2364, 0.3196, 0.4366, 0.3596, 0.3706~
## $ fractal_dimension_worst <dbl> 0.17300, 0.07678, 0.11510, 0.20750, 0.14310, 0~

tc <- trainControl(method="cv", number=10)
train(diagnosis ~ ., wbc, method="knn",
      preProcess=c("center", "scale"), trControl=tc,metric="Accuracy")->knnfit
confusionMatrix(knnfit)

## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##            Reference
## Prediction  Benign Malignant
##   Benign      62.2       2.5
##   Malignant    0.5      34.8
##                             
##  Accuracy (average) : 0.9701

predict(knnfit,test,type='prob')->prefit1
predict(knnfit,test,type='raw')->prefit
NROW(prefit)

## [1] 170

class(prefit)

## [1] "factor"

class(test$diagnosis)

## [1] "factor"

levels(prefit)

## [1] "Benign"    "Malignant"

levels(test$diagnosis)

## [1] "Benign"    "Malignant"

confusionMatrix(prefit,test$diagnosis)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign       106         1
##   Malignant      1        62
##                                           
##                Accuracy : 0.9882          
##                  95% CI : (0.9581, 0.9986)
##     No Information Rate : 0.6294          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9748          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9907          
##             Specificity : 0.9841          
##          Pos Pred Value : 0.9907          
##          Neg Pred Value : 0.9841          
##              Prevalence : 0.6294          
##          Detection Rate : 0.6235          
##    Detection Prevalence : 0.6294          
##       Balanced Accuracy : 0.9874          
##                                           
##        'Positive' Class : Benign          
##

library(pROC)
rffit2_num<-as.numeric(prefit)
rffit2_num

##   [1] 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 1 1 2 2 2 1 1 2 1 2 1 2 2 2 1 1 1 2 1 1
##  [38] 1 1 1 1 2 1 2 1 1 1 1 2 2 1 1 2 1 1 1 2 2 2 2 1 2 1 2 1 1 2 2 2 1 2 1 1 1
##  [75] 1 2 1 2 2 2 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## [112] 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2 1 2 1 1 1 1 1 1
## [149] 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1

result<-roc(test$diagnosis,rffit2_num)

## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases

result

## 
## Call:
## roc.default(response = test$diagnosis, predictor = rffit2_num)
## 
## Data: rffit2_num in 107 controls (test$diagnosis Benign) < 63 cases (test$diagnosis Malignant).
## Area under the curve: 0.9874

result$auc

## Area under the curve: 0.9874

bigdata_part04_final

kim kye chul

2022 6 6