# titanic data
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.1.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
read.delim("titanic3.txt",header=TRUE,sep=",")->full
full %>% glimpse
## Rows: 1,309
## Columns: 14
## $ pclass <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ survived <int> 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, ~
## $ name <chr> "Allen, Miss. Elisabeth Walton", "Allison, Master. Hudson Tr~
## $ sex <chr> "female", "male", "female", "male", "female", "male", "femal~
## $ age <dbl> 29.00, 0.92, 2.00, 30.00, 25.00, 48.00, 63.00, 39.00, 53.00,~
## $ sibsp <int> 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ parch <int> 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ~
## $ ticket <chr> "24160", "113781", "113781", "113781", "113781", "19952", "1~
## $ fare <dbl> 211.3375, 151.5500, 151.5500, 151.5500, 151.5500, 26.5500, 7~
## $ cabin <chr> "B5", "C22 C26", "C22 C26", "C22 C26", "C22 C26", "E12", "D7~
## $ embarked <chr> "S", "S", "S", "S", "S", "S", "S", "S", "S", "C", "C", "C", ~
## $ boat <chr> "2", "11", "", "", "", "3", "10", "", "D", "", "", "4", "9",~
## $ body <int> NA, NA, NA, 135, NA, NA, NA, NA, NA, 22, 124, NA, NA, NA, NA~
## $ home.dest <chr> "St Louis, MO", "Montreal, PQ / Chesterville, ON", "Montreal~
#.1 데이터 분할
train_list<-createDataPartition(full$survived,p=0.7,list=FALSE)
full_train<-full[train_list,]
full_test<-full[-train_list,]
NROW(full_train)
## [1] 917
NROW(full_test)
## [1] 392
train<-full_train
test<-full_test
train %>% mutate(index='train')->train
test %>% mutate(index='test')->test
bind_rows(train,test)->full
select <- dplyr::select
full %>% select(-boat,-body,-home.dest)->full
full %>% glimpse
## Rows: 1,309
## Columns: 12
## $ pclass <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ survived <int> 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1~
## $ name <chr> "Allen, Miss. Elisabeth Walton", "Allison, Master. Hudson Tre~
## $ sex <chr> "female", "male", "female", "male", "female", "male", "female~
## $ age <dbl> 29.00, 0.92, 2.00, 30.00, 25.00, 48.00, 63.00, 71.00, 18.00, ~
## $ sibsp <int> 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0~
## $ parch <int> 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0~
## $ ticket <chr> "24160", "113781", "113781", "113781", "113781", "19952", "13~
## $ fare <dbl> 211.3375, 151.5500, 151.5500, 151.5500, 151.5500, 26.5500, 77~
## $ cabin <chr> "B5", "C22 C26", "C22 C26", "C22 C26", "C22 C26", "E12", "D7"~
## $ embarked <chr> "S", "S", "S", "S", "S", "S", "S", "C", "C", "S", "S", "C", "~
## $ index <chr> "train", "train", "train", "train", "train", "train", "train"~
# select <- dplyr::select<= select는 dplyr 패키지의 select 임을 명시적으로 지정
# unused arguments
# 2. 목표변수, 기타변수 변환
full$survived<-ifelse(full$survived==0,"생존","사망")
full$survived<-as.factor(full$survived)
full$pclass<-as.factor(full$pclass)
full$sex<-as.factor(full$sex)
full$embarked<-as.factor(full$embarked) # 탑승지
# 3. 결측치 확인
colSums(is.na(full))
## pclass survived name sex age sibsp parch ticket
## 0 0 0 0 263 0 0 0
## fare cabin embarked index
## 1 0 0 0
summary(full)
## pclass survived name sex age
## 1:323 사망:500 Length:1309 female:466 Min. : 0.17
## 2:277 생존:809 Class :character male :843 1st Qu.:21.00
## 3:709 Mode :character Median :28.00
## Mean :29.88
## 3rd Qu.:39.00
## Max. :80.00
## NA's :263
## sibsp parch ticket fare
## Min. :0.0000 Min. :0.000 Length:1309 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.000 Class :character 1st Qu.: 7.896
## Median :0.0000 Median :0.000 Mode :character Median : 14.454
## Mean :0.4989 Mean :0.385 Mean : 33.295
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :8.0000 Max. :9.000 Max. :512.329
## NA's :1
## cabin embarked index
## Length:1309 : 2 Length:1309
## Class :character C:270 Class :character
## Mode :character Q:123 Mode :character
## S:914
##
##
##
table(full$embarked)
##
## C Q S
## 2 270 123 914
levels(full$embarked)[1]<-NA
# table( ) 함수는 NA 값을 제외하고 값을 출력시키므로, useNA에
# always를 지정해 NA에 대한 개수도 출력하도록 하여 빈도를 확인했다.
# 지정해 NA에 대한 개수도 출력하도록 하여 빈도를 확인했다.
table(full$embarked,useNA="always")
##
## C Q S <NA>
## 270 123 914 2
full %>% filter(!is.na(age)&!is.na(fare)&!is.na(embarked))->full
colSums(is.na(full))
## pclass survived name sex age sibsp parch ticket
## 0 0 0 0 0 0 0 0
## fare cabin embarked index
## 0 0 0 0
# 4 데이터 전처리
recipe(survived~.,data=full) %>% step_YeoJohnson(age,sibsp,parch,fare) %>%
step_center(age,sibsp,parch,fare) %>%
step_scale(age,sibsp,parch,fare) %>%
prep() %>% juice()->data
data %>% filter(index=="train") %>% select(-index,-name,-ticket,-cabin)->train
data %>% filter(index=='test') %>% select(-index,-name,-ticket,-cabin)->test
ctrl<-trainControl(method="cv",summaryFunction = twoClassSummary,
classProbs = TRUE)
ctrl # 10-fold cv
## $method
## [1] "cv"
##
## $number
## [1] 10
##
## $repeats
## [1] NA
##
## $search
## [1] "grid"
##
## $p
## [1] 0.75
##
## $initialWindow
## NULL
##
## $horizon
## [1] 1
##
## $fixedWindow
## [1] TRUE
##
## $skip
## [1] 0
##
## $verboseIter
## [1] FALSE
##
## $returnData
## [1] TRUE
##
## $returnResamp
## [1] "final"
##
## $savePredictions
## [1] FALSE
##
## $classProbs
## [1] TRUE
##
## $summaryFunction
## function (data, lev = NULL, model = NULL)
## {
## if (length(lev) > 2) {
## stop(paste("Your outcome has", length(lev), "levels. The twoClassSummary() function isn't appropriate."))
## }
## requireNamespaceQuietStop("pROC")
## if (!all(levels(data[, "pred"]) == lev)) {
## stop("levels of observed and predicted data do not match")
## }
## rocObject <- try(pROC::roc(data$obs, data[, lev[1]], direction = ">",
## quiet = TRUE), silent = TRUE)
## rocAUC <- if (inherits(rocObject, "try-error"))
## NA
## else rocObject$auc
## out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"],
## lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
## names(out) <- c("ROC", "Sens", "Spec")
## out
## }
## <bytecode: 0x000000002d51a618>
## <environment: namespace:caret>
##
## $selectionFunction
## [1] "best"
##
## $preProcOptions
## $preProcOptions$thresh
## [1] 0.95
##
## $preProcOptions$ICAcomp
## [1] 3
##
## $preProcOptions$k
## [1] 5
##
## $preProcOptions$freqCut
## [1] 19
##
## $preProcOptions$uniqueCut
## [1] 10
##
## $preProcOptions$cutoff
## [1] 0.9
##
##
## $sampling
## NULL
##
## $index
## NULL
##
## $indexOut
## NULL
##
## $indexFinal
## NULL
##
## $timingSamps
## [1] 0
##
## $predictionBounds
## [1] FALSE FALSE
##
## $seeds
## [1] NA
##
## $adaptive
## $adaptive$min
## [1] 5
##
## $adaptive$alpha
## [1] 0.05
##
## $adaptive$method
## [1] "gls"
##
## $adaptive$complete
## [1] TRUE
##
##
## $trim
## [1] FALSE
##
## $allowParallel
## [1] TRUE
train(survived~.,data=train,
method="rpart",metric='ROC',
trControl=ctrl)->rffit
rffit
## CART
##
## 737 samples
## 7 predictor
## 2 classes: '사망', '생존'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 664, 663, 662, 663, 664, 664, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.02159468 0.7840223 0.6416129 0.8719345
## 0.02657807 0.7505421 0.6674194 0.8396934
## 0.44186047 0.6303982 0.3633333 0.8974630
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02159468.
confusionMatrix(rffit)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 사망 생존
## 사망 26.2 7.6
## 생존 14.7 51.6
##
## Accuracy (average) : 0.7775
predict(rffit,test,type="prob")->rffit1
predict(rffit,test,type="raw")->rffit2
confusionMatrix(rffit2,test$survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 사망 생존
## 사망 96 31
## 생존 28 151
##
## Accuracy : 0.8072
## 95% CI : (0.7585, 0.8499)
## No Information Rate : 0.5948
## P-Value [Acc > NIR] : 1.654e-15
##
## Kappa : 0.6015
##
## Mcnemar's Test P-Value : 0.7946
##
## Sensitivity : 0.7742
## Specificity : 0.8297
## Pos Pred Value : 0.7559
## Neg Pred Value : 0.8436
## Prevalence : 0.4052
## Detection Rate : 0.3137
## Detection Prevalence : 0.4150
## Balanced Accuracy : 0.8019
##
## 'Positive' Class : 사망
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
rffit2_num<-as.numeric(rffit2)
rffit2_num
## [1] 2 1 2 1 2 1 2 2 1 2 1 1 2 1 1 2 2 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1
## [38] 2 2 2 1 2 2 2 2 1 2 1 1 1 2 2 1 2 2 2 2 2 1 1 2 2 2 1 1 1 1 2 2 2 2 2 1 2
## [75] 1 2 1 2 1 2 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 1 1 2 2 1 1 2 1 2 1 1 2
## [112] 2 2 2 2 2 1 1 2 2 1 2 2 2 2 1 1 1 2 1 2 1 1 2 2 1 1 1 2 2 1 1 2 1 2 1 1 2
## [149] 2 2 1 1 2 1 2 2 1 2 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 1 2 1 2 2 2 1 1 1 2 2 2
## [186] 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2 1 2 1 2 2 2 1
## [223] 2 1 1 2 2 1 2 1 1 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 1 1
## [260] 1 2 2 1 1 2 2 1 2 1 1 2 1 2 2 2 2 2 2 1 2 2 1 2 2 1 2 2 2 2 1 1 2 2 2 2 2
## [297] 1 1 2 2 1 2 2 2 1 1
result<-roc(test$survived,rffit2_num)
## Setting levels: control = 사망, case = 생존
## Setting direction: controls < cases
result
##
## Call:
## roc.default(response = test$survived, predictor = rffit2_num)
##
## Data: rffit2_num in 124 controls (test$survived 사망) < 182 cases (test$survived 생존).
## Area under the curve: 0.8019
result$auc
## Area under the curve: 0.8019
# 4.목표변수가 연속형,RMSE,R^2
# RMSE는 예측값-관측값의 제곱된 갑의 평균에 루트
# 낮을수록 더 적합한 모델로 평가
# 결정계수(R^2) 데이터 예측의 정확도 성능 0~1 범위
df<-read.csv("nyc.csv")
# Our goal is to create a regression model with price as the response and some
# combination of the other five variables as predictors.
df %>% glimpse
## Rows: 165
## Columns: 9
## $ Case <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ Restaurant <chr> "Daniella Ristorante", "Tello's Ristorante", "Biricchino", ~
## $ Price <int> 43, 32, 34, 41, 54, 52, 34, 34, 39, 44, 45, 47, 52, 35, 47,~
## $ Food <int> 22, 20, 21, 20, 24, 22, 22, 20, 22, 21, 19, 21, 21, 19, 20,~
## $ Decor <int> 18, 19, 13, 20, 19, 22, 16, 18, 19, 17, 17, 19, 19, 17, 18,~
## $ Service <int> 20, 19, 18, 17, 21, 21, 21, 21, 22, 19, 20, 21, 20, 19, 21,~
## $ East <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ latitude <dbl> 40.74683, 40.74342, 40.74886, 40.74848, 40.73958, 40.74069,~
## $ longitude <dbl> -73.99676, -73.99954, -73.99552, -74.00331, -73.99591, -73.~
# case,id 변수는 과적합을 유발, 반드시 제외하고 분석
nyc<-df %>% select(3:7)
summary(nyc)
## Price Food Decor Service
## Min. :19.00 Min. :16.00 Min. : 6.00 Min. :14.00
## 1st Qu.:36.00 1st Qu.:19.00 1st Qu.:16.00 1st Qu.:18.00
## Median :43.00 Median :21.00 Median :18.00 Median :20.00
## Mean :42.67 Mean :20.59 Mean :17.68 Mean :19.39
## 3rd Qu.:50.00 3rd Qu.:22.00 3rd Qu.:19.00 3rd Qu.:21.00
## Max. :65.00 Max. :25.00 Max. :25.00 Max. :24.00
## East
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.6303
## 3rd Qu.:1.0000
## Max. :1.0000
# Example 1: Ordinary Least Squares Regression
model_1 <- train(Price ~ ., nyc, method="lm")
summary(model_1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9993 -3.8909 0.1715 3.3611 17.8236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.244248 4.758009 -5.095 9.70e-07 ***
## Food 1.553922 0.372672 4.170 4.98e-05 ***
## Decor 1.897926 0.219100 8.662 4.79e-15 ***
## Service 0.005318 0.399493 0.013 0.9894
## East 2.003432 0.961794 2.083 0.0388 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.776 on 160 degrees of freedom
## Multiple R-squared: 0.6275, Adjusted R-squared: 0.6181
## F-statistic: 67.37 on 4 and 160 DF, p-value: < 2.2e-16
predictions <-predict(model_1, nyc)
head(predictions)
## 1 2 3 4 5 6
## 44.21108 42.99585 33.15689 44.88314 49.22217 51.80811
SSE <- sum((nyc$Price - predictions)^2) # 설명된 변동
SST <- sum((nyc$Price - mean(nyc$Price))^2) # (yi-y바)^2 합, 개별 y의 편차제곱의합
r2 <- 1 - SSE / SST
r2
## [1] 0.6274519
# Example 2: Using CV to Estimate Out-of-Sample r-Squared
# This is done by specifying a trainControl argument.
set.seed(1)
tc <- trainControl(method="cv", number=10)
model_2 <- train(Price ~ ., nyc, method="lm", trControl=tc)
summary(model_2)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9993 -3.8909 0.1715 3.3611 17.8236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.244248 4.758009 -5.095 9.70e-07 ***
## Food 1.553922 0.372672 4.170 4.98e-05 ***
## Decor 1.897926 0.219100 8.662 4.79e-15 ***
## Service 0.005318 0.399493 0.013 0.9894
## East 2.003432 0.961794 2.083 0.0388 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.776 on 160 degrees of freedom
## Multiple R-squared: 0.6275, Adjusted R-squared: 0.6181
## F-statistic: 67.37 on 4 and 160 DF, p-value: < 2.2e-16
model_2
## Linear Regression
##
## 165 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 150, 148, 148, 147, 149, 149, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 5.732494 0.6139411 4.564717
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# We can view the metrics for each of the 10 folds through the model’s resample attribut
model_2$resample
## RMSE Rsquared MAE Resample
## 1 4.497467 0.8709394 3.658307 Fold01
## 2 4.097291 0.8456047 3.057666 Fold02
## 3 8.324965 0.3866326 6.572674 Fold03
## 4 5.134052 0.5815749 4.015348 Fold04
## 5 5.379182 0.5928145 4.085217 Fold05
## 6 5.469966 0.6578463 4.436465 Fold06
## 7 5.004219 0.6500137 4.164401 Fold07
## 8 6.020422 0.4652042 5.505438 Fold08
## 9 6.202349 0.6238497 4.562988 Fold09
## 10 7.195027 0.4649314 5.588667 Fold10
# Example 3: Using Cross-Validation for Hyperparameter Tuning
set.seed(1)
tc <- trainControl(method="cv", number=10,
selectionFunction="Rsquared")
param_grid <- expand.grid(k = c(1:20))
model_3 <- train(Price ~ ., nyc, method="knn",
preProcess=c("center", "scale"),
tuneGrid=param_grid, trainControl=tc,
metric="RMSE") #"Rsquared"
model_3
## k-Nearest Neighbors
##
## 165 samples
## 4 predictor
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 165, 165, 165, 165, 165, 165, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 8.223940 0.3658463 6.526515
## 2 7.681634 0.4097964 6.055992
## 3 7.252432 0.4503394 5.724564
## 4 6.937049 0.4822027 5.487844
## 5 6.815849 0.4923156 5.321114
## 6 6.699129 0.5068218 5.210046
## 7 6.631676 0.5131660 5.157253
## 8 6.584585 0.5187462 5.097153
## 9 6.524093 0.5272520 5.028651
## 10 6.532138 0.5271451 5.036258
## 11 6.530362 0.5274103 5.060232
## 12 6.510896 0.5322052 5.057949
## 13 6.491657 0.5372839 5.056959
## 14 6.476135 0.5422905 5.038480
## 15 6.489995 0.5421655 5.052868
## 16 6.505787 0.5407957 5.049296
## 17 6.492769 0.5440470 5.048905
## 18 6.513894 0.5423501 5.060370
## 19 6.531310 0.5412766 5.074675
## 20 6.535247 0.5409758 5.087226
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 14.
model_4 <- train(Price ~ ., nyc, method="knn",
preProcess=c("center", "scale"),
tuneGrid=param_grid, trainControl=tc,
metric="Rsquared")
model_4
## k-Nearest Neighbors
##
## 165 samples
## 4 predictor
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 165, 165, 165, 165, 165, 165, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 8.236745 0.3483429 6.546623
## 2 7.668465 0.3897857 6.084771
## 3 7.379658 0.4166542 5.861675
## 4 7.167891 0.4363930 5.686774
## 5 6.913186 0.4610130 5.476801
## 6 6.835730 0.4681090 5.385797
## 7 6.750958 0.4762047 5.272789
## 8 6.695236 0.4803363 5.213516
## 9 6.634529 0.4872808 5.172904
## 10 6.599362 0.4915494 5.138621
## 11 6.568724 0.4954858 5.105011
## 12 6.526740 0.5014081 5.080169
## 13 6.508919 0.5033565 5.073107
## 14 6.486646 0.5077315 5.050160
## 15 6.465243 0.5113948 5.027572
## 16 6.484219 0.5087875 5.041698
## 17 6.463362 0.5128351 5.016095
## 18 6.461651 0.5142957 5.016941
## 19 6.473544 0.5130788 5.029638
## 20 6.482878 0.5121549 5.038150
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was k = 18.
# KNN 알고리즘은 classification 문제를 푸는데 유용한 알고리즘이지만, 동시에 regression
# 문제도 풀 수 있습니다.
model_3
## k-Nearest Neighbors
##
## 165 samples
## 4 predictor
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 165, 165, 165, 165, 165, 165, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 8.223940 0.3658463 6.526515
## 2 7.681634 0.4097964 6.055992
## 3 7.252432 0.4503394 5.724564
## 4 6.937049 0.4822027 5.487844
## 5 6.815849 0.4923156 5.321114
## 6 6.699129 0.5068218 5.210046
## 7 6.631676 0.5131660 5.157253
## 8 6.584585 0.5187462 5.097153
## 9 6.524093 0.5272520 5.028651
## 10 6.532138 0.5271451 5.036258
## 11 6.530362 0.5274103 5.060232
## 12 6.510896 0.5322052 5.057949
## 13 6.491657 0.5372839 5.056959
## 14 6.476135 0.5422905 5.038480
## 15 6.489995 0.5421655 5.052868
## 16 6.505787 0.5407957 5.049296
## 17 6.492769 0.5440470 5.048905
## 18 6.513894 0.5423501 5.060370
## 19 6.531310 0.5412766 5.074675
## 20 6.535247 0.5409758 5.087226
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 14.
model_3$bestTune
## k
## 14 14
model_3$results
## k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 8.223940 0.3658463 6.526515 0.7719792 0.10552733 0.5959375
## 2 2 7.681634 0.4097964 6.055992 0.6395965 0.10020021 0.5026869
## 3 3 7.252432 0.4503394 5.724564 0.5375280 0.09156403 0.4853829
## 4 4 6.937049 0.4822027 5.487844 0.4663403 0.08719568 0.4130118
## 5 5 6.815849 0.4923156 5.321114 0.4662337 0.09179709 0.4024573
## 6 6 6.699129 0.5068218 5.210046 0.4858122 0.09695271 0.4560387
## 7 7 6.631676 0.5131660 5.157253 0.4847689 0.09536929 0.4521420
## 8 8 6.584585 0.5187462 5.097153 0.4927621 0.09437150 0.4651013
## 9 9 6.524093 0.5272520 5.028651 0.5449553 0.09969106 0.5007178
## 10 10 6.532138 0.5271451 5.036258 0.5654736 0.09938211 0.5078184
## 11 11 6.530362 0.5274103 5.060232 0.5631104 0.09792394 0.4993133
## 12 12 6.510896 0.5322052 5.057949 0.5518040 0.09870701 0.4733253
## 13 13 6.491657 0.5372839 5.056959 0.5524016 0.09809260 0.4650902
## 14 14 6.476135 0.5422905 5.038480 0.5186019 0.09415011 0.4389057
## 15 15 6.489995 0.5421655 5.052868 0.5433188 0.09687266 0.4535402
## 16 16 6.505787 0.5407957 5.049296 0.5492017 0.09597308 0.4611141
## 17 17 6.492769 0.5440470 5.048905 0.5358358 0.09359488 0.4555417
## 18 18 6.513894 0.5423501 5.060370 0.5280433 0.09428737 0.4446772
## 19 19 6.531310 0.5412766 5.074675 0.5217155 0.09244746 0.4399072
## 20 20 6.535247 0.5409758 5.087226 0.5094787 0.09245159 0.4399084
model_3$preProcess
## Created from 165 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
nyc[1:4,2:5]
## Food Decor Service East
## 1 22 18 20 0
## 2 20 19 19 0
## 3 21 13 18 0
## 4 20 20 17 0
predict(model_3,nyc[1:4,2:5])
## [1] 41.27778 38.78571 34.66667 40.57143
# he object returned by train also contains a preProcess attribute that contains our
# feature scaler.
scaled_data = predict(model_3$preProcess, nyc[1:4, 2:5])
scaled_data
## Food Decor Service East
## 1 0.7093251 0.1179645 0.2846613 -1.301762
## 2 -0.2952984 0.4852125 -0.1850299 -1.301762
## 3 0.2070133 -1.7182756 -0.6547211 -1.301762
## 4 -0.2952984 0.8524606 -1.1244122 -1.301762
predict(model_3$finalModel, scaled_data)
## [1] 41.27778 38.78571 34.66667 40.57143
# knn 장점 단순하고 효율적이다, 훈련단계가 빠르다.
# 적절한 k의 선택이 필요하다.
# 사례기반 학습하기 때문에 새로운 입력 패턴이 출력값을 요구할 때 까지 아무런
# 액션도 취하지 않는다. 게으른 학습, 어떠한 모형의 모수도 생성되지 않는다.
# k=하이퍼 파라미터, k값이 커질수록 전체 노이즈(이상치)가 줄어들면서 모델의 정확도 커짐
# 교차검증을 통해서 적합한 k값을 찾음
wbc<-read.csv("wbc.csv")
names(wbc)
## [1] "id" "diagnosis"
## [3] "radius_mean" "texture_mean"
## [5] "perimeter_mean" "area_mean"
## [7] "smoothness_mean" "compactness_mean"
## [9] "concavity_mean" "concave.points_mean"
## [11] "symmetry_mean" "fractal_dimension_mean"
## [13] "radius_se" "texture_se"
## [15] "perimeter_se" "area_se"
## [17] "smoothness_se" "compactness_se"
## [19] "concavity_se" "concave.points_se"
## [21] "symmetry_se" "fractal_dimension_se"
## [23] "radius_worst" "texture_worst"
## [25] "perimeter_worst" "area_worst"
## [27] "smoothness_worst" "compactness_worst"
## [29] "concavity_worst" "concave.points_worst"
## [31] "symmetry_worst" "fractal_dimension_worst"
## [33] "X"
wbc$id <- NULL
wbc %>% glimpse
## Rows: 569
## Columns: 32
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "~
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450~
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9~
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, ~
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, ~
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0~
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0~
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0~
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0~
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087~
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0~
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345~
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902~
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18~
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.~
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114~
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246~
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0~
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188~
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0~
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051~
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8~
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6~
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,~
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, ~
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791~
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249~
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0~
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0~
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0~
## $ X <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
# id는 환자 아이디이기 때문에 모델에서 제외할 필요가 있다.
# 머신러닝 기법에서는 id 변수는 제외시켜야 한다. id 포함된 모델은 과적합 위험
table(wbc$diagnosis)
##
## B M
## 357 212
# R 머신러닝 분류기는 목표 특징이 팩터로 코딩해야 한다.
wbc$diagnosis<-factor(wbc$diagnosis,levels=c("B","M"),
labels=c("Benign","Malignant"))
round(prop.table(table(wbc$diagnosis)),2)
##
## Benign Malignant
## 0.63 0.37
wbc %>% select(radius_mean ,area_mean,smoothness_mean) %>% summary
## radius_mean area_mean smoothness_mean
## Min. : 6.981 Min. : 143.5 Min. :0.05263
## 1st Qu.:11.700 1st Qu.: 420.3 1st Qu.:0.08637
## Median :13.370 Median : 551.1 Median :0.09587
## Mean :14.127 Mean : 654.9 Mean :0.09636
## 3rd Qu.:15.780 3rd Qu.: 782.7 3rd Qu.:0.10530
## Max. :28.110 Max. :2501.0 Max. :0.16340
# knn 거리계산 입력 특징 측정, 척도에 종속되어 있다.
# 특정값을 표준 범위로 재조정하고자 정규화 한다.
colSums(is.na(wbc))
## diagnosis radius_mean texture_mean
## 0 0 0
## perimeter_mean area_mean smoothness_mean
## 0 0 0
## compactness_mean concavity_mean concave.points_mean
## 0 0 0
## symmetry_mean fractal_dimension_mean radius_se
## 0 0 0
## texture_se perimeter_se area_se
## 0 0 0
## smoothness_se compactness_se concavity_se
## 0 0 0
## concave.points_se symmetry_se fractal_dimension_se
## 0 0 0
## radius_worst texture_worst perimeter_worst
## 0 0 0
## area_worst smoothness_worst compactness_worst
## 0 0 0
## concavity_worst concave.points_worst symmetry_worst
## 0 0 0
## fractal_dimension_worst X
## 0 569
wbc %>% select(-X)->wbc
names(wbc)
## [1] "diagnosis" "radius_mean"
## [3] "texture_mean" "perimeter_mean"
## [5] "area_mean" "smoothness_mean"
## [7] "compactness_mean" "concavity_mean"
## [9] "concave.points_mean" "symmetry_mean"
## [11] "fractal_dimension_mean" "radius_se"
## [13] "texture_se" "perimeter_se"
## [15] "area_se" "smoothness_se"
## [17] "compactness_se" "concavity_se"
## [19] "concave.points_se" "symmetry_se"
## [21] "fractal_dimension_se" "radius_worst"
## [23] "texture_worst" "perimeter_worst"
## [25] "area_worst" "smoothness_worst"
## [27] "compactness_worst" "concavity_worst"
## [29] "concave.points_worst" "symmetry_worst"
## [31] "fractal_dimension_worst"
idx<-createDataPartition(wbc$diagnosis,p=0.7,list=F)
train<-wbc[idx,]
test<-wbc[-idx,]
train %>% glimpse
## Rows: 399
## Columns: 31
## $ diagnosis <fct> Malignant, Malignant, Malignant, Malignant, Ma~
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 12.450, 18.250, 13.000~
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 21.82, 23.2~
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 87.50, ~
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 519.8, ~
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2350~
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.3063~
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.0020~
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 2.406, 2.46~
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 24.32, 40.~
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 15.49, 19.1~
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 30.73, 33.8~
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 103.40, 153.20, 106.20~
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 739.3, ~
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1703~
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.5401~
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.53550, 0.37840, 0~
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.4378~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
test %>% glimpse
## Rows: 170
## Columns: 31
## $ diagnosis <fct> Malignant, Malignant, Malignant, Malignant, Ma~
## $ radius_mean <dbl> 11.420, 20.290, 13.710, 12.460, 13.730, 16.130~
## $ texture_mean <dbl> 20.38, 14.34, 20.83, 24.04, 22.61, 20.68, 22.1~
## $ perimeter_mean <dbl> 77.58, 135.10, 90.20, 83.97, 93.60, 108.10, 13~
## $ area_mean <dbl> 386.1, 1297.0, 577.9, 475.9, 578.3, 798.8, 126~
## $ smoothness_mean <dbl> 0.14250, 0.10030, 0.11890, 0.11860, 0.11310, 0~
## $ compactness_mean <dbl> 0.28390, 0.13280, 0.16450, 0.23960, 0.22930, 0~
## $ concavity_mean <dbl> 0.24140, 0.19800, 0.09366, 0.22730, 0.21280, 0~
## $ concave.points_mean <dbl> 0.105200, 0.104300, 0.059850, 0.085430, 0.0802~
## $ symmetry_mean <dbl> 0.2597, 0.1809, 0.2196, 0.2030, 0.2069, 0.2164~
## $ fractal_dimension_mean <dbl> 0.09744, 0.05883, 0.07451, 0.08243, 0.07682, 0~
## $ radius_se <dbl> 0.4956, 0.7572, 0.5835, 0.2976, 0.2121, 0.5692~
## $ texture_se <dbl> 1.1560, 0.7813, 1.3770, 1.5990, 1.1690, 1.0730~
## $ perimeter_se <dbl> 3.445, 5.438, 3.856, 2.039, 2.061, 3.854, 5.86~
## $ area_se <dbl> 27.230, 94.440, 50.960, 23.940, 19.210, 54.180~
## $ smoothness_se <dbl> 0.009110, 0.011490, 0.008805, 0.007149, 0.0064~
## $ compactness_se <dbl> 0.074580, 0.024610, 0.030290, 0.072170, 0.0593~
## $ concavity_se <dbl> 0.056610, 0.056880, 0.024880, 0.077430, 0.0550~
## $ concave.points_se <dbl> 0.018670, 0.018850, 0.014480, 0.014320, 0.0162~
## $ symmetry_se <dbl> 0.05963, 0.01756, 0.01486, 0.01789, 0.01961, 0~
## $ fractal_dimension_se <dbl> 0.009208, 0.005115, 0.005412, 0.010080, 0.0080~
## $ radius_worst <dbl> 14.910, 22.540, 17.060, 15.090, 15.030, 20.960~
## $ texture_worst <dbl> 26.50, 16.67, 28.14, 40.68, 32.01, 31.48, 30.8~
## $ perimeter_worst <dbl> 98.87, 152.20, 110.60, 97.65, 108.80, 136.80, ~
## $ area_worst <dbl> 567.7, 1575.0, 897.0, 711.4, 697.7, 1315.0, 23~
## $ smoothness_worst <dbl> 0.20980, 0.13740, 0.16540, 0.18530, 0.16510, 0~
## $ compactness_worst <dbl> 0.86630, 0.20500, 0.36820, 1.05800, 0.77250, 0~
## $ concavity_worst <dbl> 0.68690, 0.40000, 0.26780, 1.10500, 0.69430, 0~
## $ concave.points_worst <dbl> 0.25750, 0.16250, 0.15560, 0.22100, 0.22080, 0~
## $ symmetry_worst <dbl> 0.6638, 0.2364, 0.3196, 0.4366, 0.3596, 0.3706~
## $ fractal_dimension_worst <dbl> 0.17300, 0.07678, 0.11510, 0.20750, 0.14310, 0~
tc <- trainControl(method="cv", number=10)
train(diagnosis ~ ., wbc, method="knn",
preProcess=c("center", "scale"), trControl=tc,metric="Accuracy")->knnfit
confusionMatrix(knnfit)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction Benign Malignant
## Benign 62.2 2.5
## Malignant 0.5 34.8
##
## Accuracy (average) : 0.9701
predict(knnfit,test,type='prob')->prefit1
predict(knnfit,test,type='raw')->prefit
NROW(prefit)
## [1] 170
class(prefit)
## [1] "factor"
class(test$diagnosis)
## [1] "factor"
levels(prefit)
## [1] "Benign" "Malignant"
levels(test$diagnosis)
## [1] "Benign" "Malignant"
confusionMatrix(prefit,test$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 106 1
## Malignant 1 62
##
## Accuracy : 0.9882
## 95% CI : (0.9581, 0.9986)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9748
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9907
## Specificity : 0.9841
## Pos Pred Value : 0.9907
## Neg Pred Value : 0.9841
## Prevalence : 0.6294
## Detection Rate : 0.6235
## Detection Prevalence : 0.6294
## Balanced Accuracy : 0.9874
##
## 'Positive' Class : Benign
##
library(pROC)
rffit2_num<-as.numeric(prefit)
rffit2_num
## [1] 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 1 1 2 2 2 1 1 2 1 2 1 2 2 2 1 1 1 2 1 1
## [38] 1 1 1 1 2 1 2 1 1 1 1 2 2 1 1 2 1 1 1 2 2 2 2 1 2 1 2 1 1 2 2 2 1 2 1 1 1
## [75] 1 2 1 2 2 2 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## [112] 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2 1 2 1 1 1 1 1 1
## [149] 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1
result<-roc(test$diagnosis,rffit2_num)
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
result
##
## Call:
## roc.default(response = test$diagnosis, predictor = rffit2_num)
##
## Data: rffit2_num in 107 controls (test$diagnosis Benign) < 63 cases (test$diagnosis Malignant).
## Area under the curve: 0.9874
result$auc
## Area under the curve: 0.9874