Caret package
trainControl() - 훈련과정 중 parameter 설정 Ex) trainControl ( method = “repeatedcv” number = 10, <- 훈련데이터 fold 갯수 repeat - 5) <- cv 반복 횟수
expand.grid() - factor 조합의 데이터 프레임 생성 Ex) expand.grid (k=1:10)
train() Ex) train ( Class ~. , data = method= trContraol preProcess = c(“center”,“scale”), <- 표준화, tuneGrid = expand.grid(k=1:10), <- 파라미터값 목록 metric="Accuarcy) <- 모형방식
Accurarcy 정확도 = TP+TN/ Total
Kappa 통계량 = p0-pe/1-pe p0: 관측 정확도 / pe: 기대 정확도
setwd('C:/Users/Administrator/Desktop/R Analysis/Fast Campus/Part 6/Ch02. k-Nearest Neighbor/Data')
read.csv("wine.csv",header = TRUE) -> rawdata
rawdata$Class <- as.factor(rawdata$Class) #factor 변환
str(rawdata)## 'data.frame': 178 obs. of 14 variables:
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ color_intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ X0D280 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
## $ Class : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
Train, Test 분할
analdata <-rawdata
set.seed(2020) #Seed 설정
datatotal <- sort(sample(nrow(analdata), nrow(analdata)*0.7)) #sample 표본 뽑기 100중에 70개 뽑기
#sample(a,b) : 1부터 a까지 숫자 중에 b개 추출
train <-rawdata[datatotal,]
test <-rawdata[-datatotal, ]
train_x <-train[,1:13]
train_y <-train[,14]
test_x <-test[,1:13]
test_y <-test[,14]모형
ctrl <-trainControl(method ="repeatedcv",
number =10,
repeats=5)
customGrid <-expand.grid(k=1:10)
train(Class~.,
data=train,
method = "knn",
trControl = ctrl,
preProcess= c("center","scale"),
tuneGrid = customGrid,
metric="Accuracy") -> knnFit## k-Nearest Neighbors
##
## 124 samples
## 13 predictor
## 3 classes: '1', '2', '3'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 111, 112, 112, 111, 110, 112, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9726041 0.9585595
## 2 0.9658725 0.9482692
## 3 0.9707859 0.9558296
## 4 0.9681901 0.9520425
## 5 0.9776956 0.9662325
## 6 0.9603280 0.9401272
## 7 0.9567100 0.9348356
## 8 0.9517965 0.9274056
## 9 0.9644023 0.9465077
## 10 0.9673876 0.9507825
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
Prediction
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 14 2 0
## 2 0 24 0
## 3 0 1 13
##
## Overall Statistics
##
## Accuracy : 0.9444
## 95% CI : (0.8461, 0.9884)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.459e-12
##
## Kappa : 0.913
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 0.8889 1.0000
## Specificity 0.9500 1.0000 0.9756
## Pos Pred Value 0.8750 1.0000 0.9286
## Neg Pred Value 1.0000 0.9000 1.0000
## Prevalence 0.2593 0.5000 0.2407
## Detection Rate 0.2593 0.4444 0.2407
## Detection Prevalence 0.2963 0.4444 0.2593
## Balanced Accuracy 0.9750 0.9444 0.9878
Importance Feature
setwd('C:/Users/Administrator/Desktop/R Analysis/Fast Campus/Part 6/Ch03. Logistic Regression/Data')
read.csv("heart.csv", header = TRUE) -> heart
str(heart)## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : int 1 1 0 1 0 1 0 1 1 1 ...
## $ cp : int 3 2 1 1 0 0 1 1 2 2 ...
## $ trestbps: int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : int 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : int 0 1 0 1 1 1 0 1 1 1 ...
## $ thalach : int 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : int 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : int 0 0 2 2 2 1 1 2 2 2 ...
## $ ca : int 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : int 1 2 2 2 2 1 2 3 3 2 ...
## $ target : int 1 1 1 1 1 1 1 1 1 1 ...
## [1] 1 0
## Levels: 0 1
#---------------------------------------------------------
# 연속형 변수는 표준화
#--------------------------------------------------------
heart$age <- scale(heart$age)
heart$trestbps <- scale(heart$trestbps)
heart$chol<- scale(heart$chol)
heart$thalach <- scale(heart$thalach)
heart$oldpeak<- scale(heart$oldpeak)
heart$slope <- scale(heart$slope)
#---------------------------------------------------------
# 범주 변수는 as.factor
#--------------------------------------------------------
newdata<-heart
factorVar <- c("sex", "cp", "restecg", "exang","ca","thal")
newdata[,factorVar] = lapply(newdata[,factorVar], factor)Tran, Test 나누기
set.seed(2020)
sample <- sort(sample(nrow(newdata),nrow(newdata)*0.7))
train <- newdata[sample,]
test <- newdata[-sample,]
train_x <-train[,1:12]
train_y <- train[,13]
test_x <- test[,1:12]
test_y <- test[,13]Modelling
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
train(target~.,
data=train,
method = "LogitBoost",
trContrl = ctrl,
metric = "Accuracy") -> logitFit
logitFit## Boosted Logistic Regression
##
## 212 samples
## 13 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 212, 212, 212, 212, 212, 212, ...
## Resampling results across tuning parameters:
##
## nIter Accuracy Kappa
## 11 0.8065542 0.5981106
## 21 0.8094922 0.6078944
## 31 0.7919591 0.5711976
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 21.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33 7
## 1 15 36
##
## Accuracy : 0.7582
## 95% CI : (0.6572, 0.8419)
## No Information Rate : 0.5275
## P-Value [Acc > NIR] : 4.977e-06
##
## Kappa : 0.5197
##
## Mcnemar's Test P-Value : 0.1356
##
## Sensitivity : 0.6875
## Specificity : 0.8372
## Pos Pred Value : 0.8250
## Neg Pred Value : 0.7059
## Prevalence : 0.5275
## Detection Rate : 0.3626
## Detection Prevalence : 0.4396
## Balanced Accuracy : 0.7624
##
## 'Positive' Class : 0
##
Importance Variables
setwd('C:/Users/Administrator/Desktop/R Analysis/Fast Campus/Part 6/Ch02. k-Nearest Neighbor/Data')
read.csv("wine.csv", header = T) -> wine
wine$Class <- as.factor(wine$Class)
str(wine)## 'data.frame': 178 obs. of 14 variables:
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ color_intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ X0D280 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
## $ Class : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
TRAIN/TEST 나누기
samdata <-wine
set.seed(2020)
sort(sample(nrow(samdata), nrow(samdata)*.7))-> samples
train <-wine[samples,]
test <-wine[-samples,]
train_x <-train[,1:13]
train_y <-train[, 14]
test_x <-test[, 1:13]
test_y <-test[,14]Modelling
ctrl <- trainControl(method="repeatedcv",repeats = 5)
nbFit <- train(Class ~ .,
data = train,
method = "naive_bayes",
trControl = ctrl,
preProcess = c("center","scale"),
metric="Accuracy")
nbFit## Naive Bayes
##
## 124 samples
## 13 predictor
## 3 classes: '1', '2', '3'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 111, 112, 112, 111, 110, 112, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.9808142 0.9708775
## TRUE 0.9756810 0.9631746
##
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = FALSE
## and adjust = 1.
커널은 사용 x, 커널 o 에 따른 정확도를 보여줌
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 14 1 0
## 2 0 24 0
## 3 0 2 13
##
## Overall Statistics
##
## Accuracy : 0.9444
## 95% CI : (0.8461, 0.9884)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.459e-12
##
## Kappa : 0.913
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 0.8889 1.0000
## Specificity 0.9750 1.0000 0.9512
## Pos Pred Value 0.9333 1.0000 0.8667
## Neg Pred Value 1.0000 0.9000 1.0000
## Prevalence 0.2593 0.5000 0.2407
## Detection Rate 0.2593 0.4444 0.2407
## Detection Prevalence 0.2778 0.4444 0.2778
## Balanced Accuracy 0.9875 0.9444 0.9756
ROC 커브 기준으로 변수 중요도를 선별해서 보여준다. ROC 커브 면적이 넓을 수록 중요도가 상승한다.
기본 트리 설정
#install.packages("tree")
library(tree)
setwd('C:/Users/Administrator/Desktop/R Analysis/Fast Campus/Part 6/Ch02. k-Nearest Neighbor/Data')
read.csv("wine.csv",header = TRUE) -> rawdata
rawdata$Class <- as.factor(rawdata$Class)
dedata <- rawdata
set.seed(2020)
sample_data <- sort(sample(nrow(dedata), nrow(dedata)*.7))
train <-dedata[sample_data,]
test <-dedata[-sample_data, ]
train_x <-train[,1:13]
train_y <-train[,14]
test_x <-test[,1:13]
test_y <-test[,14]
tree(Class~., data=train) ->treeRaw
plot(treeRaw)
text(treeRaw)Cross-Validation
Pruning 가지치기
prune.misclass(treeRaw, best=4) -> prune_tree
plot(prune_tree)
text(prune_tree, pretty=0) #pretty=0 분할 feature 의 이름을 바꾸지 않는다. Prediciton
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 14 2 4
## 2 0 23 0
## 3 0 2 9
##
## Overall Statistics
##
## Accuracy : 0.8519
## 95% CI : (0.7288, 0.9338)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 6.922e-08
##
## Kappa : 0.7692
##
## Mcnemar's Test P-Value : 0.04601
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 0.8519 0.6923
## Specificity 0.8500 1.0000 0.9512
## Pos Pred Value 0.7000 1.0000 0.8182
## Neg Pred Value 1.0000 0.8710 0.9070
## Prevalence 0.2593 0.5000 0.2407
## Detection Rate 0.2593 0.4259 0.1667
## Detection Prevalence 0.3704 0.4259 0.2037
## Balanced Accuracy 0.9250 0.9259 0.8218
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
train(Class~.,
data=train,
method ="rf",
trControl=ctrl,
preProcess= c("center", "scale"),
metric= "Accuracy") -> rfFit
rfFit## Random Forest
##
## 124 samples
## 13 predictor
## 3 classes: '1', '2', '3'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 111, 110, 111, 112, 112, 111, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9904529 0.9855947
## 7 0.9807326 0.9708665
## 13 0.9594539 0.9385166
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
setwd('C:/Users/Administrator/Desktop/R Analysis/Fast Campus/Part 6/Ch02. k-Nearest Neighbor/Data')
read.csv("wine.csv",header = TRUE) -> wine
wine$Class <-as.factor(wine$Class)
str(wine)## 'data.frame': 178 obs. of 14 variables:
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ color_intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ X0D280 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
## $ Class : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
wine -> svm_wine
set.seed(2021)
wine_sam <- sort(sample(nrow(svm_wine), nrow(svm_wine)*.7))
train <- svm_wine[wine_sam,]
test <- svm_wine[-wine_sam,]
ctrl <- trainControl(method ="repeatedcv", repeats=5)
train(Class~.,
data=train,
method = "svmLinear", #svmPoly 비선형 svm
trControl=ctrl,
preProcess= c("center","scale"),
metric ="Accuracy") ->svm_Fit
svm_Fit## Support Vector Machines with Linear Kernel
##
## 124 samples
## 13 predictor
## 3 classes: '1', '2', '3'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 111, 111, 112, 113, 111, 112, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9474242 0.9207634
##
## Tuning parameter 'C' was held constant at a value of 1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 23 0 0
## 2 0 20 0
## 3 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9815
## 95% CI : (0.9011, 0.9995)
## No Information Rate : 0.4259
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9709
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 0.9524 1.0000
## Specificity 1.0000 1.0000 0.9773
## Pos Pred Value 1.0000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9706 1.0000
## Prevalence 0.4259 0.3889 0.1852
## Detection Rate 0.4259 0.3704 0.1852
## Detection Prevalence 0.4259 0.3704 0.2037
## Balanced Accuracy 1.0000 0.9762 0.9886
Heat Data 를 KNN 과 Logistic 를 사용해보자
범주형 변수는 factor 형으로 변환 시켜, df_1 테이블 생성
setwd('C:/Users/Administrator/Desktop/R Analysis/Fast Campus/Part 6/Ch03. Logistic Regression/Data')
read.csv("heart.csv", header = TRUE) -> heart
heart %>%
mutate(sex= as.factor(sex),
cp = as.factor(cp),
fbs = as.factor(fbs),
exang= as.factor(exang),
thal = as.factor(thal),
target= as.factor(target)) -> df_1
# 결측치 확인해보기
colSums(is.na(df_1))## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal target
## 0 0 0 0 0 0
## age sex cp trestbps chol fbs
## Min. :29.00 0: 96 0:143 Min. : 94.0 Min. :126.0 0:258
## 1st Qu.:47.50 1:207 1: 50 1st Qu.:120.0 1st Qu.:211.0 1: 45
## Median :55.00 2: 87 Median :130.0 Median :240.0
## Mean :54.37 3: 23 Mean :131.6 Mean :246.3
## 3rd Qu.:61.00 3rd Qu.:140.0 3rd Qu.:274.5
## Max. :77.00 Max. :200.0 Max. :564.0
## restecg thalach exang oldpeak slope
## Min. :0.0000 Min. : 71.0 0:204 Min. :0.00 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:133.5 1: 99 1st Qu.:0.00 1st Qu.:1.000
## Median :1.0000 Median :153.0 Median :0.80 Median :1.000
## Mean :0.5281 Mean :149.6 Mean :1.04 Mean :1.399
## 3rd Qu.:1.0000 3rd Qu.:166.0 3rd Qu.:1.60 3rd Qu.:2.000
## Max. :2.0000 Max. :202.0 Max. :6.20 Max. :2.000
## ca thal target
## Min. :0.0000 0: 2 0:138
## 1st Qu.:0.0000 1: 18 1:165
## Median :0.0000 2:166
## Mean :0.7294 3:117
## 3rd Qu.:1.0000
## Max. :4.0000
thal 변수에 0 이 2개 있는데 확인해보기
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 53 0 2 128 216 0 0 115 0 0 2 0 0
## 2 52 1 0 128 204 1 1 156 1 1 1 0 0
## target
## 1 1
## 2 0
thal의 0 은 결측 인것으로 판단됨 thal 이 0 이 아닌것만 추출
###KNN Modelling
KNN 모델에 사용되는 변수들은 숫자형이기 때문에
KNN 활용의 특징
데이터 분할
분할된 데이터를 숫자형으로 변환
1-13 목표 변수를 제외한 모든 변수를 train/test 마다 scale 를 직접 진행 scale 함수를 사용
Target 변수를 labeling 따로 지정(train, test 각각)
Optimal K 지정 가능 round(sqrt(nrow ))
knn 함수 예측 시작 (train/ test/ cl/ k) 지정
Confusion Matrix 정확도 확인
KNN 함수를 따로 적용하면 따로 따로 해야할 일이 많음 Caret 이 train 함수를 적용할 때에는 3가지 범주는 자동적으로 적용 아니면 이렇게 전부 손수 코드 작성해야한다.
#-----------------------------------------
# 분할
#------------------------------------------
flag <- sort(sample(nrow(df_1), nrow(df_1)*0.7))
train_h <- df_1[flag,]
test_h <- df_1[-flag,]
#-----------------------------------------
# 분할된 각각의 train, test 에 as.numeric 적용
#------------------------------------------
train_heart<- train_h %>%
mutate(sex = as.numeric(sex),
cp = as.numeric(cp),
fbs = as.numeric(fbs),
exang = as.numeric(exang),
thal = as.numeric(thal),
target = as.numeric(target))
test_heart<- test_h %>%
mutate(sex = as.numeric(sex),
cp = as.numeric(cp),
fbs = as.numeric(fbs),
exang = as.numeric(exang),
thal = as.numeric(thal),
target = as.numeric(target))
#-----------------------------------------
# Target 변수를 제외한 나머지에 scale 적용
#------------------------------------------
train <- scale(x=train_heart[, -14])
test <- scale(x=test_heart[,-14],
center = attr(train, "scaled:center"),
scale = attr(train, "scaled:scale"))
#-----------------------------------------
# Label 설정
#------------------------------------------
tr_label <- train_heart[,14]
te_label <- test_heart[,14]
#-----------------------------------------
# Optimal K = 15
#------------------------------------------
round(sqrt(nrow(train_heart)),0)## [1] 14
#-----------------------------------------
# KNN 함수 사용
#------------------------------------------
knn(train=train,
test=test,
cl = tr_label,
k=15) ->KNN_FIT
head(KNN_FIT)## [1] 2 2 2 2 2 2
## Levels: 1 2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 35 5
## 2 10 41
##
## Accuracy : 0.8352
## 95% CI : (0.7427, 0.9047)
## No Information Rate : 0.5055
## P-Value [Acc > NIR] : 5.352e-11
##
## Kappa : 0.6699
##
## Mcnemar's Test P-Value : 0.3017
##
## Sensitivity : 0.8913
## Specificity : 0.7778
## Pos Pred Value : 0.8039
## Neg Pred Value : 0.8750
## Prevalence : 0.5055
## Detection Rate : 0.4505
## Detection Prevalence : 0.5604
## Balanced Accuracy : 0.8345
##
## 'Positive' Class : 2
##
의견) KNN 은 2 개의 범주형 모델에는 적합한 모델로 보여지지 않는다.