setwd("C:/Users/Administrator/Desktop/R Analysis/Fast Campus")
read.csv("bank.csv") -> raw_df
str(raw_df)## 'data.frame': 600 obs. of 20 variables:
## $ age : int 28 68 31 31 77 39 35 41 30 43 ...
## $ job : chr "services" "retired" "services" "entrepreneur" ...
## $ marital : chr "single" "divorced" "married" "single" ...
## $ education : chr "high.school" "high.school" "high.school" "basic.9y" ...
## $ default : chr "no" "no" "no" "no" ...
## $ housing : chr "no" "yes" "yes" "no" ...
## $ loan : chr "no" "yes" "no" "no" ...
## $ contact : chr "cellular" "cellular" "cellular" "telephone" ...
## $ month : chr "may" "jul" "may" "jun" ...
## $ day_of_week : chr "wed" "wed" "tue" "mon" ...
## $ duration : int 649 340 398 1248 190 1176 194 229 648 926 ...
## $ campaign : int 2 1 6 3 1 1 1 2 1 2 ...
## $ previous : int 0 1 0 0 0 0 1 0 0 0 ...
## $ poutcome : chr "nonexistent" "success" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num -1.8 -1.7 -1.8 1.4 -1.1 1.4 -3.4 -1.8 -0.1 1.4 ...
## $ cons.price.idx: num 92.9 94.2 92.9 94.5 94.2 ...
## $ cons.conf.idx : num -46.2 -40.3 -46.2 -41.8 -37.5 -42.7 -26.9 -50 -42 -36.1 ...
## $ euribor3m : num 1.281 0.896 1.291 4.96 0.879 ...
## $ nr.employed : num 5099 4992 5099 5228 4964 ...
## $ target : chr "yes" "yes" "yes" "yes" ...
데이터는 결측치를 알려주지 않는다.
## [1] services retired entrepreneur admin. technician
## [6] blue-collar unemployed housemaid management student
## [11] unknown self-employed
## 12 Levels: admin. blue-collar entrepreneur housemaid management ... unknown
UNKNOWN 을 어떻게 처리 할 지 생각해야한다. -> 보통 결측치 처리
이유) is.na 로 확인하면 비워져 있는 값만 보이기 떄문에, Unique 함수로 unknown 을 처리해야한다.
## [1] may jul jun sep oct mar nov aug apr dec
## Levels: apr aug dec jul jun mar may nov oct sep
## [1] nonexistent success failure
## Levels: failure nonexistent success
## [1] yes no
## Levels: no yes
#-------------------------------------------------
# UNKNOWN 을 NA 로 변환 시킨 후, 제거
#-----------------------------------------------
raw_df[raw_df=="unknown"] <- NA
sum(is.na(raw_df)) #178 ## [1] 178
## 'data.frame': 456 obs. of 20 variables:
## $ age : int 28 68 31 31 77 35 41 30 32 40 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 8 6 8 3 6 1 10 8 10 2 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 3 1 2 3 2 2 2 3 2 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 4 4 4 3 1 4 6 4 6 3 ...
## $ default : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 1 3 3 1 3 3 1 1 3 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 3 1 1 1 1 1 1 1 3 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 1 1 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 4 7 5 10 9 6 8 9 4 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 5 5 4 2 3 3 1 1 5 4 ...
## $ duration : int 649 340 398 1248 190 194 229 648 275 1135 ...
## $ campaign : int 2 1 6 3 1 1 2 1 2 2 ...
## $ previous : int 0 1 0 0 0 1 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 3 2 2 2 3 2 2 2 2 ...
## $ emp.var.rate : num -1.8 -1.7 -1.8 1.4 -1.1 -3.4 -1.8 -0.1 -1.1 1.4 ...
## $ cons.price.idx: num 92.9 94.2 92.9 94.5 94.2 ...
## $ cons.conf.idx : num -46.2 -40.3 -46.2 -41.8 -37.5 -26.9 -50 -42 -49.5 -42.7 ...
## $ euribor3m : num 1.281 0.896 1.291 4.96 0.879 ...
## $ nr.employed : num 5099 4992 5099 5228 4964 ...
## $ target : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:144] 6 10 13 14 15 24 25 27 32 37 ...
## ..- attr(*, "names")= chr [1:144] "6" "10" "13" "14" ...
par(mfrow=c(3,3), mar=c(5.1, 4.1, 4.1, 2.1))
hist(df$age, main="age histogram", xlab="age", col="orange")
hist(df$duration, main="duration histogram", xlab="duration", col="yellow")
hist(df$campaign, main="campaign histogram", xlab="campaign", col="green")
hist(df$previous, main="previous histogram", xlab="previous", col="blue")
hist(df$emp.var.rate, main="emp.var.rate historgram", xlab="emp.var.rate", col="navy")
hist(df$cons.price.idx, main="cons.price.idx histogram", xlab="cons.price.idx", col="purple")
hist(df$cons.conf.idx, main="cons.conf.idx histogram", xlab="cons.conf.idx", col="salmon")
hist(df$euribor3m, main="euribor3m histogram", xlab="euribor3m", col="gray")
hist(df$nr.employed, main="nr.employed histogram", xlab="nr.employed", col="black")## null device
## 1
각 데이터마다 값들이 다르기 때문에 데이터 표준화가 필요 = scaling
## 'data.frame': 456 obs. of 20 variables:
## $ age : int 28 68 31 31 77 35 41 30 32 40 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 8 6 8 3 6 1 10 8 10 2 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 3 1 2 3 2 2 2 3 2 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 4 4 4 3 1 4 6 4 6 3 ...
## $ default : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 1 3 3 1 3 3 1 1 3 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 3 1 1 1 1 1 1 1 3 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 1 1 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 4 7 5 10 9 6 8 9 4 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 5 5 4 2 3 3 1 1 5 4 ...
## $ duration : int 649 340 398 1248 190 194 229 648 275 1135 ...
## $ campaign : int 2 1 6 3 1 1 2 1 2 2 ...
## $ previous : int 0 1 0 0 0 1 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 3 2 2 2 3 2 2 2 2 ...
## $ emp.var.rate : num -1.8 -1.7 -1.8 1.4 -1.1 -3.4 -1.8 -0.1 -1.1 1.4 ...
## $ cons.price.idx: num 92.9 94.2 92.9 94.5 94.2 ...
## $ cons.conf.idx : num -46.2 -40.3 -46.2 -41.8 -37.5 -26.9 -50 -42 -49.5 -42.7 ...
## $ euribor3m : num 1.281 0.896 1.291 4.96 0.879 ...
## $ nr.employed : num 5099 4992 5099 5228 4964 ...
## $ target : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:144] 6 10 13 14 15 24 25 27 32 37 ...
## ..- attr(*, "names")= chr [1:144] "6" "10" "13" "14" ...
변동성을 나타내는 새로운 축을 찾아서 다시 그려주는 것 (공분산 행렬을 이용 새로운 축 생성)
시각화 가능
데이터 차원 축소 여부 판단이 가능
scale_df %>%
select(is.numeric) -> num_data
prcomp(num_data) -> pca_num
plot(pca_num, type="l",main="Principle Component Analysis")#-------------------------------------------------
# PC1 : 전체 변동성의 39.37%설명, PC 3 전체 변동성의 64.53%
#-----------------------------------------------
summary(pca_num)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8824 1.0951 1.0320 0.9932 0.9533 0.91503 0.6511
## Proportion of Variance 0.3937 0.1333 0.1183 0.1096 0.1010 0.09303 0.0471
## Cumulative Proportion 0.3937 0.5270 0.6453 0.7549 0.8559 0.94893 0.9960
## PC8 PC9
## Standard deviation 0.15724 0.10458
## Proportion of Variance 0.00275 0.00122
## Cumulative Proportion 0.99878 1.00000
차원 축소는 언제 하는가?
PC 숫자: 축을 몇 개 그리는지 설명 ex) PC6: 축을 6개 그리는데 전체 변동성 94%를 설명한다. 예를 들어서 축을 2개만 그렸는데 98%를 설명한다면 차원 축소를 하는 것이 좋다.
#-------------------------------------------------
# 차원 축소하기 3개
# rotation: 실제 주성분 벡터를 뽑아주기
#-----------------------------------------------
pca_num$rotation -> pca_matrix
pca_data <- as.matrix(num_data) %*% pca_matrix
data.frame(cbind(pca_data[,1:3],tar)) -> reduced_data
as.factor(reduced_data$tar) -> reduced_data$tar
str(reduced_data)## 'data.frame': 456 obs. of 4 variables:
## $ PC1: num 0.774 1.644 0.548 -2.84 1.392 ...
## $ PC2: num 0.838 0.321 1.145 0.746 -0.677 ...
## $ PC3: num -1.289 2.745 -1.591 0.521 2.743 ...
## $ tar: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
reduced_data %>%
mutate(PC1 = as.numeric(PC1),
PC2 = as.numeric(PC2),
PC3 = as.numeric(PC3)) -> pca_data
str(pca_data)## 'data.frame': 456 obs. of 4 variables:
## $ PC1: num 0.774 1.644 0.548 -2.84 1.392 ...
## $ PC2: num 0.838 0.321 1.145 0.746 -0.677 ...
## $ PC3: num -1.289 2.745 -1.591 0.521 2.743 ...
## $ tar: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
#------------------------------------------------------
# PCA 시각화하기
#----------------------------------------------------
ggplot(pca_data, aes(x=PC1, y=PC2)) +
geom_point(aes(color=tar, shape=tar))+
scale_y_continuous(breaks = c(-2,4,2))shapes = c(16,17) #포인트의 형태
shapes <-shapes[as.numeric(pca_data$tar)]
scatterplot3d(pca_data[,1:3],
pch=shapes,
angle=45)Target 변수는 (0,1)
set.seed()를 써야하는 이유
동일한 sample 를 추출할때, 언제나 같은 값으로 뽑이게 하기 위해서
#---------------------------------------------------------------------
# NA 가 제거된 DF 테이블 factor변환 및 non-scaling 데이터 테이블 사용
#------------------------------------------------------------------
sapply(df, function(x) if (is.factor(x)) length(levels(x)) else NA) ## age job marital education default
## NA 12 4 8 2
## housing loan contact month day_of_week
## 3 3 2 10 5
## duration campaign previous poutcome emp.var.rate
## NA NA NA 3 NA
## cons.price.idx cons.conf.idx euribor3m nr.employed target
## NA NA NA NA 2
set.seed(2020)
df-> new_data
sort(sample(nrow(new_data),nrow(new_data)*0.7)) -> flag
train <- new_data[flag,]
test <- new_data[-flag,]ctrl <- trainControl(method = "repeatedcv", repeats = 5)
train(target~.,
data=train,
method = "glm",
trControl=ctrl,
metric = "Accuracy")-> logit_fit
predict(logit_fit, newdata = test) -> pred_test
confusionMatrix(pred_test, test$target)## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 50 17
## yes 10 60
##
## Accuracy : 0.8029
## 95% CI : (0.7264, 0.8659)
## No Information Rate : 0.562
## P-Value [Acc > NIR] : 2.596e-09
##
## Kappa : 0.6048
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.8333
## Specificity : 0.7792
## Pos Pred Value : 0.7463
## Neg Pred Value : 0.8571
## Prevalence : 0.4380
## Detection Rate : 0.3650
## Detection Prevalence : 0.4891
## Balanced Accuracy : 0.8063
##
## 'Positive' Class : no
##
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
train(target~.,
data=train,
method = "LogitBoost",
trControl=ctrl,
metric = "Accuracy")-> logit_boost_fit
logit_boost_fit## Boosted Logistic Regression
##
## 319 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 287, 287, 287, 287, 288, 287, ...
## Resampling results across tuning parameters:
##
## nIter Accuracy Kappa
## 11 0.8633205 0.7260775
## 21 0.8595430 0.7189035
## 31 0.8558688 0.7114930
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 11.
모형을 11개로 만들었을때 정확도가 가장 높음
predict(logit_boost_fit, newdata = test) -> logit_boost_pred
confusionMatrix(logit_boost_pred, test$target)## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 44 11
## yes 16 66
##
## Accuracy : 0.8029
## 95% CI : (0.7264, 0.8659)
## No Information Rate : 0.562
## P-Value [Acc > NIR] : 2.596e-09
##
## Kappa : 0.596
##
## Mcnemar's Test P-Value : 0.4414
##
## Sensitivity : 0.7333
## Specificity : 0.8571
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.8049
## Prevalence : 0.4380
## Detection Rate : 0.3212
## Detection Prevalence : 0.4015
## Balanced Accuracy : 0.7952
##
## 'Positive' Class : no
##
Logistic + Tree 모형
ctrl <- trainControl(method="repeatedcv",repeats = 5)
logit_tree_fit <- train(target ~ .,
data = train,
method = "LMT",
trControl = ctrl,
metric="Accuracy")
logit_tree_fit## Logistic Model Trees
##
## 319 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 287, 287, 288, 287, 287, 287, ...
## Resampling results across tuning parameters:
##
## iter Accuracy Kappa
## 1 0.8427358 0.6850245
## 21 0.8785068 0.7563402
## 41 0.8671951 0.7339967
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was iter = 21.
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
train(target~.,
data=train,
method = "plr",
trControl=ctrl,
metric = "Accuracy") -> logit_plr_fit##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
##
## Convergence warning in plr: 2
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 50 13
## yes 10 64
##
## Accuracy : 0.8321
## 95% CI : (0.7588, 0.8905)
## No Information Rate : 0.562
## P-Value [Acc > NIR] : 1.718e-11
##
## Kappa : 0.6609
##
## Mcnemar's Test P-Value : 0.6767
##
## Sensitivity : 0.8333
## Specificity : 0.8312
## Pos Pred Value : 0.7937
## Neg Pred Value : 0.8649
## Prevalence : 0.4380
## Detection Rate : 0.3650
## Detection Prevalence : 0.4599
## Balanced Accuracy : 0.8323
##
## 'Positive' Class : no
##
ctrl <- trainControl(method="repeatedcv",repeats = 5)
logit_reg_fit <- train(target ~ .,
data = train,
method = "regLogistic",
trControl = ctrl,
metric="Accuracy")
logit_reg_fit## Regularized Logistic Regression
##
## 319 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 287, 287, 288, 287, 287, 288, ...
## Resampling results across tuning parameters:
##
## cost loss epsilon Accuracy Kappa
## 0.5 L1 0.001 0.8765329 0.7524921
## 0.5 L1 0.010 0.8840769 0.7677478
## 0.5 L1 0.100 0.8815958 0.7627764
## 0.5 L2_dual 0.001 0.6363044 0.2695227
## 0.5 L2_dual 0.010 0.6304613 0.2589427
## 0.5 L2_dual 0.100 0.6289510 0.2562714
## 0.5 L2_primal 0.001 0.8796237 0.7586487
## 0.5 L2_primal 0.010 0.8674652 0.7345643
## 0.5 L2_primal 0.100 0.7303543 0.4630738
## 1.0 L1 0.001 0.8763374 0.7522424
## 1.0 L1 0.010 0.8840579 0.7677321
## 1.0 L1 0.100 0.8852914 0.7701064
## 1.0 L2_dual 0.001 0.6209586 0.2373865
## 1.0 L2_dual 0.010 0.6233675 0.2447831
## 1.0 L2_dual 0.100 0.6199389 0.2322231
## 1.0 L2_primal 0.001 0.8789205 0.7572210
## 1.0 L2_primal 0.010 0.8674652 0.7345643
## 1.0 L2_primal 0.100 0.7303543 0.4630738
## 2.0 L1 0.001 0.8795241 0.7587051
## 2.0 L1 0.010 0.8807942 0.7612854
## 2.0 L1 0.100 0.8833370 0.7662524
## 2.0 L2_dual 0.001 0.6205370 0.2351264
## 2.0 L2_dual 0.010 0.6492944 0.2951622
## 2.0 L2_dual 0.100 0.6323564 0.2629556
## 2.0 L2_primal 0.001 0.8776124 0.7545262
## 2.0 L2_primal 0.010 0.8668591 0.7333263
## 2.0 L2_primal 0.100 0.7303543 0.4630738
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were cost = 1, loss = L1 and epsilon = 0.1.
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 50 14
## yes 10 63
##
## Accuracy : 0.8248
## 95% CI : (0.7506, 0.8844)
## No Information Rate : 0.562
## P-Value [Acc > NIR] : 6.474e-11
##
## Kappa : 0.6468
##
## Mcnemar's Test P-Value : 0.5403
##
## Sensitivity : 0.8333
## Specificity : 0.8182
## Pos Pred Value : 0.7812
## Neg Pred Value : 0.8630
## Prevalence : 0.4380
## Detection Rate : 0.3650
## Detection Prevalence : 0.4672
## Balanced Accuracy : 0.8258
##
## 'Positive' Class : no
##
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
train(target~.,
data=train,
method = "naive_bayes",
trControl = ctrl,
metric= "Accuracy") -> nb_fit
nb_fit## Naive Bayes
##
## 319 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 287, 286, 287, 288, 286, 287, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.7149578 0.4374612
## TRUE 0.7708388 0.5446199
##
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = TRUE
## and adjust = 1.
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 53 33
## yes 7 44
##
## Accuracy : 0.708
## 95% CI : (0.6243, 0.7825)
## No Information Rate : 0.562
## P-Value [Acc > NIR] : 0.000314
##
## Kappa : 0.434
##
## Mcnemar's Test P-Value : 7.723e-05
##
## Sensitivity : 0.8833
## Specificity : 0.5714
## Pos Pred Value : 0.6163
## Neg Pred Value : 0.8627
## Prevalence : 0.4380
## Detection Rate : 0.3869
## Detection Prevalence : 0.6277
## Balanced Accuracy : 0.7274
##
## 'Positive' Class : no
##
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
train(target~.,
data=train,
method = "rf",
trControl = ctrl,
metric= "Accuracy") -> rf_fit
rf_fit## Random Forest
##
## 319 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 286, 287, 287, 288, 287, 287, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7815054 0.5657565
## 26 0.8581739 0.7157607
## 51 0.8499481 0.6994562
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 51 9
## yes 9 68
##
## Accuracy : 0.8686
## 95% CI : (0.8003, 0.9202)
## No Information Rate : 0.562
## P-Value [Acc > NIR] : 1.017e-14
##
## Kappa : 0.7331
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8500
## Specificity : 0.8831
## Pos Pred Value : 0.8500
## Neg Pred Value : 0.8831
## Prevalence : 0.4380
## Detection Rate : 0.3723
## Detection Prevalence : 0.4380
## Balanced Accuracy : 0.8666
##
## 'Positive' Class : no
##
ctrl <- trainControl(method="repeatedcv",repeats = 5)
svm_linear_fit <- train(target ~ .,
data = train,
method = "svmLinear",
trControl = ctrl,
metric="Accuracy")
svm_linear_fit## Support Vector Machines with Linear Kernel
##
## 319 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 287, 287, 287, 288, 288, 288, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8802456 0.7600278
##
## Tuning parameter 'C' was held constant at a value of 1
svm_linear_pred <- predict(svm_linear_fit, newdata=test)
confusionMatrix(svm_linear_pred, test$target)## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 48 10
## yes 12 67
##
## Accuracy : 0.8394
## 95% CI : (0.767, 0.8965)
## No Information Rate : 0.562
## P-Value [Acc > NIR] : 4.336e-12
##
## Kappa : 0.6726
##
## Mcnemar's Test P-Value : 0.8312
##
## Sensitivity : 0.8000
## Specificity : 0.8701
## Pos Pred Value : 0.8276
## Neg Pred Value : 0.8481
## Prevalence : 0.4380
## Detection Rate : 0.3504
## Detection Prevalence : 0.4234
## Balanced Accuracy : 0.8351
##
## 'Positive' Class : no
##
ctrl <- trainControl(method="repeatedcv",repeats = 5)
svm_poly_fit <- train(target ~ .,
data = train,
method = "svmPoly",
trControl = ctrl,
metric="Accuracy")
svm_poly_fit## Support Vector Machines with Polynomial Kernel
##
## 319 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 288, 288, 288, 286, 288, 287, ...
## Resampling results across tuning parameters:
##
## degree scale C Accuracy Kappa
## 1 0.001 0.25 0.8661889 0.7316951
## 1 0.001 0.50 0.8668139 0.7329558
## 1 0.001 1.00 0.8630224 0.7255467
## 1 0.010 0.25 0.8648607 0.7291940
## 1 0.010 0.50 0.8673430 0.7342025
## 1 0.010 1.00 0.8748845 0.7491733
## 1 0.100 0.25 0.8775232 0.7545973
## 1 0.100 0.50 0.8849902 0.7693441
## 1 0.100 1.00 0.8794196 0.7581110
## 2 0.001 0.25 0.8762469 0.7517105
## 2 0.001 0.50 0.8713038 0.7419325
## 2 0.001 1.00 0.8675513 0.7344202
## 2 0.010 0.25 0.8601424 0.7197172
## 2 0.010 0.50 0.8627823 0.7253325
## 2 0.010 1.00 0.8631916 0.7260879
## 2 0.100 0.25 0.8676876 0.7351852
## 2 0.100 0.50 0.8707924 0.7414554
## 2 0.100 1.00 0.8658291 0.7316302
## 3 0.001 0.25 0.8598295 0.7193846
## 3 0.001 0.50 0.8625275 0.7250237
## 3 0.001 1.00 0.8650690 0.7299933
## 3 0.010 0.25 0.8129429 0.6253672
## 3 0.010 0.50 0.8166160 0.6327835
## 3 0.010 1.00 0.8117320 0.6227475
## 3 0.100 0.25 0.8173595 0.6347791
## 3 0.100 0.50 0.8223644 0.6446590
## 3 0.100 1.00 0.8109494 0.6217943
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 1, scale = 0.1 and C = 0.5.