2.1 Decision Tree
지니척도
엔트로피
2.1.1 Binary Prediction
setwd("C:/Users/Administrator/Desktop/R Analysis")
read.csv('autoparts.csv') -> autopart
#---------------------------------------------------
# Selecting Features
#----------------------------------------------------
autopart %>%
filter(prod_no == "90784-76001") %>%
filter(c_thickness < 1000) %>%
filter(highpressure_time < 1000) %>%
mutate(target = ifelse(c_thickness<20| c_thickness>32,1 ,0) %>%
as.factor()) %>%
select(-prod_no, -c_thickness) -> df
#---------------------------------------------------
# train/test
#----------------------------------------------------
set.seed(2022)
sort(sample(nrow(df), nrow(df)*0.7)) -> flag
train <- df[flag,]
test <- df[-flag,]
#---------------------------------------------------
# tree()
#----------------------------------------------------
tree(target~., data=train) -> treeRaw
plot(treeRaw)
text(treeRaw)가지치기 기준 9~10개
#---------------------------------------------------
# 가지 9개
#----------------------------------------------------
predict(prune_tree_nine, test, type = 'class') -> pred_test_nine
confusionMatrix(pred_test_nine, test$target)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5415 253
## 1 260 600
##
## Accuracy : 0.9214
## 95% CI : (0.9146, 0.9278)
## No Information Rate : 0.8693
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6553
##
## Mcnemar's Test P-Value : 0.7911
##
## Sensitivity : 0.9542
## Specificity : 0.7034
## Pos Pred Value : 0.9554
## Neg Pred Value : 0.6977
## Prevalence : 0.8693
## Detection Rate : 0.8295
## Detection Prevalence : 0.8683
## Balanced Accuracy : 0.8288
##
## 'Positive' Class : 0
##
#---------------------------------------------------
# 가지 3개
#----------------------------------------------------
predict(prune_tree_three, test, type = 'class') -> pred_test_three
confusionMatrix(pred_test_three, test$target)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5585 706
## 1 90 147
##
## Accuracy : 0.8781
## 95% CI : (0.8699, 0.8859)
## No Information Rate : 0.8693
## P-Value [Acc > NIR] : 0.0183
##
## Kappa : 0.2257
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9841
## Specificity : 0.1723
## Pos Pred Value : 0.8878
## Neg Pred Value : 0.6203
## Prevalence : 0.8693
## Detection Rate : 0.8555
## Detection Prevalence : 0.9637
## Balanced Accuracy : 0.5782
##
## 'Positive' Class : 0
##
library(Epi)
#---------------------------------------------------
# 가지 9개
#----------------------------------------------------
ROC(test=pred_test_nine, stat=test$target, plot="ROC", AUC=T)#---------------------------------------------------
# 가지 3개
#----------------------------------------------------
ROC(test=pred_test_three, stat=test$target, plot="ROC", AUC=T)예측하기
new.data=data.frame(fix_time=87,a_speed=0.609,b_speed=1.715,separation=242.7,s_separation=657.5,rate_terms=95,mpa=78,load_time=18.1,highpressure_time=82)
predict(prune_tree_nine, newdata = new.data, type="class")## [1] 0
## Levels: 0 1
## [1] 0
## Levels: 0 1
2.1.1 multile level Prediction
Target 을 1,2,3 으로 나눈다.
autopart %>%
filter(prod_no == "90784-76001") %>%
filter(c_thickness < 1000) %>%
filter(highpressure_time < 1000) %>%
mutate(target = ifelse(c_thickness <20,1,
ifelse(c_thickness < 32, 2,3)) %>%
as.factor()) %>%
select(-prod_no, -c_thickness) -> df1
sort(sample(nrow(df1), nrow(df1)*0.7)) -> fla
train <-df1[fla,]
test <- df1[-fla,]
tree(target~., data=train) -> raw_tree
plot(raw_tree)
text(raw_tree)## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 363 185 0
## 2 265 5351 6
## 3 4 141 213
##
## Overall Statistics
##
## Accuracy : 0.9079
## 95% CI : (0.9007, 0.9148)
## No Information Rate : 0.8696
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6181
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.57437 0.9426 0.97260
## Specificity 0.96862 0.6816 0.97702
## Pos Pred Value 0.66241 0.9518 0.59497
## Neg Pred Value 0.95502 0.6402 0.99903
## Prevalence 0.09681 0.8696 0.03355
## Detection Rate 0.05561 0.8197 0.03263
## Detection Prevalence 0.08395 0.8612 0.05484
## Balanced Accuracy 0.77149 0.8121 0.97481
2.1.2 연속형 변수에 대한 Decision Tree
c_thinkess 의 수를 예측한다.
autopart %>%
select(-prod_no) ->df2
#--------------------------------------
# Train/Test 구분
#---------------------------------------
sort(sample(nrow(df2), nrow(df2)*0.7)) -> fl
df2[fl,] -> train
df2[-fl,] -> test
#--------------------------------------
# 모델 생성
#---------------------------------------
tree(c_thickness~fix_time+a_speed+b_speed+separation+s_separation+rate_terms+
mpa+load_time+highpressure_time,data=train) -> df2_tree
plot(df2_tree)
text(df2_tree)#--------------------------------------
# 예측
#---------------------------------------
predict(df2_tree, test) -> pred_df2
head(test$c_thickness)## [1] 24.1 23.1 20.5 34.3 36.7 33.8
#--------------------------------------
# Predict
#---------------------------------------
new.data=data.frame(fix_time=c(87,85.6),a_speed=c(0.609,0.472),b_speed=c(1.715,1.685),separation=c(242.7,243.4),s_separation=c(657.5,657.9),rate_terms=c(95,95),mpa=c(78,28.8),load_time=c(18.1,18.2),highpressure_time=c(82,60))
predict(df2_tree, newdata=new.data)## [1] 24.11343 24.11343
#--------------------------------------
# 1. Data preparation
#---------------------------------------
autopart %>%
filter(prod_no == "90784-76001") %>%
filter(c_thickness < 1000) %>%
filter(highpressure_time < 1000) %>%
mutate(target = ifelse(c_thickness<20| c_thickness>32,1 ,0) %>%
as.factor()) %>%
select(-prod_no, -c_thickness) -> df_knn
#--------------------------------------
# 2. Train/Test spereration
#---------------------------------------
sort(sample(nrow(df_knn), nrow(df_knn)*0.7)) -> fl
df_knn[fl,] -> train
df_knn[-fl,] -> test
#--------------------------------------
# 3. Modelling
#---------------------------------------
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
customGrid <- expand.grid(k=1:10)
train(target~.,
data=train,
method ="knn",
preProcess= c("center","scale"),
tuneGrid = customGrid,
metric ="Accuracy") -> knn_fit
knn_fit## k-Nearest Neighbors
##
## 15229 samples
## 9 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (9), scaled (9)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 15229, 15229, 15229, 15229, 15229, 15229, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8865206 0.4733409
## 2 0.8819025 0.4517897
## 3 0.8850261 0.4539919
## 4 0.8872547 0.4541364
## 5 0.8912186 0.4596264
## 6 0.8930416 0.4608583
## 7 0.8949264 0.4620850
## 8 0.8952960 0.4589429
## 9 0.8965841 0.4591509
## 10 0.8963488 0.4546513
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
#--------------------------------------
# 3. optimal K
#---------------------------------------
knn_fit$bestTune## k
## 9 9
#--------------------------------------
# 4. predict
#---------------------------------------
predict(knn_fit, newdata = test) -> pred_knn
confusionMatrix(pred_knn, test$target)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5574 471
## 1 108 375
##
## Accuracy : 0.9113
## 95% CI : (0.9041, 0.9181)
## No Information Rate : 0.8704
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.519
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9810
## Specificity : 0.4433
## Pos Pred Value : 0.9221
## Neg Pred Value : 0.7764
## Prevalence : 0.8704
## Detection Rate : 0.8539
## Detection Prevalence : 0.9260
## Balanced Accuracy : 0.7121
##
## 'Positive' Class : 0
##
i{r}