library(e1071) # SVM 분석 패키지
library(Epi) # ROC, AUC
autoparts <- read.csv("autoparts.csv", header = T)
dim(autoparts)
## [1] 34139 17
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)
## [1] 21779 10
str(autoparts1)
## 'data.frame': 21779 obs. of 10 variables:
## $ fix_time : num 85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
## $ a_speed : num 0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
## $ b_speed : num 1.72 1.71 1.72 1.72 1.7 ...
## $ separation : num 242 245 243 242 242 ...
## $ s_separation : num 658 657 658 657 657 ...
## $ rate_terms : int 95 95 95 95 95 95 95 95 95 95 ...
## $ mpa : num 78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
## $ load_time : num 18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
## $ highpressure_time: int 58 58 82 74 56 78 55 57 50 60 ...
## $ c_thickness : num 24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ] # 이상치 제거
dim(autoparts2)
## [1] 21767 10
autoparts2$y_faulty <- ifelse((autoparts2$c_thickness<20)|(autoparts2$c_thickness>32),1,0)
table(autoparts2$y_faulty)
##
## 0 1
## 18925 2842
# train, test data 나누기
t_index <- sample(1:nrow(autoparts2), size=nrow(autoparts2)*0.7)
train <- autoparts2[t_index, ]
test <- autoparts2[-t_index, ]
# tune.svm() 함수를 이용하여 gamma, cost 두개의 주요 파라미터의 최적값을 구함
# gamma는 초평면의 기울기이고, cost는 과적합에 따른 비용임.
# 과적합 될수록 cost가 상승함. 즉, 어느정도의 비용을 감수하더라도 모델을 훈련데이터에 맞추겠는지이다.
# 이상치는 초평면에 지대한 영향일때 제외함
# 주어진 범위 내에서 gamma, cost을 최적값을 찾아준다. 여기에서는 1과 16 임
# 시간 많이 걸림
tune.svm(factor(y_faulty) ~ fix_time+a_speed+b_speed+separation+s_separation+
rate_terms+mpa+load_time+highpressure_time, data=autoparts2,
gamma = 2^(-1:1), cost=2^(2:4))
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.5 16
##
## - best performance: 0.03730414
m <- svm(factor(y_faulty) ~ fix_time+a_speed+b_speed+separation+s_separation+
rate_terms+mpa+load_time+highpressure_time, data=train, gamma=1, cost=16)
summary(m)
##
## Call:
## svm(formula = factor(y_faulty) ~ fix_time + a_speed + b_speed +
## separation + s_separation + rate_terms + mpa + load_time +
## highpressure_time, data = train, gamma = 1, cost = 16)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 16
## gamma: 1
##
## Number of Support Vectors: 2243
##
## ( 1194 1049 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
yhat_test <- predict(m,test)
table <- table(real=test$y_fault, predict=yhat_test)
table
## predict
## real 0 1
## 0 5587 78
## 1 155 711
(table[1,1] + table[2,2]) / sum(table)
## [1] 0.964324
ROC(test=yhat_test, stat=test$y_faulty, plot="ROC", AUC=T, main="SVM")
***
기타
m1 <- svm(factor(y_faulty) ~ fix_time+a_speed+b_speed+separation+s_separation+
rate_terms+mpa+load_time+highpressure_time, data=train, gamma=1, cost=16, kernel="linear")
summary(m1)
##
## Call:
## svm(formula = factor(y_faulty) ~ fix_time + a_speed + b_speed +
## separation + s_separation + rate_terms + mpa + load_time +
## highpressure_time, data = train, gamma = 1, cost = 16, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 16
## gamma: 1
##
## Number of Support Vectors: 3178
##
## ( 1590 1588 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# 검증데이터로 예측값 및 정확도 구하기
yhat_test <- predict(m1,test)
table_1 <- table(real=test$y_fault, predict=yhat_test)
table_1
## predict
## real 0 1
## 0 5582 83
## 1 485 381
(table_1[1,1] + table_1[2,2]) / sum(table_1)
## [1] 0.9130302
# gamma, cost값을 기본값으로 두고 모델을 생성해본다
m2 <- svm(factor(y_faulty) ~ fix_time+a_speed+b_speed+separation+s_separation+
rate_terms+mpa+load_time+highpressure_time, data=train)
summary(m2)
##
## Call:
## svm(formula = factor(y_faulty) ~ fix_time + a_speed + b_speed +
## separation + s_separation + rate_terms + mpa + load_time +
## highpressure_time, data = train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.1111111
##
## Number of Support Vectors: 3402
##
## ( 1712 1690 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# 검증데이터로 예측값 및 정확도 구하기
yhat_test <- predict(m2, test)
table_2 <- table(real=test$y_fault, predict=yhat_test)
table_2
## predict
## real 0 1
## 0 5596 69
## 1 482 384
(table_2[1,1] + table_2[2,2]) / sum(table_2)
## [1] 0.9156331