반응변수(2개 범주) KNN 모형
library(FNN) # KNN 패키지
library(ROSE)
library(e1071) # SVM 분석 패키지
library(Epi) # ROC, AUC
library(class)
autoparts <- read.csv("autoparts.csv", header = T)
dim(autoparts)
## [1] 34139 17
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)
## [1] 21779 10
str(autoparts1)
## 'data.frame': 21779 obs. of 10 variables:
## $ fix_time : num 85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
## $ a_speed : num 0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
## $ b_speed : num 1.72 1.71 1.72 1.72 1.7 ...
## $ separation : num 242 245 243 242 242 ...
## $ s_separation : num 658 657 658 657 657 ...
## $ rate_terms : int 95 95 95 95 95 95 95 95 95 95 ...
## $ mpa : num 78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
## $ load_time : num 18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
## $ highpressure_time: int 58 58 82 74 56 78 55 57 50 60 ...
## $ c_thickness : num 24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ] # 이상치 제거
dim(autoparts2)
## [1] 21767 10
autoparts2$y_faulty <- ifelse((autoparts2$c_thickness<20)|(autoparts2$c_thickness>32),1,0)
table(autoparts2$y_faulty)
##
## 0 1
## 18925 2842
# train, test data 나누기
t_index <- sample(1:nrow(autoparts2), size=nrow(autoparts2)*0.7)
train <- autoparts2[t_index, ]
test <- autoparts2[-t_index, ]
# 훈련데이터 행렬과 반응변수
xmat.train <- model.matrix(y_faulty ~ fix_time+a_speed+b_speed+separation+s_separation+
rate_terms+mpa+load_time+highpressure_time, data=train)[ ,-1]
y_faulty.train <- train$y_faulty
# 검증데이터 행렬
xmat.test <- model.matrix(c_thickness ~ fix_time+a_speed+b_speed+separation+s_separation +
rate_terms+mpa+load_time+highpressure_time, data=test)[ ,-1]
# library(e1071)
tune.out <- tune.knn(x=xmat.train, y=as.factor(y_faulty.train), k=1:10)
tune.out # best값은 5로 나타남
##
## Parameter tuning of 'knn.wrapper':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## k
## 5
##
## - best performance: 0.0459446
plot(tune.out) # k=5일때 error이 가장 낮다
***
# library(class)
yhat_test <- knn(xmat.train, xmat.test, y_faulty.train, k=5)
table <- table(real=test$y_faulty, predict=yhat_test)
table
## predict
## real 0 1
## 0 5560 89
## 1 189 693
#정확도
(table[1,1]+table[2,2])/sum(table)
## [1] 0.9574338
ROC(test=yhat_test, stat=test$y_faulty, plot="ROC", AUC=T, main="KNN")
***
반응변수(다범주) KNN 모형
# 데이터 준비 (종속변수 3개 범주로 나누기 )
autoparts2$g_class <- as.factor(ifelse(autoparts2$c_thickness<20, 1,
ifelse(autoparts2$c_thickness<32, 2, 3)))
이하 생략
반응변수(연속형) KNN 모형
# 데이터
str(autoparts2)
## 'data.frame': 21767 obs. of 12 variables:
## $ fix_time : num 85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
## $ a_speed : num 0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
## $ b_speed : num 1.72 1.71 1.72 1.72 1.7 ...
## $ separation : num 242 245 243 242 242 ...
## $ s_separation : num 658 657 658 657 657 ...
## $ rate_terms : int 95 95 95 95 95 95 95 95 95 95 ...
## $ mpa : num 78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
## $ load_time : num 18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
## $ highpressure_time: int 58 58 82 74 56 78 55 57 50 60 ...
## $ c_thickness : num 24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
## $ y_faulty : num 0 0 0 0 0 0 0 0 0 1 ...
## $ g_class : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 1 ...
# train, test data 나누기
t_index <- sample(1:nrow(autoparts2), size=nrow(autoparts2)*0.7)
train <- autoparts2[t_index, ]
test <- autoparts2[-t_index, ]
#훈련데이터 행렬과 반응변수
# [ , -1]은 모형에 나타나는 y절편 제거
xmat.train <- model.matrix(c_thickness ~ fix_time+a_speed+b_speed+separation+s_separation +
rate_terms+mpa+load_time+highpressure_time, data=train)[ ,-1]
c_thickness.train <- train$c_thickness
#검증데이터 행렬
xmat.test <- model.matrix(c_thickness ~ fix_time+a_speed+b_speed+separation+s_separation +
rate_terms+mpa+load_time+highpressure_time, data=test)[ ,-1]
# k 값 구하기는 앞과 동일
yhat_test <- knn.reg(xmat.train, xmat.test, c_thickness.train, k=5)
# Mean Squared Error(평균제곱오차)
mse <- mean((yhat_test$pred - test$c_thickness)^2)
mse
## [1] 1.479637