K-근접 이웃 분류 모형(K-Nearest Neighbors)




반응변수(2개 범주) KNN 모형


패키지 불러오기
library(FNN)     # KNN 패키지
library(ROSE)
library(e1071)   # SVM 분석 패키지
library(Epi)     # ROC, AUC
library(class)

자료 준비
autoparts <- read.csv("autoparts.csv", header = T)
dim(autoparts)
## [1] 34139    17
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)
## [1] 21779    10
str(autoparts1)
## 'data.frame':    21779 obs. of  10 variables:
##  $ fix_time         : num  85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
##  $ a_speed          : num  0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
##  $ b_speed          : num  1.72 1.71 1.72 1.72 1.7 ...
##  $ separation       : num  242 245 243 242 242 ...
##  $ s_separation     : num  658 657 658 657 657 ...
##  $ rate_terms       : int  95 95 95 95 95 95 95 95 95 95 ...
##  $ mpa              : num  78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
##  $ load_time        : num  18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
##  $ highpressure_time: int  58 58 82 74 56 78 55 57 50 60 ...
##  $ c_thickness      : num  24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ]  # 이상치 제거
dim(autoparts2)
## [1] 21767    10
autoparts2$y_faulty <- ifelse((autoparts2$c_thickness<20)|(autoparts2$c_thickness>32),1,0)
table(autoparts2$y_faulty)
## 
##     0     1 
## 18925  2842
# train, test data 나누기
t_index <- sample(1:nrow(autoparts2), size=nrow(autoparts2)*0.7)
train <- autoparts2[t_index, ]
test <- autoparts2[-t_index, ]

Argument 준비
# 훈련데이터 행렬과 반응변수
xmat.train <- model.matrix(y_faulty ~ fix_time+a_speed+b_speed+separation+s_separation+
           rate_terms+mpa+load_time+highpressure_time, data=train)[ ,-1]
y_faulty.train <- train$y_faulty

# 검증데이터 행렬
xmat.test <- model.matrix(c_thickness ~ fix_time+a_speed+b_speed+separation+s_separation +
                            rate_terms+mpa+load_time+highpressure_time, data=test)[ ,-1]

교차검증 수행을 통한 최적의 k값 찾기
# library(e1071)
tune.out <- tune.knn(x=xmat.train, y=as.factor(y_faulty.train), k=1:10)
tune.out    # best값은 5로 나타남
## 
## Parameter tuning of 'knn.wrapper':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  k
##  5
## 
## - best performance: 0.0459446
plot(tune.out)  # k=5일때 error이 가장 낮다

***

예측값 생성 및 정확도
# library(class)
yhat_test <- knn(xmat.train, xmat.test, y_faulty.train, k=5)
table <- table(real=test$y_faulty, predict=yhat_test)
table
##     predict
## real    0    1
##    0 5560   89
##    1  189  693
#정확도
(table[1,1]+table[2,2])/sum(table)
## [1] 0.9574338

ROC
ROC(test=yhat_test, stat=test$y_faulty, plot="ROC", AUC=T, main="KNN")

***

반응변수(다범주) KNN 모형

데이터 준비
# 데이터 준비 (종속변수 3개 범주로 나누기 )
autoparts2$g_class <- as.factor(ifelse(autoparts2$c_thickness<20, 1, 
                                       ifelse(autoparts2$c_thickness<32, 2, 3)))

이하 생략


반응변수(연속형) KNN 모형

  • 연속형 변수의 KNN은 knn.reg()를 사용한다.

자료 준비
# 데이터 
str(autoparts2)
## 'data.frame':    21767 obs. of  12 variables:
##  $ fix_time         : num  85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
##  $ a_speed          : num  0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
##  $ b_speed          : num  1.72 1.71 1.72 1.72 1.7 ...
##  $ separation       : num  242 245 243 242 242 ...
##  $ s_separation     : num  658 657 658 657 657 ...
##  $ rate_terms       : int  95 95 95 95 95 95 95 95 95 95 ...
##  $ mpa              : num  78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
##  $ load_time        : num  18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
##  $ highpressure_time: int  58 58 82 74 56 78 55 57 50 60 ...
##  $ c_thickness      : num  24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
##  $ y_faulty         : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ g_class          : Factor w/ 3 levels "1","2","3": 2 2 2 2 2 2 2 2 2 1 ...
# train, test data 나누기
t_index <- sample(1:nrow(autoparts2), size=nrow(autoparts2)*0.7)
train <- autoparts2[t_index, ]
test <- autoparts2[-t_index, ]

#훈련데이터 행렬과 반응변수
# [ , -1]은 모형에 나타나는 y절편 제거
xmat.train <- model.matrix(c_thickness ~ fix_time+a_speed+b_speed+separation+s_separation +
                             rate_terms+mpa+load_time+highpressure_time, data=train)[ ,-1]
c_thickness.train <- train$c_thickness

#검증데이터 행렬
xmat.test <- model.matrix(c_thickness ~ fix_time+a_speed+b_speed+separation+s_separation +
                            rate_terms+mpa+load_time+highpressure_time, data=test)[ ,-1]

예측값 생성 및 mse
# k 값 구하기는 앞과 동일
yhat_test <- knn.reg(xmat.train, xmat.test, c_thickness.train, k=5)

# Mean Squared Error(평균제곱오차)
mse <- mean((yhat_test$pred - test$c_thickness)^2)
mse
## [1] 1.479637