KNN을 사용한 iris 분류

KNN은 비분류된 혹은 최초 발생된 데이터에서 유사성을 파악하여 그와 비슷하게 분류된 점들의 클래스를 할당해 분류하는 과정이라고 볼 수 있다.
KNN은 지도 학습 알고리즘이다. 이 때문에 다른 클래스들로부터 분류된 트레이닝 데이터 세트로부터 시작된다.
알고리즘은 테스트 데이터 세트에 속한 각 데이터 포인트들을 선택하고, 선택된 유사성 측정 방법에 따라 K근접 이웃을 알아낸다.

데이터 파악

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

패키지 로드

library(ggvis) 
library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(class)

종 도식화 %>%함수를 사용 했기 때문에 dplyr패키지 설치가 되어 있어야 한다.

iris %>% 
  ggvis(~Petal.Length, ~Petal.Width, fill = ~factor(Species)) %>%
  layer_points()

정규화 함수 생성

normalizer <- function(x) {
  return_value <- (x - min(x)) / (max(x) - min(x))
  return(return_value)
}

normal_iris <- sapply(iris[,1:4], normalizer) %>% 
  as.data.frame()

training / test sampling

df <- iris

training_sampling <- sort(sample(1:nrow(df),nrow(df)*0.7))
test_sampling <- setdiff(1:nrow(df), training_sampling)

traing_set / test_set

training_set <- df[training_sampling,]
test_set <- df[test_sampling,]

labeling

training_set_unlabel <- training_set[,1:4]
training_set_label <- training_set[,5]
test_set_unlabel <- test_set[,1:4]
test_set_label <- test_set[,5]

knn 함수 기본형태는 다음과 같다
knn(train = training_set_unlabel, test = test_set_unlabel, cl = training_set_lable, k = 3)

knn_m <- knn(train = training_set_unlabel, test = test_set_unlabel, cl = training_set_label, k = 3)

혼동행렬 (ConfusionMatrix)

confusionMatrix(knn_m, test_set_label)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         13         1
##   virginica       0          1        13
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9556          
##                  95% CI : (0.8485, 0.9946)
##     No Information Rate : 0.3778          
##     P-Value [Acc > NIR] : 2.61e-16        
##                                           
##                   Kappa : 0.933           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9286           0.9286
## Specificity                 1.0000            0.9677           0.9677
## Pos Pred Value              1.0000            0.9286           0.9286
## Neg Pred Value              1.0000            0.9677           0.9677
## Prevalence                  0.3778            0.3111           0.3111
## Detection Rate              0.3778            0.2889           0.2889
## Detection Prevalence        0.3778            0.3111           0.3111
## Balanced Accuracy           1.0000            0.9482           0.9482