install package

#install.packages("readr")
#install.packages("dplyr")
#install.packages("caret")
#install.packages("caTools")
#install.packages("tidyverse")
#install.packages("XML")
#install.packages("rvest")
#install.packages("class")
#install.packages("gmodels")

library("readr")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("caret")
## Loading required package: lattice
## Loading required package: ggplot2
library("caTools")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.5     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
library("XML")
library("rvest")
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
## 
##     xml
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library("class")
library("gmodels")

import data set of iris flower dataset.txt

iris.flower.dataset <- read.csv("/cloud/project/Data/iris flower dataset.txt")
iris <- data.frame(iris.flower.dataset)

xem biến

str(iris.flower.dataset$species)
##  chr [1:150] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" ...
iris$species
##   [1] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##   [6] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [11] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [16] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [21] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [26] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [31] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [36] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [41] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [46] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [51] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [56] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [61] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [66] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [71] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [76] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [81] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [86] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [91] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [96] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [101] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [106] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [111] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [116] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [121] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [126] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [131] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [136] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [141] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [146] "virginica"  "virginica"  "virginica"  "virginica"  "virginica"

change iris$species to factor

iris$species <- as.factor(iris$species)
round(prop.table(table(iris$species))*100, digits = 1)
## 
##     setosa versicolor  virginica 
##       33.3       33.3       33.3

check iris$species

str(iris$species) # xem
##  Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

tạo data frame_split, in this case is iris_split (bắt đầu quá trình phân chia data frame một cách ngẫu nhiên)

iris_split <- sample.split(iris$species, SplitRatio = 0.66  )

create radomly training set & test set (kết thúc quá trình này)

iris_train <- subset(iris, iris_split == TRUE)
iris_test <- subset(iris,iris_split == FALSE)
#view(iris_train)
iris_training_label <- as.factor(iris_train[,5])   
iris_test_label <- as.factor(iris_test[,5]) 

tạo function đổi về số prob

normalize <- function(x) {
 return ((x - min(x)) / (max(x) - min(x)))
 }

áp dụng đổi về số prob

iris_train_n <- as.data.frame(lapply(iris_train[1:4],normalize))
#iris_n$petal_width
iris_test_n <- as.data.frame(lapply(iris_test[1:4],normalize))

check & view

str(iris_train)
## 'data.frame':    99 obs. of  5 variables:
##  $ sepal_length: num  5.1 4.9 4.7 4.6 5 4.9 5.4 4.8 4.8 4.3 ...
##  $ sepal_width : num  3.5 3 3.2 3.1 3.4 3.1 3.7 3.4 3 3 ...
##  $ petal_length: num  1.4 1.4 1.3 1.5 1.5 1.5 1.5 1.6 1.4 1.1 ...
##  $ petal_width : num  0.2 0.2 0.2 0.2 0.2 0.1 0.2 0.2 0.1 0.1 ...
##  $ species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#view(iris_train)
str(iris_test)
## 'data.frame':    51 obs. of  5 variables:
##  $ sepal_length: num  5 5.4 4.6 4.4 5.7 5.1 5.4 5.1 4.8 5 ...
##  $ sepal_width : num  3.6 3.9 3.4 2.9 4.4 3.5 3.4 3.3 3.4 3.4 ...
##  $ petal_length: num  1.4 1.7 1.4 1.4 1.5 1.4 1.7 1.7 1.9 1.6 ...
##  $ petal_width : num  0.2 0.4 0.3 0.2 0.4 0.3 0.2 0.5 0.2 0.4 ...
##  $ species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Training thử mẫu đã tạo

iris_train_process <- knn(iris_train_n, iris_test_n, cl = iris_training_label, k = 13)

xem iris_train_process

iris_train_process
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     setosa     setosa     versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor virginica  virginica  versicolor versicolor versicolor
## [31] versicolor versicolor versicolor versicolor virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica  virginica  virginica  virginica 
## [49] virginica  virginica  virginica 
## Levels: setosa versicolor virginica

lập bảng kiểm tra

CrossTable(x= iris_test_label, y = iris_train_process, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  51 
## 
##  
##                 | iris_train_process 
## iris_test_label |     setosa | versicolor |  virginica |  Row Total | 
## ----------------|------------|------------|------------|------------|
##          setosa |         17 |          0 |          0 |         17 | 
##                 |      1.000 |      0.000 |      0.000 |      0.333 | 
##                 |      1.000 |      0.000 |      0.000 |            | 
##                 |      0.333 |      0.000 |      0.000 |            | 
## ----------------|------------|------------|------------|------------|
##      versicolor |          0 |         15 |          2 |         17 | 
##                 |      0.000 |      0.882 |      0.118 |      0.333 | 
##                 |      0.000 |      1.000 |      0.105 |            | 
##                 |      0.000 |      0.294 |      0.039 |            | 
## ----------------|------------|------------|------------|------------|
##       virginica |          0 |          0 |         17 |         17 | 
##                 |      0.000 |      0.000 |      1.000 |      0.333 | 
##                 |      0.000 |      0.000 |      0.895 |            | 
##                 |      0.000 |      0.000 |      0.333 |            | 
## ----------------|------------|------------|------------|------------|
##    Column Total |         17 |         15 |         19 |         51 | 
##                 |      0.333 |      0.294 |      0.373 |            | 
## ----------------|------------|------------|------------|------------|
## 
##