install package
#install.packages("readr")
#install.packages("dplyr")
#install.packages("caret")
#install.packages("caTools")
#install.packages("tidyverse")
#install.packages("XML")
#install.packages("rvest")
#install.packages("class")
#install.packages("gmodels")
library("readr")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("caret")
## Loading required package: lattice
## Loading required package: ggplot2
library("caTools")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.5 ✓ stringr 1.4.0
## ✓ tidyr 1.1.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library("XML")
library("rvest")
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library("class")
library("gmodels")
import data set of iris flower dataset.txt
iris.flower.dataset <- read.csv("/cloud/project/Data/iris flower dataset.txt")
iris <- data.frame(iris.flower.dataset)
xem biến
str(iris.flower.dataset$species)
## chr [1:150] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" ...
iris$species
## [1] "setosa" "setosa" "setosa" "setosa" "setosa"
## [6] "setosa" "setosa" "setosa" "setosa" "setosa"
## [11] "setosa" "setosa" "setosa" "setosa" "setosa"
## [16] "setosa" "setosa" "setosa" "setosa" "setosa"
## [21] "setosa" "setosa" "setosa" "setosa" "setosa"
## [26] "setosa" "setosa" "setosa" "setosa" "setosa"
## [31] "setosa" "setosa" "setosa" "setosa" "setosa"
## [36] "setosa" "setosa" "setosa" "setosa" "setosa"
## [41] "setosa" "setosa" "setosa" "setosa" "setosa"
## [46] "setosa" "setosa" "setosa" "setosa" "setosa"
## [51] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [56] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [61] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [66] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [71] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [76] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [81] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [86] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [91] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [96] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [101] "virginica" "virginica" "virginica" "virginica" "virginica"
## [106] "virginica" "virginica" "virginica" "virginica" "virginica"
## [111] "virginica" "virginica" "virginica" "virginica" "virginica"
## [116] "virginica" "virginica" "virginica" "virginica" "virginica"
## [121] "virginica" "virginica" "virginica" "virginica" "virginica"
## [126] "virginica" "virginica" "virginica" "virginica" "virginica"
## [131] "virginica" "virginica" "virginica" "virginica" "virginica"
## [136] "virginica" "virginica" "virginica" "virginica" "virginica"
## [141] "virginica" "virginica" "virginica" "virginica" "virginica"
## [146] "virginica" "virginica" "virginica" "virginica" "virginica"
change iris$species to factor
iris$species <- as.factor(iris$species)
round(prop.table(table(iris$species))*100, digits = 1)
##
## setosa versicolor virginica
## 33.3 33.3 33.3
check iris$species
str(iris$species) # xem
## Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
tạo data frame_split, in this case is iris_split (bắt đầu quá trình phân chia data frame một cách ngẫu nhiên)
iris_split <- sample.split(iris$species, SplitRatio = 0.66 )
create radomly training set & test set (kết thúc quá trình này)
iris_train <- subset(iris, iris_split == TRUE)
iris_test <- subset(iris,iris_split == FALSE)
#view(iris_train)
iris_training_label <- as.factor(iris_train[,5])
iris_test_label <- as.factor(iris_test[,5])
tạo function đổi về số prob
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
áp dụng đổi về số prob
iris_train_n <- as.data.frame(lapply(iris_train[1:4],normalize))
#iris_n$petal_width
iris_test_n <- as.data.frame(lapply(iris_test[1:4],normalize))
check & view
str(iris_train)
## 'data.frame': 99 obs. of 5 variables:
## $ sepal_length: num 5.1 4.9 4.7 4.6 5 4.9 5.4 4.8 4.8 4.3 ...
## $ sepal_width : num 3.5 3 3.2 3.1 3.4 3.1 3.7 3.4 3 3 ...
## $ petal_length: num 1.4 1.4 1.3 1.5 1.5 1.5 1.5 1.6 1.4 1.1 ...
## $ petal_width : num 0.2 0.2 0.2 0.2 0.2 0.1 0.2 0.2 0.1 0.1 ...
## $ species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#view(iris_train)
str(iris_test)
## 'data.frame': 51 obs. of 5 variables:
## $ sepal_length: num 5 5.4 4.6 4.4 5.7 5.1 5.4 5.1 4.8 5 ...
## $ sepal_width : num 3.6 3.9 3.4 2.9 4.4 3.5 3.4 3.3 3.4 3.4 ...
## $ petal_length: num 1.4 1.7 1.4 1.4 1.5 1.4 1.7 1.7 1.9 1.6 ...
## $ petal_width : num 0.2 0.4 0.3 0.2 0.4 0.3 0.2 0.5 0.2 0.4 ...
## $ species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Training thử mẫu đã tạo
iris_train_process <- knn(iris_train_n, iris_test_n, cl = iris_training_label, k = 13)
xem iris_train_process
iris_train_process
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa setosa setosa versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor virginica virginica versicolor versicolor versicolor
## [31] versicolor versicolor versicolor versicolor virginica virginica
## [37] virginica virginica virginica virginica virginica virginica
## [43] virginica virginica virginica virginica virginica virginica
## [49] virginica virginica virginica
## Levels: setosa versicolor virginica
lập bảng kiểm tra
CrossTable(x= iris_test_label, y = iris_train_process, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 51
##
##
## | iris_train_process
## iris_test_label | setosa | versicolor | virginica | Row Total |
## ----------------|------------|------------|------------|------------|
## setosa | 17 | 0 | 0 | 17 |
## | 1.000 | 0.000 | 0.000 | 0.333 |
## | 1.000 | 0.000 | 0.000 | |
## | 0.333 | 0.000 | 0.000 | |
## ----------------|------------|------------|------------|------------|
## versicolor | 0 | 15 | 2 | 17 |
## | 0.000 | 0.882 | 0.118 | 0.333 |
## | 0.000 | 1.000 | 0.105 | |
## | 0.000 | 0.294 | 0.039 | |
## ----------------|------------|------------|------------|------------|
## virginica | 0 | 0 | 17 | 17 |
## | 0.000 | 0.000 | 1.000 | 0.333 |
## | 0.000 | 0.000 | 0.895 | |
## | 0.000 | 0.000 | 0.333 | |
## ----------------|------------|------------|------------|------------|
## Column Total | 17 | 15 | 19 | 51 |
## | 0.333 | 0.294 | 0.373 | |
## ----------------|------------|------------|------------|------------|
##
##