load package

#install.packages("tidyverse")
#install.packages("XML")
#install.packages("rvest")
#install.packages("class")
#install.packages("gmodels")

library("readr")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("caret")
## Loading required package: lattice
## Loading required package: ggplot2
library("caTools")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.5     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
library("XML")
library("rvest")
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
## 
##     xml
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library("class")
library("gmodels")

Tạo một df mẫu

A_hang <- c(1:5) # tạo biến 
A_cot <- c("a","b","c","d","e") # tạo biến
A_level <- c("M","F","M","M","F") # tạo biến
A <- data.frame(A_hang, A_cot, A_level) # tạo data frame
A # xem A
##   A_hang A_cot A_level
## 1      1     a       M
## 2      2     b       F
## 3      3     c       M
## 4      4     d       M
## 5      5     e       F

đổi biến A_level thành factor, lý do tại sao chưa biết

A$A_level <- as.factor(A$A_level) # đổi thành factor
str(A$A_level) # xem 
##  Factor w/ 2 levels "F","M": 2 1 2 2 1
glimpse(A)  # xem 
## Rows: 5
## Columns: 3
## $ A_hang  <int> 1, 2, 3, 4, 5
## $ A_cot   <chr> "a", "b", "c", "d", "e"
## $ A_level <fct> M, F, M, M, F

tạo dataframe_split, tên nó vậy cho dễ nhớ, trong trường hợp này là A_split

A_split <- sample.split(A$A_level, SplitRatio = 0.6)

tạo ngẫu nhiên train & test set

A_train <- subset(A, A_split == TRUE)
A_test <- subset(A, A_split == FALSE)
A_train
##   A_hang A_cot A_level
## 1      1     a       M
## 2      2     b       F
## 3      3     c       M
A_test
##   A_hang A_cot A_level
## 4      4     d       M
## 5      5     e       F