load package
#install.packages("tidyverse")
#install.packages("XML")
#install.packages("rvest")
#install.packages("class")
#install.packages("gmodels")
library("readr")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("caret")
## Loading required package: lattice
## Loading required package: ggplot2
library("caTools")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.5 ✓ stringr 1.4.0
## ✓ tidyr 1.1.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library("XML")
library("rvest")
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library("class")
library("gmodels")
Tạo một df mẫu
A_hang <- c(1:5) # tạo biến
A_cot <- c("a","b","c","d","e") # tạo biến
A_level <- c("M","F","M","M","F") # tạo biến
A <- data.frame(A_hang, A_cot, A_level) # tạo data frame
A # xem A
## A_hang A_cot A_level
## 1 1 a M
## 2 2 b F
## 3 3 c M
## 4 4 d M
## 5 5 e F
đổi biến A_level thành factor, lý do tại sao chưa biết
A$A_level <- as.factor(A$A_level) # đổi thành factor
str(A$A_level) # xem
## Factor w/ 2 levels "F","M": 2 1 2 2 1
glimpse(A) # xem
## Rows: 5
## Columns: 3
## $ A_hang <int> 1, 2, 3, 4, 5
## $ A_cot <chr> "a", "b", "c", "d", "e"
## $ A_level <fct> M, F, M, M, F
tạo dataframe_split, tên nó vậy cho dễ nhớ, trong trường hợp này là A_split
A_split <- sample.split(A$A_level, SplitRatio = 0.6)
tạo ngẫu nhiên train & test set
A_train <- subset(A, A_split == TRUE)
A_test <- subset(A, A_split == FALSE)
A_train
## A_hang A_cot A_level
## 1 1 a M
## 2 2 b F
## 3 3 c M
A_test
## A_hang A_cot A_level
## 4 4 d M
## 5 5 e F