USING R AS A CALCULATOR SIMPLE EXPRESSION
s <- function(x, y){
tong <- x + y
tich <- x / y
}
class(36)
## [1] "numeric"
class("DLL&UD")
## [1] "character"
ASSIGNMENTS
x <- 3
4 -> y
x^y
## [1] 81
STRING
s1 <- "Du lieu lon"
length(s1) # do dai vector
## [1] 1
nchar(s1) # so ky tu
## [1] 11
s2 <- "kho vaii"
s3 <- paste(s1, s2)
s4 <- c(s1, s2)
length(s3)
## [1] 1
length(s4)
## [1] 2
nchar(s3)
## [1] 20
INDEXING VECTORS
v1 <- c(1, 3, 5, 7, 9)
v2<-c("Mot tuan hoc 7 ngay van ngoo")
v3<-c(TRUE, TRUE, FALSE, FALSE)
sprintf("Do dai v1: %d", length(v1))
## [1] "Do dai v1: 5"
v1[2]
## [1] 3
v1[c(1, 3)]
## [1] 1 5
BOOLEAN INDEXING
v <- c(-11, 3, -4, 5, 7, 9, 10)
v%%2==0
## [1] FALSE FALSE TRUE FALSE FALSE FALSE TRUE
v1<-v[v%%2==0] #loai pt le
v1
## [1] -4 10
v2<-v[v%%3==0]
v2
## [1] 3 9
v3<-v[-(1: 5)] #loai vt 1, 5
v3
## [1] 9 10
v4<-v[c(2:4)] #lay pt tu vt 2 den 4
v4
## [1] 3 -4 5
NAME VECTOR
v<-c("class"="KHMT", "name"="ahn")
v["name"]
## name
## "ahn"
v1<-c(10, 9, 8)
names(v1)<-c("A", "B", "C")
names(v1)
## [1] "A" "B" "C"
VECTORIZED EXPRESSION
v1<-c(2, 4, 6, 8)
v1**2
## [1] 4 16 36 64
v2<-c(1, 2, 3, 4)
v2**2-v1
## [1] -1 0 3 8
FUNCTION
s1<-function(a, b){
return (a+b)
}
s1(1, 8)
## [1] 9
tbc <- function(x){
sum(x) / length(x)
}
tbc(1:10)
## [1] 5.5
fc<-function(x, y){
t<-x**2
t1<-y**3
}
fc(1:3, rev(1:3)) #rev: dao nguoc
ktcp<-function(n){
sqrt(n)%%1==0
}
dscp<-function(x) ifelse(ktcp(x), x, "S")
dscp(1:10)
## [1] "1" "S" "S" "4" "S" "S" "S" "S" "9" "S"
##for while
tongw<-function(x){
vt<-1
s<-0
while(vt<=length(x)){
i<-x[vt]
s<-s+i
vt<-vt+1
}
s
}
tongw(1: 10)
## [1] 55
FACTOR
f<-factor(c("S", "M", "M", "S", "L", "M", "XL"))
f
## [1] S M M S L M XL
## Levels: L M S XL
ff<-factor(c("S", "M", "M", "S", "M", "XL"), levels = c("S", "M", "L", "XL"))
ff
## [1] S M M S M XL
## Levels: S M L XL
f1<-factor(LETTERS[1:5], levels = rev(LETTERS[1:5]))
f1
## [1] A B C D E
## Levels: E D C B A
DATA FRAMES
df<-data.frame(
name=c("Anh", "An", "lan", "Thanh"),
sex=c("FM", "M", "FM", "M"),
ad=c("Thai Nguyen", "Ha Noi", "Thai Nguyen", "Hai Phong"),
gpa=c(10, 7, 9, 8)
)
df
## name sex ad gpa
## 1 Anh FM Thai Nguyen 10
## 2 An M Ha Noi 7
## 3 lan FM Thai Nguyen 9
## 4 Thanh M Hai Phong 8
df$name #Truy cap theo cot
## [1] "Anh" "An" "lan" "Thanh"
df$TB<-c("A", "C", "B", "B") #Them cot
df
## name sex ad gpa TB
## 1 Anh FM Thai Nguyen 10 A
## 2 An M Ha Noi 7 C
## 3 lan FM Thai Nguyen 9 B
## 4 Thanh M Hai Phong 8 B
df$gpa[2]<-3
df
## name sex ad gpa TB
## 1 Anh FM Thai Nguyen 10 A
## 2 An M Ha Noi 3 C
## 3 lan FM Thai Nguyen 9 B
## 4 Thanh M Hai Phong 8 B
df[df$gpa>7, ]
## name sex ad gpa TB
## 1 Anh FM Thai Nguyen 10 A
## 3 lan FM Thai Nguyen 9 B
## 4 Thanh M Hai Phong 8 B
DEALING WITH MISSING VALUES
n <- c(1, 2, 3, NA, 5)
any(is.na(n))
## [1] TRUE
sum(n, na.rm = TRUE)
## [1] 11
n[!is.na(n)]
## [1] 1 2 3 5
n
## [1] 1 2 3 NA 5
df<-data.frame(
name=c("Anh", "Ngoc", "Minh"),
age=c(20, 21, NA)
)
df
## name age
## 1 Anh 20
## 2 Ngoc 21
## 3 Minh NA
rowSums(is.na(df))
## [1] 0 0 1
na.omit(df)
## name age
## 1 Anh 20
## 2 Ngoc 21
DATA PIPELINES
library(magrittr) #library(magrittr)
n<-c(1, 2, 3, 4, NA, 6, 7, NA)
sum(na.omit(n))
## [1] 23
n%>% #dung pinelines
na.omit()%>%
sum()
## [1] 23
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df<-data.frame(
name=c("Anh", "Ngoc", "Minh", "Lan"),
age=c(20, 21, NA, 22),
score = c(8, NA, 7, 9)
)
#Tinh tbc diem, tbc tuoi
df%>%
na.omit()%>%
summarise(
tb_age=mean(age),
tb_d=mean(score)
)
## tb_age tb_d
## 1 21 8.5
#Ten nguoi co diem cao nhat
df%>%
na.omit()%>%
filter(score==max(score))%>%
select(name, score)
## name score
## 1 Lan 9
#Them cot
df%>%
na.omit()%>%
mutate(status=ifelse(score>=8, "Pass", "Fail"))
## name age score status
## 1 Anh 20 8 Pass
## 4 Lan 22 9 Pass
#Magical dot (dai dien cho du lieu ben trai cua %>%)
df%>%
na.omit()%>%
.[.$age>=21, ]
## name age score
## 4 Lan 22 9
ANONYMOUS FUNCTION
rnorm(5)%>%
data.frame(x=1:5, y=.^2)
## . x y
## 1 -0.06978946 1 0.004870569
## 2 -0.25575491 2 0.065410573
## 3 -0.07192501 3 0.005173207
## 4 0.36318992 4 0.131906915
## 5 1.77234831 5 3.141218519
library(dplyr)
df<-data.frame(
name=c("Anh", "Ngoc", "Minh", "Lan"),
age=c(20, 21, NA, 22),
score = c(8, NA, 7, 9)
)
df %>%
na.omit()%>%
(function(d){
d$status<-ifelse(d$score>=8, "PASS", "FAIL")
d[d$age >= 22, ]
})
## name age score status
## 4 Lan 22 9 PASS
OTHER PINELINES OPERATIONS
library(magrittr)
d<-data.frame(
x=c(1:4),
y=c(11:14)
)
#lay cot khoi data frame
d%$%
mean(x)
## [1] 2.5
#xem giua chung ko lam gian doan pinelines
d %T>%
print() %>%
nrow()
## x y
## 1 1 11
## 2 2 12
## 3 3 13
## 4 4 14
## [1] 4
#ghi de
d %<>%
subset(x > 1)
Bai 1: MEAN OF POSITIVE VALUES
rnorm(10)%>%
{ifelse(.<0, NA, .)}%>%
mean(na.rm=TRUE)
## [1] 1.154066
BAI 2: ROOT MEAN SQUARE ERROR Nếu bạn có các giá trị thực
\[ t = (t_1, \ldots, t_n) \]
và các giá trị dự đoán
\[ y = (y_1, \ldots, y_n), \]
thì sai số căn trung bình bình phương được định nghĩa là:
\[ RMSE(t, y) = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (t_i - y_i)^2} \]
df <- data.frame(
t = sample(1:10, 10, replace = TRUE),
y = sample(1:10, 10, replace = TRUE)
)
df %>%
{ (.$t - .$y)^2 } %>%
mean() %>%
sqrt()
## [1] 3.794733