USING R AS A CALCULATOR SIMPLE EXPRESSION

s <- function(x, y){
  tong <- x + y
  tich <- x / y
}

class(36)        
## [1] "numeric"
class("DLL&UD")  
## [1] "character"

ASSIGNMENTS

x <- 3
4 -> y
x^y
## [1] 81

STRING

s1 <- "Du lieu lon"
length(s1)   # do dai vector
## [1] 1
nchar(s1)    # so ky tu
## [1] 11
s2 <- "kho vaii"
s3 <- paste(s1, s2)
s4 <- c(s1, s2)

length(s3)
## [1] 1
length(s4)
## [1] 2
nchar(s3)
## [1] 20

INDEXING VECTORS

v1 <- c(1, 3, 5, 7, 9)
v2<-c("Mot tuan hoc 7 ngay van ngoo")
v3<-c(TRUE, TRUE, FALSE, FALSE)
sprintf("Do dai v1: %d", length(v1))
## [1] "Do dai v1: 5"
v1[2]
## [1] 3
v1[c(1, 3)]
## [1] 1 5

BOOLEAN INDEXING

v <- c(-11, 3, -4, 5, 7, 9, 10)
v%%2==0
## [1] FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
v1<-v[v%%2==0]      #loai pt le
v1
## [1] -4 10
v2<-v[v%%3==0]
v2
## [1] 3 9
v3<-v[-(1: 5)]       #loai vt 1, 5
v3
## [1]  9 10
v4<-v[c(2:4)]      #lay pt tu vt 2 den 4
v4
## [1]  3 -4  5

NAME VECTOR

v<-c("class"="KHMT", "name"="ahn")
v["name"]
##  name 
## "ahn"
v1<-c(10, 9, 8)
names(v1)<-c("A", "B", "C")
names(v1)
## [1] "A" "B" "C"

VECTORIZED EXPRESSION

v1<-c(2, 4, 6, 8)
v1**2
## [1]  4 16 36 64
v2<-c(1, 2, 3, 4)
v2**2-v1
## [1] -1  0  3  8

FUNCTION

s1<-function(a, b){
  return (a+b)
}
s1(1, 8)
## [1] 9
tbc <- function(x){
  sum(x) / length(x)
}
tbc(1:10)
## [1] 5.5
fc<-function(x, y){
  t<-x**2
  t1<-y**3
}
fc(1:3, rev(1:3))     #rev: dao nguoc

ktcp<-function(n){
  sqrt(n)%%1==0
}

dscp<-function(x) ifelse(ktcp(x), x, "S")
dscp(1:10)
##  [1] "1" "S" "S" "4" "S" "S" "S" "S" "9" "S"
##for while
tongw<-function(x){
  vt<-1
  s<-0
  while(vt<=length(x)){
    i<-x[vt]
    s<-s+i
    vt<-vt+1
    }
  s
}
tongw(1: 10)
## [1] 55

FACTOR

f<-factor(c("S", "M", "M", "S", "L", "M", "XL"))
f
## [1] S  M  M  S  L  M  XL
## Levels: L M S XL
ff<-factor(c("S", "M", "M", "S", "M", "XL"), levels = c("S", "M", "L", "XL"))
ff
## [1] S  M  M  S  M  XL
## Levels: S M L XL
f1<-factor(LETTERS[1:5], levels = rev(LETTERS[1:5]))
f1
## [1] A B C D E
## Levels: E D C B A

DATA FRAMES

df<-data.frame(
  name=c("Anh", "An", "lan", "Thanh"),
  sex=c("FM", "M", "FM", "M"),
  ad=c("Thai Nguyen", "Ha Noi", "Thai Nguyen", "Hai Phong"),
  gpa=c(10, 7, 9, 8)
)  
df
##    name sex          ad gpa
## 1   Anh  FM Thai Nguyen  10
## 2    An   M      Ha Noi   7
## 3   lan  FM Thai Nguyen   9
## 4 Thanh   M   Hai Phong   8
df$name     #Truy cap theo cot
## [1] "Anh"   "An"    "lan"   "Thanh"
df$TB<-c("A", "C", "B", "B")      #Them cot
df
##    name sex          ad gpa TB
## 1   Anh  FM Thai Nguyen  10  A
## 2    An   M      Ha Noi   7  C
## 3   lan  FM Thai Nguyen   9  B
## 4 Thanh   M   Hai Phong   8  B
df$gpa[2]<-3
df
##    name sex          ad gpa TB
## 1   Anh  FM Thai Nguyen  10  A
## 2    An   M      Ha Noi   3  C
## 3   lan  FM Thai Nguyen   9  B
## 4 Thanh   M   Hai Phong   8  B
df[df$gpa>7, ]
##    name sex          ad gpa TB
## 1   Anh  FM Thai Nguyen  10  A
## 3   lan  FM Thai Nguyen   9  B
## 4 Thanh   M   Hai Phong   8  B

DEALING WITH MISSING VALUES

n <- c(1, 2, 3, NA, 5)

any(is.na(n))
## [1] TRUE
sum(n, na.rm = TRUE)
## [1] 11
n[!is.na(n)]
## [1] 1 2 3 5
n
## [1]  1  2  3 NA  5
df<-data.frame(
  name=c("Anh", "Ngoc", "Minh"),
  age=c(20, 21, NA)
)
df
##   name age
## 1  Anh  20
## 2 Ngoc  21
## 3 Minh  NA
rowSums(is.na(df))
## [1] 0 0 1
na.omit(df)     
##   name age
## 1  Anh  20
## 2 Ngoc  21

DATA PIPELINES

library(magrittr)            #library(magrittr)
n<-c(1, 2, 3, 4, NA, 6, 7, NA)
sum(na.omit(n)) 
## [1] 23
n%>%                       #dung pinelines
  na.omit()%>%
  sum()
## [1] 23
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df<-data.frame(
  name=c("Anh", "Ngoc", "Minh", "Lan"),
  age=c(20, 21, NA, 22),
  score = c(8, NA, 7, 9)
)

#Tinh tbc diem, tbc tuoi
df%>%
  na.omit()%>%
  summarise(
    tb_age=mean(age),
    tb_d=mean(score)
  )
##   tb_age tb_d
## 1     21  8.5
#Ten nguoi co diem cao nhat
df%>%
  na.omit()%>%
  filter(score==max(score))%>%
  select(name, score)
##   name score
## 1  Lan     9
#Them cot
df%>%
  na.omit()%>%
  mutate(status=ifelse(score>=8, "Pass", "Fail"))
##   name age score status
## 1  Anh  20     8   Pass
## 4  Lan  22     9   Pass
#Magical dot (dai dien cho du lieu ben trai cua %>%)
df%>%
  na.omit()%>%
  .[.$age>=21, ]
##   name age score
## 4  Lan  22     9

ANONYMOUS FUNCTION

rnorm(5)%>%
  data.frame(x=1:5, y=.^2)
##             . x           y
## 1 -0.06978946 1 0.004870569
## 2 -0.25575491 2 0.065410573
## 3 -0.07192501 3 0.005173207
## 4  0.36318992 4 0.131906915
## 5  1.77234831 5 3.141218519
library(dplyr)
df<-data.frame(
  name=c("Anh", "Ngoc", "Minh", "Lan"),
  age=c(20, 21, NA, 22),
  score = c(8, NA, 7, 9)
)

df %>%
  na.omit()%>%
  (function(d){
    d$status<-ifelse(d$score>=8, "PASS", "FAIL")
    d[d$age >= 22, ]
  })
##   name age score status
## 4  Lan  22     9   PASS

OTHER PINELINES OPERATIONS

library(magrittr)
d<-data.frame(
  x=c(1:4), 
  y=c(11:14)
)
#lay cot khoi data frame
d%$%
  mean(x)
## [1] 2.5
#xem giua chung ko lam gian doan pinelines
d %T>%
  print() %>%
  nrow()
##   x  y
## 1 1 11
## 2 2 12
## 3 3 13
## 4 4 14
## [1] 4
#ghi de
d %<>%
  subset(x > 1)

Bai 1: MEAN OF POSITIVE VALUES

rnorm(10)%>%
  {ifelse(.<0, NA, .)}%>%
  mean(na.rm=TRUE)
## [1] 1.154066

BAI 2: ROOT MEAN SQUARE ERROR Nếu bạn có các giá trị thực

\[ t = (t_1, \ldots, t_n) \]

và các giá trị dự đoán

\[ y = (y_1, \ldots, y_n), \]

thì sai số căn trung bình bình phương được định nghĩa là:

\[ RMSE(t, y) = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (t_i - y_i)^2} \]

df <- data.frame(
  t = sample(1:10, 10, replace = TRUE),
  y = sample(1:10, 10, replace = TRUE)
)

df %>%
  { (.$t - .$y)^2 } %>%
  mean() %>%
  sqrt()
## [1] 3.794733