USING R AS A CALCULATOR

SIMPLE EXPRESSION

s<-function(x, y){
  tong<-x+y
  tich<x/y
}

class(36)
## [1] "numeric"
class("DLL&UD")
## [1] "character"

ASSIGNMENTS

x<-3
4->y
x**y
## [1] 81

STRING

s1<-"Du lieu lon"
length(s1)    #do dai vector
## [1] 1
nchar(s1)      #do dai kt
## [1] 11
s2<-"kho vaii"
s3<-paste(s1, s2)   #ghep chuoi
s4<-c(s1, s2)       #tao vector gom 2 chuoi
length(s3)
## [1] 1
length(s4)
## [1] 2
nchar(s3)
## [1] 20

INDEXING VECTORS

v1<-c(1, 3, 5, 7, 9)      
v2<-c("Mot tuan hoc 7 ngay van ngoo")
v3<-c(TRUE, TRUE, FALSE, FALSE)
sprintf("Do dai v1: %d", length(v1))
## [1] "Do dai v1: 5"
v1[2]
## [1] 3
v1[c(1, 3)]
## [1] 1 5

BOOLEAN INDEXING

v<-c(-11, 3, -4, 5, 7, 9, 10)
v%%2==0
## [1] FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
v1<-v[v%%2==0]      #loai pt le
v1
## [1] -4 10
v2<-v[v%%3==0]
v2
## [1] 3 9
v3<-v[-(1: 5)]       #loai vt 1, 5
v3
## [1]  9 10
v4<-v[c(2:4)]      #lay pt tu vt 2 den 4
v4
## [1]  3 -4  5

NAME VECTOR

v<-c("class"="KHMT", "name"="ahn")
v["name"]
##  name 
## "ahn"
v1<-c(10, 9, 8)
names(v1)<-c("A", "B", "C")
names(v1)
## [1] "A" "B" "C"

VECTORIZED EXPRESSION

v1<-c(2, 4, 6, 8)
v1**2
## [1]  4 16 36 64
v2<-c(1, 2, 3, 4)
v2**2-v1
## [1] -1  0  3  8

FUNCTION

s1<-function(a, b){
  return (a+b)
}
s1(1, 8)
## [1] 9
tbc<-function(x){
  n<-length(x)
  sum(x)/n
}
tbc(1:10)
## [1] 5.5
fc<-function(x, y){
  t<-x**2
  t1<-y**3
}
fc(1:3, rev(1:3))     #rev: dao nguoc

ktcp<-function(n){
  sqrt(n)%%1==0
}

dscp<-function(x) ifelse(ktcp(x), x, "S")
dscp(1:10)
##  [1] "1" "S" "S" "4" "S" "S" "S" "S" "9" "S"
##for while
tongw<-function(x){
  vt<-1
  s<-0
  while(vt<=length(x)){
    i<-x[vt]
    s<-s+i
    vt<-vt+1
    }
  s
}
tongw(1: 10)
## [1] 55

FACTOR

f<-factor(c("S", "M", "M", "S", "L", "M", "XL"))
f
## [1] S  M  M  S  L  M  XL
## Levels: L M S XL
ff<-factor(c("S", "M", "M", "S", "M", "XL"), levels = c("S", "M", "L", "XL"))
ff
## [1] S  M  M  S  M  XL
## Levels: S M L XL
f1<-factor(LETTERS[1:5], levels = rev(LETTERS[1:5]))
f1
## [1] A B C D E
## Levels: E D C B A

DATA FRAMES

df<-data.frame(
  name=c("Anh", "An", "lan", "Thanh"),
  sex=c("FM", "M", "FM", "M"),
  ad=c("Thai Nguyen", "Ha Noi", "Thai Nguyen", "Hai Phong"),
  gpa=c(10, 7, 9, 8)
)  
df
##    name sex          ad gpa
## 1   Anh  FM Thai Nguyen  10
## 2    An   M      Ha Noi   7
## 3   lan  FM Thai Nguyen   9
## 4 Thanh   M   Hai Phong   8
df$name     #Truy cap theo cot
## [1] "Anh"   "An"    "lan"   "Thanh"
df$TB<-c("A", "C", "B", "B")      #Them cot
df
##    name sex          ad gpa TB
## 1   Anh  FM Thai Nguyen  10  A
## 2    An   M      Ha Noi   7  C
## 3   lan  FM Thai Nguyen   9  B
## 4 Thanh   M   Hai Phong   8  B
df$gpa[2]<-3
df
##    name sex          ad gpa TB
## 1   Anh  FM Thai Nguyen  10  A
## 2    An   M      Ha Noi   3  C
## 3   lan  FM Thai Nguyen   9  B
## 4 Thanh   M   Hai Phong   8  B
df[df$gpa>7, ]
##    name sex          ad gpa TB
## 1   Anh  FM Thai Nguyen  10  A
## 3   lan  FM Thai Nguyen   9  B
## 4 Thanh   M   Hai Phong   8  B

DEALING WITH MISSING VALUES

n<-c(1, 2, 3, NA, 5)
n
## [1]  1  2  3 NA  5
any(is.na(n))             #ktra xem co NA ko
## [1] TRUE
sum(n, na.rm = TRUE)      #bo qua NA
## [1] 11
n[!is.na(n)]              #giu pt ko phai NA
## [1] 1 2 3 5
n
## [1]  1  2  3 NA  5
df<-data.frame(
  name=c("Anh", "Ngoc", "Minh"),
  age=c(20, 21, NA)
)
df
##   name age
## 1  Anh  20
## 2 Ngoc  21
## 3 Minh  NA
rowSums(is.na(df))
## [1] 0 0 1
na.omit(df)             #xoa dong co NA
##   name age
## 1  Anh  20
## 2 Ngoc  21

DATA PIPELINES (OR POINT LESS PROGRAMMING)

library(magrittr)            #library(magrittr)
n<-c(1, 2, 3, 4, NA, 6, 7, NA)
sum(na.omit(n))            #ko dung pinelines
## [1] 23
n%>%                       #dung pinelines
  na.omit()%>%
  sum()
## [1] 23
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df<-data.frame(
  name=c("Anh", "Ngoc", "Minh", "Lan"),
  age=c(20, 21, NA, 22),
  score = c(8, NA, 7, 9)
)

#Tinh tbc diem, tbc tuoi
df%>%
  na.omit()%>%
  summarise(
    tb_age=mean(age),
    tb_d=mean(score)
  )
##   tb_age tb_d
## 1     21  8.5
#Ten nguoi co diem cao nhat
df%>%
  na.omit()%>%
  filter(score==max(score))%>%
  select(name, score)
##   name score
## 1  Lan     9
#Them cot
df%>%
  na.omit()%>%
  mutate(status=ifelse(score>=8, "Pass", "Fail"))
##   name age score status
## 1  Anh  20     8   Pass
## 4  Lan  22     9   Pass
#Magical dot (dai dien cho du lieu ben trai cua %>%)
df%>%
  na.omit()%>%
  .[.$age>=21, ]
##   name age score
## 4  Lan  22     9
df %>%
  na.omit() %>%
  lm(score ~ age, data = .)
## 
## Call:
## lm(formula = score ~ age, data = .)
## 
## Coefficients:
## (Intercept)          age  
##        -2.0          0.5

ANONYMOUS FUNCTION

rnorm(5)%>%
  data.frame(x=1:5, y=.^2)
##             . x            y
## 1 -0.32651439 1 0.1066116477
## 2 -0.01482197 2 0.0002196908
## 3  0.68759767 3 0.4727905546
## 4  0.85200107 4 0.7259058166
## 5  0.81095626 5 0.6576500486
library(dplyr)
df<-data.frame(
  name=c("Anh", "Ngoc", "Minh", "Lan"),
  age=c(20, 21, NA, 22),
  score = c(8, NA, 7, 9)
)

df %>%
  na.omit()%>%
  (function(d){
    d$status<-ifelse(d$score>=8, "PASS", "FAIL")
    d[d$age >= 22, ]
  })
##   name age score status
## 4  Lan  22     9   PASS

OTHER PINELINES OPERATIONS

library(magrittr)
d<-data.frame(
  x=c(1:4), 
  y=c(11:14)
)
#lay cot khoi data frame
d%$%
  mean(x)
## [1] 2.5
#xem giua chung ko lam gian doan pinelines
d %T>%
  print() %>%
  nrow()
##   x  y
## 1 1 11
## 2 2 12
## 3 3 13
## 4 4 14
## [1] 4
#ghi de
d %<>%
  subset(x > 1)

Bai 1: MEAN OF POSITIVE VALUES

rnorm(10)%>%
  {ifelse(.<0, NA, .)}%>%
  mean(na.rm=TRUE)
## [1] 0.9597316

Bai 2: ROOT MEAN SQUARE ERROR Nếu bạn có các giá trị thực

\[ t = (t_1, \ldots, t_n) \]

và các giá trị dự đoán

\[ y = (y_1, \ldots, y_n), \]

thì sai số căn trung bình bình phương được định nghĩa là:

\[ RMSE(t, y) = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (t_i - y_i)^2} \]

df <- data.frame(
  t = sample(1:10, 10, replace = TRUE),
  y = sample(1:10, 10, replace = TRUE)
)

df %>%
  { (.$t - .$y)^2 } %>%
  mean() %>%
  sqrt()
## [1] 3.646917