what is R? - https://zh.wikipedia.org/wiki/R%E8%AF%AD%E8%A8%80

R vs Python? - https://www.datacamp.com/community/tutorials/r-or-python-for-data-analysis - https://www.kdnuggets.com/2019/05/poll-top-data-science-machine-learning-platforms.html

Kaggle - https://www.kaggle.com/

UCI dataset - https://archive.ics.uci.edu/ml/datasets.html

電子檔講義 - https://www.dropbox.com/s/bhqmpndutktw401/201806%20-%20R%E8%AA%9E%E8%A8%80.pdf?dl=0

R intro

RDemo

#使用範例資料
data(anscombe)
#使用資料中x1,y1變數畫出點散布圖
plot(y1 ~ x1, data = anscombe)
#建立回歸模型並assign到lmfit變數中
lmfit <- lm(y1~x1, data=anscombe) 
#在點散佈圖上加上迴歸線
abline(lmfit, col="red")

Basic type

  • numeric: 1,2,1.2
  • integer: 1L,2L,3L
  • character: “string”
  • logical: TRUE,FALSE,T,F
  • complex: 1+4i
  • date: “2018-05-01”
  • posixct, posixlt: “2018-05-01 08:00:00 CST”

Basic Objects

(由相同資料型態組成)
atomic:
  • vector
  • matrix
  • factor
(可以有混合的資料型態)
recursive:
  • dataframe
  • list

R basic command

#文件查詢
help(package="base")
?base::sum
?sum
help.search("sum")
??sum

#範例演釋
demo()
#使用內建資料集
data()
#看現有變數
ls()
## [1] "anscombe" "lmfit"
#移除變數
rm()

x = c(1,2,3)
#查看資料型態
class(x)
## [1] "numeric"
#查看資料結構
str(x)
##  num [1:3] 1 2 3

Basic computing

3+8
## [1] 11
3-8
## [1] -5
3*8
## [1] 24
3/8
## [1] 0.375
3^2
## [1] 9
11%%2
## [1] 1
3<4
## [1] TRUE
2==5
## [1] FALSE
T == TRUE
## [1] TRUE

Assignment

a = 3
a <- 3 # ( alt + - )
assign("a",3)

a / 2
## [1] 1.5
a = a / 2
a
## [1] 1.5

Vector

  • R語言最基本的物件
character(5)  ## character vector of length 5
## [1] "" "" "" "" ""
numeric(5)
## [1] 0 0 0 0 0
logical(5)
## [1] FALSE FALSE FALSE FALSE FALSE
x = c(1,2,3,7)
y= c(2,3,5,1)
x+y
## [1] 3 5 8 8
x*y
## [1]  2  6 15  7
x-y
## [1] -1 -1 -2  6
x/y
## [1] 0.5000000 0.6666667 0.6000000 7.0000000
x = c(1,2,3,7)
x + 10
## [1] 11 12 13 17
x + c(10)
## [1] 11 12 13 17
x + c(1,2)
## [1] 2 4 4 9
x + c(1,2,1,2)
## [1] 2 4 4 9
x == c(1,99,3,4)
## [1]  TRUE FALSE  TRUE FALSE
c(1,2,3)
## [1] 1 2 3
c(2,T,3+0i,"one")
## [1] "2"    "TRUE" "3+0i" "one"
c(2,T,3+0i)
## [1] 2+0i 1+0i 3+0i
c(c(1,2,3,4),c(5))
## [1] 1 2 3 4 5
x = c(1,2,3,4,NA)
sum(x)
## [1] NA
sum(x, na.rm=T)
## [1] 10
x = c(1,2,3,4,NA)
is.na(x)
## [1] FALSE FALSE FALSE FALSE  TRUE
sum(x[!is.na(x)])
## [1] 10
height_vec = c(180,169,173)
height_vec
## [1] 180 169 173
names(height_vec) = c("Brian", "Toby", "Sherry")
height_vec
##  Brian   Toby Sherry 
##    180    169    173
name_vec = c("Brian", "Toby", "Sherry")
names(height_vec) = name_vec
height_vec > 175
##  Brian   Toby Sherry 
##   TRUE  FALSE  FALSE
height_vec / 100
##  Brian   Toby Sherry 
##   1.80   1.69   1.73
height_vec > 175 | height_vec < 170
##  Brian   Toby Sherry 
##   TRUE   TRUE  FALSE
height_vec < 175 & height_vec > 170
##  Brian   Toby Sherry 
##  FALSE  FALSE   TRUE
#R 的index從1開始
height_vec[c(1)] #index
## Brian 
##   180
height_vec['Brian'] #element name
## Brian 
##   180
height_vec[height_vec > 175] #condition (boolean vector)
## Brian 
##   180

p38 example

h = c(180,169,173)
w = c(73,87,43)
bmi = w / ((h/100)^2)
names(bmi) = c("Brian", "Toby", "Sherry")
bmi < 18.5 | bmi >= 24
##  Brian   Toby Sherry 
##  FALSE   TRUE   TRUE
bmi[bmi < 18.5 | bmi >= 24]
##     Toby   Sherry 
## 30.46112 14.36734

seq() & rep() & paste()

1:20
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
seq(1,20)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
20:1
##  [1] 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1
?seq
seq(from=1,to=20,by=2)
##  [1]  1  3  5  7  9 11 13 15 17 19
seq(from=1,to=20,length=2)
## [1]  1 20
rep(1,5)
## [1] 1 1 1 1 1
?rep
rep(x=c(1,2), times=5)
##  [1] 1 2 1 2 1 2 1 2 1 2
rep(x=c(1,2), times=c(1,2))
## [1] 1 2 2
rep(x=c(1,2), each=5)
##  [1] 1 1 1 1 1 2 2 2 2 2
rep(x=c(1,2), length=5)
## [1] 1 2 1 2 1
rep_len(x=c(1,2),length.out = 5)
## [1] 1 2 1 2 1
paste("the","big","bang","theory")
## [1] "the big bang theory"
paste("big","bang",sep="-")
## [1] "big-bang"
length(paste("the","big","bang","theory"))
## [1] 1
paste("big","bang",sep="")
## [1] "bigbang"
paste("big","bang",sep=";")
## [1] "big;bang"
paste(c("big","bang"),1:2)
## [1] "big 1"  "bang 2"
paste(c("big","bang"),1:2,collapse = "+" )
## [1] "big 1+bang 2"
length(paste(c("big","bang"),1:4,collapse = "+" ))
## [1] 1

Matrix

matrix(1:9, byrow=TRUE, nrow=3)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9
matrix(1:9, nrow=3)
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
kevin = c(85,73)
marry = c(72,64)
jerry = c(59,66)
mat = matrix(c(kevin, marry, jerry), nrow=3, byrow= TRUE)
colnames(mat) = c('first', 'second')
rownames(mat) = c('kevin', 'marry', 'jerry')
mat
##       first second
## kevin    85     73
## marry    72     64
## jerry    59     66
#取得矩陣維度
dim(mat)
## [1] 3 2
#取得矩陣列數
nrow(mat)
## [1] 3
#取得矩陣欄數
ncol(mat)
## [1] 2
#矩陣轉置(transpose)
t(mat)
##        kevin marry jerry
## first     85    72    59
## second    73    64    66
#取第一列
mat[1,]
##  first second 
##     85     73
#取第一行
mat[,1]
## kevin marry jerry 
##    85    72    59
#取第一、二列
mat[1:2,]
##       first second
## kevin    85     73
## marry    72     64
#取kevin和jerry成績
mat[c('kevin','jerry'),]
##       first second
## kevin    85     73
## jerry    59     66
#取kevin和jerry成績的第一次考試成績
mat[c('kevin','jerry'),'first']
## kevin jerry 
##    85    59
#取得第一次考試成績不及格的人
mat[mat[,1] < 60,'first']
## [1] 59

Matrix(續)

#新增列與行
mat2 = rbind(mat, c(78,63))
rownames(mat2)[nrow(mat2)] = 'sam'
mat2
##       first second
## kevin    85     73
## marry    72     64
## jerry    59     66
## sam      78     63
mat3 = cbind(mat2,c(82,77,70,64))
colnames(mat3)[ncol(mat3)] = 'third'
mat3
##       first second third
## kevin    85     73    82
## marry    72     64    77
## jerry    59     66    70
## sam      78     63    64
rowMeans(mat3)
##    kevin    marry    jerry      sam 
## 80.00000 71.00000 65.00000 68.33333
colMeans(mat3)
##  first second  third 
##  73.50  66.50  73.25
# arithmetic
m1 = matrix(1:4, byrow=TRUE, nrow=2)
m2 = matrix(5:8, byrow=TRUE, nrow=2)

m1 + m2
##      [,1] [,2]
## [1,]    6    8
## [2,]   10   12
m1 - m2
##      [,1] [,2]
## [1,]   -4   -4
## [2,]   -4   -4
m1 * m2
##      [,1] [,2]
## [1,]    5   12
## [2,]   21   32
m1 / m2
##           [,1]      [,2]
## [1,] 0.2000000 0.3333333
## [2,] 0.4285714 0.5000000
m1 %*% m2
##      [,1] [,2]
## [1,]   19   22
## [2,]   43   50

Factor

# syntax
weather= c("sunny","rainy", "cloudy", "rainy", "cloudy")
weather_category = factor(weather)
weather_category
## [1] sunny  rainy  cloudy rainy  cloudy
## Levels: cloudy rainy sunny
class(weather)
## [1] "character"
class(weather_category)
## [1] "factor"
levels(weather_category)
## [1] "cloudy" "rainy"  "sunny"
# order
temperature = c("Low", "High", "High", "Medium", "Low", "Medium")
temperature_category = factor(temperature, order = TRUE, levels = c("Low", "Medium", "High"))
temperature_category
## [1] Low    High   High   Medium Low    Medium
## Levels: Low < Medium < High
temperature_category[3] > temperature_category[1]
## [1] TRUE
temperature_category[4] > temperature_category[3]
## [1] FALSE
# change levels name
weather= c("s","r", "c", "r", "c")
weather_factor = factor(weather)
levels(weather_factor) = c("cloudy","rainy","sunny")
weather_factor
## [1] sunny  rainy  cloudy rainy  cloudy
## Levels: cloudy rainy sunny
#type priority
c("string",1+2i,5.5,TRUE)
## [1] "string" "1+2i"   "5.5"    "TRUE"
c(1+2i,5.5,TRUE)
## [1] 1.0+2i 5.5+0i 1.0+0i
c(5.5,TRUE,FALSE)
## [1] 5.5 1.0 0.0