R 常用技巧

Jying-Nan Wang

2016-10-05

關係式表達 (1):單一變數

# 布林值比較
TRUE==FALSE
## [1] FALSE
# 數值比較 (不等於)
-6 * 4 != 16-101
## [1] TRUE
# 比較字串
"useR"=="user"
## [1] FALSE
# 布林值與數值
TRUE==1
## [1] TRUE
FALSE==0
## [1] TRUE

關係式表達 (2) : 向量與矩陣

# 建立兩個向量 linkedin and facebook,表示兩種社交媒體中之人氣
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

# 查看linkedin的人氣有沒有超過15
linkedin > 15
## [1]  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
# 查看linkedin的人氣有沒有低於5
linkedin <= 5
## [1] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE
# Linkedin是否比Facebook更有人氣
linkedin > facebook
## [1] FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE
# 合併為一個矩陣
views <- matrix(c(linkedin, facebook), nrow = 2, byrow = TRUE)

# 檢查矩陣的內容
views <= 14
##       [,1] [,2] [,3]  [,4] [,5]  [,6] [,7]
## [1,] FALSE TRUE TRUE  TRUE TRUE FALSE TRUE
## [2,] FALSE TRUE TRUE FALSE TRUE  TRUE TRUE

關係式表達 (3) : and (&) 和 or (|)

TRUE & TRUE
## [1] TRUE
FALSE | TRUE
## [1] TRUE
5 <= 5 & 2 < 3
## [1] TRUE
3 < 4 | 7 < 6
## [1] TRUE

關係式表達 (4) : and (&) 和 or (|)

# linkedin exceeds 10 but facebook below 10
linkedin >10 & facebook <10
## [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
# When were one or both visited at least 12 times?
linkedin>=12 | facebook >=12
## [1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
# When is views between 11 (exclusive) and 14 (inclusive)?
views >11 & views<=14
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6] [,7]
## [1,] FALSE FALSE  TRUE FALSE FALSE FALSE TRUE
## [2,] FALSE FALSE FALSE FALSE FALSE  TRUE TRUE

If Structures

If…Else Structures 是撰寫程式必備的技巧之一,可以控制在不同的情況下讓電腦執行不同的程式

if (condition) {
  expr
}

範例:

x <- 10
if(x > 8)  # 如果 x > 8 則執行 {} 內的程式
{
 print("x is larger than 8")
}
## [1] "x is larger than 8"

If…Else Structures

if…else架構如下:

if (condition1) {
  expr1
} else if (condition2) {
  expr2
} else if (condition3) {
  expr3
} else {
  expr4
}

範例:

x <- 10
if(x > 8) { 
# 如果 x > 8,執行以下程式
 print("x is larger than 8")
} else if(x > 6) { 
# 如果 x > 6 且 x <= 8,執行以下程式
   print("x is larger than 6, but not larger than 8")
} else { 
# 其它所有情況下執行以下程式
  print("x is not larger than 6")
}
## [1] "x is larger than 8"

範例:

number <- 4
if (number < 10) {
  if (number < 5) {
    result <- "extra small"
  } else {
    result <- "small"
  }
} else if (number < 100) {
  result <- "medium"
} else {
  result <- "large"
}
#print(result)

while loop (1)

The while loop can be used to repeat a set of instructions, and it is often used when you do not know in advance how often the instructions will be executed.

while (condition) {
  expr
}

範例:

# 設定 speed的教值
speed <- 100

# Code the while loop
while (speed>30) {
  print("慢一點!")
  speed <- speed-18
}
## [1] "慢一點!"
## [1] "慢一點!"
## [1] "慢一點!"
## [1] "慢一點!"
print(speed)
## [1] 28

while loop (2): 结合if structure

# 設定 speed的教值
speed <- 100

while (speed > 30) {
  print(paste("現在速度是",speed))
  if (speed>48) {
    print("剎車踩大力一點!")
    speed <- speed -15
  } else {
    print("小力踩剎車!")
    speed <- speed -6
  }
}
## [1] "現在速度是 100"
## [1] "剎車踩大力一點!"
## [1] "現在速度是 85"
## [1] "剎車踩大力一點!"
## [1] "現在速度是 70"
## [1] "剎車踩大力一點!"
## [1] "現在速度是 55"
## [1] "剎車踩大力一點!"
## [1] "現在速度是 40"
## [1] "小力踩剎車!"
## [1] "現在速度是 34"
## [1] "小力踩剎車!"

while loop (3): break

The break statement is a control statement. When R encounters it, the while loop is abandoned completely.

範例:

# 設定 speed的教值
speed <- 120

while (speed > 30) {
  print(paste("你的速度是", speed))
  # 當速度超過100時離開loop
  if (speed>100) {
    print("剎不住了!!")
    break
  }
  if (speed>48) {
    print("剎車踩大力一點!")
    speed <- speed -15
  } else {
    print("小力踩剎車!")
    speed <- speed -6
  }
}
## [1] "你的速度是 120"
## [1] "剎不住了!!"

範例:猜猜看結果是什麼?

i <- 1
while (i <= 30) {
  print(i)
  if (i%%5==0) {
    break
  }
  i <- i*2 + 1
}

for Loops (1)

The for loop can be used to repeat a set of instructions, and it is used when you know in advance the values that the loop variable will have each time it goes through the loop.

範例:讀取vector

# 向量 linkedin 
linkedin <- c(16, 29, 16, 5, 2)

# Loop version 1
for (i in linkedin){
    print(i)
}
## [1] 16
## [1] 29
## [1] 16
## [1] 5
## [1] 2
# Loop version 2
for (i in 1:length(linkedin)){
    print(linkedin[i])
}
## [1] 16
## [1] 29
## [1] 16
## [1] 5
## [1] 2

範例:讀取list

primes_list <- list(2, 3, 5, 7, 11, 13)

# loop version 1
for (p in primes_list) {
  print(p)
}
## [1] 2
## [1] 3
## [1] 5
## [1] 7
## [1] 11
## [1] 13
# loop version 2
for (i in 1:length(primes_list)) {
  print(primes_list[[i]])
}
## [1] 2
## [1] 3
## [1] 5
## [1] 7
## [1] 11
## [1] 13

for Loops (2): 兩層for loop

x1 <- matrix(c("A","B","C","D","E","F"),nrow=2,byrow = TRUE)
x1
##      [,1] [,2] [,3]
## [1,] "A"  "B"  "C" 
## [2,] "D"  "E"  "F"
for (i in 1:nrow(x1)){
  for (j in 1:ncol(x1)){
    print(paste("第 ",i," 列和第 ",j," 行是 ",x1[i,j],sep=""))
  }
}
## [1] "第 1 列和第 1 行是 A"
## [1] "第 1 列和第 2 行是 B"
## [1] "第 1 列和第 3 行是 C"
## [1] "第 2 列和第 1 行是 D"
## [1] "第 2 列和第 2 行是 E"
## [1] "第 2 列和第 3 行是 F"

for Loops (3): break

# The linkedin vector has already been defined for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)

# Extend the for loop
for (li in linkedin) {
  if (li > 10) {
    print("You're popular!")
  } else {
    print("Be more visible!")
  }
  
  # Add if statement with break
  if (li>16) {
      print("This is ridiculous, I'm outta here!")
      break
  }
  
  # Add if statement with next
  if (li<5){
      print("This is too embarrassing!")
      next
  }
  
  print(li)
}
## [1] "You're popular!"
## [1] 16
## [1] "Be more visible!"
## [1] 9
## [1] "You're popular!"
## [1] 13
## [1] "Be more visible!"
## [1] 5
## [1] "Be more visible!"
## [1] "This is too embarrassing!"
## [1] "You're popular!"
## [1] "This is ridiculous, I'm outta here!"

for Loops (4): 字串處理

# Pre-defined variables
rquote <- "r's internals are irrefutably intriguing"
chars <- strsplit(rquote, split = "")[[1]]

# Initialize rcount
rcount <- 0

# Finish the for loop
for (char in chars) {
  if (char=="r"){
      rcount <- rcount+1
  }
  if(char=="u"){
      break
  }
}

# Print out rcount
print(rcount)
## [1] 5

計算從 1+2+3+…+100 的值

 x=0
 for (i in 1:100)
 {
 x <- x+i
 }
 show(x)
## [1] 5050

Question

Answer

 x <- 0
 for (i in seq(1,101,by=2))
 {
 x <- x+i
 }
 show(x)
## [1] 2601
 x <- 0
 for (i in seq(1,101,by=2))
 {
  if (i%%2==1) {x <- x+i}
 }
 show(x)
## [1] 2601

Questions: Central limit theorem (C.L.T)

Answers: Code

data <- c(1,1,1,5,5,8,9,13,15,15,15,15)
result <- mean(sample(data, n, replace=T))

for (i in 1:1000){
  x <- sample(data, n, replace=T)
  result <- c(result,mean(x))
}

hist(result, col = "gray")
#qqnorm(result)

Answers: n=1

Answers: n=3

Answers: n=50

Functions (1)

查詢函數功能

help(mean)
?mean

查詢函數參數

args(mean)
## function (x, ...) 
## NULL
args(sample)
## function (x, size, replace = FALSE, prob = NULL) 
## NULL

Functions (2): 使用函數

# 設定 linkedin 和 facebook
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)

# 計算其平均數
mean_li <- mean(linkedin)
mean_fb <- mean(facebook)

# 列印結果
print(mean_li)
## [1] 10.85714
print(mean_fb)
## [1] 11.42857

Functions (3): 使用函數

# 設定 linkedin 
linkedin <- c(16, NA, 13, 5, 2, 17, 14)
# 計算其平均數
mean(linkedin)
## [1] NA
#移除NA值
mean(linkedin,na.rm=TRUE)
## [1] 11.16667

Functions (4): Write your own function

若處理資料的過程中,有重覆或複雜性的工作,我們可以自己撰寫函數來幫助我們完成任務。 函數 (function)的架構如下:

my_fun <- function(arg1, arg2) {
  body
}

範例:

# 創造一個函數,命名為 hello()
hello <- function(){
    print("您好!")
    return(TRUE)
}

# 測試一下
hello()
## [1] "您好!"
## [1] TRUE
# 創造一個函數,命名為 pow_two()
pow_two <- function(a){
    pow_two <- a^2
}
# 測試一下
pow_two(12)

# 創造一個函數,命名為 sum_abs()
sum_abs <- function(x,y){
    abs(x)+abs(y)
}
# 測試一下
sum_abs(-2, 3)
## [1] 5

(註) 函數中,回傳值的方式有很多種:

Functions (5): Write your own function

函數參數可以設定其 deafult value,其架構如下:

my_fun <- function(arg1, arg2 = val2) {
  body
}

範例

pow_two <- function(x, print_info=TRUE) {
  y <- x ^ 2
  if (print_info){
  print(paste(x, "to the power two equals", y))
  }
  return(y)
}
pow_two(10)
## [1] "10 to the power two equals 100"
## [1] 100
pow_two(10, FALSE)
## [1] 100

Probability Distributions (機率分配)

probability density function

\[ p(x) <- \frac{1}{\sigma \sqrt{2 \pi}} e^{-\frac{(x-\mu)^2}{2 \sigma^2}} \]

dnorm(0)
## [1] 0.3989423
dnorm(0)*sqrt(2*pi)
## [1] 1
dnorm(0,mean=4,sd=10)
## [1] 0.03682701
v <- c(0,1,2)
dnorm(v)
## [1] 0.39894228 0.24197072 0.05399097

Plot N(0,1)

x <- seq(-20,20,by=.1)
y <- dnorm(x)
plot(x,y)

Plot N(1,3)

x <- seq(-20,20,by=.1)
y <- dnorm(x,mean=1,sd=3)
plot(x,y,type="l")

Cumulative density function (CDF)

pnorm(0)
## [1] 0.5
pnorm(1)
## [1] 0.8413447
pnorm(0,mean=2)
## [1] 0.02275013
pnorm(0,mean=2,sd=3)
## [1] 0.2524925

Plot CDF of Normal Distribution

x <- seq(-5,5,by=.05)
y <- pnorm(x)
plot(x,y,type="l")

Inverse CDF (quantiles)

qnorm(0.5)
## [1] 0
qnorm(0.5,mean=1)
## [1] 1
qnorm(0.5,mean=1,sd=2)
## [1] 1
qnorm(0.25,mean=2,sd=2)
## [1] 0.6510205

Randomly generated numbers (Normal)

rnorm(4)
## [1]  0.44268722  0.64946309 -0.04129442 -0.04362132
rnorm(4,mean=3,sd=3)
## [1]  2.949999  3.159914 -1.796055  2.050207

Randomly generated numbers (Normal)

y <- rnorm(500)
hist(y)

Question: Monte Carlo Simulation

Answer

# 圓面積的估計
n <- 5000
x <- runif(n, min = -1, max = 1)
y <- runif(n, min = -1, max = 1)
r <- x^2+y^2
length(r[r<1])/n*4
## [1] 3.1496

apply 函數家族 (1): lapply()

lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X.

lapply(X, FUN, ...)
# ...   optional arguments to FUN.

範例:

x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# compute the list mean for each list element
lapply(x, mean)
## $a
## [1] 5.5
## 
## $beta
## [1] 4.535125
## 
## $logic
## [1] 0.5
# median and quartiles for each list element
lapply(x, quantile, probs = 1:3/4)
## $a
##  25%  50%  75% 
## 3.25 5.50 7.75 
## 
## $beta
##       25%       50%       75% 
## 0.2516074 1.0000000 5.0536690 
## 
## $logic
## 25% 50% 75% 
## 0.0 0.5 1.0

apply 函數家族 (2): lapply()

# Write function select_second()
select_second <- function(x) {
  x[2]
}
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# 使用 select_second() 取得需要的資料
lapply(x, select_second)
## $a
## [1] 2
## 
## $beta
## [1] 0.1353353
## 
## $logic
## [1] FALSE

apply 函數家族 (3): lapply()

# Generic select function
select_el <- function(x, index) {
  x[index]
}
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# 使用 select_second() 取得需要的資料
lapply(x, select_el, index=2)
## $a
## [1] 2
## 
## $beta
## [1] 0.1353353
## 
## $logic
## [1] FALSE

apply 函數家族 (4): sapply()

You can use sapply() similar to how you used lapply().

sapply(X, FUN, ...)

範例:See how lapply() and sapply() differ

temp <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))

# Use lapply() to find each factor's minimum
lapply(temp, min)
## $a
## [1] 1
## 
## $beta
## [1] 0.04978707
## 
## $logic
## [1] 0
# Use sapply() to find each factor's minimum
sapply(temp,min)
##          a       beta      logic 
## 1.00000000 0.04978707 0.00000000
# Use lapply() to find each factor's maximum 
lapply(temp, max)
## $a
## [1] 10
## 
## $beta
## [1] 20.08554
## 
## $logic
## [1] 1
# Use sapply() to find each factor's maximum
sapply(temp, max)
##        a     beta    logic 
## 10.00000 20.08554  1.00000

apply 函數家族 (5)

extremes <- function(x) {
  c(min = min(x), max = max(x))
}

temp <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# Apply extremes() over temp with sapply()
sapply(temp, extremes)
##      a        beta logic
## min  1  0.04978707     0
## max 10 20.08553692     1
# Apply extremes() over temp with lapply()
lapply(temp, extremes)
## $a
## min max 
##   1  10 
## 
## $beta
##         min         max 
##  0.04978707 20.08553692 
## 
## $logic
## min max 
##   0   1

其它常用技巧(1): 數值函數

範例:

errors <- c(1.9, -2.6, 4.0, -9.5, -3.4, 7.3)
sum(abs(round(errors)))
## [1] 29

其它常用技巧(2): 資料處理函數

範例:

x <- seq(1,20,by=2)
y <- append(x,rep(c(4,6),2))
sort(y)
##  [1]  1  3  4  4  5  6  6  7  9 11 13 15 17 19
sort(y,decreasing = TRUE)
##  [1] 19 17 15 13 11  9  7  6  6  5  4  4  3  1
list_y<-as.list(y)
str(list_y)
## List of 14
##  $ : num 1
##  $ : num 3
##  $ : num 5
##  $ : num 7
##  $ : num 9
##  $ : num 11
##  $ : num 13
##  $ : num 15
##  $ : num 17
##  $ : num 19
##  $ : num 4
##  $ : num 6
##  $ : num 4
##  $ : num 6
vector_y <- unlist(y)
str(vector_y)
##  num [1:14] 1 3 5 7 9 11 13 15 17 19 ...

其它常用技巧(3): Regular Expressions

其它常用技巧(4): Regular Expressions

範例

emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")
# Use grepl() to match for "edu"
grepl("edu",emails)
## [1]  TRUE  TRUE FALSE  TRUE  TRUE FALSE
# Use grep() to match for "edu", save result to hits
hits <- grep("edu",emails)
# Subset emails using hits
emails[hits]
## [1] "john.doe@ivyleague.edu"   "education@world.gov"     
## [3] "invalid.edu"              "quant@bigdatacollege.edu"

其它常用技巧(5): Regular Expressions

範例

emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")
# Use grepl() to match for .edu addresses more robustly
grepl("@.*\\.edu$",emails)
## [1]  TRUE FALSE FALSE FALSE  TRUE FALSE
# Use grep() to match for .edu addresses more robustly, save result to hits
hits <- grep("@.*\\.edu$",emails)

# Subset emails using hits
emails[hits]
## [1] "john.doe@ivyleague.edu"   "quant@bigdatacollege.edu"

說明:

其它常用技巧(6): Regular Expressions

emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org", 
            "invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")

# Use sub() to convert the email domains to test.edu
sub("@.*\\.edu$","@test.edu",emails)
## [1] "john.doe@test.edu"        "education@world.gov"     
## [3] "dalai.lama@peace.org"     "invalid.edu"             
## [5] "quant@test.edu"           "cookie.monster@sesame.tv"

Question

若取得以下電影review字串資料 (from IMDB),我們想知道覺得該評論有用的比率是多少?

mydata <- c("3061 out of 4024 people found the following review useful",
            "1061 out of 2424 people found the following review useful",
            "61 out of 124 people found the following review useful",
            "9 out of 24 people found the following review useful")

Answer

mydata <- c("3061 out of 4024 people found the following review useful",
            "1061 out of 2424 people found the following review useful",
            "61 out of 124 people found the following review useful",
            "9 out of 24 people found the following review useful")
x1 <- as.numeric(sub(" out.*useful","",mydata))
y1 <- sub(" people found the following review useful","",mydata)
y1 <- as.numeric(sub(".*of ","",y1))
rating <- x1/y1
print(rating)
## [1] 0.7606859 0.4377063 0.4919355 0.3750000

其它常用技巧(7): 日期與時間

# Get the current date: today
today <- Sys.Date()

# See what today looks like under the hood
unclass(today)
## [1] 17079
# Get the current time: now
now <- Sys.time()

# See what now looks like under the hood
unclass(now)
## [1] 1475605132

其它常用技巧(8): 日期與時間

To create a Date object from a simple character string in R, you can use the as.Date() function.

# Definition of character strings representing dates
str1 <- "2012-03-15"
str2 <- "30/5/2016"

# Convert the strings to dates: date1, date2
date1 <- as.Date(str1, format="%Y-%m-%d")
date2 <- as.Date(str2, format="%d/%m/%Y")

# Convert dates to formatted strings
format(date1, "%A")
## [1] "周四"
format(date2, "%d")
## [1] "30"
format(date2, "%b %Y")
## [1] " 5 2016"

其它常用技巧(9): 日期與時間

# a useful package for creating a Date object
library(lubridate)
# Definition of character strings representing dates
str1 <- "2012-03-15"
str2 <- "30/5/2016"

ymd(str1)
## [1] "2012-03-15"
dmy(str2)
## [1] "2016-05-30"