Jying-Nan Wang
2016-10-05
# 布林值比較
TRUE==FALSE## [1] FALSE
# 數值比較 (不等於)
-6 * 4 != 16-101## [1] TRUE
# 比較字串
"useR"=="user"## [1] FALSE
# 布林值與數值
TRUE==1## [1] TRUE
FALSE==0## [1] TRUE
# 建立兩個向量 linkedin and facebook,表示兩種社交媒體中之人氣
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)
# 查看linkedin的人氣有沒有超過15
linkedin > 15## [1] TRUE FALSE FALSE FALSE FALSE TRUE FALSE
# 查看linkedin的人氣有沒有低於5
linkedin <= 5## [1] FALSE FALSE FALSE TRUE TRUE FALSE FALSE
# Linkedin是否比Facebook更有人氣
linkedin > facebook## [1] FALSE TRUE TRUE FALSE FALSE TRUE FALSE
# 合併為一個矩陣
views <- matrix(c(linkedin, facebook), nrow = 2, byrow = TRUE)
# 檢查矩陣的內容
views <= 14## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## [2,] FALSE TRUE TRUE FALSE TRUE TRUE TRUE
TRUE & TRUE## [1] TRUE
FALSE | TRUE## [1] TRUE
5 <= 5 & 2 < 3## [1] TRUE
3 < 4 | 7 < 6## [1] TRUE
# linkedin exceeds 10 but facebook below 10
linkedin >10 & facebook <10## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE
# When were one or both visited at least 12 times?
linkedin>=12 | facebook >=12## [1] TRUE FALSE TRUE TRUE FALSE TRUE TRUE
# When is views between 11 (exclusive) and 14 (inclusive)?
views >11 & views<=14## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## [2,] FALSE FALSE FALSE FALSE FALSE TRUE TRUE
If…Else Structures 是撰寫程式必備的技巧之一,可以控制在不同的情況下讓電腦執行不同的程式
if (condition) {
expr
}範例:
x <- 10
if(x > 8) # 如果 x > 8 則執行 {} 內的程式
{
print("x is larger than 8")
}## [1] "x is larger than 8"
if…else架構如下:
if (condition1) {
expr1
} else if (condition2) {
expr2
} else if (condition3) {
expr3
} else {
expr4
}範例:
x <- 10
if(x > 8) {
# 如果 x > 8,執行以下程式
print("x is larger than 8")
} else if(x > 6) {
# 如果 x > 6 且 x <= 8,執行以下程式
print("x is larger than 6, but not larger than 8")
} else {
# 其它所有情況下執行以下程式
print("x is not larger than 6")
}## [1] "x is larger than 8"
範例:
number <- 4
if (number < 10) {
if (number < 5) {
result <- "extra small"
} else {
result <- "small"
}
} else if (number < 100) {
result <- "medium"
} else {
result <- "large"
}
#print(result)The while loop can be used to repeat a set of instructions, and it is often used when you do not know in advance how often the instructions will be executed.
while (condition) {
expr
}範例:
# 設定 speed的教值
speed <- 100
# Code the while loop
while (speed>30) {
print("慢一點!")
speed <- speed-18
}## [1] "慢一點!"
## [1] "慢一點!"
## [1] "慢一點!"
## [1] "慢一點!"
print(speed)## [1] 28
# 設定 speed的教值
speed <- 100
while (speed > 30) {
print(paste("現在速度是",speed))
if (speed>48) {
print("剎車踩大力一點!")
speed <- speed -15
} else {
print("小力踩剎車!")
speed <- speed -6
}
}## [1] "現在速度是 100"
## [1] "剎車踩大力一點!"
## [1] "現在速度是 85"
## [1] "剎車踩大力一點!"
## [1] "現在速度是 70"
## [1] "剎車踩大力一點!"
## [1] "現在速度是 55"
## [1] "剎車踩大力一點!"
## [1] "現在速度是 40"
## [1] "小力踩剎車!"
## [1] "現在速度是 34"
## [1] "小力踩剎車!"
The break statement is a control statement. When R encounters it, the while loop is abandoned completely.
範例:
# 設定 speed的教值
speed <- 120
while (speed > 30) {
print(paste("你的速度是", speed))
# 當速度超過100時離開loop
if (speed>100) {
print("剎不住了!!")
break
}
if (speed>48) {
print("剎車踩大力一點!")
speed <- speed -15
} else {
print("小力踩剎車!")
speed <- speed -6
}
}## [1] "你的速度是 120"
## [1] "剎不住了!!"
範例:猜猜看結果是什麼?
i <- 1
while (i <= 30) {
print(i)
if (i%%5==0) {
break
}
i <- i*2 + 1
}The for loop can be used to repeat a set of instructions, and it is used when you know in advance the values that the loop variable will have each time it goes through the loop.
範例:讀取vector
# 向量 linkedin
linkedin <- c(16, 29, 16, 5, 2)
# Loop version 1
for (i in linkedin){
print(i)
}## [1] 16
## [1] 29
## [1] 16
## [1] 5
## [1] 2
# Loop version 2
for (i in 1:length(linkedin)){
print(linkedin[i])
}## [1] 16
## [1] 29
## [1] 16
## [1] 5
## [1] 2
範例:讀取list
primes_list <- list(2, 3, 5, 7, 11, 13)
# loop version 1
for (p in primes_list) {
print(p)
}## [1] 2
## [1] 3
## [1] 5
## [1] 7
## [1] 11
## [1] 13
# loop version 2
for (i in 1:length(primes_list)) {
print(primes_list[[i]])
}## [1] 2
## [1] 3
## [1] 5
## [1] 7
## [1] 11
## [1] 13
x1 <- matrix(c("A","B","C","D","E","F"),nrow=2,byrow = TRUE)
x1## [,1] [,2] [,3]
## [1,] "A" "B" "C"
## [2,] "D" "E" "F"
for (i in 1:nrow(x1)){
for (j in 1:ncol(x1)){
print(paste("第 ",i," 列和第 ",j," 行是 ",x1[i,j],sep=""))
}
}## [1] "第 1 列和第 1 行是 A"
## [1] "第 1 列和第 2 行是 B"
## [1] "第 1 列和第 3 行是 C"
## [1] "第 2 列和第 1 行是 D"
## [1] "第 2 列和第 2 行是 E"
## [1] "第 2 列和第 3 行是 F"
# The linkedin vector has already been defined for you
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
# Extend the for loop
for (li in linkedin) {
if (li > 10) {
print("You're popular!")
} else {
print("Be more visible!")
}
# Add if statement with break
if (li>16) {
print("This is ridiculous, I'm outta here!")
break
}
# Add if statement with next
if (li<5){
print("This is too embarrassing!")
next
}
print(li)
}## [1] "You're popular!"
## [1] 16
## [1] "Be more visible!"
## [1] 9
## [1] "You're popular!"
## [1] 13
## [1] "Be more visible!"
## [1] 5
## [1] "Be more visible!"
## [1] "This is too embarrassing!"
## [1] "You're popular!"
## [1] "This is ridiculous, I'm outta here!"
# Pre-defined variables
rquote <- "r's internals are irrefutably intriguing"
chars <- strsplit(rquote, split = "")[[1]]
# Initialize rcount
rcount <- 0
# Finish the for loop
for (char in chars) {
if (char=="r"){
rcount <- rcount+1
}
if(char=="u"){
break
}
}
# Print out rcount
print(rcount)## [1] 5
x=0
for (i in 1:100)
{
x <- x+i
}
show(x)## [1] 5050
x <- 0
for (i in seq(1,101,by=2))
{
x <- x+i
}
show(x)## [1] 2601
x <- 0
for (i in seq(1,101,by=2))
{
if (i%%2==1) {x <- x+i}
}
show(x)## [1] 2601
data <- c(1,1,1,5,5,8,9,13,15,15,15,15)
result <- mean(sample(data, n, replace=T))
for (i in 1:1000){
x <- sample(data, n, replace=T)
result <- c(result,mean(x))
}
hist(result, col = "gray")
#qqnorm(result)查詢函數功能
help(mean)
?mean查詢函數參數
args(mean)## function (x, ...)
## NULL
args(sample)## function (x, size, replace = FALSE, prob = NULL)
## NULL
# 設定 linkedin 和 facebook
linkedin <- c(16, 9, 13, 5, 2, 17, 14)
facebook <- c(17, 7, 5, 16, 8, 13, 14)
# 計算其平均數
mean_li <- mean(linkedin)
mean_fb <- mean(facebook)
# 列印結果
print(mean_li)## [1] 10.85714
print(mean_fb)## [1] 11.42857
# 設定 linkedin
linkedin <- c(16, NA, 13, 5, 2, 17, 14)
# 計算其平均數
mean(linkedin)## [1] NA
#移除NA值
mean(linkedin,na.rm=TRUE)## [1] 11.16667
若處理資料的過程中,有重覆或複雜性的工作,我們可以自己撰寫函數來幫助我們完成任務。 函數 (function)的架構如下:
my_fun <- function(arg1, arg2) {
body
}範例:
# 創造一個函數,命名為 hello()
hello <- function(){
print("您好!")
return(TRUE)
}
# 測試一下
hello()## [1] "您好!"
## [1] TRUE
# 創造一個函數,命名為 pow_two()
pow_two <- function(a){
pow_two <- a^2
}
# 測試一下
pow_two(12)
# 創造一個函數,命名為 sum_abs()
sum_abs <- function(x,y){
abs(x)+abs(y)
}
# 測試一下
sum_abs(-2, 3)## [1] 5
(註) 函數中,回傳值的方式有很多種:
函數參數可以設定其 deafult value,其架構如下:
my_fun <- function(arg1, arg2 = val2) {
body
}範例
pow_two <- function(x, print_info=TRUE) {
y <- x ^ 2
if (print_info){
print(paste(x, "to the power two equals", y))
}
return(y)
}
pow_two(10)## [1] "10 to the power two equals 100"
## [1] 100
pow_two(10, FALSE)## [1] 100
We look at some of the basic operations associated with probability distributions. There are a large number of probability distributions available, but we only look at a few.
If you would like to know what distributions are available you can do a search using the command help(Distributions).
\[ p(x) <- \frac{1}{\sigma \sqrt{2 \pi}} e^{-\frac{(x-\mu)^2}{2 \sigma^2}} \]
dnorm(0)## [1] 0.3989423
dnorm(0)*sqrt(2*pi)## [1] 1
dnorm(0,mean=4,sd=10)## [1] 0.03682701
v <- c(0,1,2)
dnorm(v)## [1] 0.39894228 0.24197072 0.05399097
x <- seq(-20,20,by=.1)
y <- dnorm(x)
plot(x,y)x <- seq(-20,20,by=.1)
y <- dnorm(x,mean=1,sd=3)
plot(x,y,type="l")pnorm(0)## [1] 0.5
pnorm(1)## [1] 0.8413447
pnorm(0,mean=2)## [1] 0.02275013
pnorm(0,mean=2,sd=3)## [1] 0.2524925
x <- seq(-5,5,by=.05)
y <- pnorm(x)
plot(x,y,type="l")qnorm(0.5)## [1] 0
qnorm(0.5,mean=1)## [1] 1
qnorm(0.5,mean=1,sd=2)## [1] 1
qnorm(0.25,mean=2,sd=2)## [1] 0.6510205
rnorm(4)## [1] 0.44268722 0.64946309 -0.04129442 -0.04362132
rnorm(4,mean=3,sd=3)## [1] 2.949999 3.159914 -1.796055 2.050207
y <- rnorm(500)
hist(y)# 圓面積的估計
n <- 5000
x <- runif(n, min = -1, max = 1)
y <- runif(n, min = -1, max = 1)
r <- x^2+y^2
length(r[r<1])/n*4## [1] 3.1496
lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X.
lapply(X, FUN, ...)
# ... optional arguments to FUN.範例:
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# compute the list mean for each list element
lapply(x, mean)## $a
## [1] 5.5
##
## $beta
## [1] 4.535125
##
## $logic
## [1] 0.5
# median and quartiles for each list element
lapply(x, quantile, probs = 1:3/4)## $a
## 25% 50% 75%
## 3.25 5.50 7.75
##
## $beta
## 25% 50% 75%
## 0.2516074 1.0000000 5.0536690
##
## $logic
## 25% 50% 75%
## 0.0 0.5 1.0
# Write function select_second()
select_second <- function(x) {
x[2]
}
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# 使用 select_second() 取得需要的資料
lapply(x, select_second)## $a
## [1] 2
##
## $beta
## [1] 0.1353353
##
## $logic
## [1] FALSE
# Generic select function
select_el <- function(x, index) {
x[index]
}
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# 使用 select_second() 取得需要的資料
lapply(x, select_el, index=2)## $a
## [1] 2
##
## $beta
## [1] 0.1353353
##
## $logic
## [1] FALSE
You can use sapply() similar to how you used lapply().
sapply(X, FUN, ...)範例:See how lapply() and sapply() differ
temp <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# Use lapply() to find each factor's minimum
lapply(temp, min)## $a
## [1] 1
##
## $beta
## [1] 0.04978707
##
## $logic
## [1] 0
# Use sapply() to find each factor's minimum
sapply(temp,min)## a beta logic
## 1.00000000 0.04978707 0.00000000
# Use lapply() to find each factor's maximum
lapply(temp, max)## $a
## [1] 10
##
## $beta
## [1] 20.08554
##
## $logic
## [1] 1
# Use sapply() to find each factor's maximum
sapply(temp, max)## a beta logic
## 10.00000 20.08554 1.00000
extremes <- function(x) {
c(min = min(x), max = max(x))
}
temp <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
# Apply extremes() over temp with sapply()
sapply(temp, extremes)## a beta logic
## min 1 0.04978707 0
## max 10 20.08553692 1
# Apply extremes() over temp with lapply()
lapply(temp, extremes)## $a
## min max
## 1 10
##
## $beta
## min max
## 0.04978707 20.08553692
##
## $logic
## min max
## 0 1
範例:
errors <- c(1.9, -2.6, 4.0, -9.5, -3.4, 7.3)
sum(abs(round(errors)))## [1] 29
範例:
x <- seq(1,20,by=2)
y <- append(x,rep(c(4,6),2))
sort(y)## [1] 1 3 4 4 5 6 6 7 9 11 13 15 17 19
sort(y,decreasing = TRUE)## [1] 19 17 15 13 11 9 7 6 6 5 4 4 3 1
list_y<-as.list(y)
str(list_y)## List of 14
## $ : num 1
## $ : num 3
## $ : num 5
## $ : num 7
## $ : num 9
## $ : num 11
## $ : num 13
## $ : num 15
## $ : num 17
## $ : num 19
## $ : num 4
## $ : num 6
## $ : num 4
## $ : num 6
vector_y <- unlist(y)
str(vector_y)## num [1:14] 1 3 5 7 9 11 13 15 17 19 ...
範例
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
"invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")
# Use grepl() to match for "edu"
grepl("edu",emails)## [1] TRUE TRUE FALSE TRUE TRUE FALSE
# Use grep() to match for "edu", save result to hits
hits <- grep("edu",emails)
# Subset emails using hits
emails[hits]## [1] "john.doe@ivyleague.edu" "education@world.gov"
## [3] "invalid.edu" "quant@bigdatacollege.edu"
範例
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
"invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")
# Use grepl() to match for .edu addresses more robustly
grepl("@.*\\.edu$",emails)## [1] TRUE FALSE FALSE FALSE TRUE FALSE
# Use grep() to match for .edu addresses more robustly, save result to hits
hits <- grep("@.*\\.edu$",emails)
# Subset emails using hits
emails[hits]## [1] "john.doe@ivyleague.edu" "quant@bigdatacollege.edu"
說明:
emails <- c("john.doe@ivyleague.edu", "education@world.gov", "dalai.lama@peace.org",
"invalid.edu", "quant@bigdatacollege.edu", "cookie.monster@sesame.tv")
# Use sub() to convert the email domains to test.edu
sub("@.*\\.edu$","@test.edu",emails)## [1] "john.doe@test.edu" "education@world.gov"
## [3] "dalai.lama@peace.org" "invalid.edu"
## [5] "quant@test.edu" "cookie.monster@sesame.tv"
若取得以下電影review字串資料 (from IMDB),我們想知道覺得該評論有用的比率是多少?
mydata <- c("3061 out of 4024 people found the following review useful",
"1061 out of 2424 people found the following review useful",
"61 out of 124 people found the following review useful",
"9 out of 24 people found the following review useful")mydata <- c("3061 out of 4024 people found the following review useful",
"1061 out of 2424 people found the following review useful",
"61 out of 124 people found the following review useful",
"9 out of 24 people found the following review useful")
x1 <- as.numeric(sub(" out.*useful","",mydata))
y1 <- sub(" people found the following review useful","",mydata)
y1 <- as.numeric(sub(".*of ","",y1))
rating <- x1/y1
print(rating)## [1] 0.7606859 0.4377063 0.4919355 0.3750000
# Get the current date: today
today <- Sys.Date()
# See what today looks like under the hood
unclass(today)## [1] 17079
# Get the current time: now
now <- Sys.time()
# See what now looks like under the hood
unclass(now)## [1] 1475605132
To create a Date object from a simple character string in R, you can use the as.Date() function.
# Definition of character strings representing dates
str1 <- "2012-03-15"
str2 <- "30/5/2016"
# Convert the strings to dates: date1, date2
date1 <- as.Date(str1, format="%Y-%m-%d")
date2 <- as.Date(str2, format="%d/%m/%Y")
# Convert dates to formatted strings
format(date1, "%A")## [1] "周四"
format(date2, "%d")## [1] "30"
format(date2, "%b %Y")## [1] " 5 2016"
# a useful package for creating a Date object
library(lubridate)
# Definition of character strings representing dates
str1 <- "2012-03-15"
str2 <- "30/5/2016"
ymd(str1)## [1] "2012-03-15"
dmy(str2)## [1] "2016-05-30"