library(ggplot2)
library(lattice)
library(tidyverse)
library(dplyr)Explain what does this statement do: lapply(lapply(search(), ls), length)
search() [1] ".GlobalEnv" "package:forcats" "package:stringr"
[4] "package:dplyr" "package:purrr" "package:readr"
[7] "package:tidyr" "package:tibble" "package:tidyverse"
[10] "package:lattice" "package:ggplot2" "package:stats"
[13] "package:graphics" "package:grDevices" "package:utils"
[16] "package:datasets" "package:methods" "Autoloads"
[19] "package:base"
#若想查詢目前所有被載入的套件列表,可以使用 search函數列出所有的搜尋路徑
#這個輸出中列出了 R 在搜尋變數的時候,會搜尋到的環境空間與順序,全域環境空間(.GlobalEnv)一定是排在第一位,接著就是最近載入的一些套件,R 的基本核心函數則是放在最後面。# lapply(search(), ls)
# ls()列出物件名稱
# lapply(search(), ls)循環列處目前所有套件的物件名稱lapply(lapply(search(), ls), length)[[1]]
[1] 0
[[2]]
[1] 37
[[3]]
[1] 52
[[4]]
[1] 290
[[5]]
[1] 178
[[6]]
[1] 114
[[7]]
[1] 77
[[8]]
[1] 44
[[9]]
[1] 6
[[10]]
[1] 151
[[11]]
[1] 531
[[12]]
[1] 449
[[13]]
[1] 87
[[14]]
[1] 113
[[15]]
[1] 247
[[16]]
[1] 104
[[17]]
[1] 203
[[18]]
[1] 0
[[19]]
[1] 1255
# length 函數來檢查向量的長度ls(.GlobalEnv)character(0)
length(ls(.GlobalEnv))[1] 0
length(.GlobalEnv)[1] 0
結論
lapply(lapply(search(), ls), length)
1.search結果有10個載入的套件
2.內層lapply(, ls)找出來10個套件的list
3.外層lapply(, length)數出所有套件其內含物件名稱的個數
#發現knit出來和rstudio的結果不一致?原因?
knitr::include_graphics("lapply knit.png")ls(.GlobalEnv)在knit後應該會是0。如果在本機處執行的結果則會取決於你當下Environment 存了哪些東西,按掃把清完Environment後,再執行一次就會是0。
Convert the R script in the NZ schools example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.
## keep the school names with white spaces
dta <- read.csv("D://nzSchools.csv", as.is=2)
str(dta) #2571 obs. of 6 variables'data.frame': 2571 obs. of 6 variables:
$ ID : int 1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
$ Name: chr "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
$ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
$ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
$ Dec : int 2 3 4 2 4 8 5 5 6 1 ...
$ Roll: int 318 200 455 86 577 329 637 395 438 201 ...
dim(dta) #各維度長度 dim(),先row後column[1] 2571 6
head(dta) ID Name City Auth Dec Roll
1 1015 Hora Hora School Whangarei State 2 318
2 1052 Morningside School Whangarei State 3 200
3 1062 Onerahi School Whangarei State 4 455
4 1092 Raurimu Avenue School Whangarei State 2 86
5 1130 Whangarei School Whangarei State 4 577
6 1018 Hurupaki School Whangarei State 8 329
#先知道Roll的median是193
median(dta$Roll)[1] 193
#列舉Roll的前6筆
head(dta$Roll)[1] 318 200 455 86 577 329
#如果Roll的各筆>Roll的median,標示Large,否則標示Small
#並把資料存到dta$Size這個新的column
dta$Size <- ifelse(dta$Roll > median(dta$Roll), "Large", "Small")
#列舉Size前6筆,對照一下ifelse邏輯是一致的
head(dta$Size)[1] "Large" "Large" "Large" "Small" "Large" "Large"
#確認資料中有Size這個column
str(dta) #2571 obs. of 7 variables'data.frame': 2571 obs. of 7 variables:
$ ID : int 1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
$ Name: chr "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
$ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
$ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
$ Dec : int 2 3 4 2 4 8 5 5 6 1 ...
$ Roll: int 318 200 455 86 577 329 637 395 438 201 ...
$ Size: chr "Large" "Large" "Large" "Small" ...
head(dta) ID Name City Auth Dec Roll Size
1 1015 Hora Hora School Whangarei State 2 318 Large
2 1052 Morningside School Whangarei State 3 200 Large
3 1062 Onerahi School Whangarei State 4 455 Large
4 1092 Raurimu Avenue School Whangarei State 2 86 Small
5 1130 Whangarei School Whangarei State 4 577 Large
6 1018 Hurupaki School Whangarei State 8 329 Large
#把NULL存入dta$Size
dta$Size <- NULL
#確認存入之後真的NULL
dta$SizeNULL
str(dta) #2571 obs. of 6 variables'data.frame': 2571 obs. of 6 variables:
$ ID : int 1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
$ Name: chr "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
$ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
$ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
$ Dec : int 2 3 4 2 4 8 5 5 6 1 ...
$ Roll: int 318 200 455 86 577 329 637 395 438 201 ...
#dta$Size <- NULL 後,Size這個Column就消失了
head(dta) ID Name City Auth Dec Roll
1 1015 Hora Hora School Whangarei State 2 318
2 1052 Morningside School Whangarei State 3 200
3 1062 Onerahi School Whangarei State 4 455
4 1092 Raurimu Avenue School Whangarei State 2 86
5 1130 Whangarei School Whangarei State 4 577
6 1018 Hurupaki School Whangarei State 8 329
# cut()cut divides the range of x into intervals
# cut .default(x, breaks, labels = NULL, include.lowest = FALSE, right = TRUE, dig.lab = 3)
summary(dta$Roll) Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 79.0 193.0 295.5 383.5 5546.0
5546/2[1] 2773
#把dta$Roll的範圍切成2等分,編入Small或Large
dta$Size <- cut(dta$Roll, 2, labels=c("Small", "Large"))
head(dta$Size)[1] Small Small Small Small Small Small
Levels: Small Large
table(dta$Size)
Small Large
2569 2
#Large的資料只有1筆?
#畫histogram看一下dta的分布狀況,結果確實很偏態
hist(dta$Roll)histogram(~ Roll, dta,breaks = 2,type="count")#切成3等分
5546/3 #0-1848=Small,1848-3697=Mediam, >3697=Large[1] 1848.667
dta$Size <- cut(dta$Roll, 3, labels=c("Small", "Mediam", "Large"))
table(dta$Size)
Small Mediam Large
2555 15 1
#???這裡有疑問
#order()類似sort(),排序
#order()若將decreasing參數設定為TRUE,則會回傳由大到小的元素位置
#依據dta$Roll島序排序給予順序後存入dta$Rollord
dta$RollOrd <- order(dta$Roll, decreasing=T)
head(dta$RollOrd) #跟實際排序不太一致[1] 1726 301 376 2307 615 199
dta$RollOrd [2571][1] 1575
這邊對於dta$Rollord的排序有疑問
head(dta[dta$RollOrd, ]) #這邊的順序有些奇怪 ID Name City Auth Dec Roll Size RollOrd
1726 498 Correspondence School Wellington State NA 5546 Large 753
301 28 Rangitoto College Auckland State 10 3022 Mediam 353
376 78 Avondale College Auckland State 4 2613 Mediam 712
2307 319 Burnside High School Christchurch State 8 2588 Mediam 709
615 41 Macleans College Auckland State 10 2476 Mediam 1915
199 43 Massey High School Auckland State 5 2452 Mediam 1683
tail(dta[dta$RollOrd, ]) #這邊的順序有些奇怪 ID Name City Auth Dec Roll Size
2401 1641 Amana Christian School Dunedin Private 9 7 Small
1590 2461 Tangimoana School Manawatu State 4 6 Small
1996 3598 Woodbank School Kaikoura State 4 6 Small
2112 3386 Jacobs River School Jacobs River State 5 6 Small
1514 2407 Ngamatapouri School Sth Taranaki District State 9 5 Small
1575 2420 Papanui Junction School Taihape State 5 5 Small
RollOrd
2401 2562
1590 266
1996 2478
2112 1501
1514 2377
1575 1542
head(dta[order(dta$City, dta$Roll, decreasing=T), ]) #City降序+Roll降序 列出前6筆 ID Name City Auth Dec Roll Size RollOrd
2548 401 Menzies College Wyndham State 4 356 Small 859
2549 4054 Wyndham School Wyndham State 5 94 Small 1163
1611 2742 Woodville School Woodville State 3 147 Small 726
1630 2640 Papatawa School Woodville State 7 27 Small 2273
2041 3600 Woodend School Woodend State 9 375 Small 1401
1601 399 Central Southland College Winton State 7 549 Small 450
tail(dta[order(dta$City, dta$Roll, decreasing=T), ]) ID Name City Auth Dec Roll Size RollOrd
2169 3273 Albury School Albury State 8 30 Small 1010
2018 350 Akaroa Area School Akaroa State 8 125 Small 1051
2023 3332 Duvauchelle School Akaroa State 9 41 Small 749
335 1200 Ahuroa School Ahuroa State 7 22 Small 193
99 1000 Ahipara School Ahipara State 3 241 Small 1963
2117 2105 Awahono School - Grey Valley Ahaura State 4 119 Small 364
#City降序+Roll降序 列出最後6筆 所以是倒著列這邊對於head(dta[dta$RollOrd, ])的排序有疑問
# counting 數dta$Auth的這個column類別樹目
table(dta$Auth)
Other Private State State Integrated
1 99 2144 327
# table(variable_name, ...)
# 數column類別數目後存成一個table
authtbl <- table(dta$Auth);authtbl
Other Private State State Integrated
1 99 2144 327
# 另一種寫法
au=table(dta$Auth)
au
Other Private State State Integrated
1 99 2144 327
#兩種寫法都是table
class(authtbl)[1] "table"
class(au)[1] "table"
#選出資料dta$Auth就是Other的的那個row
dta[dta$Auth == "Other", ] ID Name City Auth Dec Roll Size RollOrd
2315 518 Kingslea School Christchurch Other 1 51 Small 1579
#xtabs()類似table的作法
#單一個~Auth,會算出dta$Auth的類別項目個數
xtabs(~ Auth , data=dta)Auth
Other Private State State Integrated
1 99 2144 327
#做成交叉表
xtabs(~ Auth + Dec, data=dta) Dec
Auth 1 2 3 4 5 6 7 8 9 10
Other 1 0 0 0 0 0 0 0 0 0
Private 0 0 2 6 2 2 6 11 12 38
State 259 230 208 219 214 215 188 200 205 205
State Integrated 12 22 35 28 38 34 45 45 37 31
mean(dta$Roll) #算出這個column的mean[1] 295.4737
mean(dta$Roll[dta$Auth == "Private"]) #算出Auth=Private的$Roll mean[1] 308.798
dtaPrivate<-dta[dta$Auth == "Private", ] #另一種寫法,提取Auth=Private的存成另一個data.frame
mean(dtaPrivate$Roll) #再算新的data.frame $Roll mean[1] 308.798
# aggregate 聚合資料
# 依據資料dta$Auth這個column的Roll算出mean
aggregate(dta["Roll"], by=list(dta$Auth), FUN=mean) Group.1 Roll
1 Other 51.0000
2 Private 308.7980
3 State 300.6301
4 State Integrated 258.3792
#其實也可以寫成4個語法,驗算一下
mean(dta$Roll[dta$Auth == "Other"]) [1] 51
mean(dta$Roll[dta$Auth == "Private"]) [1] 308.798
mean(dta$Roll[dta$Auth == "State"]) [1] 300.6301
mean(dta$Roll[dta$Auth == "State Integrated"]) [1] 258.3792
# 資料dta$Dec是否>5,結果存入dta$Rich
dta$Rich <- dta$Dec > 5
# 比對一下dta$Dec和dta$Rich
head(dta$Dec)[1] 2 3 4 2 4 8
head(dta$Rich)[1] FALSE FALSE FALSE FALSE FALSE TRUE
#依據dta$Auth, dta$Rich聚合 Roll的mean結果
#dta$Auth, dta$Rich=4x2=8,但結果只有7是因為dta$Auth==Other只有一筆資料
aggregate(dta["Roll"], by=list(dta$Auth, dta$Rich), FUN=mean) Group.1 Group.2 Roll
1 Other FALSE 51.0000
2 Private FALSE 151.4000
3 State FALSE 261.7487
4 State Integrated FALSE 183.2370
5 Private TRUE 402.5362
6 State TRUE 338.8243
7 State Integrated TRUE 311.2135
# 將data frame按照INDICES的factor拆分成若小區塊的data frames,在每個小區塊的data frame上运行函数FUN
#把Roll取出依據dta$Auth分成小塊,列出range
by(dta["Roll"], INDICES=list(dta$Auth), FUN=range): Other
[1] 51 51
------------------------------------------------------------
: Private
[1] 7 1663
------------------------------------------------------------
: State
[1] 5 5546
------------------------------------------------------------
: State Integrated
[1] 18 1475
Split the ChickWeight{datasets} data by individual chicks to extract separate slope estimates of regressing weight onto Time for each chick.
dta<-ChickWeight
str(dta)Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 578 obs. of 4 variables:
$ weight: num 42 51 59 64 76 93 106 125 149 171 ...
$ Time : num 0 2 4 6 8 10 12 14 16 18 ...
$ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
$ Diet : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, "formula")=Class 'formula' language weight ~ Time | Chick
.. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
- attr(*, "outer")=Class 'formula' language ~Diet
.. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
- attr(*, "labels")=List of 2
..$ x: chr "Time"
..$ y: chr "Body weight"
- attr(*, "units")=List of 2
..$ x: chr "(days)"
..$ y: chr "(gm)"
head(dta) weight Time Chick Diet
1 42 0 1 1
2 51 2 1 1
3 59 4 1 1
4 64 6 1 1
5 76 8 1 1
6 93 10 1 1
mypanel <- function(x, y){ #寫一個function畫xy plot panel.xyplot(x, y, pch=19) panel.rug(x,y) panel.grid # 添加水平和垂直的网格线 panel.lmline(x, y, col=“red”, lwd=1, lty=2) # 添加回歸線 } myplot<-xyplot(weight ~ Time | Chick, data=dta, aspect=1.5, layout = c(1, 1), main = “Regression of weight onto Time for each chick”, xlab = “Time”, ylab = “weight”, panel = mypanel )
library(lattice)
png("regression.png", width=1200, height=1200) #存成圖片
mypanel <- function(x, y){ #寫一個function畫xy plot
panel.xyplot(x, y, pch=19)
panel.rug(x,y)
panel.grid # 添加水平和垂直的參考線
panel.lmline(x, y, col="red", lwd=0.3, lty=1) # 添加回歸線
}
xyplot(weight ~ Time | Chick, data=dta,
aspect=0.5,
layout = c(5,10),
main = "Regression of weight onto Time for each chick",
xlab = "Time(day)",
ylab = "weight",
panel = mypanel
)
dev.off()png
2
knitr::include_graphics("regression.png")#不知道怎麼讓Chick照順序排列#寫一個回歸的function
#lapply(list apply)的函數簡化原本要寫的函數,這個函數會把data frame當作一個列表來執行函數lapply的寫法是lapply(data frame, 函數)
function(x)
lm(weight ~ Time, data = x)function(x)
lm(weight ~ Time, data = x)
Convert the script in the NCEA 2007 example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.
#讀資料
dta2 <- read.table("D://NCEA2007.txt", sep=":", quote="", h=T, as.is=T)
str(dta2) #88 obs. of 4 variables'data.frame': 88 obs. of 4 variables:
$ Name : chr "Al-Madinah School" "Alfriston College" "Ambury Park Centre for Riding Therapy" "Aorere College" ...
$ Level1: num 61.5 53.9 33.3 39.5 71.2 22.1 50.8 57.3 89.3 59.8 ...
$ Level2: num 75 44.1 20 50.2 78.9 30.8 34.8 49.8 89.7 65.7 ...
$ Level3: num 0 0 0 30.6 55.5 26.3 48.9 44.6 88.6 50.4 ...
head(dta2) Name Level1 Level2 Level3
1 Al-Madinah School 61.5 75.0 0.0
2 Alfriston College 53.9 44.1 0.0
3 Ambury Park Centre for Riding Therapy 33.3 20.0 0.0
4 Aorere College 39.5 50.2 30.6
5 Auckland Girls' Grammar School 71.2 78.9 55.5
6 Auckland Grammar 22.1 30.8 26.3
#維度長度 dim()
dim(dta2)[1] 88 4
#row 88 x column 4#apply(X, MARGIN, FUN)
#X: 矩陣或資料
#MARGIN: 1,對row應用;2,對column應用;both,對行列應用
#FUN:函數
apply(dta2[,2:4], MARGIN=2, FUN=mean) #只用column2~4 Level1 Level2 Level3
62.26705 61.06818 47.97614
a1<-apply(dta2[, -1], MARGIN=2, FUN=mean) #扣掉column1
a1#算出column 2~4的mean Level1 Level2 Level3
62.26705 61.06818 47.97614
#list apply:lapply(X, FUN)
#X: 列表或矩陣
#FUN: 函數
#輸出:列表
a2<-lapply(dta2[, -1], FUN=mean)
a2#算出column 2~4的mean,列成list$Level1
[1] 62.26705
$Level2
[1] 61.06818
$Level3
[1] 47.97614
## simplify the list apply:sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)
#X: 列表或矩陣
#FUN: 函數
#輸出:向量、矩陣、列表
a3<-sapply(dta2[, -1], FUN=mean)
a3 Level1 Level2 Level3
62.26705 61.06818 47.97614
#比較apply跟lapply跟sapply
class(a1) #apply的結果是數值[1] "numeric"
str(a1) #一串數值 Named num [1:3] 62.3 61.1 48
- attr(*, "names")= chr [1:3] "Level1" "Level2" "Level3"
class(a2) #lapply的結果是list[1] "list"
str(a2) #List of 3List of 3
$ Level1: num 62.3
$ Level2: num 61.1
$ Level3: num 48
class(a3) #sapply應該是簡化版的lapply但目前的結果看起來跟apply一樣[1] "numeric"
str(a3) #一串數值 Named num [1:3] 62.3 61.1 48
- attr(*, "names")= chr [1:3] "Level1" "Level2" "Level3"
#把function改成range
r1<-apply(dta2[, -1], MARGIN=2, FUN=range)
r1 Level1 Level2 Level3
[1,] 2.8 0.0 0.0
[2,] 97.4 95.7 95.7
r2<-lapply(dta2[, -1], FUN=range)
r2$Level1
[1] 2.8 97.4
$Level2
[1] 0.0 95.7
$Level3
[1] 0.0 95.7
r3<-sapply(dta2[, -1], FUN=range)
r3 Level1 Level2 Level3
[1,] 2.8 0.0 0.0
[2,] 97.4 95.7 95.7
#比較apply跟lapply跟sapply
class(r1) #apply的結果是"matrix" "array" [1] "matrix" "array"
str(r1) #2x3的matrix num [1:2, 1:3] 2.8 97.4 0 95.7 0 95.7
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:3] "Level1" "Level2" "Level3"
dim(r1) #2x3的matrix[1] 2 3
class(r2) #lapply的結果依然是list[1] "list"
str(r2) #List of 3List of 3
$ Level1: num [1:2] 2.8 97.4
$ Level2: num [1:2] 0 95.7
$ Level3: num [1:2] 0 95.7
dim(r2) #list沒有dimensionNULL
class(r3) #結果看起來跟apply一樣[1] "matrix" "array"
str(r3) num [1:2, 1:3] 2.8 97.4 0 95.7 0 95.7
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:3] "Level1" "Level2" "Level3"
dim(r3)[1] 2 3
#換了一個資料
#splitting
dta <- read.csv("D://nzSchools.csv", as.is=2)
rollsByAuth <- split(dta$Roll, dta$Auth) #依照Roll、Auth分類存成物件rollsByAuth
str(rollsByAuth) #是listList of 4
$ Other : int 51
$ Private : int [1:99] 255 39 154 73 83 25 95 85 94 729 ...
$ State : int [1:2144] 318 200 455 86 577 329 637 395 201 267 ...
$ State Integrated: int [1:327] 438 26 191 560 151 114 126 171 211 57 ...
class(rollsByAuth) #是list[1] "list"
#依據Roll和Auth的分類算mean
s1<-lapply(split(dta$Roll, dta$Auth), mean)
s1$Other
[1] 51
$Private
[1] 308.798
$State
[1] 300.6301
$`State Integrated`
[1] 258.3792
s2<-sapply(split(dta$Roll, dta$Auth), mean)
s2 Other Private State State Integrated
51.0000 308.7980 300.6301 258.3792
#比較lapply跟sapply
class(s1) #lapply的結果是list [1] "list"
str(s1) #List of 4List of 4
$ Other : num 51
$ Private : num 309
$ State : num 301
$ State Integrated: num 258
dim(s1) #list沒有dimensionNULL
class(s2) #sapply的結果是數值[1] "numeric"
str(s2) #1x4的數值 Named num [1:4] 51 309 301 258
- attr(*, "names")= chr [1:4] "Other" "Private" "State" "State Integrated"
dim(s2) #只有一行數值,沒有dimensionNULL
The following R script uses Cushings{MASS} to demonstrates several ways to achieve the same objective in R. Explain the advantages or disadvantages of each method.
Use the data in the high schools example to solve the following problems: (a) test if any pairs of the five variables: read, write, math, science, and socst, are different in means. (b) test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst. (c) Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.
The formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r. Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate.
Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article: Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.