library(ggplot2)
library(lattice)
library(tidyverse)
library(dplyr)Explain what does this statement do: lapply(lapply(search(), ls), length)
search() [1] ".GlobalEnv" "package:forcats" "package:stringr"
[4] "package:dplyr" "package:purrr" "package:readr"
[7] "package:tidyr" "package:tibble" "package:tidyverse"
[10] "package:lattice" "package:ggplot2" "package:stats"
[13] "package:graphics" "package:grDevices" "package:utils"
[16] "package:datasets" "package:methods" "Autoloads"
[19] "package:base"
#若想查詢目前所有被載入的套件列表,可以使用 search函數列出所有的搜尋路徑
#這個輸出中列出了 R 在搜尋變數的時候,會搜尋到的環境空間與順序,全域環境空間(.GlobalEnv)一定是排在第一位,接著就是最近載入的一些套件,R 的基本核心函數則是放在最後面。# lapply(search(), ls)
# ls()列出物件名稱
# lapply(search(), ls)循環列處目前所有套件的物件名稱lapply(lapply(search(), ls), length)[[1]]
[1] 0
[[2]]
[1] 37
[[3]]
[1] 52
[[4]]
[1] 290
[[5]]
[1] 178
[[6]]
[1] 115
[[7]]
[1] 77
[[8]]
[1] 44
[[9]]
[1] 6
[[10]]
[1] 151
[[11]]
[1] 531
[[12]]
[1] 449
[[13]]
[1] 87
[[14]]
[1] 113
[[15]]
[1] 247
[[16]]
[1] 104
[[17]]
[1] 203
[[18]]
[1] 0
[[19]]
[1] 1255
# length 函數來檢查向量的長度ls(.GlobalEnv)character(0)
length(ls(.GlobalEnv))[1] 0
length(.GlobalEnv)[1] 0
結論
lapply(lapply(search(), ls), length)
1.search結果有10個載入的套件
2.內層lapply(, ls)找出來10個套件的list
3.外層lapply(, length)數出所有套件其內含物件名稱的個數
#發現knit出來和rstudio的結果不一致?原因?
knitr::include_graphics("lapply knit.png")ls(.GlobalEnv)在knit後應該會是0。如果在本機處執行的結果則會取決於你當下Environment 存了哪些東西,按掃把清完Environment後,再執行一次就會是0。
Convert the R script in the NZ schools example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.
## keep the school names with white spaces
dta <- read.csv("D://nzSchools.csv", as.is=2)
str(dta) #2571 obs. of 6 variables'data.frame': 2571 obs. of 6 variables:
$ ID : int 1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
$ Name: chr "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
$ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
$ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
$ Dec : int 2 3 4 2 4 8 5 5 6 1 ...
$ Roll: int 318 200 455 86 577 329 637 395 438 201 ...
dim(dta) #各維度長度 dim(),先row後column[1] 2571 6
head(dta) ID Name City Auth Dec Roll
1 1015 Hora Hora School Whangarei State 2 318
2 1052 Morningside School Whangarei State 3 200
3 1062 Onerahi School Whangarei State 4 455
4 1092 Raurimu Avenue School Whangarei State 2 86
5 1130 Whangarei School Whangarei State 4 577
6 1018 Hurupaki School Whangarei State 8 329
#先知道Roll的median是193
median(dta$Roll)[1] 193
#列舉Roll的前6筆
head(dta$Roll)[1] 318 200 455 86 577 329
#如果Roll的各筆>Roll的median,標示Large,否則標示Small
#並把資料存到dta$Size這個新的column
dta$Size <- ifelse(dta$Roll > median(dta$Roll), "Large", "Small")
#列舉Size前6筆,對照一下ifelse邏輯是一致的
head(dta$Size)[1] "Large" "Large" "Large" "Small" "Large" "Large"
#確認資料中有Size這個column
str(dta) #2571 obs. of 7 variables'data.frame': 2571 obs. of 7 variables:
$ ID : int 1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
$ Name: chr "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
$ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
$ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
$ Dec : int 2 3 4 2 4 8 5 5 6 1 ...
$ Roll: int 318 200 455 86 577 329 637 395 438 201 ...
$ Size: chr "Large" "Large" "Large" "Small" ...
head(dta) ID Name City Auth Dec Roll Size
1 1015 Hora Hora School Whangarei State 2 318 Large
2 1052 Morningside School Whangarei State 3 200 Large
3 1062 Onerahi School Whangarei State 4 455 Large
4 1092 Raurimu Avenue School Whangarei State 2 86 Small
5 1130 Whangarei School Whangarei State 4 577 Large
6 1018 Hurupaki School Whangarei State 8 329 Large
#把NULL存入dta$Size
dta$Size <- NULL
#確認存入之後真的NULL
dta$SizeNULL
str(dta) #2571 obs. of 6 variables'data.frame': 2571 obs. of 6 variables:
$ ID : int 1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
$ Name: chr "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
$ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
$ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
$ Dec : int 2 3 4 2 4 8 5 5 6 1 ...
$ Roll: int 318 200 455 86 577 329 637 395 438 201 ...
#dta$Size <- NULL 後,Size這個Column就消失了
head(dta) ID Name City Auth Dec Roll
1 1015 Hora Hora School Whangarei State 2 318
2 1052 Morningside School Whangarei State 3 200
3 1062 Onerahi School Whangarei State 4 455
4 1092 Raurimu Avenue School Whangarei State 2 86
5 1130 Whangarei School Whangarei State 4 577
6 1018 Hurupaki School Whangarei State 8 329
# cut()cut divides the range of x into intervals
# cut .default(x, breaks, labels = NULL, include.lowest = FALSE, right = TRUE, dig.lab = 3)
summary(dta$Roll) Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 79.0 193.0 295.5 383.5 5546.0
5546/2[1] 2773
#把dta$Roll的範圍切成2等分,編入Small或Large
dta$Size <- cut(dta$Roll, 2, labels=c("Small", "Large"))
head(dta$Size)[1] Small Small Small Small Small Small
Levels: Small Large
table(dta$Size)
Small Large
2569 2
#Large的資料只有1筆?
#畫histogram看一下dta的分布狀況,結果確實很偏態
hist(dta$Roll)histogram(~ Roll, dta,breaks = 2,type="count")#切成3等分
5546/3 #0-1848=Small,1848-3697=Mediam, >3697=Large[1] 1848.667
dta$Size <- cut(dta$Roll, 3, labels=c("Small", "Mediam", "Large"))
table(dta$Size)
Small Mediam Large
2555 15 1
#???這裡有疑問
#order()類似sort(),排序
#order()若將decreasing參數設定為TRUE,則會回傳由大到小的元素位置
#依據dta$Roll島序排序給予順序後存入dta$Rollord
dta$RollOrd <- order(dta$Roll, decreasing=T)
head(dta$RollOrd) #跟實際排序不太一致[1] 1726 301 376 2307 615 199
dta$RollOrd [2571][1] 1575
這邊對於dta$Rollord的排序有疑問
head(dta[dta$RollOrd, ]) #這邊的順序有些奇怪 ID Name City Auth Dec Roll Size RollOrd
1726 498 Correspondence School Wellington State NA 5546 Large 753
301 28 Rangitoto College Auckland State 10 3022 Mediam 353
376 78 Avondale College Auckland State 4 2613 Mediam 712
2307 319 Burnside High School Christchurch State 8 2588 Mediam 709
615 41 Macleans College Auckland State 10 2476 Mediam 1915
199 43 Massey High School Auckland State 5 2452 Mediam 1683
tail(dta[dta$RollOrd, ]) #這邊的順序有些奇怪 ID Name City Auth Dec Roll Size
2401 1641 Amana Christian School Dunedin Private 9 7 Small
1590 2461 Tangimoana School Manawatu State 4 6 Small
1996 3598 Woodbank School Kaikoura State 4 6 Small
2112 3386 Jacobs River School Jacobs River State 5 6 Small
1514 2407 Ngamatapouri School Sth Taranaki District State 9 5 Small
1575 2420 Papanui Junction School Taihape State 5 5 Small
RollOrd
2401 2562
1590 266
1996 2478
2112 1501
1514 2377
1575 1542
head(dta[order(dta$City, dta$Roll, decreasing=T), ]) #City降序+Roll降序 列出前6筆 ID Name City Auth Dec Roll Size RollOrd
2548 401 Menzies College Wyndham State 4 356 Small 859
2549 4054 Wyndham School Wyndham State 5 94 Small 1163
1611 2742 Woodville School Woodville State 3 147 Small 726
1630 2640 Papatawa School Woodville State 7 27 Small 2273
2041 3600 Woodend School Woodend State 9 375 Small 1401
1601 399 Central Southland College Winton State 7 549 Small 450
tail(dta[order(dta$City, dta$Roll, decreasing=T), ]) ID Name City Auth Dec Roll Size RollOrd
2169 3273 Albury School Albury State 8 30 Small 1010
2018 350 Akaroa Area School Akaroa State 8 125 Small 1051
2023 3332 Duvauchelle School Akaroa State 9 41 Small 749
335 1200 Ahuroa School Ahuroa State 7 22 Small 193
99 1000 Ahipara School Ahipara State 3 241 Small 1963
2117 2105 Awahono School - Grey Valley Ahaura State 4 119 Small 364
#City降序+Roll降序 列出最後6筆 所以是倒著列這邊對於head(dta[dta$RollOrd, ])的排序有疑問
# counting 數dta$Auth的這個column類別樹目
table(dta$Auth)
Other Private State State Integrated
1 99 2144 327
# table(variable_name, ...)
# 數column類別數目後存成一個table
authtbl <- table(dta$Auth);authtbl
Other Private State State Integrated
1 99 2144 327
# 另一種寫法
au=table(dta$Auth)
au
Other Private State State Integrated
1 99 2144 327
#兩種寫法都是table
class(authtbl)[1] "table"
class(au)[1] "table"
#選出資料dta$Auth就是Other的的那個row
dta[dta$Auth == "Other", ] ID Name City Auth Dec Roll Size RollOrd
2315 518 Kingslea School Christchurch Other 1 51 Small 1579
#xtabs()類似table的作法
#單一個~Auth,會算出dta$Auth的類別項目個數
xtabs(~ Auth , data=dta)Auth
Other Private State State Integrated
1 99 2144 327
#做成交叉表
xtabs(~ Auth + Dec, data=dta) Dec
Auth 1 2 3 4 5 6 7 8 9 10
Other 1 0 0 0 0 0 0 0 0 0
Private 0 0 2 6 2 2 6 11 12 38
State 259 230 208 219 214 215 188 200 205 205
State Integrated 12 22 35 28 38 34 45 45 37 31
mean(dta$Roll) #算出這個column的mean[1] 295.4737
mean(dta$Roll[dta$Auth == "Private"]) #算出Auth=Private的$Roll mean[1] 308.798
dtaPrivate<-dta[dta$Auth == "Private", ] #另一種寫法,提取Auth=Private的存成另一個data.frame
mean(dtaPrivate$Roll) #再算新的data.frame $Roll mean[1] 308.798
# aggregate 聚合資料
# 依據資料dta$Auth這個column的Roll算出mean
aggregate(dta["Roll"], by=list(dta$Auth), FUN=mean) Group.1 Roll
1 Other 51.0000
2 Private 308.7980
3 State 300.6301
4 State Integrated 258.3792
#其實也可以寫成4個語法,驗算一下
mean(dta$Roll[dta$Auth == "Other"]) [1] 51
mean(dta$Roll[dta$Auth == "Private"]) [1] 308.798
mean(dta$Roll[dta$Auth == "State"]) [1] 300.6301
mean(dta$Roll[dta$Auth == "State Integrated"]) [1] 258.3792
# 資料dta$Dec是否>5,結果存入dta$Rich
dta$Rich <- dta$Dec > 5
# 比對一下dta$Dec和dta$Rich
head(dta$Dec)[1] 2 3 4 2 4 8
head(dta$Rich)[1] FALSE FALSE FALSE FALSE FALSE TRUE
#依據dta$Auth, dta$Rich聚合 Roll的mean結果
#dta$Auth, dta$Rich=4x2=8,但結果只有7是因為dta$Auth==Other只有一筆資料
aggregate(dta["Roll"], by=list(dta$Auth, dta$Rich), FUN=mean) Group.1 Group.2 Roll
1 Other FALSE 51.0000
2 Private FALSE 151.4000
3 State FALSE 261.7487
4 State Integrated FALSE 183.2370
5 Private TRUE 402.5362
6 State TRUE 338.8243
7 State Integrated TRUE 311.2135
# 將data frame按照INDICES的factor拆分成若小區塊的data frames,在每個小區塊的data frame上运行函数FUN
#把Roll取出依據dta$Auth分成小塊,列出range
by(dta["Roll"], INDICES=list(dta$Auth), FUN=range): Other
[1] 51 51
------------------------------------------------------------
: Private
[1] 7 1663
------------------------------------------------------------
: State
[1] 5 5546
------------------------------------------------------------
: State Integrated
[1] 18 1475
Split the ChickWeight{datasets} data by individual chicks to extract separate slope estimates of regressing weight onto Time for each chick.
dta<-ChickWeight
str(dta)Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 578 obs. of 4 variables:
$ weight: num 42 51 59 64 76 93 106 125 149 171 ...
$ Time : num 0 2 4 6 8 10 12 14 16 18 ...
$ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
$ Diet : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, "formula")=Class 'formula' language weight ~ Time | Chick
.. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
- attr(*, "outer")=Class 'formula' language ~Diet
.. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
- attr(*, "labels")=List of 2
..$ x: chr "Time"
..$ y: chr "Body weight"
- attr(*, "units")=List of 2
..$ x: chr "(days)"
..$ y: chr "(gm)"
head(dta) weight Time Chick Diet
1 42 0 1 1
2 51 2 1 1
3 59 4 1 1
4 64 6 1 1
5 76 8 1 1
6 93 10 1 1
#依照建議,圖排序的問題,需改變Chick的level順序
# Chick level重新排序
dta$Chick <- factor(as.numeric(as.character(dta$Chick)))
levels(dta$Chick) [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
[16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
[31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
[46] "46" "47" "48" "49" "50"
mypanel <- function(x, y){ #寫一個function畫xy plot panel.xyplot(x, y, pch=19) panel.rug(x,y) panel.grid # 添加水平和垂直的网格线 panel.lmline(x, y, col=“red”, lwd=1, lty=2) # 添加回歸線 } myplot<-xyplot(weight ~ Time | Chick, data=dta, aspect=1.5, layout = c(1, 1), main = “Regression of weight onto Time for each chick”, xlab = “Time”, ylab = “weight”, panel = mypanel )
library(lattice)
png("regression.png", width=1200, height=1200) #存成圖片
mypanel <- function(x, y){ #寫一個function畫xy plot
panel.xyplot(x, y, pch=19)
panel.rug(x,y)
panel.grid # 添加水平和垂直的參考線
panel.lmline(x, y, col="red", lwd=0.3, lty=1) # 添加回歸線
}
xyplot(weight ~ Time | Chick, data=dta,
aspect=0.5,
layout = c(5,10),
main = "Regression of weight onto Time for each chick",
xlab = "Time(day)",
ylab = "weight",
panel = mypanel
)
dev.off()png
2
knitr::include_graphics("regression.png")#不知道怎麼讓Chick照順序排列#寫一個回歸的function
#lapply(list apply)的函數簡化原本要寫的函數,這個函數會把data frame當作一個列表來執行函數lapply的寫法是lapply(data frame, 函數)
function(x)
lm(weight ~ Time, data = x)function(x)
lm(weight ~ Time, data = x)
Convert the script in the NCEA 2007 example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.
#讀資料
dta2 <- read.table("D://NCEA2007.txt", sep=":", quote="", h=T, as.is=T)
str(dta2) #88 obs. of 4 variables'data.frame': 88 obs. of 4 variables:
$ Name : chr "Al-Madinah School" "Alfriston College" "Ambury Park Centre for Riding Therapy" "Aorere College" ...
$ Level1: num 61.5 53.9 33.3 39.5 71.2 22.1 50.8 57.3 89.3 59.8 ...
$ Level2: num 75 44.1 20 50.2 78.9 30.8 34.8 49.8 89.7 65.7 ...
$ Level3: num 0 0 0 30.6 55.5 26.3 48.9 44.6 88.6 50.4 ...
head(dta2) Name Level1 Level2 Level3
1 Al-Madinah School 61.5 75.0 0.0
2 Alfriston College 53.9 44.1 0.0
3 Ambury Park Centre for Riding Therapy 33.3 20.0 0.0
4 Aorere College 39.5 50.2 30.6
5 Auckland Girls' Grammar School 71.2 78.9 55.5
6 Auckland Grammar 22.1 30.8 26.3
#維度長度 dim()
dim(dta2)[1] 88 4
#row 88 x column 4#apply(X, MARGIN, FUN)
#X: 矩陣或資料
#MARGIN: 1,對row應用;2,對column應用;both,對行列應用
#FUN:函數
apply(dta2[,2:4], MARGIN=2, FUN=mean) #只用column2~4 Level1 Level2 Level3
62.26705 61.06818 47.97614
a1<-apply(dta2[, -1], MARGIN=2, FUN=mean) #扣掉column1
a1#算出column 2~4的mean Level1 Level2 Level3
62.26705 61.06818 47.97614
#list apply:lapply(X, FUN)
#X: 列表或矩陣
#FUN: 函數
#輸出:列表
a2<-lapply(dta2[, -1], FUN=mean)
a2#算出column 2~4的mean,列成list$Level1
[1] 62.26705
$Level2
[1] 61.06818
$Level3
[1] 47.97614
## simplify the list apply:sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)
#X: 列表或矩陣
#FUN: 函數
#輸出:向量、矩陣、列表
a3<-sapply(dta2[, -1], FUN=mean)
a3 Level1 Level2 Level3
62.26705 61.06818 47.97614
#比較apply跟lapply跟sapply
class(a1) #apply的結果是數值[1] "numeric"
str(a1) #一串數值 Named num [1:3] 62.3 61.1 48
- attr(*, "names")= chr [1:3] "Level1" "Level2" "Level3"
class(a2) #lapply的結果是list[1] "list"
str(a2) #List of 3List of 3
$ Level1: num 62.3
$ Level2: num 61.1
$ Level3: num 48
class(a3) #sapply應該是簡化版的lapply但目前的結果看起來跟apply一樣[1] "numeric"
str(a3) #一串數值 Named num [1:3] 62.3 61.1 48
- attr(*, "names")= chr [1:3] "Level1" "Level2" "Level3"
#把function改成range
r1<-apply(dta2[, -1], MARGIN=2, FUN=range)
r1 Level1 Level2 Level3
[1,] 2.8 0.0 0.0
[2,] 97.4 95.7 95.7
r2<-lapply(dta2[, -1], FUN=range)
r2$Level1
[1] 2.8 97.4
$Level2
[1] 0.0 95.7
$Level3
[1] 0.0 95.7
r3<-sapply(dta2[, -1], FUN=range)
r3 Level1 Level2 Level3
[1,] 2.8 0.0 0.0
[2,] 97.4 95.7 95.7
#比較apply跟lapply跟sapply
class(r1) #apply的結果是"matrix" "array" [1] "matrix" "array"
str(r1) #2x3的matrix num [1:2, 1:3] 2.8 97.4 0 95.7 0 95.7
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:3] "Level1" "Level2" "Level3"
dim(r1) #2x3的matrix[1] 2 3
class(r2) #lapply的結果依然是list[1] "list"
str(r2) #List of 3List of 3
$ Level1: num [1:2] 2.8 97.4
$ Level2: num [1:2] 0 95.7
$ Level3: num [1:2] 0 95.7
dim(r2) #list沒有dimensionNULL
class(r3) #結果看起來跟apply一樣[1] "matrix" "array"
str(r3) num [1:2, 1:3] 2.8 97.4 0 95.7 0 95.7
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:3] "Level1" "Level2" "Level3"
dim(r3)[1] 2 3
#換了一個資料
#splitting
dta <- read.csv("D://nzSchools.csv", as.is=2)
rollsByAuth <- split(dta$Roll, dta$Auth) #依照Roll、Auth分類存成物件rollsByAuth
str(rollsByAuth) #是listList of 4
$ Other : int 51
$ Private : int [1:99] 255 39 154 73 83 25 95 85 94 729 ...
$ State : int [1:2144] 318 200 455 86 577 329 637 395 201 267 ...
$ State Integrated: int [1:327] 438 26 191 560 151 114 126 171 211 57 ...
class(rollsByAuth) #是list[1] "list"
#依據Roll和Auth的分類算mean
s1<-lapply(split(dta$Roll, dta$Auth), mean)
s1$Other
[1] 51
$Private
[1] 308.798
$State
[1] 300.6301
$`State Integrated`
[1] 258.3792
s2<-sapply(split(dta$Roll, dta$Auth), mean)
s2 Other Private State State Integrated
51.0000 308.7980 300.6301 258.3792
#比較lapply跟sapply
class(s1) #lapply的結果是list [1] "list"
str(s1) #List of 4List of 4
$ Other : num 51
$ Private : num 309
$ State : num 301
$ State Integrated: num 258
dim(s1) #list沒有dimensionNULL
class(s2) #sapply的結果是數值[1] "numeric"
str(s2) #1x4的數值 Named num [1:4] 51 309 301 258
- attr(*, "names")= chr [1:4] "Other" "Private" "State" "State Integrated"
dim(s2) #只有一行數值,沒有dimensionNULL
The following R script uses Cushings{MASS} to demonstrates several ways to achieve the same objective in R. Explain the advantages or disadvantages of each method.
library(MASS)
library(dplyr)
head(Cushings) Tetrahydrocortisone Pregnanetriol Type
a1 3.1 11.70 a
a2 3.0 1.30 a
a3 1.9 0.10 a
a4 3.8 0.04 a
a5 4.1 1.10 a
a6 1.9 0.40 a
tail(Cushings) Tetrahydrocortisone Pregnanetriol Type
u1 5.1 0.4 u
u2 12.9 5.0 u
u3 13.0 0.8 u
u4 2.6 0.1 u
u5 30.0 0.1 u
u6 20.5 0.8 u
str(Cushings) #27 obs. of 3 variables'data.frame': 27 obs. of 3 variables:
$ Tetrahydrocortisone: num 3.1 3 1.9 3.8 4.1 1.9 8.3 3.8 3.9 7.8 ...
$ Pregnanetriol : num 11.7 1.3 0.1 0.04 1.1 0.4 1 0.2 0.6 1.2 ...
$ Type : Factor w/ 4 levels "a","b","c","u": 1 1 1 1 1 1 2 2 2 2 ...
table(Cushings$Type)
a b c u
6 10 5 6
# method 1 aggregate {stats}
#把資料分組聚合,然後對聚合以後的資料進行加和、求平均等各種操作
#針對data Cushings以Type分組,計算其他變項的mean
m1<-aggregate( . ~ Type, data = Cushings, mean)
m1 Type Tetrahydrocortisone Pregnanetriol
1 a 2.966667 2.44
2 b 8.180000 1.12
3 c 19.720000 5.50
4 u 14.016667 1.20
class(m1) #data.frame[1] "data.frame"
str(m1) #4 obs. of 3 variables'data.frame': 4 obs. of 3 variables:
$ Type : Factor w/ 4 levels "a","b","c","u": 1 2 3 4
$ Tetrahydrocortisone: num 2.97 8.18 19.72 14.02
$ Pregnanetriol : num 2.44 1.12 5.5 1.2
###這是最直觀的方法,對於不會寫function的人...比較容易操作
###可變性較低,若資料多樣,就要逐一修改# method 2 sapply+function(x)
#split切割資料
# 依據Cushings$Type的定義切割資料Cushings[,-3]
a<-split(Cushings[,-3], Cushings$Type)
a$a
Tetrahydrocortisone Pregnanetriol
a1 3.1 11.70
a2 3.0 1.30
a3 1.9 0.10
a4 3.8 0.04
a5 4.1 1.10
a6 1.9 0.40
$b
Tetrahydrocortisone Pregnanetriol
b1 8.3 1.0
b2 3.8 0.2
b3 3.9 0.6
b4 7.8 1.2
b5 9.1 0.6
b6 15.4 3.6
b7 7.7 1.6
b8 6.5 0.4
b9 5.7 0.4
b10 13.6 1.6
$c
Tetrahydrocortisone Pregnanetriol
c1 10.2 6.4
c2 9.2 7.9
c3 9.6 3.1
c4 53.8 2.5
c5 15.8 7.6
$u
Tetrahydrocortisone Pregnanetriol
u1 5.1 0.4
u2 12.9 5.0
u3 13.0 0.8
u4 2.6 0.1
u5 30.0 0.1
u6 20.5 0.8
class(a)[1] "list"
## sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)
#X: 列表或矩陣
#FUN: 函數
#輸出:向量、矩陣、列表
#這邊是針對split後的結果(a list)進行apply的function
m2<-sapply(split(Cushings[,-3], Cushings$Type),
#apply(X, MARGIN, FUN)
#X: 矩陣或資料
#MARGIN: 1,對row應用;2,對column應用;both,對行列應用
#FUN:函數
#其中x是Cushings$Type的分類,所以會產出四個資料
function(x) apply(x, 2, mean))
m2 a b c u
Tetrahydrocortisone 2.966667 8.18 19.72 14.01667
Pregnanetriol 2.440000 1.12 5.50 1.20000
class(m2) #"matrix" "array" [1] "matrix" "array"
str(m2) #List of 2 num [1:2, 1:4] 2.97 2.44 8.18 1.12 19.72 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
..$ : chr [1:4] "a" "b" "c" "u"
###function的寫法不難理解,但要我自己寫,我可能也是寫不出來
###若要計算別的函數,可以直接修改# method 3
# do.call 作為函數的傳回值
# rbind透過row合併
# 把Cushings列出$Type的資料當成list進行rbind,並以fun計算mean
m3<-do.call("rbind", as.list(
by(Cushings, list(Cushings$Type),
function(x) {
#不選Type此欄列入子集
y <- subset(x, select = -Type, )
apply(y, 2, mean)
}
)))
m3 Tetrahydrocortisone Pregnanetriol
a 2.966667 2.44
b 8.180000 1.12
c 19.720000 5.50
u 14.016667 1.20
class(m3) #"matrix" "array" [1] "matrix" "array"
str(m3) #List of 2 num [1:4, 1:2] 2.97 8.18 19.72 14.02 2.44 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:4] "a" "b" "c" "u"
..$ : chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
###因為是rbind,所以雖然是跟method 2一樣是list of 2,但為1:4、1:2的list組成的matrixlibrary(tidyr) #nest
library(purrr) #map
# method 4 %>% 運算子(pipe operator)
m4<-Cushings %>%
group_by(Type) %>% #依據Type做分組
#summarise()函式用來計算統計值
summarize(t_m = mean(Tetrahydrocortisone),
p_m = mean(Pregnanetriol))
#mean(Tetrahydrocortisone)存入t_m的column
m4# A tibble: 4 x 3
Type t_m p_m
<fct> <dbl> <dbl>
1 a 2.97 2.44
2 b 8.18 1.12
3 c 19.7 5.5
4 u 14.0 1.2
class(m4) #"tbl_df" "tbl""data.frame"[1] "tbl_df" "tbl" "data.frame"
#使用 dplyr 套件中基礎函數之後的輸出是一種叫做 tibble 的改良式資料框
#tibble 可以利用 as.data.frame() 函數轉換
str(m4) #tibble [4 x 3]tibble [4 x 3] (S3: tbl_df/tbl/data.frame)
$ Type: Factor w/ 4 levels "a","b","c","u": 1 2 3 4
$ t_m : num [1:4] 2.97 8.18 19.72 14.02
$ p_m : num [1:4] 2.44 1.12 5.5 1.2
### %>% 增加語法的可讀性,也很直觀# method 5
m5<-Cushings %>%
nest(-Type) #Nesting creates a list-column of data frames
m5# A tibble: 4 x 2
Type data
<fct> <list>
1 a <tibble [6 x 2]>
2 b <tibble [10 x 2]>
3 c <tibble [5 x 2]>
4 u <tibble [6 x 2]>
# 不包括Type把Cushings改變成一個list-column of data frames
class(m5)[1] "tbl_df" "tbl" "data.frame"
str(m5)tibble [4 x 2] (S3: tbl_df/tbl/data.frame)
$ Type: Factor w/ 4 levels "a","b","c","u": 1 2 3 4
$ data:List of 4
..$ : tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ Tetrahydrocortisone: num [1:6] 3.1 3 1.9 3.8 4.1 1.9
.. ..$ Pregnanetriol : num [1:6] 11.7 1.3 0.1 0.04 1.1 0.4
..$ : tibble [10 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ Tetrahydrocortisone: num [1:10] 8.3 3.8 3.9 7.8 9.1 15.4 7.7 6.5 5.7 13.6
.. ..$ Pregnanetriol : num [1:10] 1 0.2 0.6 1.2 0.6 3.6 1.6 0.4 0.4 1.6
..$ : tibble [5 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ Tetrahydrocortisone: num [1:5] 10.2 9.2 9.6 53.8 15.8
.. ..$ Pregnanetriol : num [1:5] 6.4 7.9 3.1 2.5 7.6
..$ : tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ Tetrahydrocortisone: num [1:6] 5.1 12.9 13 2.6 30 20.5
.. ..$ Pregnanetriol : num [1:6] 0.4 5 0.8 0.1 0.1 0.8
### 不太清楚nest的功能有甚麼資料處理的重要性m5.1<-Cushings %>%
nest(-Type) %>%
#使用mutate()增加新欄位
#map()函數家族的使用概念為map(需逐一計算的向量,計算所需的函數)
mutate(avg = map(data, ~ apply(., 2, mean)),
res_1 = map_dbl(avg, "Tetrahydrocortisone"),
res_2 = map_dbl(avg, "Pregnanetriol"))
m5.1# A tibble: 4 x 5
Type data avg res_1 res_2
<fct> <list> <list> <dbl> <dbl>
1 a <tibble [6 x 2]> <dbl [2]> 2.97 2.44
2 b <tibble [10 x 2]> <dbl [2]> 8.18 1.12
3 c <tibble [5 x 2]> <dbl [2]> 19.7 5.5
4 u <tibble [6 x 2]> <dbl [2]> 14.0 1.2
class(m5.1) #"tbl_df" "tbl""data.frame"[1] "tbl_df" "tbl" "data.frame"
#使用 dplyr 套件中基礎函數之後的輸出是一種叫做 tibble 的改良式資料框
#tibble 可以利用 as.data.frame() 函數轉換
str(m5.1) #tibble [4 x 5]tibble [4 x 5] (S3: tbl_df/tbl/data.frame)
$ Type : Factor w/ 4 levels "a","b","c","u": 1 2 3 4
$ data :List of 4
..$ : tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ Tetrahydrocortisone: num [1:6] 3.1 3 1.9 3.8 4.1 1.9
.. ..$ Pregnanetriol : num [1:6] 11.7 1.3 0.1 0.04 1.1 0.4
..$ : tibble [10 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ Tetrahydrocortisone: num [1:10] 8.3 3.8 3.9 7.8 9.1 15.4 7.7 6.5 5.7 13.6
.. ..$ Pregnanetriol : num [1:10] 1 0.2 0.6 1.2 0.6 3.6 1.6 0.4 0.4 1.6
..$ : tibble [5 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ Tetrahydrocortisone: num [1:5] 10.2 9.2 9.6 53.8 15.8
.. ..$ Pregnanetriol : num [1:5] 6.4 7.9 3.1 2.5 7.6
..$ : tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ Tetrahydrocortisone: num [1:6] 5.1 12.9 13 2.6 30 20.5
.. ..$ Pregnanetriol : num [1:6] 0.4 5 0.8 0.1 0.1 0.8
$ avg :List of 4
..$ : Named num [1:2] 2.97 2.44
.. ..- attr(*, "names")= chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
..$ : Named num [1:2] 8.18 1.12
.. ..- attr(*, "names")= chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
..$ : Named num [1:2] 19.7 5.5
.. ..- attr(*, "names")= chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
..$ : Named num [1:2] 14 1.2
.. ..- attr(*, "names")= chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
$ res_1: num [1:4] 2.97 8.18 19.72 14.02
$ res_2: num [1:4] 2.44 1.12 5.5 1.2
### map可以解決for迴圈速度很慢且程式碼很長的問題
### 語法結構覺得很複雜,我不會這樣做(是真的不理解的那種不會)Exercise 1心得
條條道路通羅馬,但我在道路中迷路了…
對於初學者,我會選擇Method 1或Method 4
Use the data in the high schools example to solve the following problems: (a) test if any pairs of the five variables: read, write, math, science, and socst, are different in means. (b) test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst. (c) Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.
# 讀資料
dta<-read.table("D://hs0.txt", sep="\t", header=T )
str(dta) #'data.frame': 200 obs. of 11 variables'data.frame': 200 obs. of 11 variables:
$ id : int 70 121 86 141 172 113 50 11 84 48 ...
$ female : chr "male" "female" "male" "male" ...
$ race : chr "white" "white" "white" "white" ...
$ ses : chr "low" "middle" "high" "high" ...
$ schtyp : chr "public" "public" "public" "public" ...
$ prog : chr "general" "vocation" "general" "vocation" ...
$ read : int 57 68 44 63 47 44 50 34 63 57 ...
$ write : int 52 59 33 44 52 52 59 46 57 55 ...
$ math : int 41 53 54 47 57 51 42 45 54 52 ...
$ science: int 47 63 58 53 53 63 53 39 58 NA ...
$ socst : int 57 61 31 56 61 61 61 36 51 51 ...
head(dta) id female race ses schtyp prog read write math science socst
1 70 male white low public general 57 52 41 47 57
2 121 female white middle public vocation 68 59 53 63 61
3 86 male white high public general 44 33 54 58 31
4 141 male white high public vocation 63 44 47 53 56
5 172 male white middle public academic 47 52 57 53 61
6 113 male white middle public academic 44 52 51 63 61
table(dta$race)
african-amer asian hispanic white
20 11 24 145
table(dta$ses)
high low middle
58 47 95
table(dta$schtyp)
private public
32 168
table(dta$prog)
academic general vocation
105 45 50
#Column 1: Student ID
#Column 2: Gender, M = Male, F = Female
#Column 3: Race
#Column 4: Socioeconomic status
#Column 5: School type
#Column 6: Program type
#Column 7: Reading score
#Column 8: Writing score
#Column 9: Math score
#Column 10: Science score
#Column 11: Social science studies score#test if any pairs of the five variables: read, write, math, science, and socst, are different in means.
#t.test(x,y,paired = T)
t.test(dta$read, dta$write,paired = T )
Paired t-test
data: dta$read and dta$write
t = -0.86731, df = 199, p-value = 0.3868
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.7841424 0.6941424
sample estimates:
mean of the differences
-0.545
t.test(dta$read, dta$math,paired = T )
Paired t-test
data: dta$read and dta$math
t = -0.72428, df = 199, p-value = 0.4697
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.5448905 0.7148905
sample estimates:
mean of the differences
-0.415
t.test(dta$read, dta$science,paired = T )
Paired t-test
data: dta$read and dta$science
t = 0.31406, df = 194, p-value = 0.7538
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.028890 1.418634
sample estimates:
mean of the differences
0.1948718
t.test(dta$read, dta$socst,paired = T )
Paired t-test
data: dta$read and dta$socst
t = -0.27074, df = 199, p-value = 0.7869
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.449639 1.099639
sample estimates:
mean of the differences
-0.175
t.test(dta$write, dta$math,paired = T )
Paired t-test
data: dta$write and dta$math
t = 0.22303, df = 199, p-value = 0.8237
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.01944 1.27944
sample estimates:
mean of the differences
0.13
t.test(dta$write, dta$science,paired = T )
Paired t-test
data: dta$write and dta$science
t = 1.1867, df = 194, p-value = 0.2368
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.5057821 2.0339872
sample estimates:
mean of the differences
0.7641026
t.test(dta$write, dta$socst,paired = T )
Paired t-test
data: dta$write and dta$socst
t = 0.5778, df = 199, p-value = 0.5641
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.8927696 1.6327696
sample estimates:
mean of the differences
0.37
t.test(dta$math, dta$science,paired = T )
Paired t-test
data: dta$math and dta$science
t = 1.0799, df = 194, p-value = 0.2815
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.5296709 1.8117222
sample estimates:
mean of the differences
0.6410256
t.test(dta$math, dta$socst,paired = T )
Paired t-test
data: dta$math and dta$socst
t = 0.35101, df = 199, p-value = 0.726
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.108304 1.588304
sample estimates:
mean of the differences
0.24
t.test(dta$science, dta$socst,paired = T )
Paired t-test
data: dta$science and dta$socst
t = -0.50692, df = 194, p-value = 0.6128
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.906119 1.126632
sample estimates:
mean of the differences
-0.3897436
# 最傻的方法,一個一個做...a<-do.call(“rbind”, as.list( by(dta, list(c(,7:11) ))) a class(a) #“matrix” “array” str(a) #List of 2
nest(-id,-race, -ses,-schtyp,-prog)%>% mutate(avg = map(data, ~ apply(., 2, mean)), score_r = map_dbl(avg, “read”), score_w = map_dbl(avg, “write”), score_m = map_dbl(avg, “math”), score_sci = map_dbl(avg, “science”), score_sc = map_dbl(avg, “socst”)) a
dta%>% group_by(variable) %>% #依據Type做分組 #summarise()函式用來計算統計值 summarize(t_m = mean(Tetrahydrocortisone), p_m = mean(Pregnanetriol)) #mean(Tetrahydrocortisone)存入t_m的column m4
The formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r. Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate. 如果您今天以 r 的月利率貸款 L 金額,您必須在 M 個月內每月支付的款項。
#名字<-function(參數1,參數2,...){
#程式碼本體
#最後一行的輸出自動設為回傳值
#}
amount <- function(L,r,M){
P<-L*(r/(1-(1+r)^(-M)))
return(P)
}
#r:利率
#L:借款金額
#M:M個月內繳清款項
#借貸5,000,000元,利率為2%,在10年內還完(10*12=120月)
amount(5*10^6,2/100, 10*12)[1] 110240.5
#rep(x, times = 1, length.out = NA, each = 1)
#x 使用者想要相同重複元素的數值向量.
#times 數值向量 x 重複的次數.
#each 數值向量 x 的每個元素重複的次數.
#length.out = NA 數值向量 x 重複後的長度.
#r:利率
#L:借款金額
#M:M個月內繳清款項
loan<-data.frame(
L=rep(c(5*10^6, 10*10^6, 15*10^6), each=15),
r=rep(c(2/100, 5/100, 7/100)),
M=rep(c(10*12, 15*12, 20*12, 25*12, 30*12))) %>%
mutate(money=amount(L,r,M))
loan L r M money
1 5.0e+06 0.02 120 110240.5
2 5.0e+06 0.05 180 250038.4
3 5.0e+06 0.07 240 350000.0
4 5.0e+06 0.02 300 100263.7
5 5.0e+06 0.05 360 250000.0
6 5.0e+06 0.07 120 350104.3
7 5.0e+06 0.02 180 102913.7
8 5.0e+06 0.05 240 250002.1
9 5.0e+06 0.07 300 350000.0
10 5.0e+06 0.02 360 100080.2
11 5.0e+06 0.05 120 250718.6
12 5.0e+06 0.07 180 350001.8
13 5.0e+06 0.02 240 100870.4
14 5.0e+06 0.05 300 250000.1
15 5.0e+06 0.07 360 350000.0
16 1.0e+07 0.02 120 220481.0
17 1.0e+07 0.05 180 500076.7
18 1.0e+07 0.07 240 700000.1
19 1.0e+07 0.02 300 200527.4
20 1.0e+07 0.05 360 500000.0
21 1.0e+07 0.07 120 700208.5
22 1.0e+07 0.02 180 205827.4
23 1.0e+07 0.05 240 500004.1
24 1.0e+07 0.07 300 700000.0
25 1.0e+07 0.02 360 200160.4
26 1.0e+07 0.05 120 501437.1
27 1.0e+07 0.07 180 700003.6
28 1.0e+07 0.02 240 201740.8
29 1.0e+07 0.05 300 500000.2
30 1.0e+07 0.07 360 700000.0
31 1.5e+07 0.02 120 330721.5
32 1.5e+07 0.05 180 750115.1
33 1.5e+07 0.07 240 1050000.1
34 1.5e+07 0.02 300 300791.1
35 1.5e+07 0.05 360 750000.0
36 1.5e+07 0.07 120 1050312.8
37 1.5e+07 0.02 180 308741.0
38 1.5e+07 0.05 240 750006.2
39 1.5e+07 0.07 300 1050000.0
40 1.5e+07 0.02 360 300240.7
41 1.5e+07 0.05 120 752155.7
42 1.5e+07 0.07 180 1050005.4
43 1.5e+07 0.02 240 302611.2
44 1.5e+07 0.05 300 750000.3
45 1.5e+07 0.07 360 1050000.0
str(loan)'data.frame': 45 obs. of 4 variables:
$ L : num 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 ...
$ r : num 0.02 0.05 0.07 0.02 0.05 0.07 0.02 0.05 0.07 0.02 ...
$ M : num 120 180 240 300 360 120 180 240 300 360 ...
$ money: num 110240 250038 350000 100264 250000 ...
head(loan) L r M money
1 5e+06 0.02 120 110240.5
2 5e+06 0.05 180 250038.4
3 5e+06 0.07 240 350000.0
4 5e+06 0.02 300 100263.7
5 5e+06 0.05 360 250000.0
6 5e+06 0.07 120 350104.3
###想畫個圖,但沒成功
names(loan)[c(1:4)] <- c("L", "r", "m", "money")
head(loan) L r m money
1 5e+06 0.02 120 110240.5
2 5e+06 0.05 180 250038.4
3 5e+06 0.07 240 350000.0
4 5e+06 0.02 300 100263.7
5 5e+06 0.05 360 250000.0
6 5e+06 0.07 120 350104.3
barplot(with(loan,
xtabs(L ~m +money)),
main="pay per month for loan",
ylab="money",
xlab="loan",
beside=T)loan%>%
ggplot( ) +
aes(x=money,
y=reorder(L,r)) +
geom_point(size=rel(3))+
scale_color_manual(values=c('lightblue',
'skyblue',
'steelblue',
'navyblue'),
guide=guide_legend(title=NULL))+
labs(x="",
y="") Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article: Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.
# c-stat example
# read in data
dta <- read.table("D://cstat.txt", header=T)
str(dta) #42 obs. of 1 variable'data.frame': 42 obs. of 1 variable:
$ nc: int 28 46 39 45 24 20 35 37 36 40 ...
head(dta) nc
1 28
2 46
3 39
4 45
5 24
6 20
plot(dta) #完全畫不出甚麼東西來plot(dta$nc) #畫出來沒有順序dta$order <- c(1:42) #要先給一個順序
head(dta) nc order
1 28 1
2 46 2
3 39 3
4 45 4
5 24 5
6 20 6
plot(x=dta$order, # X軸的值
y=dta$nc, # Y軸的值
xlab="Observations", # X軸名稱
ylab="number of children", # Y軸名稱
lines(1:42, dta[,1]) ) # plot figure 1
#plot(dta, xlab="Observations", ylab="Number of Children")
plot(1:42, dta[,1], xlab="Observations", ylab="Number of Children")
lines(1:42, dta[,1]) #把1~41連起來
abline(v=10, lty=2) #abline()函數給當前繪圖添加一條或多條直線
abline(v=32, lty=2)
#1~10做一個區段線
segments(1, mean(dta[1:10,1]),10, mean(dta[1:10,1]),col="red")
#11~32做一個區段線
segments(11, mean(dta[11:32,1]),32, mean(dta[11:32,1]),col="red")
#33~42做一個區段線
segments(33, mean(dta[33:42,1]),42, mean(dta[33:42,1]),col="red")# calculate c-stat for first baseline phase
cden <- 1-(sum(diff(dta[1:10,1])^2)/(2*(10-1)*var(dta[1:10,1])))
cden[1] 0.1601208
sc <- sqrt((10-2)/((10-1)*(10+1)))
sc[1] 0.2842676
pval <- 1-pnorm(cden/sc)
pval[1] 0.2866238
# calculate c-stat for first baseline plus group tokens
n <- 32
cden <- 1-(sum(diff(dta[1:n,1])^2)/(2*(n-1)*var(dta[1:n,1])))
sc <- sqrt((n-2)/((n-1)*(n+1)))
pval <- 1-pnorm(cden/sc)
list(z=cden/sc,pvalue=pval)$z
[1] 3.879054
$pvalue
[1] 5.243167e-05