Inclass exercise

library(ggplot2)
library(lattice)
library(tidyverse)
library(dplyr)

In-class 1

Explain what does this statement do: lapply(lapply(search(), ls), length)

search()

 [1] ".GlobalEnv"        "package:forcats"   "package:stringr"  
 [4] "package:dplyr"     "package:purrr"     "package:readr"    
 [7] "package:tidyr"     "package:tibble"    "package:tidyverse"
[10] "package:lattice"   "package:ggplot2"   "package:stats"    
[13] "package:graphics"  "package:grDevices" "package:utils"    
[16] "package:datasets"  "package:methods"   "Autoloads"        
[19] "package:base"

#若想查詢目前所有被載入的套件列表，可以使用 search函數列出所有的搜尋路徑
#這個輸出中列出了 R 在搜尋變數的時候，會搜尋到的環境空間與順序，全域環境空間（.GlobalEnv）一定是排在第一位，接著就是最近載入的一些套件，R 的基本核心函數則是放在最後面。

# lapply(search(), ls) 
# ls()列出物件名稱
# lapply(search(), ls)循環列處目前所有套件的物件名稱

lapply(lapply(search(), ls), length)

[[1]]
[1] 0

[[2]]
[1] 37

[[3]]
[1] 52

[[4]]
[1] 290

[[5]]
[1] 178

[[6]]
[1] 115

[[7]]
[1] 77

[[8]]
[1] 44

[[9]]
[1] 6

[[10]]
[1] 151

[[11]]
[1] 531

[[12]]
[1] 449

[[13]]
[1] 87

[[14]]
[1] 113

[[15]]
[1] 247

[[16]]
[1] 104

[[17]]
[1] 203

[[18]]
[1] 0

[[19]]
[1] 1255

# length 函數來檢查向量的長度

ls(.GlobalEnv)

character(0)

length(ls(.GlobalEnv))

[1] 0

length(.GlobalEnv)

[1] 0

結論

lapply(lapply(search(), ls), length)

1.search結果有10個載入的套件

2.內層lapply(, ls)找出來10個套件的list

3.外層lapply(, length)數出所有套件其內含物件名稱的個數

#發現knit出來和rstudio的結果不一致?原因?
knitr::include_graphics("lapply knit.png")

ls(.GlobalEnv)在knit後應該會是0。如果在本機處執行的結果則會取決於你當下Environment 存了哪些東西，按掃把清完Environment後，再執行一次就會是0。

In-class 2

Convert the R script in the NZ schools example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.

## keep the school names with white spaces
dta <- read.csv("D://nzSchools.csv", as.is=2)
str(dta) #2571 obs. of  6 variables

'data.frame':   2571 obs. of  6 variables:
 $ ID  : int  1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
 $ Name: chr  "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
 $ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
 $ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
 $ Dec : int  2 3 4 2 4 8 5 5 6 1 ...
 $ Roll: int  318 200 455 86 577 329 637 395 438 201 ...

dim(dta) #各維度長度 dim()，先row後column

[1] 2571    6

head(dta)

    ID                  Name      City  Auth Dec Roll
1 1015      Hora Hora School Whangarei State   2  318
2 1052    Morningside School Whangarei State   3  200
3 1062        Onerahi School Whangarei State   4  455
4 1092 Raurimu Avenue School Whangarei State   2   86
5 1130      Whangarei School Whangarei State   4  577
6 1018       Hurupaki School Whangarei State   8  329

#先知道Roll的median是193
median(dta$Roll)

[1] 193

#列舉Roll的前6筆
head(dta$Roll)

[1] 318 200 455  86 577 329

#如果Roll的各筆>Roll的median，標示Large，否則標示Small
#並把資料存到dta$Size這個新的column
dta$Size <- ifelse(dta$Roll > median(dta$Roll), "Large", "Small")
#列舉Size前6筆，對照一下ifelse邏輯是一致的
head(dta$Size)

[1] "Large" "Large" "Large" "Small" "Large" "Large"

#確認資料中有Size這個column
str(dta)  #2571 obs. of  7 variables

'data.frame':   2571 obs. of  7 variables:
 $ ID  : int  1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
 $ Name: chr  "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
 $ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
 $ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
 $ Dec : int  2 3 4 2 4 8 5 5 6 1 ...
 $ Roll: int  318 200 455 86 577 329 637 395 438 201 ...
 $ Size: chr  "Large" "Large" "Large" "Small" ...

head(dta)

    ID                  Name      City  Auth Dec Roll  Size
1 1015      Hora Hora School Whangarei State   2  318 Large
2 1052    Morningside School Whangarei State   3  200 Large
3 1062        Onerahi School Whangarei State   4  455 Large
4 1092 Raurimu Avenue School Whangarei State   2   86 Small
5 1130      Whangarei School Whangarei State   4  577 Large
6 1018       Hurupaki School Whangarei State   8  329 Large

#把NULL存入dta$Size
dta$Size <- NULL
#確認存入之後真的NULL
dta$Size

NULL

str(dta)  #2571 obs. of  6 variables

'data.frame':   2571 obs. of  6 variables:
 $ ID  : int  1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
 $ Name: chr  "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
 $ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
 $ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
 $ Dec : int  2 3 4 2 4 8 5 5 6 1 ...
 $ Roll: int  318 200 455 86 577 329 637 395 438 201 ...

#dta$Size <- NULL  後，Size這個Column就消失了
head(dta)

    ID                  Name      City  Auth Dec Roll
1 1015      Hora Hora School Whangarei State   2  318
2 1052    Morningside School Whangarei State   3  200
3 1062        Onerahi School Whangarei State   4  455
4 1092 Raurimu Avenue School Whangarei State   2   86
5 1130      Whangarei School Whangarei State   4  577
6 1018       Hurupaki School Whangarei State   8  329

# cut()cut divides the range of x into intervals
# cut .default(x, breaks, labels = NULL, include.lowest = FALSE, right = TRUE, dig.lab = 3)

summary(dta$Roll)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    5.0    79.0   193.0   295.5   383.5  5546.0

5546/2

[1] 2773

#把dta$Roll的範圍切成2等分，編入Small或Large
dta$Size <- cut(dta$Roll, 2, labels=c("Small", "Large"))
head(dta$Size)

[1] Small Small Small Small Small Small
Levels: Small Large

table(dta$Size)


Small Large 
 2569     2

#Large的資料只有1筆?
#畫histogram看一下dta的分布狀況，結果確實很偏態
hist(dta$Roll)

histogram(~ Roll, dta,breaks = 2,type="count")

#切成3等分
5546/3   #0-1848=Small，1848-3697=Mediam, >3697=Large

[1] 1848.667

dta$Size <- cut(dta$Roll, 3, labels=c("Small", "Mediam", "Large"))
table(dta$Size)


 Small Mediam  Large 
  2555     15      1

#???這裡有疑問
#order()類似sort()，排序
#order()若將decreasing參數設定為TRUE，則會回傳由大到小的元素位置
#依據dta$Roll島序排序給予順序後存入dta$Rollord
dta$RollOrd <- order(dta$Roll, decreasing=T)
head(dta$RollOrd)   #跟實際排序不太一致

[1] 1726  301  376 2307  615  199

dta$RollOrd [2571]

[1] 1575

這邊對於dta$Rollord的排序有疑問

head(dta[dta$RollOrd, ]) #這邊的順序有些奇怪

      ID                  Name         City  Auth Dec Roll   Size RollOrd
1726 498 Correspondence School   Wellington State  NA 5546  Large     753
301   28     Rangitoto College     Auckland State  10 3022 Mediam     353
376   78      Avondale College     Auckland State   4 2613 Mediam     712
2307 319  Burnside High School Christchurch State   8 2588 Mediam     709
615   41      Macleans College     Auckland State  10 2476 Mediam    1915
199   43    Massey High School     Auckland State   5 2452 Mediam    1683

tail(dta[dta$RollOrd, ]) #這邊的順序有些奇怪

       ID                    Name                  City    Auth Dec Roll  Size
2401 1641  Amana Christian School               Dunedin Private   9    7 Small
1590 2461       Tangimoana School              Manawatu   State   4    6 Small
1996 3598         Woodbank School              Kaikoura   State   4    6 Small
2112 3386     Jacobs River School          Jacobs River   State   5    6 Small
1514 2407     Ngamatapouri School Sth Taranaki District   State   9    5 Small
1575 2420 Papanui Junction School               Taihape   State   5    5 Small
     RollOrd
2401    2562
1590     266
1996    2478
2112    1501
1514    2377
1575    1542

head(dta[order(dta$City, dta$Roll, decreasing=T), ]) #City降序+Roll降序  列出前6筆

       ID                      Name      City  Auth Dec Roll  Size RollOrd
2548  401           Menzies College   Wyndham State   4  356 Small     859
2549 4054            Wyndham School   Wyndham State   5   94 Small    1163
1611 2742          Woodville School Woodville State   3  147 Small     726
1630 2640           Papatawa School Woodville State   7   27 Small    2273
2041 3600            Woodend School   Woodend State   9  375 Small    1401
1601  399 Central Southland College    Winton State   7  549 Small     450

tail(dta[order(dta$City, dta$Roll, decreasing=T), ])

       ID                         Name    City  Auth Dec Roll  Size RollOrd
2169 3273                Albury School  Albury State   8   30 Small    1010
2018  350           Akaroa Area School  Akaroa State   8  125 Small    1051
2023 3332           Duvauchelle School  Akaroa State   9   41 Small     749
335  1200                Ahuroa School  Ahuroa State   7   22 Small     193
99   1000               Ahipara School Ahipara State   3  241 Small    1963
2117 2105 Awahono School - Grey Valley  Ahaura State   4  119 Small     364

#City降序+Roll降序  列出最後6筆  所以是倒著列

這邊對於head(dta[dta$RollOrd, ])的排序有疑問

# counting 數dta$Auth的這個column類別樹目
table(dta$Auth)


           Other          Private            State State Integrated 
               1               99             2144              327

# table(variable_name, ...)
# 數column類別數目後存成一個table
authtbl <- table(dta$Auth);authtbl


           Other          Private            State State Integrated 
               1               99             2144              327

# 另一種寫法
au=table(dta$Auth)
au


           Other          Private            State State Integrated 
               1               99             2144              327

#兩種寫法都是table
class(authtbl)

[1] "table"

class(au)

[1] "table"

#選出資料dta$Auth就是Other的的那個row
dta[dta$Auth == "Other", ]

      ID            Name         City  Auth Dec Roll  Size RollOrd
2315 518 Kingslea School Christchurch Other   1   51 Small    1579

#xtabs()類似table的作法
#單一個~Auth，會算出dta$Auth的類別項目個數
xtabs(~ Auth , data=dta)

Auth
           Other          Private            State State Integrated 
               1               99             2144              327

#做成交叉表
xtabs(~ Auth + Dec, data=dta)

                  Dec
Auth                 1   2   3   4   5   6   7   8   9  10
  Other              1   0   0   0   0   0   0   0   0   0
  Private            0   0   2   6   2   2   6  11  12  38
  State            259 230 208 219 214 215 188 200 205 205
  State Integrated  12  22  35  28  38  34  45  45  37  31

mean(dta$Roll)  #算出這個column的mean

[1] 295.4737

mean(dta$Roll[dta$Auth == "Private"])   #算出Auth=Private的$Roll mean

[1] 308.798

dtaPrivate<-dta[dta$Auth == "Private", ] #另一種寫法，提取Auth=Private的存成另一個data.frame
mean(dtaPrivate$Roll) #再算新的data.frame $Roll mean

[1] 308.798

# aggregate 聚合資料
# 依據資料dta$Auth這個column的Roll算出mean
aggregate(dta["Roll"], by=list(dta$Auth), FUN=mean)

           Group.1     Roll
1            Other  51.0000
2          Private 308.7980
3            State 300.6301
4 State Integrated 258.3792

#其實也可以寫成4個語法，驗算一下
mean(dta$Roll[dta$Auth == "Other"])

[1] 51

mean(dta$Roll[dta$Auth == "Private"])

[1] 308.798

mean(dta$Roll[dta$Auth == "State"])

[1] 300.6301

mean(dta$Roll[dta$Auth == "State Integrated"])

[1] 258.3792

# 資料dta$Dec是否>5，結果存入dta$Rich
dta$Rich <- dta$Dec > 5
# 比對一下dta$Dec和dta$Rich
head(dta$Dec)

[1] 2 3 4 2 4 8

head(dta$Rich)

[1] FALSE FALSE FALSE FALSE FALSE  TRUE

#依據dta$Auth, dta$Rich聚合 Roll的mean結果
#dta$Auth, dta$Rich=4x2=8，但結果只有7是因為dta$Auth==Other只有一筆資料
aggregate(dta["Roll"], by=list(dta$Auth, dta$Rich), FUN=mean)

           Group.1 Group.2     Roll
1            Other   FALSE  51.0000
2          Private   FALSE 151.4000
3            State   FALSE 261.7487
4 State Integrated   FALSE 183.2370
5          Private    TRUE 402.5362
6            State    TRUE 338.8243
7 State Integrated    TRUE 311.2135

# 將data frame按照INDICES的factor拆分成若小區塊的data frames，在每個小區塊的data frame上运行函数FUN
#把Roll取出依據dta$Auth分成小塊，列出range
by(dta["Roll"], INDICES=list(dta$Auth), FUN=range)

: Other
[1] 51 51
------------------------------------------------------------ 
: Private
[1]    7 1663
------------------------------------------------------------ 
: State
[1]    5 5546
------------------------------------------------------------ 
: State Integrated
[1]   18 1475

In-class 3

Split the ChickWeight{datasets} data by individual chicks to extract separate slope estimates of regressing weight onto Time for each chick.

dta<-ChickWeight
str(dta)

Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':  578 obs. of  4 variables:
 $ weight: num  42 51 59 64 76 93 106 125 149 171 ...
 $ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
 $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
 $ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
 - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
  .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
 - attr(*, "outer")=Class 'formula'  language ~Diet
  .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
 - attr(*, "labels")=List of 2
  ..$ x: chr "Time"
  ..$ y: chr "Body weight"
 - attr(*, "units")=List of 2
  ..$ x: chr "(days)"
  ..$ y: chr "(gm)"

head(dta)

  weight Time Chick Diet
1     42    0     1    1
2     51    2     1    1
3     59    4     1    1
4     64    6     1    1
5     76    8     1    1
6     93   10     1    1

#依照建議，圖排序的問題，需改變Chick的level順序
# Chick level重新排序
dta$Chick <- factor(as.numeric(as.character(dta$Chick)))
levels(dta$Chick)

 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
[16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
[31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
[46] "46" "47" "48" "49" "50"

mypanel <- function(x, y){ #寫一個function畫xy plot panel.xyplot(x, y, pch=19) panel.rug(x,y) panel.grid # 添加水平和垂直的网格线 panel.lmline(x, y, col=“red”, lwd=1, lty=2) # 添加回歸線 } myplot<-xyplot(weight ~ Time | Chick, data=dta, aspect=1.5, layout = c(1, 1), main = “Regression of weight onto Time for each chick”, xlab = “Time”, ylab = “weight”, panel = mypanel )

library(lattice)
png("regression.png", width=1200, height=1200) #存成圖片
mypanel <- function(x, y){ #寫一個function畫xy plot
    panel.xyplot(x, y, pch=19)
    panel.rug(x,y)
    panel.grid # 添加水平和垂直的參考線
    panel.lmline(x, y, col="red", lwd=0.3, lty=1) # 添加回歸線
}
xyplot(weight ~ Time | Chick, data=dta,
       aspect=0.5,
       layout = c(5,10),
       main = "Regression of weight onto Time for each chick",
       xlab = "Time(day)",
       ylab = "weight",
       panel = mypanel
       )
dev.off()

png 
  2

knitr::include_graphics("regression.png")

#不知道怎麼讓Chick照順序排列

#寫一個回歸的function
#lapply(list apply)的函數簡化原本要寫的函數，這個函數會把data frame當作一個列表來執行函數lapply的寫法是lapply(data frame, 函數)
function(x) 
  lm(weight ~ Time, data = x)

function(x) 
  lm(weight ~ Time, data = x)

In-class 4

Convert the script in the NCEA 2007 example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.

#讀資料
dta2 <- read.table("D://NCEA2007.txt", sep=":", quote="", h=T, as.is=T)
str(dta2) #88 obs. of  4 variables

'data.frame':   88 obs. of  4 variables:
 $ Name  : chr  "Al-Madinah School" "Alfriston College" "Ambury Park Centre for Riding Therapy" "Aorere College" ...
 $ Level1: num  61.5 53.9 33.3 39.5 71.2 22.1 50.8 57.3 89.3 59.8 ...
 $ Level2: num  75 44.1 20 50.2 78.9 30.8 34.8 49.8 89.7 65.7 ...
 $ Level3: num  0 0 0 30.6 55.5 26.3 48.9 44.6 88.6 50.4 ...

head(dta2)

                                   Name Level1 Level2 Level3
1                     Al-Madinah School   61.5   75.0    0.0
2                     Alfriston College   53.9   44.1    0.0
3 Ambury Park Centre for Riding Therapy   33.3   20.0    0.0
4                        Aorere College   39.5   50.2   30.6
5        Auckland Girls' Grammar School   71.2   78.9   55.5
6                      Auckland Grammar   22.1   30.8   26.3

#維度長度 dim()
dim(dta2)

[1] 88  4

#row 88 x column 4

#apply(X, MARGIN, FUN)
#X: 矩陣或資料
#MARGIN: 1,對row應用;2,對column應用;both,對行列應用
#FUN:函數
apply(dta2[,2:4], MARGIN=2, FUN=mean) #只用column2~4

  Level1   Level2   Level3 
62.26705 61.06818 47.97614

a1<-apply(dta2[, -1], MARGIN=2, FUN=mean) #扣掉column1
a1#算出column 2~4的mean

  Level1   Level2   Level3 
62.26705 61.06818 47.97614

#list apply:lapply(X, FUN)
#X: 列表或矩陣
#FUN: 函數
#輸出:列表
a2<-lapply(dta2[, -1], FUN=mean)
a2#算出column 2~4的mean，列成list

$Level1
[1] 62.26705

$Level2
[1] 61.06818

$Level3
[1] 47.97614

## simplify the list apply:sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)
#X: 列表或矩陣
#FUN: 函數
#輸出:向量、矩陣、列表
a3<-sapply(dta2[, -1], FUN=mean)
a3

  Level1   Level2   Level3 
62.26705 61.06818 47.97614

#比較apply跟lapply跟sapply
class(a1) #apply的結果是數值

[1] "numeric"

str(a1)   #一串數值

 Named num [1:3] 62.3 61.1 48
 - attr(*, "names")= chr [1:3] "Level1" "Level2" "Level3"

class(a2) #lapply的結果是list

[1] "list"

str(a2)   #List of 3

List of 3
 $ Level1: num 62.3
 $ Level2: num 61.1
 $ Level3: num 48

class(a3) #sapply應該是簡化版的lapply但目前的結果看起來跟apply一樣

[1] "numeric"

str(a3)   #一串數值

 Named num [1:3] 62.3 61.1 48
 - attr(*, "names")= chr [1:3] "Level1" "Level2" "Level3"

#把function改成range
r1<-apply(dta2[, -1], MARGIN=2, FUN=range)
r1

     Level1 Level2 Level3
[1,]    2.8    0.0    0.0
[2,]   97.4   95.7   95.7

r2<-lapply(dta2[, -1], FUN=range)
r2

$Level1
[1]  2.8 97.4

$Level2
[1]  0.0 95.7

$Level3
[1]  0.0 95.7

r3<-sapply(dta2[, -1], FUN=range)
r3

     Level1 Level2 Level3
[1,]    2.8    0.0    0.0
[2,]   97.4   95.7   95.7

#比較apply跟lapply跟sapply
class(r1) #apply的結果是"matrix" "array"

[1] "matrix" "array"

str(r1)   #2x3的matrix

 num [1:2, 1:3] 2.8 97.4 0 95.7 0 95.7
 - attr(*, "dimnames")=List of 2
  ..$ : NULL
  ..$ : chr [1:3] "Level1" "Level2" "Level3"

dim(r1)   #2x3的matrix

[1] 2 3

class(r2) #lapply的結果依然是list

[1] "list"

str(r2)   #List of 3

List of 3
 $ Level1: num [1:2] 2.8 97.4
 $ Level2: num [1:2] 0 95.7
 $ Level3: num [1:2] 0 95.7

dim(r2)   #list沒有dimension

NULL

class(r3) #結果看起來跟apply一樣

[1] "matrix" "array"

str(r3)

 num [1:2, 1:3] 2.8 97.4 0 95.7 0 95.7
 - attr(*, "dimnames")=List of 2
  ..$ : NULL
  ..$ : chr [1:3] "Level1" "Level2" "Level3"

dim(r3)

[1] 2 3

#換了一個資料
#splitting
dta <- read.csv("D://nzSchools.csv", as.is=2)
rollsByAuth <- split(dta$Roll, dta$Auth) #依照Roll、Auth分類存成物件rollsByAuth
str(rollsByAuth)    #是list

List of 4
 $ Other           : int 51
 $ Private         : int [1:99] 255 39 154 73 83 25 95 85 94 729 ...
 $ State           : int [1:2144] 318 200 455 86 577 329 637 395 201 267 ...
 $ State Integrated: int [1:327] 438 26 191 560 151 114 126 171 211 57 ...

class(rollsByAuth)  #是list

[1] "list"

#依據Roll和Auth的分類算mean
s1<-lapply(split(dta$Roll, dta$Auth), mean)
s1

$Other
[1] 51

$Private
[1] 308.798

$State
[1] 300.6301

$`State Integrated`
[1] 258.3792

s2<-sapply(split(dta$Roll, dta$Auth), mean)
s2

           Other          Private            State State Integrated 
         51.0000         308.7980         300.6301         258.3792

#比較lapply跟sapply
class(s1) #lapply的結果是list

[1] "list"

str(s1)   #List of 4

List of 4
 $ Other           : num 51
 $ Private         : num 309
 $ State           : num 301
 $ State Integrated: num 258

dim(s1)   #list沒有dimension

NULL

class(s2) #sapply的結果是數值

[1] "numeric"

str(s2)   #1x4的數值

 Named num [1:4] 51 309 301 258
 - attr(*, "names")= chr [1:4] "Other" "Private" "State" "State Integrated"

dim(s2)   #只有一行數值，沒有dimension

NULL

Exercise 1

The following R script uses Cushings{MASS} to demonstrates several ways to achieve the same objective in R. Explain the advantages or disadvantages of each method.

library(MASS)
library(dplyr)
head(Cushings)

   Tetrahydrocortisone Pregnanetriol Type
a1                 3.1         11.70    a
a2                 3.0          1.30    a
a3                 1.9          0.10    a
a4                 3.8          0.04    a
a5                 4.1          1.10    a
a6                 1.9          0.40    a

tail(Cushings)

   Tetrahydrocortisone Pregnanetriol Type
u1                 5.1           0.4    u
u2                12.9           5.0    u
u3                13.0           0.8    u
u4                 2.6           0.1    u
u5                30.0           0.1    u
u6                20.5           0.8    u

str(Cushings) #27 obs. of  3 variables

'data.frame':   27 obs. of  3 variables:
 $ Tetrahydrocortisone: num  3.1 3 1.9 3.8 4.1 1.9 8.3 3.8 3.9 7.8 ...
 $ Pregnanetriol      : num  11.7 1.3 0.1 0.04 1.1 0.4 1 0.2 0.6 1.2 ...
 $ Type               : Factor w/ 4 levels "a","b","c","u": 1 1 1 1 1 1 2 2 2 2 ...

table(Cushings$Type)


 a  b  c  u 
 6 10  5  6

# method 1  aggregate {stats}
#把資料分組聚合，然後對聚合以後的資料進行加和、求平均等各種操作
#針對data Cushings以Type分組，計算其他變項的mean
m1<-aggregate( . ~ Type, data = Cushings, mean)
m1

  Type Tetrahydrocortisone Pregnanetriol
1    a            2.966667          2.44
2    b            8.180000          1.12
3    c           19.720000          5.50
4    u           14.016667          1.20

class(m1) #data.frame

[1] "data.frame"

str(m1)   #4 obs. of  3 variables

'data.frame':   4 obs. of  3 variables:
 $ Type               : Factor w/ 4 levels "a","b","c","u": 1 2 3 4
 $ Tetrahydrocortisone: num  2.97 8.18 19.72 14.02
 $ Pregnanetriol      : num  2.44 1.12 5.5 1.2

###這是最直觀的方法，對於不會寫function的人...比較容易操作
###可變性較低，若資料多樣，就要逐一修改

# method 2  sapply+function(x)
#split切割資料
# 依據Cushings$Type的定義切割資料Cushings[,-3]
a<-split(Cushings[,-3], Cushings$Type)
a

$a
   Tetrahydrocortisone Pregnanetriol
a1                 3.1         11.70
a2                 3.0          1.30
a3                 1.9          0.10
a4                 3.8          0.04
a5                 4.1          1.10
a6                 1.9          0.40

$b
    Tetrahydrocortisone Pregnanetriol
b1                  8.3           1.0
b2                  3.8           0.2
b3                  3.9           0.6
b4                  7.8           1.2
b5                  9.1           0.6
b6                 15.4           3.6
b7                  7.7           1.6
b8                  6.5           0.4
b9                  5.7           0.4
b10                13.6           1.6

$c
   Tetrahydrocortisone Pregnanetriol
c1                10.2           6.4
c2                 9.2           7.9
c3                 9.6           3.1
c4                53.8           2.5
c5                15.8           7.6

$u
   Tetrahydrocortisone Pregnanetriol
u1                 5.1           0.4
u2                12.9           5.0
u3                13.0           0.8
u4                 2.6           0.1
u5                30.0           0.1
u6                20.5           0.8

class(a)

[1] "list"

## sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)
#X: 列表或矩陣
#FUN: 函數
#輸出:向量、矩陣、列表
#這邊是針對split後的結果(a list)進行apply的function
m2<-sapply(split(Cushings[,-3], Cushings$Type), 
           #apply(X, MARGIN, FUN)
           #X: 矩陣或資料
           #MARGIN: 1,對row應用;2,對column應用;both,對行列應用
           #FUN:函數
           #其中x是Cushings$Type的分類,所以會產出四個資料
           function(x) apply(x, 2, mean))

m2

                           a    b     c        u
Tetrahydrocortisone 2.966667 8.18 19.72 14.01667
Pregnanetriol       2.440000 1.12  5.50  1.20000

class(m2) #"matrix" "array"

[1] "matrix" "array"

str(m2)   #List of 2

 num [1:2, 1:4] 2.97 2.44 8.18 1.12 19.72 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
  ..$ : chr [1:4] "a" "b" "c" "u"

###function的寫法不難理解，但要我自己寫，我可能也是寫不出來
###若要計算別的函數，可以直接修改

# method 3
# do.call 作為函數的傳回值
# rbind透過row合併
# 把Cushings列出$Type的資料當成list進行rbind，並以fun計算mean
m3<-do.call("rbind", as.list(
  by(Cushings, list(Cushings$Type), 
     function(x) {
       #不選Type此欄列入子集
    y <- subset(x, select =  -Type, )
    apply(y, 2, mean)
  }
)))
m3

  Tetrahydrocortisone Pregnanetriol
a            2.966667          2.44
b            8.180000          1.12
c           19.720000          5.50
u           14.016667          1.20

class(m3) #"matrix" "array"

[1] "matrix" "array"

str(m3)   #List of 2

 num [1:4, 1:2] 2.97 8.18 19.72 14.02 2.44 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:4] "a" "b" "c" "u"
  ..$ : chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"

###因為是rbind，所以雖然是跟method 2一樣是list of 2，但為1:4、1:2的list組成的matrix

library(tidyr)  #nest
library(purrr)  #map
# method 4  %>% 運算子(pipe operator)
m4<-Cushings %>%
 group_by(Type) %>%   #依據Type做分組
  #summarise()函式用來計算統計值
 summarize(t_m = mean(Tetrahydrocortisone), 
           p_m = mean(Pregnanetriol))
  #mean(Tetrahydrocortisone)存入t_m的column
m4

# A tibble: 4 x 3
  Type    t_m   p_m
  <fct> <dbl> <dbl>
1 a      2.97  2.44
2 b      8.18  1.12
3 c     19.7   5.5 
4 u     14.0   1.2

class(m4) #"tbl_df" "tbl""data.frame"

[1] "tbl_df"     "tbl"        "data.frame"

#使用 dplyr 套件中基礎函數之後的輸出是一種叫做 tibble 的改良式資料框
#tibble 可以利用 as.data.frame() 函數轉換
str(m4)   #tibble [4 x 3]

tibble [4 x 3] (S3: tbl_df/tbl/data.frame)
 $ Type: Factor w/ 4 levels "a","b","c","u": 1 2 3 4
 $ t_m : num [1:4] 2.97 8.18 19.72 14.02
 $ p_m : num [1:4] 2.44 1.12 5.5 1.2

### %>% 增加語法的可讀性，也很直觀

# method 5 
m5<-Cushings %>%
 nest(-Type) #Nesting creates a list-column of data frames
m5

# A tibble: 4 x 2
  Type  data             
  <fct> <list>           
1 a     <tibble [6 x 2]> 
2 b     <tibble [10 x 2]>
3 c     <tibble [5 x 2]> 
4 u     <tibble [6 x 2]>

# 不包括Type把Cushings改變成一個list-column of data frames
class(m5)

[1] "tbl_df"     "tbl"        "data.frame"

str(m5)

tibble [4 x 2] (S3: tbl_df/tbl/data.frame)
 $ Type: Factor w/ 4 levels "a","b","c","u": 1 2 3 4
 $ data:List of 4
  ..$ : tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ Tetrahydrocortisone: num [1:6] 3.1 3 1.9 3.8 4.1 1.9
  .. ..$ Pregnanetriol      : num [1:6] 11.7 1.3 0.1 0.04 1.1 0.4
  ..$ : tibble [10 x 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ Tetrahydrocortisone: num [1:10] 8.3 3.8 3.9 7.8 9.1 15.4 7.7 6.5 5.7 13.6
  .. ..$ Pregnanetriol      : num [1:10] 1 0.2 0.6 1.2 0.6 3.6 1.6 0.4 0.4 1.6
  ..$ : tibble [5 x 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ Tetrahydrocortisone: num [1:5] 10.2 9.2 9.6 53.8 15.8
  .. ..$ Pregnanetriol      : num [1:5] 6.4 7.9 3.1 2.5 7.6
  ..$ : tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ Tetrahydrocortisone: num [1:6] 5.1 12.9 13 2.6 30 20.5
  .. ..$ Pregnanetriol      : num [1:6] 0.4 5 0.8 0.1 0.1 0.8

### 不太清楚nest的功能有甚麼資料處理的重要性

m5.1<-Cushings %>%
 nest(-Type) %>%
  #使用mutate()增加新欄位
  #map()函數家族的使用概念為map(需逐一計算的向量,計算所需的函數)
 mutate(avg = map(data, ~ apply(., 2, mean)), 
        res_1 = map_dbl(avg, "Tetrahydrocortisone"), 
        res_2 = map_dbl(avg, "Pregnanetriol")) 
m5.1

# A tibble: 4 x 5
  Type  data              avg       res_1 res_2
  <fct> <list>            <list>    <dbl> <dbl>
1 a     <tibble [6 x 2]>  <dbl [2]>  2.97  2.44
2 b     <tibble [10 x 2]> <dbl [2]>  8.18  1.12
3 c     <tibble [5 x 2]>  <dbl [2]> 19.7   5.5 
4 u     <tibble [6 x 2]>  <dbl [2]> 14.0   1.2

class(m5.1) #"tbl_df" "tbl""data.frame"

[1] "tbl_df"     "tbl"        "data.frame"

#使用 dplyr 套件中基礎函數之後的輸出是一種叫做 tibble 的改良式資料框
#tibble 可以利用 as.data.frame() 函數轉換
str(m5.1)   #tibble [4 x 5]

tibble [4 x 5] (S3: tbl_df/tbl/data.frame)
 $ Type : Factor w/ 4 levels "a","b","c","u": 1 2 3 4
 $ data :List of 4
  ..$ : tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ Tetrahydrocortisone: num [1:6] 3.1 3 1.9 3.8 4.1 1.9
  .. ..$ Pregnanetriol      : num [1:6] 11.7 1.3 0.1 0.04 1.1 0.4
  ..$ : tibble [10 x 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ Tetrahydrocortisone: num [1:10] 8.3 3.8 3.9 7.8 9.1 15.4 7.7 6.5 5.7 13.6
  .. ..$ Pregnanetriol      : num [1:10] 1 0.2 0.6 1.2 0.6 3.6 1.6 0.4 0.4 1.6
  ..$ : tibble [5 x 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ Tetrahydrocortisone: num [1:5] 10.2 9.2 9.6 53.8 15.8
  .. ..$ Pregnanetriol      : num [1:5] 6.4 7.9 3.1 2.5 7.6
  ..$ : tibble [6 x 2] (S3: tbl_df/tbl/data.frame)
  .. ..$ Tetrahydrocortisone: num [1:6] 5.1 12.9 13 2.6 30 20.5
  .. ..$ Pregnanetriol      : num [1:6] 0.4 5 0.8 0.1 0.1 0.8
 $ avg  :List of 4
  ..$ : Named num [1:2] 2.97 2.44
  .. ..- attr(*, "names")= chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
  ..$ : Named num [1:2] 8.18 1.12
  .. ..- attr(*, "names")= chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
  ..$ : Named num [1:2] 19.7 5.5
  .. ..- attr(*, "names")= chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
  ..$ : Named num [1:2] 14 1.2
  .. ..- attr(*, "names")= chr [1:2] "Tetrahydrocortisone" "Pregnanetriol"
 $ res_1: num [1:4] 2.97 8.18 19.72 14.02
 $ res_2: num [1:4] 2.44 1.12 5.5 1.2

### map可以解決for迴圈速度很慢且程式碼很長的問題
### 語法結構覺得很複雜，我不會這樣做(是真的不理解的那種不會)

Exercise 1心得

條條道路通羅馬，但我在道路中迷路了…

對於初學者，我會選擇Method 1或Method 4

Exercise 2(未完成)

Use the data in the high schools example to solve the following problems: (a) test if any pairs of the five variables: read, write, math, science, and socst, are different in means. (b) test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst. (c) Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.

# 讀資料

dta<-read.table("D://hs0.txt", sep="\t", header=T )
str(dta) #'data.frame': 200 obs. of  11 variables

'data.frame':   200 obs. of  11 variables:
 $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
 $ female : chr  "male" "female" "male" "male" ...
 $ race   : chr  "white" "white" "white" "white" ...
 $ ses    : chr  "low" "middle" "high" "high" ...
 $ schtyp : chr  "public" "public" "public" "public" ...
 $ prog   : chr  "general" "vocation" "general" "vocation" ...
 $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
 $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
 $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
 $ science: int  47 63 58 53 53 63 53 39 58 NA ...
 $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...

head(dta)

   id female  race    ses schtyp     prog read write math science socst
1  70   male white    low public  general   57    52   41      47    57
2 121 female white middle public vocation   68    59   53      63    61
3  86   male white   high public  general   44    33   54      58    31
4 141   male white   high public vocation   63    44   47      53    56
5 172   male white middle public academic   47    52   57      53    61
6 113   male white middle public academic   44    52   51      63    61

table(dta$race)


african-amer        asian     hispanic        white 
          20           11           24          145

table(dta$ses)


  high    low middle 
    58     47     95

table(dta$schtyp)


private  public 
     32     168

table(dta$prog)


academic  general vocation 
     105       45       50

#Column 1: Student ID
#Column 2: Gender, M = Male, F = Female
#Column 3: Race
#Column 4: Socioeconomic status
#Column 5: School type
#Column 6: Program type
#Column 7: Reading score
#Column 8: Writing score
#Column 9: Math score
#Column 10: Science score
#Column 11: Social science studies score

#test if any pairs of the five variables: read, write, math, science, and socst, are different in means.
#t.test(x,y,paired = T)
t.test(dta$read, dta$write,paired = T )


    Paired t-test

data:  dta$read and dta$write
t = -0.86731, df = 199, p-value = 0.3868
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.7841424  0.6941424
sample estimates:
mean of the differences 
                 -0.545

t.test(dta$read, dta$math,paired = T )


    Paired t-test

data:  dta$read and dta$math
t = -0.72428, df = 199, p-value = 0.4697
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.5448905  0.7148905
sample estimates:
mean of the differences 
                 -0.415

t.test(dta$read, dta$science,paired = T )


    Paired t-test

data:  dta$read and dta$science
t = 0.31406, df = 194, p-value = 0.7538
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.028890  1.418634
sample estimates:
mean of the differences 
              0.1948718

t.test(dta$read, dta$socst,paired = T )


    Paired t-test

data:  dta$read and dta$socst
t = -0.27074, df = 199, p-value = 0.7869
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.449639  1.099639
sample estimates:
mean of the differences 
                 -0.175

t.test(dta$write, dta$math,paired = T )


    Paired t-test

data:  dta$write and dta$math
t = 0.22303, df = 199, p-value = 0.8237
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.01944  1.27944
sample estimates:
mean of the differences 
                   0.13

t.test(dta$write, dta$science,paired = T )


    Paired t-test

data:  dta$write and dta$science
t = 1.1867, df = 194, p-value = 0.2368
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.5057821  2.0339872
sample estimates:
mean of the differences 
              0.7641026

t.test(dta$write, dta$socst,paired = T )


    Paired t-test

data:  dta$write and dta$socst
t = 0.5778, df = 199, p-value = 0.5641
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.8927696  1.6327696
sample estimates:
mean of the differences 
                   0.37

t.test(dta$math, dta$science,paired = T )


    Paired t-test

data:  dta$math and dta$science
t = 1.0799, df = 194, p-value = 0.2815
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.5296709  1.8117222
sample estimates:
mean of the differences 
              0.6410256

t.test(dta$math, dta$socst,paired = T )


    Paired t-test

data:  dta$math and dta$socst
t = 0.35101, df = 199, p-value = 0.726
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.108304  1.588304
sample estimates:
mean of the differences 
                   0.24

t.test(dta$science, dta$socst,paired = T )


    Paired t-test

data:  dta$science and dta$socst
t = -0.50692, df = 194, p-value = 0.6128
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.906119  1.126632
sample estimates:
mean of the differences 
             -0.3897436

# 最傻的方法，一個一個做...

a<-do.call(“rbind”, as.list( by(dta, list(c(,7:11) ))) a class(a) #“matrix” “array” str(a) #List of 2

nest(-id,-race, -ses,-schtyp,-prog)%>% mutate(avg = map(data, ~ apply(., 2, mean)), score_r = map_dbl(avg, “read”), score_w = map_dbl(avg, “write”), score_m = map_dbl(avg, “math”), score_sci = map_dbl(avg, “science”), score_sc = map_dbl(avg, “socst”)) a

dta%>% group_by(variable) %>% #依據Type做分組 #summarise()函式用來計算統計值 summarize(t_m = mean(Tetrahydrocortisone), p_m = mean(Pregnanetriol)) #mean(Tetrahydrocortisone)存入t_m的column m4

Exercise 3

The formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r. Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate. 如果您今天以 r 的月利率貸款 L 金額，您必須在 M 個月內每月支付的款項。

#名字<-function(參數1,參數2,...){
#程式碼本體
#最後一行的輸出自動設為回傳值
#}


amount <- function(L,r,M){
  P<-L*(r/(1-(1+r)^(-M)))
  return(P)
}
#r:利率
#L:借款金額
#M:M個月內繳清款項


#借貸5,000,000元，利率為2%，在10年內還完(10*12=120月)
amount(5*10^6,2/100, 10*12)

[1] 110240.5

#rep(x, times = 1, length.out = NA, each = 1)
#x 使用者想要相同重複元素的數值向量.
#times 數值向量 x 重複的次數.
#each 數值向量 x 的每個元素重複的次數.
#length.out = NA 數值向量 x 重複後的長度.
#r:利率
#L:借款金額
#M:M個月內繳清款項
loan<-data.frame(
       L=rep(c(5*10^6, 10*10^6, 15*10^6), each=15),
       r=rep(c(2/100, 5/100, 7/100)),
       M=rep(c(10*12, 15*12, 20*12, 25*12, 30*12))) %>%
    mutate(money=amount(L,r,M)) 
loan

         L    r   M     money
1  5.0e+06 0.02 120  110240.5
2  5.0e+06 0.05 180  250038.4
3  5.0e+06 0.07 240  350000.0
4  5.0e+06 0.02 300  100263.7
5  5.0e+06 0.05 360  250000.0
6  5.0e+06 0.07 120  350104.3
7  5.0e+06 0.02 180  102913.7
8  5.0e+06 0.05 240  250002.1
9  5.0e+06 0.07 300  350000.0
10 5.0e+06 0.02 360  100080.2
11 5.0e+06 0.05 120  250718.6
12 5.0e+06 0.07 180  350001.8
13 5.0e+06 0.02 240  100870.4
14 5.0e+06 0.05 300  250000.1
15 5.0e+06 0.07 360  350000.0
16 1.0e+07 0.02 120  220481.0
17 1.0e+07 0.05 180  500076.7
18 1.0e+07 0.07 240  700000.1
19 1.0e+07 0.02 300  200527.4
20 1.0e+07 0.05 360  500000.0
21 1.0e+07 0.07 120  700208.5
22 1.0e+07 0.02 180  205827.4
23 1.0e+07 0.05 240  500004.1
24 1.0e+07 0.07 300  700000.0
25 1.0e+07 0.02 360  200160.4
26 1.0e+07 0.05 120  501437.1
27 1.0e+07 0.07 180  700003.6
28 1.0e+07 0.02 240  201740.8
29 1.0e+07 0.05 300  500000.2
30 1.0e+07 0.07 360  700000.0
31 1.5e+07 0.02 120  330721.5
32 1.5e+07 0.05 180  750115.1
33 1.5e+07 0.07 240 1050000.1
34 1.5e+07 0.02 300  300791.1
35 1.5e+07 0.05 360  750000.0
36 1.5e+07 0.07 120 1050312.8
37 1.5e+07 0.02 180  308741.0
38 1.5e+07 0.05 240  750006.2
39 1.5e+07 0.07 300 1050000.0
40 1.5e+07 0.02 360  300240.7
41 1.5e+07 0.05 120  752155.7
42 1.5e+07 0.07 180 1050005.4
43 1.5e+07 0.02 240  302611.2
44 1.5e+07 0.05 300  750000.3
45 1.5e+07 0.07 360 1050000.0

str(loan)

'data.frame':   45 obs. of  4 variables:
 $ L    : num  5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 5e+06 ...
 $ r    : num  0.02 0.05 0.07 0.02 0.05 0.07 0.02 0.05 0.07 0.02 ...
 $ M    : num  120 180 240 300 360 120 180 240 300 360 ...
 $ money: num  110240 250038 350000 100264 250000 ...

head(loan)

      L    r   M    money
1 5e+06 0.02 120 110240.5
2 5e+06 0.05 180 250038.4
3 5e+06 0.07 240 350000.0
4 5e+06 0.02 300 100263.7
5 5e+06 0.05 360 250000.0
6 5e+06 0.07 120 350104.3

###想畫個圖，但沒成功
names(loan)[c(1:4)] <- c("L", "r", "m", "money")
head(loan)

      L    r   m    money
1 5e+06 0.02 120 110240.5
2 5e+06 0.05 180 250038.4
3 5e+06 0.07 240 350000.0
4 5e+06 0.02 300 100263.7
5 5e+06 0.05 360 250000.0
6 5e+06 0.07 120 350104.3

barplot(with(loan, 
             xtabs(L ~m +money)),
        main="pay per month for loan", 
        ylab="money", 
        xlab="loan",
        beside=T)

loan%>%
ggplot( ) + 
  aes(x=money, 
      y=reorder(L,r)) +
  geom_point(size=rel(3))+
  scale_color_manual(values=c('lightblue',
                              'skyblue',
                              'steelblue',
                              'navyblue'),
                     guide=guide_legend(title=NULL))+
  labs(x="", 
       y="")

Exercise 4(未完成?)

Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article: Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.

# c-stat example
# read in data 
dta <- read.table("D://cstat.txt", header=T)
str(dta) #42 obs. of  1 variable

'data.frame':   42 obs. of  1 variable:
 $ nc: int  28 46 39 45 24 20 35 37 36 40 ...

head(dta)

plot(dta)     #完全畫不出甚麼東西來

plot(dta$nc)  #畫出來沒有順序

dta$order <- c(1:42)   #要先給一個順序
head(dta)

plot(x=dta$order,                # X軸的值
     y=dta$nc,                   # Y軸的值
     xlab="Observations",        # X軸名稱
     ylab="number of children",  # Y軸名稱
     lines(1:42, dta[,1]) )

# plot figure 1
#plot(dta, xlab="Observations", ylab="Number of Children")
plot(1:42, dta[,1], xlab="Observations", ylab="Number of Children")
lines(1:42, dta[,1]) #把1~41連起來
abline(v=10, lty=2)  #abline()函數給當前繪圖添加一條或多條直線
abline(v=32, lty=2)
#1~10做一個區段線
segments(1, mean(dta[1:10,1]),10, mean(dta[1:10,1]),col="red")
#11~32做一個區段線
segments(11, mean(dta[11:32,1]),32, mean(dta[11:32,1]),col="red")
#33~42做一個區段線
segments(33, mean(dta[33:42,1]),42, mean(dta[33:42,1]),col="red")

# calculate c-stat for first baseline phase
cden <- 1-(sum(diff(dta[1:10,1])^2)/(2*(10-1)*var(dta[1:10,1])))
cden

[1] 0.1601208

sc <- sqrt((10-2)/((10-1)*(10+1)))
sc

[1] 0.2842676

pval <- 1-pnorm(cden/sc)   
pval

[1] 0.2866238

# calculate c-stat for first baseline plus group tokens
n <- 32
cden <- 1-(sum(diff(dta[1:n,1])^2)/(2*(n-1)*var(dta[1:n,1])))
sc <- sqrt((n-2)/((n-1)*(n+1)))
pval <- 1-pnorm(cden/sc)
list(z=cden/sc,pvalue=pval)

$z
[1] 3.879054

$pvalue
[1] 5.243167e-05

Inclass exercise_Function

Ruei-Ning Su

2021-12-13