1 In-class 1

Task: Explain what does this statement do: lapply(lapply(search(), ls), length)

# the code means apply the length function on the result which apply ls function on the search()
# the result provide the number of function in each attached packages
## search(): gives a list of attached packages
lapply(lapply(search(), ls), length)
## [[1]]
## [1] 0
## 
## [[2]]
## [1] 449
## 
## [[3]]
## [1] 87
## 
## [[4]]
## [1] 113
## 
## [[5]]
## [1] 247
## 
## [[6]]
## [1] 104
## 
## [[7]]
## [1] 203
## 
## [[8]]
## [1] 0
## 
## [[9]]
## [1] 1255

2 In-class 2

Task: Convert the R script in the NZ schools example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.

The New Zealand Ministry of Education provides basic information for all primary and secondary schools in the country.

  • Column 1: School ID
  • Column 2: School name
  • Column 3: City where the school is located
  • Column 4: The authority of the school
  • Column 5: A socio-economic status of the families of the students of the school
  • Column 6: The number of students enrolled at the school as of July 2007

2.1 Data input and view

## keep the school names with white spaces
dta <- read.csv("nzSchools.csv", as.is=2)

## show display the structure of dta 
str(dta)
## 'data.frame':    2571 obs. of  6 variables:
##  $ ID  : int  1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
##  $ Name: chr  "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
##  $ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
##  $ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
##  $ Dec : int  2 3 4 2 4 8 5 5 6 1 ...
##  $ Roll: int  318 200 455 86 577 329 637 395 438 201 ...
## Dimension of dta (row, column)
dim(dta)
## [1] 2571    6

2.2 Binning

## binning

## create "Size" variable which based on "Roll" variable. If the value higher than the median will classed into "Large", the other is "Small" 

dta$Size <- ifelse(dta$Roll > median(dta$Roll), "Large", "Small")

## Remove "Size" column
dta$Size <- NULL

## show the first 6 row of dta
head(dta)
##     ID                  Name      City  Auth Dec Roll
## 1 1015      Hora Hora School Whangarei State   2  318
## 2 1052    Morningside School Whangarei State   3  200
## 3 1062        Onerahi School Whangarei State   4  455
## 4 1092 Raurimu Avenue School Whangarei State   2   86
## 5 1130      Whangarei School Whangarei State   4  577
## 6 1018       Hurupaki School Whangarei State   8  329
## create "Size" variable which based on "Roll" that divides the range of "Roll" into 3 intervals and codes the "Small", "Mediam", "Large" in "Roll" according to which interval they fall.
dta$Size <- cut(dta$Roll, 3, labels=c("Small", "Medium", "Large"))

## create cross tabulation show each number of level in the "Size" variable. 
table(dta$Size)
## 
##  Small Medium  Large 
##   2555     15      1

2.3 Sorting

## create "RollOrd" to show the position of Roll values with descending order 
dta$RollOrd <- order(dta$Roll, decreasing=T)

## Top 6 Roll value
## similar to the result: head(sort(dta$Roll, decreasing=T))
head(dta[dta$RollOrd, ])
##       ID                  Name         City  Auth Dec Roll   Size RollOrd
## 1726 498 Correspondence School   Wellington State  NA 5546  Large     753
## 301   28     Rangitoto College     Auckland State  10 3022 Medium     353
## 376   78      Avondale College     Auckland State   4 2613 Medium     712
## 2307 319  Burnside High School Christchurch State   8 2588 Medium     709
## 615   41      Macleans College     Auckland State  10 2476 Medium    1915
## 199   43    Massey High School     Auckland State   5 2452 Medium    1683
## Last 6 Roll value
tail(dta[dta$RollOrd, ])
##        ID                    Name                  City    Auth Dec Roll  Size
## 2401 1641  Amana Christian School               Dunedin Private   9    7 Small
## 1590 2461       Tangimoana School              Manawatu   State   4    6 Small
## 1996 3598         Woodbank School              Kaikoura   State   4    6 Small
## 2112 3386     Jacobs River School          Jacobs River   State   5    6 Small
## 1514 2407     Ngamatapouri School Sth Taranaki District   State   9    5 Small
## 1575 2420 Papanui Junction School               Taihape   State   5    5 Small
##      RollOrd
## 2401    2562
## 1590     266
## 1996    2478
## 2112    1501
## 1514    2377
## 1575    1542
## 先按城市名字再依Roll值,由大排到小 然後呈現前六列
## head(sort(dta$City, dta$Roll, decreasing=T))
head(dta[order(dta$City, dta$Roll, decreasing=T), ])
##        ID                      Name      City  Auth Dec Roll  Size RollOrd
## 2548  401           Menzies College   Wyndham State   4  356 Small     859
## 2549 4054            Wyndham School   Wyndham State   5   94 Small    1163
## 1611 2742          Woodville School Woodville State   3  147 Small     726
## 1630 2640           Papatawa School Woodville State   7   27 Small    2273
## 2041 3600            Woodend School   Woodend State   9  375 Small    1401
## 1601  399 Central Southland College    Winton State   7  549 Small     450
## 先按城市名字再依Roll值,由大排到小 然後呈現最後六列
tail(dta[order(dta$City, dta$Roll, decreasing=T), ])
##        ID                         Name    City  Auth Dec Roll  Size RollOrd
## 2169 3273                Albury School  Albury State   8   30 Small    1010
## 2018  350           Akaroa Area School  Akaroa State   8  125 Small    1051
## 2023 3332           Duvauchelle School  Akaroa State   9   41 Small     749
## 335  1200                Ahuroa School  Ahuroa State   7   22 Small     193
## 99   1000               Ahipara School Ahipara State   3  241 Small    1963
## 2117 2105 Awahono School - Grey Valley  Ahaura State   4  119 Small     364

2.4 Counting

## create cross tabulation show each number of level in the "Auth" variable. 
table(dta$Auth)
## 
##            Other          Private            State State Integrated 
##                1               99             2144              327
## save as "authtbl" object and show authtbl
authtbl <- table(dta$Auth); authtbl
## 
##            Other          Private            State State Integrated 
##                1               99             2144              327
## show the class of "authtbl" object
class(authtbl)
## [1] "table"
## only show the data with the "Auth" column equal to the value "Other"
dta[dta$Auth == "Other", ]
##       ID            Name         City  Auth Dec Roll  Size RollOrd
## 2315 518 Kingslea School Christchurch Other   1   51 Small    1579
## Create a contingency table from cross-classifying factors by Decile and authority types.  
## Decile: A schools decile indicated the extent to which the school draws its students from the low socio-economic communities.
## Decile 1 schools are the 10% of schools with the highest proportion of students from the low socio-economic communities, whereas decile 10 schools are the 10% of schools with the lowest proportion of these students. 
## A schools decile does not indicate the overall socio-economic mix of the school. 
xtabs(~ Auth + Dec, data=dta)
##                   Dec
## Auth                 1   2   3   4   5   6   7   8   9  10
##   Other              1   0   0   0   0   0   0   0   0   0
##   Private            0   0   2   6   2   2   6  11  12  38
##   State            259 230 208 219 214 215 188 200 205 205
##   State Integrated  12  22  35  28  38  34  45  45  37  31

2.5 Aggregating

## calculate the mean of "Roll"
mean(dta$Roll)
## [1] 295.4737
## calculate the mean "Roll" of the "Private" authority types
mean(dta$Roll[dta$Auth == "Private"])
## [1] 308.798
## calculate the mean "Roll" of the each types of authority
aggregate(dta["Roll"], by=list(dta$Auth), FUN=mean)
##            Group.1     Roll
## 1            Other  51.0000
## 2          Private 308.7980
## 3            State 300.6301
## 4 State Integrated 258.3792
## Create "Rich" variable which based on those decile higher than 5 will be TRUE, the other will be False
dta$Rich <- dta$Dec > 5 ; # dta$Rich

## calculate the mean "Roll" of the each types of authority and Rich group
aggregate(dta["Roll"], by=list(dta$Auth, dta$Rich), FUN=mean)
##            Group.1 Group.2     Roll
## 1            Other   FALSE  51.0000
## 2          Private   FALSE 151.4000
## 3            State   FALSE 261.7487
## 4 State Integrated   FALSE 183.2370
## 5          Private    TRUE 402.5362
## 6            State    TRUE 338.8243
## 7 State Integrated    TRUE 311.2135
## show the range of Roll in each types of authority
## by() tapply()
by(dta["Roll"], INDICES=list(dta$Auth), FUN=range)
## : Other
## [1] 51 51
## ------------------------------------------------------------ 
## : Private
## [1]    7 1663
## ------------------------------------------------------------ 
## : State
## [1]    5 5546
## ------------------------------------------------------------ 
## : State Integrated
## [1]   18 1475

3 In-class 3

Task: Split the ChickWeight{datasets} data by individual chicks to extract separate slope estimates of regressing weight onto Time for each chick.

3.1 Data input

pacman::p_load(dplyr, readr, tidyr, tidyverse, ggplot2)

dat <- ChickWeight

head(dat)
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
str(dat)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   578 obs. of  4 variables:
##  $ weight: num  42 51 59 64 76 93 106 125 149 171 ...
##  $ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
##  $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "outer")=Class 'formula'  language ~Diet
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Time"
##   ..$ y: chr "Body weight"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(days)"
##   ..$ y: chr "(gm)"

3.2 slope

3.2.1 Group-by

with(dat, by(dat, Chick, function(x) lm(weight ~ Time, data=x)))|> sapply(coef)
##             18        16       15        13         9        20        10
## (Intercept) 39 43.392857 46.83333 43.384359 52.094086 37.667826 38.695054
## Time        -2  1.053571  1.89881  2.239601  2.663137  3.732718  4.066102
##                     8        17       19        4         6        11        3
## (Intercept) 43.727273 43.030706 31.21222 32.86568 44.123431 47.921948 23.17955
## Time         4.827273  4.531538  5.08743  6.08864  6.378006  7.510967  8.48737
##                     1        12         2        5       14         7        24
## (Intercept) 24.465436 21.939797 24.724853 16.89563 20.52488  5.842535 53.067766
## Time         7.987899  8.440629  8.719861 10.05536 11.98245 13.205264  1.207533
##                    30        22        23        27        28       26       25
## (Intercept) 39.109666 40.082590 38.428074 29.858569 23.984874 20.70715 19.65119
## Time         5.898351  5.877931  6.685978  7.379368  9.703676 10.10316 11.30676
##                    29       21        33        37       36       31       39
## (Intercept)  5.882771 15.56330 45.830283 29.608834 25.85403 19.13099 17.03661
## Time        12.453487 15.47512  5.855241  6.677053  9.99047 10.02617 10.73710
##                   38       32       40        34        35        44        45
## (Intercept) 10.67282 13.69173 10.83830  5.081682  4.757979 44.909091 35.673121
## Time        12.06051 13.18091 13.44229 15.000151 17.258811  6.354545  7.686432
##                    43        41        47        49        46       50       42
## (Intercept) 52.185751 39.337922 36.489790 31.662986 27.771744 23.78218 19.86507
## Time         8.318863  8.159885  8.374981  9.717894  9.738466 11.33293 11.83679
##                    48
## (Intercept)  7.947663
## Time        13.714718

3.2.2 Tidy approach

revision based on lecture code

slopetidy <- dat |>
  group_by(Chick)|> 
  do({res <- lm(weight ~ Time, data=.)
     broom::tidy(coef(res))
     }
    ) |>
  subset(names=="Time")|>
  arrange(x)


DT::datatable(slopetidy, rownames=FALSE, options=list(pageLength=5))

3.2.3 split

slopesplit <-with(dat, split(dat, Chick))|> 
  lapply(function(x) lm(weight ~ Time, data=x))|> 
  sapply(coef)|>
  reshape2::melt()|>
  reshape(idvar = "Var2", timevar = "Var1", direction = "wide")|>
  arrange(value.Time)

colnames(slopesplit)<-c("Chick", "Intercept", "Time")

DT::datatable(slopesplit, rownames=FALSE, options=list(pageLength=5))

3.3 plot

datshade <- dat[, -c(3,4)]
dat%>%
  mutate(Chick=factor(Chick, levels=slopesplit$Chick))%>%
  ggplot(., aes(x=Time, y=weight))+
  geom_point()+
  geom_smooth(method = lm, formula=y ~ x, se = FALSE)+
  geom_line(data=datshade, aes(x=Time, y=weight), 
            stat="smooth",method = "lm", formula = y ~ x,
            se = FALSE, color="firebrick", alpha=0.5)+
  facet_wrap(.~Chick, ncol=10, nrow=5)+
  theme_minimal()

4 In-class 4

Task: Convert the script in the NCEA 2007 example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.

  • a case study - II

4.1 Data input

## read data
dta2 <- read.table("NCEA2007.txt", sep=":", quote="", h=T, as.is=T)

## Dimension of dta2 (row, column)
dim(dta2)
## [1] 88  4
## show display the structure of dta2 
str(dta2)
## 'data.frame':    88 obs. of  4 variables:
##  $ Name  : chr  "Al-Madinah School" "Alfriston College" "Ambury Park Centre for Riding Therapy" "Aorere College" ...
##  $ Level1: num  61.5 53.9 33.3 39.5 71.2 22.1 50.8 57.3 89.3 59.8 ...
##  $ Level2: num  75 44.1 20 50.2 78.9 30.8 34.8 49.8 89.7 65.7 ...
##  $ Level3: num  0 0 0 30.6 55.5 26.3 48.9 44.6 88.6 50.4 ...
## show first 6 row of dta2
head(dta2)
##                                    Name Level1 Level2 Level3
## 1                     Al-Madinah School   61.5   75.0    0.0
## 2                     Alfriston College   53.9   44.1    0.0
## 3 Ambury Park Centre for Riding Therapy   33.3   20.0    0.0
## 4                        Aorere College   39.5   50.2   30.6
## 5        Auckland Girls' Grammar School   71.2   78.9   55.5
## 6                      Auckland Grammar   22.1   30.8   26.3

4.2 Mean

## calculate the mean of each level
# output is numeric object
apply(dta2[, -1], MARGIN=2, FUN=mean)
##   Level1   Level2   Level3 
## 62.26705 61.06818 47.97614
## list apply
## output is list object
lapply(dta2[, -1], FUN=mean)
## $Level1
## [1] 62.26705
## 
## $Level2
## [1] 61.06818
## 
## $Level3
## [1] 47.97614
## simplify the list apply
## output is numeric object
sapply(dta2[, -1], FUN=mean)
##   Level1   Level2   Level3 
## 62.26705 61.06818 47.97614

4.3 Range

Range of each level

## output is "matrix" "array" object
apply(dta2[, -1], MARGIN=2, FUN=range)
##      Level1 Level2 Level3
## [1,]    2.8    0.0    0.0
## [2,]   97.4   95.7   95.7
## output is list object
lapply(dta2[, -1], FUN=range)
## $Level1
## [1]  2.8 97.4
## 
## $Level2
## [1]  0.0 95.7
## 
## $Level3
## [1]  0.0 95.7
## output is "matrix" "array" object
sapply(dta2[, -1], FUN=range)
##      Level1 Level2 Level3
## [1,]    2.8    0.0    0.0
## [2,]   97.4   95.7   95.7

4.4 Splitting

## 
rollsByAuth <- split(dta$Roll, dta$Auth)

##
str(rollsByAuth)
## List of 4
##  $ Other           : int 51
##  $ Private         : int [1:99] 255 39 154 73 83 25 95 85 94 729 ...
##  $ State           : int [1:2144] 318 200 455 86 577 329 637 395 201 267 ...
##  $ State Integrated: int [1:327] 438 26 191 560 151 114 126 171 211 57 ...
##
class(rollsByAuth)
## [1] "list"
##
lapply(split(dta$Roll, dta$Auth), mean)
## $Other
## [1] 51
## 
## $Private
## [1] 308.798
## 
## $State
## [1] 300.6301
## 
## $`State Integrated`
## [1] 258.3792
##
sapply(split(dta$Roll, dta$Auth), mean) 
##            Other          Private            State State Integrated 
##          51.0000         308.7980         300.6301         258.3792

4.5 Alternative

4.5.1 dta2 mean and range

## colMeans
colMeans(dta2[,-1])
##   Level1   Level2   Level3 
## 62.26705 61.06818 47.97614
## range
range(dta2$Level1)
## [1]  2.8 97.4
## min and max
data.frame(min=sapply(dta2,min), max=sapply(dta2,max))
##                      min                     max
## Name   Al-Madinah School Zayed College for Girls
## Level1               2.8                    97.4
## Level2                 0                    95.7
## Level3                 0                    95.7

4.5.2 dta mean

## aggregate
aggregate(dta[,"Roll"], by=list(dta$Auth), FUN=mean)
##            Group.1        x
## 1            Other  51.0000
## 2          Private 308.7980
## 3            State 300.6301
## 4 State Integrated 258.3792
## dplyr
dta|>
  group_by(Auth)|>
  summarise_at(vars(Roll), list(mean = mean))
## # A tibble: 4 x 2
##   Auth              mean
##   <fct>            <dbl>
## 1 Other              51 
## 2 Private           309.
## 3 State             301.
## 4 State Integrated  258.

5 Homework 1

The following R script uses Cushings{MASS} to demonstrates several ways to achieve the same objective in R.

Task: Explain the advantages or disadvantages of each method.

# Cushings example
library(pacman)
pacman::p_load(MASS, tidyverse)

5.1 method 1

class: “data.frame”
+: 比較符合自己習慣的寫法?
-: 無法看到轉換前的資料樣態

aggregate( . ~ Type, data = Cushings, mean)
##   Type Tetrahydrocortisone Pregnanetriol
## 1    a            2.966667          2.44
## 2    b            8.180000          1.12
## 3    c           19.720000          5.50
## 4    u           14.016667          1.20

5.2 method 2

class: “matrix” “array”
similar to method 3
+: 資料量大時,演算比較有效率?
-: 不太容易上手的語法

sapply(split(Cushings[,-3], Cushings$Type), function(x) apply(x, 2, mean))
##                            a    b     c        u
## Tetrahydrocortisone 2.966667 8.18 19.72 14.01667
## Pregnanetriol       2.440000 1.12  5.50  1.20000

5.3 method 3

class: “matrix” “array”
+: 資料量大時,演算比較有效率?
-: 不太容易上手的語法

do.call("rbind", as.list(
  by(Cushings, list(Cushings$Type), function(x) {
    y <- subset(x, select =  -Type)
    apply(y, 2, mean)
  }
)))
##   Tetrahydrocortisone Pregnanetriol
## a            2.966667          2.44
## b            8.180000          1.12
## c           19.720000          5.50
## u           14.016667          1.20

5.4 method 4

class: “tbl_df” “tbl” “data.frame”
+: 比較符合目前自己習慣的寫法?
-: 無法看到轉換前的資料樣態

## dplyr
Cushings %>%
 group_by(Type) %>%
 summarize(t_m = mean(Tetrahydrocortisone), p_m = mean(Pregnanetriol))
## # A tibble: 4 x 3
##   Type    t_m   p_m
##   <fct> <dbl> <dbl>
## 1 a      2.97  2.44
## 2 b      8.18  1.12
## 3 c     19.7   5.5 
## 4 u     14.0   1.2

5.5 method 5

class: “tbl_df” “tbl” “data.frame”
+: 包含原本資料的樣態
-: 占用的記憶體較多, 語法較複雜

Cushings %>%
 nest(-Type) %>% # unnest?
 mutate(avg = map(data, ~ apply(., 2, mean)), 
        res_1 = map_dbl(avg, "Tetrahydrocortisone"), 
        res_2 = map_dbl(avg, "Pregnanetriol")) 
## Warning: All elements of `...` must be named.
## Did you want `data = c(Tetrahydrocortisone, Pregnanetriol)`?
## # A tibble: 4 x 5
##   Type  data              avg       res_1 res_2
##   <fct> <list>            <list>    <dbl> <dbl>
## 1 a     <tibble [6 x 2]>  <dbl [2]>  2.97  2.44
## 2 b     <tibble [10 x 2]> <dbl [2]>  8.18  1.12
## 3 c     <tibble [5 x 2]>  <dbl [2]> 19.7   5.5 
## 4 u     <tibble [6 x 2]>  <dbl [2]> 14.0   1.2

6 Homework 2

Task: Use the data in the high schools example to solve the following problems:

  1. test if any pairs of the five variables: read, write, math, science, and socst, are different in means.
  2. test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst.
  3. Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.
dat <- read.table("hs0.txt", sep="\t", quote="", header=T, as.is=T)

str(dat)
## 'data.frame':    200 obs. of  11 variables:
##  $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
##  $ female : chr  "male" "female" "male" "male" ...
##  $ race   : chr  "white" "white" "white" "white" ...
##  $ ses    : chr  "low" "middle" "high" "high" ...
##  $ schtyp : chr  "public" "public" "public" "public" ...
##  $ prog   : chr  "general" "vocation" "general" "vocation" ...
##  $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
##  $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
##  $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
##  $ science: int  47 63 58 53 53 63 53 39 58 NA ...
##  $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...

6.1 Q(a)

  • test if any pairs of the five variables: read, write, math, science, and socst, are different in means.
datl <- dat %>% reshape2::melt(., measure.vars = colnames(.)[7:11])

res <- dat %>% 
  select_if(is.numeric) %>%
  reshape2::melt(., id='id')

pairwise.t.test(res$value, res$variable, p.adjust = "none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  res$value and res$variable 
## 
##         read write math science
## write   0.58 -     -    -      
## math    0.68 0.90  -    -      
## science 0.76 0.39  0.47 -      
## socst   0.86 0.71  0.81 0.63   
## 
## P value adjustment method: none

6.1.1 Plot

datl %>% 
  group_by(variable) %>%
  summarise(group_MEAN = mean(value, na.rm = TRUE),
            group_SE = sd(value, na.rm = TRUE) / sqrt(sum(!is.na(value)))) %>%
  ggplot(aes(x = variable, y = group_MEAN)) +
  geom_point(aes(shape = variable), size = 4) +
  geom_errorbar(aes(ymax = group_MEAN + 1.96*group_SE,              
                    ymin = group_MEAN - 1.96*group_SE), width=.3, size=.4) +
  labs(x="Variable", y="Mean score")+
  theme_minimal() +
  theme(legend.position = 'none')

6.2 Q(b)

  • test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst.
dat2<-dat[ ,c(3, 7, 8 ,9 , 10, 11)]

res2 <- dat2%>%
  reshape2::melt(., id='race')

lapply(split(res2, res2$variable), function(x) anova(lm(x$value~factor(x$race))))
## $read
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(x$race)   3  1749.8  583.27  5.9637 0.0006539 ***
## Residuals      196 19169.6   97.80                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $write
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(x$race)   3  1914.2  638.05  7.8334 5.785e-05 ***
## Residuals      196 15964.7   81.45                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $math
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value   Pr(>F)    
## factor(x$race)   3  1842.1  614.05  7.7033 6.84e-05 ***
## Residuals      196 15623.7   79.71                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $science
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(x$race)   3  3169.5 1056.51  13.063 8.505e-08 ***
## Residuals      191 15447.2   80.88                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $socst
## Analysis of Variance Table
## 
## Response: x$value
##                 Df  Sum Sq Mean Sq F value  Pr(>F)  
## factor(x$race)   3   943.9  314.63   2.804 0.04098 *
## Residuals      196 21992.3  112.21                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6.2.1 Plot

datl %>% 
  group_by(race, variable) %>%
  summarise(group_MEAN = mean(value, na.rm = TRUE),
            group_SE = sd(value, na.rm = TRUE) / sqrt(sum(!is.na(value)))) %>%
  ggplot(aes(x = race, y = group_MEAN)) +
  geom_point(aes(shape = race), size = 4) +
  geom_errorbar(aes(ymax = group_MEAN + 1.96*group_SE,              
                    ymin = group_MEAN - 1.96*group_SE), width=.3, size=.4) +
  labs(x="Variable", y="Score")+
  facet_wrap(. ~ variable, nrow = 1) + 
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90))
## `summarise()` has grouped output by 'race'. You can override using the `.groups` argument.

6.3 Q(c)

  • Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.
dat3<-dat[ ,c(7:11)]

lapply(dat3, function(x){ 
  coefficients( lm(cbind(dat3$read, 
                         dat3$write, 
                         dat3$math, 
                         dat3$science, 
                         dat3$socst)~x, data=dat3)) 
  } )
## $read
##                      [,1]       [,2]       [,3]       [,4]       [,5]
## (Intercept) -8.141284e-15 24.4095462 21.2425264 20.6602830 18.5561875
## x            1.000000e+00  0.5425249  0.6009356  0.5998076  0.6476622
## 
## $write
##                   [,1]          [,2]       [,3]       [,4]       [,5]
## (Intercept) 18.7309118 -8.955412e-14 20.6572869 21.1309429 16.5279046
## x            0.6336486  1.000000e+00  0.6055514  0.5843927  0.6791647
## 
## $math
##                   [,1]       [,2] [,3]     [,4]       [,5]
## (Intercept) 14.5396491 20.2651022    0 17.49497 19.9529454
## x            0.7148764  0.6167729    1  0.65494  0.6155894
## 
## $science
##                   [,1]       [,2]       [,3]          [,4]       [,5]
## (Intercept) 18.1571108 24.3565993 21.3916641 -8.141284e-15 26.1686196
## x            0.6540264  0.5455811  0.6003186  1.000000e+00  0.5034689
## 
## $socst
##                   [,1]      [,2]       [,3]       [,4]         [,5]
## (Intercept) 21.5368195 25.229771 28.1291585 30.1197077 1.628257e-14
## x            0.5845412  0.524823  0.4670406  0.4167311 1.000000e+00

7 Homework 3

The formula

\[P = L (\frac{r}{1-(1+r)^{-M}})\] describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r.

Task: Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate.

7.1 function

interest <- function(L, r, M) {
  P <- L * (r / (1 - (1 + r)^(-M)))
  return(P)}

7.2 Output

data.frame(L = rep(c(5*10^6, 10*10^6, 15*10^6), each = 15),
           r = rep(c(2, 5, 7) / 100, each = 5, 3),
           M = rep(c(10, 15, 20, 25, 30)*12, 9)) |>
  mutate(P = round(mapply(interest, L, r, M), digits = 2))|>
  DT::datatable(rownames=FALSE, options=list(pageLength=5))

8 Homework 4

Task: Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article: Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.

# c-stat example

# read in data 
dat <- read.table("cstat.txt", header=T)

str(dat)
## 'data.frame':    42 obs. of  1 variable:
##  $ nc: int  28 46 39 45 24 20 35 37 36 40 ...
head(dat)
##   nc
## 1 28
## 2 46
## 3 39
## 4 45
## 5 24
## 6 20
# plot figure 1
plot(1:42, dat[,1], xlab="Observations", ylab="Number of Children")
lines(1:42, dat[,1])
abline(v=10, lty=2)
abline(v=32, lty=2)
segments(1, mean(dat[1:10,1]),10, mean(dat[1:10,1]),col="red")
segments(11, mean(dat[11:32,1]),32, mean(dat[11:32,1]),col="red")
segments(33, mean(dat[33:42,1]),42, mean(dat[33:42,1]),col="red")

8.1 first baseline phase

# calculate c-stat for first baseline phase
cden <- 1-(sum(diff(dat[1:10,1])^2)/(2*(10-1)*var(dat[1:10,1])))

sc <- sqrt((10-2)/((10-1)*(10+1)))

pval <- 1-pnorm(cden/sc)
pval
## [1] 0.2866238
f_cstat <- function(x) {
  n <- length(x)
  cden <- 1 - (sum(diff(x)^2)/(2*(n-1)*var(x)))
  return(cden)
}

f_sc <- function(x) {
  n <- length(x)
  sc <- sqrt((n-2) / ((n-1) * (n+1)))
  return(sc)
}

f_pval <- function(x) {
  cden <- f_cstat(x)
  sc <- f_sc(x)
  return(1 - pnorm(cden / sc))
}

f_cstat(dat[1:10, 1])
## [1] 0.1601208
f_sc(dat[1:10, 1])
## [1] 0.2842676
f_pval(dat[1:10, 1])
## [1] 0.2866238

8.2 first baseline plus group tokens

# calculate c-stat for first baseline plus group tokens
n <- 32
cden <- 1-(sum(diff(dat[1:n,1])^2)/(2*(n-1)*var(dat[1:n,1])))
sc <- sqrt((n-2)/((n-1)*(n+1)))
pval <- 1-pnorm(cden/sc)
list(z=cden/sc,pvalue=pval)
## $z
## [1] 3.879054
## 
## $pvalue
## [1] 5.243167e-05
fgroup_cstat <- function(x) {
  n <- length(x)
  cden <- 1 - (sum(diff(x)^2)/(2*(n-1)*var(x)))
  sc <- sqrt((n-2)/((n-1)*(n+1)))
  pval <- 1-pnorm(cden/sc)
  return(list(c.stat = cden, sc = sc, z = cden/sc, p.value = pval))
}

fgroup_cstat(dat[1:32, 1])
## $c.stat
## [1] 0.6642762
## 
## $sc
## [1] 0.1712469
## 
## $z
## [1] 3.879054
## 
## $p.value
## [1] 5.243167e-05