hw1

The following R script uses Cushings{MASS} to demonstrates several ways to achieve the same objective in R. Explain the advantages or disadvantages of each method.

# Cushings example
#
library(pacman)

pacman::p_load(MASS, tidyverse)

method 1- aggregate by group to mean

aggregate( . ~ Type, data = Cushings, mean)
  Type Tetrahydrocortisone Pregnanetriol
1    a            2.966667          2.44
2    b            8.180000          1.12
3    c           19.720000          5.50
4    u           14.016667          1.20

method 2- split data and sapply each data to mean

# 若只寫mean(x)會不知道要row 還是column mean
sapply(split(Cushings[,-3], Cushings$Type), function(x) apply(x, 2, mean))
                           a    b     c        u
Tetrahydrocortisone 2.966667 8.18 19.72 14.01667
Pregnanetriol       2.440000 1.12  5.50  1.20000
# 可用colMeans取代apply(x, 2, mean)
sapply(split(Cushings[,-3], Cushings$Type), function(x) colMeans(x))
                           a    b     c        u
Tetrahydrocortisone 2.966667 8.18 19.72 14.01667
Pregnanetriol       2.440000 1.12  5.50  1.20000
  • 用split拆成小dataframe
  • 用sapply(data, function)計算

method 3- use do.call put rbind function into multiple subdata

# by(dta, indices, funtion)
do.call("rbind", as.list(
  by(Cushings, list(Cushings$Type), function(x) {
    y <- subset(x, select =  -Type)
    apply(y, 2, mean)
  }
)))
  Tetrahydrocortisone Pregnanetriol
a            2.966667          2.44
b            8.180000          1.12
c           19.720000          5.50
u           14.016667          1.20
  • 用by function 拆成四個小dataframe並計算除去type外的colMeans
  • 再透過do.call將rbind function 運用在前面的四個結果(list)
# 用colmeans取代前面function中的複雜方式
do.call("rbind", by(Cushings[,-3], Cushings[,"Type"], colMeans))
  Tetrahydrocortisone Pregnanetriol
a            2.966667          2.44
b            8.180000          1.12
c           19.720000          5.50
u           14.016667          1.20
# 同樣的方式,但rbind結果樣式不一樣
lapply(by(Cushings[,-3], Cushings[,"Type"], colMeans), rbind)
$a
     Tetrahydrocortisone Pregnanetriol
[1,]            2.966667          2.44

$b
     Tetrahydrocortisone Pregnanetriol
[1,]                8.18          1.12

$c
     Tetrahydrocortisone Pregnanetriol
[1,]               19.72           5.5

$u
     Tetrahydrocortisone Pregnanetriol
[1,]            14.01667           1.2

do.call的做法應該是: rbind( subdata[[1]], subdata[[2]])

lapply的做法應該是:list( rbind( subdata[[1]]) , rbind( subdata[[2]]) )

do.call才是我們想要的樣子

method 4- groupby, summarize

Cushings %>%
 group_by(Type) %>%
 summarize( t_m = mean(Tetrahydrocortisone), p_m = mean(Pregnanetriol))
# A tibble: 4 x 3
  Type    t_m   p_m
  <fct> <dbl> <dbl>
1 a      2.97  2.44
2 b      8.18  1.12
3 c     19.7   5.5 
4 u     14.0   1.2 

method 5-nested???

有看沒有懂?

Cushings %>%
 nest(-Type) %>%
 mutate(avg = map(data, ~ apply(., 2, mean)), 
        res_1 = map_dbl(avg, "Tetrahydrocortisone"), 
        res_2 = map_dbl(avg, "Pregnanetriol")) 
# A tibble: 4 x 5
  Type  data              avg       res_1 res_2
  <fct> <list>            <list>    <dbl> <dbl>
1 a     <tibble [6 x 2]>  <dbl [2]>  2.97  2.44
2 b     <tibble [10 x 2]> <dbl [2]>  8.18  1.12
3 c     <tibble [5 x 2]>  <dbl [2]> 19.7   5.5 
4 u     <tibble [6 x 2]>  <dbl [2]> 14.0   1.2 

hw2

Use the data in the high schools example to solve the following problems:

dta2 <- read.table("hs0.txt", sep="\t", quote="", header=T, as.is=T) #as.is=T 為 chr; as.is=T 為 factor

str(dta2)
'data.frame':   200 obs. of  11 variables:
 $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
 $ female : chr  "male" "female" "male" "male" ...
 $ race   : chr  "white" "white" "white" "white" ...
 $ ses    : chr  "low" "middle" "high" "high" ...
 $ schtyp : chr  "public" "public" "public" "public" ...
 $ prog   : chr  "general" "vocation" "general" "vocation" ...
 $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
 $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
 $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
 $ science: int  47 63 58 53 53 63 53 39 58 NA ...
 $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...

test any pairs of the five variables

  1. test if any pairs of the five variables: read, write, math, science, and socst, are different in means.
# Apply a Function to Multiple List or Vector  Arguments
## x=dta[,7:11] y=dta[,11:7]
mapply(function(x, y) lm(y ~ x), dta2[,7:11], dta2[,11:7]) |> sapply(coef) |> round(2)
             read write math science socst
(Intercept) 18.42 21.13    0   24.36 21.13
x            0.65  0.58    1    0.55  0.59
## 實際上執行的欄位配對
mapply(c, names(dta2[,7:11]), names(dta2[,11:7]))
     read    write     math   science   socst  
[1,] "read"  "write"   "math" "science" "socst"
[2,] "socst" "science" "math" "write"   "read" 

好像不是這個,但感覺跟c小題一樣?

test race and 5 variables

  1. test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst.
# race x 5 variables
by(dta2[,7:11], dta2["race"], colMeans, na.rm=T)
race: african-amer
    read    write     math  science    socst 
46.80000 48.20000 46.75000 42.42105 49.45000 
------------------------------------------------------------ 
race: asian
    read    write     math  science    socst 
51.90909 58.00000 57.27273 51.45455 51.00000 
------------------------------------------------------------ 
race: hispanic
    read    write     math  science    socst 
46.66667 46.45833 47.41667 46.21739 47.79167 
------------------------------------------------------------ 
race: white
    read    write     math  science    socst 
53.92414 54.05517 53.97241 54.14789 53.68276 
# anova test
paste0(names(dta2[7:11]), " ~ race")|> # string: read ~ Race,write ~ Race...
lapply(FUN=formula) |> #give it function to become formula 
lapply(function(f) lm(f, data=dta2)) |> #呈現5個lm結果
lapply(anova) ##呈現5個anova結果
[[1]]
Analysis of Variance Table

Response: read
           Df  Sum Sq Mean Sq F value    Pr(>F)    
race        3  1749.8  583.27  5.9637 0.0006539 ***
Residuals 196 19169.6   97.80                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

[[2]]
Analysis of Variance Table

Response: write
           Df  Sum Sq Mean Sq F value    Pr(>F)    
race        3  1914.2  638.05  7.8334 5.785e-05 ***
Residuals 196 15964.7   81.45                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

[[3]]
Analysis of Variance Table

Response: math
           Df  Sum Sq Mean Sq F value   Pr(>F)    
race        3  1842.1  614.05  7.7033 6.84e-05 ***
Residuals 196 15623.7   79.71                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

[[4]]
Analysis of Variance Table

Response: science
           Df  Sum Sq Mean Sq F value    Pr(>F)    
race        3  3169.5 1056.51  13.063 8.505e-08 ***
Residuals 191 15447.2   80.88                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

[[5]]
Analysis of Variance Table

Response: socst
           Df  Sum Sq Mean Sq F value  Pr(>F)  
race        3   943.9  314.63   2.804 0.04098 *
Residuals 196 21992.3  112.21                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tidy approach
dta2 |> 
 gather(key=Test, value=Score, 7:11) |> #轉long form
 group_by(Test) |> 
 do({res <- lm(Score ~ race, data=.)
 broom::tidy(anova(res))
 }
 )
# A tibble: 10 x 7
# Groups:   Test [5]
   Test    term         df  sumsq meansq statistic       p.value
   <chr>   <chr>     <int>  <dbl>  <dbl>     <dbl>         <dbl>
 1 math    race          3  1842.  614.       7.70  0.0000684   
 2 math    Residuals   196 15624.   79.7     NA    NA           
 3 read    race          3  1750.  583.       5.96  0.000654    
 4 read    Residuals   196 19170.   97.8     NA    NA           
 5 science race          3  3170. 1057.      13.1   0.0000000850
 6 science Residuals   191 15447.   80.9     NA    NA           
 7 socst   race          3   944.  315.       2.80  0.0410      
 8 socst   Residuals   196 21992.  112.      NA    NA           
 9 write   race          3  1914.  638.       7.83  0.0000578   
10 write   Residuals   196 15965.   81.5     NA    NA           

all pairwise simple regressions

  1. Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.
# lm test
paste0(rep(c("read~","write~","math~","science~","socst~"), each = 5), names(dta2[7:11]))|> 
lapply(FUN=formula) |> #give it function to become formula 
lapply(function(f) lm(f, data=dta2)) |> lapply(coef) 
[[1]]
(Intercept) 
      52.23 

[[2]]
(Intercept)       write 
   18.16215     0.64553 

[[3]]
(Intercept)        math 
  14.072537    0.724807 

[[4]]
(Intercept)     science 
 18.1571108   0.6540264 

[[5]]
(Intercept)       socst 
 21.1259457   0.5935322 

[[6]]
(Intercept)        read 
 23.9594444   0.5517051 

[[7]]
(Intercept) 
     52.775 

[[8]]
(Intercept)        math 
 19.8872379   0.6247082 

[[9]]
(Intercept)     science 
 24.3565993   0.5455811 

[[10]]
(Intercept)       socst 
 24.7923365   0.5339693 

[[11]]
(Intercept)        read 
 21.0381578   0.6051473 

[[12]]
(Intercept)       write 
 20.4377529   0.6102747 

[[13]]
(Intercept) 
     52.645 

[[14]]
(Intercept)     science 
 21.3916641   0.6003186 

[[15]]
(Intercept)       socst 
 27.7456280   0.4751335 

[[16]]
(Intercept)        read 
 20.6602830   0.5998076 

[[17]]
(Intercept)       write 
 21.1309429   0.5843927 

[[18]]
(Intercept)        math 
   17.49497     0.65494 

[[19]]
(Intercept) 
   51.91795 

[[20]]
(Intercept)       socst 
 30.1197077   0.4167311 

[[21]]
(Intercept)        read 
 18.4161841   0.6507527 

[[22]]
(Intercept)       write 
 16.2535476   0.6850109 

[[23]]
(Intercept)        math 
 19.5572360   0.6239484 

[[24]]
(Intercept)     science 
 26.1686196   0.5034689 

[[25]]
(Intercept) 
     52.405 
paste0(rep(c("read~","write~","math~","science~","socst~"), each = 5), names(dta2[7:11]))
 [1] "read~read"       "read~write"      "read~math"       "read~science"   
 [5] "read~socst"      "write~read"      "write~write"     "write~math"     
 [9] "write~science"   "write~socst"     "math~read"       "math~write"     
[13] "math~math"       "math~science"    "math~socst"      "science~read"   
[17] "science~write"   "science~math"    "science~science" "science~socst"  
[21] "socst~read"      "socst~write"     "socst~math"      "socst~science"  
[25] "socst~socst"    

hw3

The formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r. Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate.

create function

MIR <- function(L,r,M){
  P <- L * (r / (1 -(1 + r) ^ (-M)))
  return(P)
}
data.frame(L = rep(c(5*10^6, 10*10^6, 15*10^6), each = 15),
           r = rep(c(2, 5, 7) / 100, each = 5, 3),
           M = rep(c(10, 15, 20, 25, 30)*12, 9)) |>
  mutate(P = round(mapply(MIR, L, r, M), digits = 2))
         L    r   M         P
1  5.0e+06 0.02 120  110240.5
2  5.0e+06 0.02 180  102913.7
3  5.0e+06 0.02 240  100870.4
4  5.0e+06 0.02 300  100263.7
5  5.0e+06 0.02 360  100080.2
6  5.0e+06 0.05 120  250718.6
7  5.0e+06 0.05 180  250038.4
8  5.0e+06 0.05 240  250002.0
9  5.0e+06 0.05 300  250000.1
10 5.0e+06 0.05 360  250000.0
11 5.0e+06 0.07 120  350104.3
12 5.0e+06 0.07 180  350001.8
13 5.0e+06 0.07 240  350000.0
14 5.0e+06 0.07 300  350000.0
15 5.0e+06 0.07 360  350000.0
16 1.0e+07 0.02 120  220481.0
17 1.0e+07 0.02 180  205827.4
18 1.0e+07 0.02 240  201740.8
19 1.0e+07 0.02 300  200527.4
20 1.0e+07 0.02 360  200160.4
21 1.0e+07 0.05 120  501437.2
22 1.0e+07 0.05 180  500076.7
23 1.0e+07 0.05 240  500004.1
24 1.0e+07 0.05 300  500000.2
25 1.0e+07 0.05 360  500000.0
26 1.0e+07 0.07 120  700208.5
27 1.0e+07 0.07 180  700003.6
28 1.0e+07 0.07 240  700000.1
29 1.0e+07 0.07 300  700000.0
30 1.0e+07 0.07 360  700000.0
31 1.5e+07 0.02 120  330721.5
32 1.5e+07 0.02 180  308741.0
33 1.5e+07 0.02 240  302611.2
34 1.5e+07 0.02 300  300791.1
35 1.5e+07 0.02 360  300240.7
36 1.5e+07 0.05 120  752155.7
37 1.5e+07 0.05 180  750115.1
38 1.5e+07 0.05 240  750006.2
39 1.5e+07 0.05 300  750000.3
40 1.5e+07 0.05 360  750000.0
41 1.5e+07 0.07 120 1050312.8
42 1.5e+07 0.07 180 1050005.4
43 1.5e+07 0.07 240 1050000.1
44 1.5e+07 0.07 300 1050000.0
45 1.5e+07 0.07 360 1050000.0

hw4

Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article: Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.

# c-stat example
# read in data 
dta4 <- read.table("cstat.txt", header=T)

str(dta4)
'data.frame':   42 obs. of  1 variable:
 $ nc: int  28 46 39 45 24 20 35 37 36 40 ...
head(dta4)
  nc
1 28
2 46
3 39
4 45
5 24
6 20
# plot figure 1
plot(1:42, dta4[,1], xlab="Observations", ylab="Number of Children")
lines(1:42, dta4[,1])
abline(v=10, lty=2)
abline(v=32, lty=2)
segments(1, mean(dta4[1:10,1]),10, mean(dta4[1:10,1]),col="red")
segments(11, mean(dta4[11:32,1]),32, mean(dta4[11:32,1]),col="red")
segments(33, mean(dta4[33:42,1]),42, mean(dta4[33:42,1]),col="red")

# calculate c-stat for first baseline phase
cden <- 1-(sum(diff(dta4[1:10,1])^2)/(2*(10-1)*var(dta4[1:10,1])))
sc <- sqrt((10-2)/((10-1)*(10+1)))
pval <- 1-pnorm(cden/sc)
pval
[1] 0.2866238
# calculate c-stat for first baseline plus group tokens
n <- 32
cden <- 1-(sum(diff(dta4[1:n,1])^2)/(2*(n-1)*var(dta4[1:n,1])))
sc <- sqrt((n-2)/((n-1)*(n+1)))
pval <- 1-pnorm(cden/sc)
list(z=cden/sc,pvalue=pval)
$z
[1] 3.879054

$pvalue
[1] 5.243167e-05