The following R script uses Cushings{MASS} to demonstrates several ways to achieve the same objective in R. Explain the advantages or disadvantages of each method.
# Cushings example
#
library(pacman)
::p_load(MASS, tidyverse) pacman
aggregate( . ~ Type, data = Cushings, mean)
Type Tetrahydrocortisone Pregnanetriol
1 a 2.966667 2.44
2 b 8.180000 1.12
3 c 19.720000 5.50
4 u 14.016667 1.20
# 若只寫mean(x)會不知道要row 還是column mean
sapply(split(Cushings[,-3], Cushings$Type), function(x) apply(x, 2, mean))
a b c u
Tetrahydrocortisone 2.966667 8.18 19.72 14.01667
Pregnanetriol 2.440000 1.12 5.50 1.20000
# 可用colMeans取代apply(x, 2, mean)
sapply(split(Cushings[,-3], Cushings$Type), function(x) colMeans(x))
a b c u
Tetrahydrocortisone 2.966667 8.18 19.72 14.01667
Pregnanetriol 2.440000 1.12 5.50 1.20000
# by(dta, indices, funtion)
do.call("rbind", as.list(
by(Cushings, list(Cushings$Type), function(x) {
<- subset(x, select = -Type)
y apply(y, 2, mean)
} )))
Tetrahydrocortisone Pregnanetriol
a 2.966667 2.44
b 8.180000 1.12
c 19.720000 5.50
u 14.016667 1.20
# 用colmeans取代前面function中的複雜方式
do.call("rbind", by(Cushings[,-3], Cushings[,"Type"], colMeans))
Tetrahydrocortisone Pregnanetriol
a 2.966667 2.44
b 8.180000 1.12
c 19.720000 5.50
u 14.016667 1.20
# 同樣的方式,但rbind結果樣式不一樣
lapply(by(Cushings[,-3], Cushings[,"Type"], colMeans), rbind)
$a
Tetrahydrocortisone Pregnanetriol
[1,] 2.966667 2.44
$b
Tetrahydrocortisone Pregnanetriol
[1,] 8.18 1.12
$c
Tetrahydrocortisone Pregnanetriol
[1,] 19.72 5.5
$u
Tetrahydrocortisone Pregnanetriol
[1,] 14.01667 1.2
do.call的做法應該是: rbind( subdata[[1]], subdata[[2]])
lapply的做法應該是:list( rbind( subdata[[1]]) , rbind( subdata[[2]]) )
do.call才是我們想要的樣子
%>%
Cushings group_by(Type) %>%
summarize( t_m = mean(Tetrahydrocortisone), p_m = mean(Pregnanetriol))
# A tibble: 4 x 3
Type t_m p_m
<fct> <dbl> <dbl>
1 a 2.97 2.44
2 b 8.18 1.12
3 c 19.7 5.5
4 u 14.0 1.2
有看沒有懂?
%>%
Cushings nest(-Type) %>%
mutate(avg = map(data, ~ apply(., 2, mean)),
res_1 = map_dbl(avg, "Tetrahydrocortisone"),
res_2 = map_dbl(avg, "Pregnanetriol"))
# A tibble: 4 x 5
Type data avg res_1 res_2
<fct> <list> <list> <dbl> <dbl>
1 a <tibble [6 x 2]> <dbl [2]> 2.97 2.44
2 b <tibble [10 x 2]> <dbl [2]> 8.18 1.12
3 c <tibble [5 x 2]> <dbl [2]> 19.7 5.5
4 u <tibble [6 x 2]> <dbl [2]> 14.0 1.2
Use the data in the high schools example to solve the following problems:
<- read.table("hs0.txt", sep="\t", quote="", header=T, as.is=T) #as.is=T 為 chr; as.is=T 為 factor
dta2
str(dta2)
'data.frame': 200 obs. of 11 variables:
$ id : int 70 121 86 141 172 113 50 11 84 48 ...
$ female : chr "male" "female" "male" "male" ...
$ race : chr "white" "white" "white" "white" ...
$ ses : chr "low" "middle" "high" "high" ...
$ schtyp : chr "public" "public" "public" "public" ...
$ prog : chr "general" "vocation" "general" "vocation" ...
$ read : int 57 68 44 63 47 44 50 34 63 57 ...
$ write : int 52 59 33 44 52 52 59 46 57 55 ...
$ math : int 41 53 54 47 57 51 42 45 54 52 ...
$ science: int 47 63 58 53 53 63 53 39 58 NA ...
$ socst : int 57 61 31 56 61 61 61 36 51 51 ...
# Apply a Function to Multiple List or Vector Arguments
## x=dta[,7:11] y=dta[,11:7]
mapply(function(x, y) lm(y ~ x), dta2[,7:11], dta2[,11:7]) |> sapply(coef) |> round(2)
read write math science socst
(Intercept) 18.42 21.13 0 24.36 21.13
x 0.65 0.58 1 0.55 0.59
## 實際上執行的欄位配對
mapply(c, names(dta2[,7:11]), names(dta2[,11:7]))
read write math science socst
[1,] "read" "write" "math" "science" "socst"
[2,] "socst" "science" "math" "write" "read"
好像不是這個,但感覺跟c小題一樣?
# race x 5 variables
by(dta2[,7:11], dta2["race"], colMeans, na.rm=T)
race: african-amer
read write math science socst
46.80000 48.20000 46.75000 42.42105 49.45000
------------------------------------------------------------
race: asian
read write math science socst
51.90909 58.00000 57.27273 51.45455 51.00000
------------------------------------------------------------
race: hispanic
read write math science socst
46.66667 46.45833 47.41667 46.21739 47.79167
------------------------------------------------------------
race: white
read write math science socst
53.92414 54.05517 53.97241 54.14789 53.68276
# anova test
paste0(names(dta2[7:11]), " ~ race")|> # string: read ~ Race,write ~ Race...
lapply(FUN=formula) |> #give it function to become formula
lapply(function(f) lm(f, data=dta2)) |> #呈現5個lm結果
lapply(anova) ##呈現5個anova結果
[[1]]
Analysis of Variance Table
Response: read
Df Sum Sq Mean Sq F value Pr(>F)
race 3 1749.8 583.27 5.9637 0.0006539 ***
Residuals 196 19169.6 97.80
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
[[2]]
Analysis of Variance Table
Response: write
Df Sum Sq Mean Sq F value Pr(>F)
race 3 1914.2 638.05 7.8334 5.785e-05 ***
Residuals 196 15964.7 81.45
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
[[3]]
Analysis of Variance Table
Response: math
Df Sum Sq Mean Sq F value Pr(>F)
race 3 1842.1 614.05 7.7033 6.84e-05 ***
Residuals 196 15623.7 79.71
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
[[4]]
Analysis of Variance Table
Response: science
Df Sum Sq Mean Sq F value Pr(>F)
race 3 3169.5 1056.51 13.063 8.505e-08 ***
Residuals 191 15447.2 80.88
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
[[5]]
Analysis of Variance Table
Response: socst
Df Sum Sq Mean Sq F value Pr(>F)
race 3 943.9 314.63 2.804 0.04098 *
Residuals 196 21992.3 112.21
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tidy approach
|>
dta2 gather(key=Test, value=Score, 7:11) |> #轉long form
group_by(Test) |>
do({res <- lm(Score ~ race, data=.)
::tidy(anova(res))
broom
} )
# A tibble: 10 x 7
# Groups: Test [5]
Test term df sumsq meansq statistic p.value
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 math race 3 1842. 614. 7.70 0.0000684
2 math Residuals 196 15624. 79.7 NA NA
3 read race 3 1750. 583. 5.96 0.000654
4 read Residuals 196 19170. 97.8 NA NA
5 science race 3 3170. 1057. 13.1 0.0000000850
6 science Residuals 191 15447. 80.9 NA NA
7 socst race 3 944. 315. 2.80 0.0410
8 socst Residuals 196 21992. 112. NA NA
9 write race 3 1914. 638. 7.83 0.0000578
10 write Residuals 196 15965. 81.5 NA NA
# lm test
paste0(rep(c("read~","write~","math~","science~","socst~"), each = 5), names(dta2[7:11]))|>
lapply(FUN=formula) |> #give it function to become formula
lapply(function(f) lm(f, data=dta2)) |> lapply(coef)
[[1]]
(Intercept)
52.23
[[2]]
(Intercept) write
18.16215 0.64553
[[3]]
(Intercept) math
14.072537 0.724807
[[4]]
(Intercept) science
18.1571108 0.6540264
[[5]]
(Intercept) socst
21.1259457 0.5935322
[[6]]
(Intercept) read
23.9594444 0.5517051
[[7]]
(Intercept)
52.775
[[8]]
(Intercept) math
19.8872379 0.6247082
[[9]]
(Intercept) science
24.3565993 0.5455811
[[10]]
(Intercept) socst
24.7923365 0.5339693
[[11]]
(Intercept) read
21.0381578 0.6051473
[[12]]
(Intercept) write
20.4377529 0.6102747
[[13]]
(Intercept)
52.645
[[14]]
(Intercept) science
21.3916641 0.6003186
[[15]]
(Intercept) socst
27.7456280 0.4751335
[[16]]
(Intercept) read
20.6602830 0.5998076
[[17]]
(Intercept) write
21.1309429 0.5843927
[[18]]
(Intercept) math
17.49497 0.65494
[[19]]
(Intercept)
51.91795
[[20]]
(Intercept) socst
30.1197077 0.4167311
[[21]]
(Intercept) read
18.4161841 0.6507527
[[22]]
(Intercept) write
16.2535476 0.6850109
[[23]]
(Intercept) math
19.5572360 0.6239484
[[24]]
(Intercept) science
26.1686196 0.5034689
[[25]]
(Intercept)
52.405
paste0(rep(c("read~","write~","math~","science~","socst~"), each = 5), names(dta2[7:11]))
[1] "read~read" "read~write" "read~math" "read~science"
[5] "read~socst" "write~read" "write~write" "write~math"
[9] "write~science" "write~socst" "math~read" "math~write"
[13] "math~math" "math~science" "math~socst" "science~read"
[17] "science~write" "science~math" "science~science" "science~socst"
[21] "socst~read" "socst~write" "socst~math" "socst~science"
[25] "socst~socst"
The formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r. Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate.
<- function(L,r,M){
MIR <- L * (r / (1 -(1 + r) ^ (-M)))
P return(P)
}
data.frame(L = rep(c(5*10^6, 10*10^6, 15*10^6), each = 15),
r = rep(c(2, 5, 7) / 100, each = 5, 3),
M = rep(c(10, 15, 20, 25, 30)*12, 9)) |>
mutate(P = round(mapply(MIR, L, r, M), digits = 2))
L r M P
1 5.0e+06 0.02 120 110240.5
2 5.0e+06 0.02 180 102913.7
3 5.0e+06 0.02 240 100870.4
4 5.0e+06 0.02 300 100263.7
5 5.0e+06 0.02 360 100080.2
6 5.0e+06 0.05 120 250718.6
7 5.0e+06 0.05 180 250038.4
8 5.0e+06 0.05 240 250002.0
9 5.0e+06 0.05 300 250000.1
10 5.0e+06 0.05 360 250000.0
11 5.0e+06 0.07 120 350104.3
12 5.0e+06 0.07 180 350001.8
13 5.0e+06 0.07 240 350000.0
14 5.0e+06 0.07 300 350000.0
15 5.0e+06 0.07 360 350000.0
16 1.0e+07 0.02 120 220481.0
17 1.0e+07 0.02 180 205827.4
18 1.0e+07 0.02 240 201740.8
19 1.0e+07 0.02 300 200527.4
20 1.0e+07 0.02 360 200160.4
21 1.0e+07 0.05 120 501437.2
22 1.0e+07 0.05 180 500076.7
23 1.0e+07 0.05 240 500004.1
24 1.0e+07 0.05 300 500000.2
25 1.0e+07 0.05 360 500000.0
26 1.0e+07 0.07 120 700208.5
27 1.0e+07 0.07 180 700003.6
28 1.0e+07 0.07 240 700000.1
29 1.0e+07 0.07 300 700000.0
30 1.0e+07 0.07 360 700000.0
31 1.5e+07 0.02 120 330721.5
32 1.5e+07 0.02 180 308741.0
33 1.5e+07 0.02 240 302611.2
34 1.5e+07 0.02 300 300791.1
35 1.5e+07 0.02 360 300240.7
36 1.5e+07 0.05 120 752155.7
37 1.5e+07 0.05 180 750115.1
38 1.5e+07 0.05 240 750006.2
39 1.5e+07 0.05 300 750000.3
40 1.5e+07 0.05 360 750000.0
41 1.5e+07 0.07 120 1050312.8
42 1.5e+07 0.07 180 1050005.4
43 1.5e+07 0.07 240 1050000.1
44 1.5e+07 0.07 300 1050000.0
45 1.5e+07 0.07 360 1050000.0
Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article: Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.
# c-stat example
# read in data
<- read.table("cstat.txt", header=T)
dta4
str(dta4)
'data.frame': 42 obs. of 1 variable:
$ nc: int 28 46 39 45 24 20 35 37 36 40 ...
head(dta4)
nc
1 28
2 46
3 39
4 45
5 24
6 20
# plot figure 1
plot(1:42, dta4[,1], xlab="Observations", ylab="Number of Children")
lines(1:42, dta4[,1])
abline(v=10, lty=2)
abline(v=32, lty=2)
segments(1, mean(dta4[1:10,1]),10, mean(dta4[1:10,1]),col="red")
segments(11, mean(dta4[11:32,1]),32, mean(dta4[11:32,1]),col="red")
segments(33, mean(dta4[33:42,1]),42, mean(dta4[33:42,1]),col="red")
# calculate c-stat for first baseline phase
<- 1-(sum(diff(dta4[1:10,1])^2)/(2*(10-1)*var(dta4[1:10,1])))
cden <- sqrt((10-2)/((10-1)*(10+1)))
sc <- 1-pnorm(cden/sc)
pval pval
[1] 0.2866238
# calculate c-stat for first baseline plus group tokens
<- 32
n <- 1-(sum(diff(dta4[1:n,1])^2)/(2*(n-1)*var(dta4[1:n,1])))
cden <- sqrt((n-2)/((n-1)*(n+1)))
sc <- 1-pnorm(cden/sc)
pval list(z=cden/sc,pvalue=pval)
$z
[1] 3.879054
$pvalue
[1] 5.243167e-05