The following R script uses Cushings{MASS} to demonstrates several ways to achieve the same objective in R. Explain the advantages or disadvantages of each method.
# tools
#install.packages("pacman")
library(pacman)
::p_load(MASS, tidyverse) pacman
# take a look
Cushings
## Tetrahydrocortisone Pregnanetriol Type
## a1 3.1 11.70 a
## a2 3.0 1.30 a
## a3 1.9 0.10 a
## a4 3.8 0.04 a
## a5 4.1 1.10 a
## a6 1.9 0.40 a
## b1 8.3 1.00 b
## b2 3.8 0.20 b
## b3 3.9 0.60 b
## b4 7.8 1.20 b
## b5 9.1 0.60 b
## b6 15.4 3.60 b
## b7 7.7 1.60 b
## b8 6.5 0.40 b
## b9 5.7 0.40 b
## b10 13.6 1.60 b
## c1 10.2 6.40 c
## c2 9.2 7.90 c
## c3 9.6 3.10 c
## c4 53.8 2.50 c
## c5 15.8 7.60 c
## u1 5.1 0.40 u
## u2 12.9 5.00 u
## u3 13.0 0.80 u
## u4 2.6 0.10 u
## u5 30.0 0.10 u
## u6 20.5 0.80 u
#
names(Cushings)
## [1] "Tetrahydrocortisone" "Pregnanetriol" "Type"
str(Cushings)
## 'data.frame': 27 obs. of 3 variables:
## $ Tetrahydrocortisone: num 3.1 3 1.9 3.8 4.1 1.9 8.3 3.8 3.9 7.8 ...
## $ Pregnanetriol : num 11.7 1.3 0.1 0.04 1.1 0.4 1 0.2 0.6 1.2 ...
## $ Type : Factor w/ 4 levels "a","b","c","u": 1 1 1 1 1 1 2 2 2 2 ...
# method 1
aggregate( . ~ Type, data = Cushings, mean)
## Type Tetrahydrocortisone Pregnanetriol
## 1 a 2.966667 2.44
## 2 b 8.180000 1.12
## 3 c 19.720000 5.50
## 4 u 14.016667 1.20
#優點:用公式formula,code可以很簡潔!
#缺點:但似乎沒那麼直觀,要記住那個點"."是代表其他所有變數。否則我就會寫成aggregate( Tetrahydrocortisone + Pregnanetriol ~ Type, data = Cushings, mean),但這樣的表達方式無效。
# method 2
#function(x) apply(x, 2, mean):apply()將FUN=mean應用在矩陣x的每一欄(2 is the column index)
#sapply(x,function(x) apply(x, 2, mean)):sapply()將FUN=function(x) apply(x, 2, mean)應用在x每一個元素上。
#split(Cushings[,-3], Cushings$Type)是依據Type分割Cushings[,-3]
sapply(split(Cushings[,-3], Cushings$Type), function(x) apply(x, 2, mean))
## a b c u
## Tetrahydrocortisone 2.966667 8.18 19.72 14.01667
## Pregnanetriol 2.440000 1.12 5.50 1.20000
#優點:簡潔
#缺點:不直觀。我知道sapply()是依據Type=a、b、c、u逐行逐列計算mean,但不會聯想到要寫function(x),我就會寫成 sapply(split(Cushings[,-3], Cushings$Type), mean)
# method 3
#do.call(what, args, quote = FALSE, envir = parent.frame())
#what可以是function,或是文字字串表示要R做的事情
#args=a list of arguments to the function call.
#If quote is FALSE, the default, then the arguments are evaluated (in the calling environment, not in envir).
#envir=an environment within which to evaluate the call.
do.call("rbind", as.list(
by(Cushings, list(Cushings$Type), function(x) {
<- subset(x, select = -Type)
y apply(y, 2, mean)
} )))
## Tetrahydrocortisone Pregnanetriol
## a 2.966667 2.44
## b 8.180000 1.12
## c 19.720000 5.50
## u 14.016667 1.20
# 缺點:依據Type分割dataset,然後rbind 各子集subset的mean。主要步驟都跟前幾個方法相同,但是它的語法串接好多個function,相當複雜。
%>%
Cushings group_by(Type) %>%
summarize( t_m = mean(Tetrahydrocortisone), p_m = mean(Pregnanetriol))
## # A tibble: 4 x 3
## Type t_m p_m
## <fct> <dbl> <dbl>
## 1 a 2.97 2.44
## 2 b 8.18 1.12
## 3 c 19.7 5.5
## 4 u 14.0 1.2
#優點:group_by資料分群,summarize變數平均,兩步驟完成,符合大腦直線思考的方式,比較容易使用。
# nest() creates a list-column of data frames
# mutate() creates new variables:avg、res_1、res_2
# map() Apply a function to each element of a list or atomic vector
# map_dbl() return vectors of the corresponding type (or die trying)
%>%
Cushings nest(-Type) %>%
mutate(avg = map(data, ~ apply(., 2, mean)),
res_1 = map_dbl(avg, "Tetrahydrocortisone"),
res_2 = map_dbl(avg, "Pregnanetriol"))
## Warning: All elements of `...` must be named.
## Did you want `data = c(Tetrahydrocortisone, Pregnanetriol)`?
## # A tibble: 4 x 5
## Type data avg res_1 res_2
## <fct> <list> <list> <dbl> <dbl>
## 1 a <tibble [6 x 2]> <dbl [2]> 2.97 2.44
## 2 b <tibble [10 x 2]> <dbl [2]> 8.18 1.12
## 3 c <tibble [5 x 2]> <dbl [2]> 19.7 5.5
## 4 u <tibble [6 x 2]> <dbl [2]> 14.0 1.2
# 優點:新增三個變數,每個元素都計算平均,這樣的思考很直接,但是在語法的使用上,我個人想不到 map(apply()) 這樣的組合方式。
# 缺點:多出兩個空白column。
Use the data in the high schools example to solve the following problems:
test if any pairs of the five variables: read, write, math, science, and socst, are different in means.
test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read, write, math, science, and socst.
Perform all pairwise simple regressions for these variables: read, write, math, science, and socst.
The data set consists of measurements on 11 variables for a sample of 200 students.
Source: UCLA Academic Technology Service: R class note
Column 1: Student ID
Column 2: Gender, M = Male, F = Female
Column 3: Race
Column 4: Socioeconomic status
Column 5: School type
Column 6: Program type
Column 7: Reading score
Column 8: Writing score
Column 9: Math score
Column 10: Science score
Column 11: Social science studies score
# tools
library(dplyr)
library(MASS)
library (reshape)
##
## 載入套件:'reshape'
## 下列物件被遮斷自 'package:dplyr':
##
## rename
## 下列物件被遮斷自 'package:tidyr':
##
## expand, smiths
# input data
<- read.table("hs0.txt", header = T)
dta2
# take a look
head(dta2)
## id female race ses schtyp prog read write math science socst
## 1 70 male white low public general 57 52 41 47 57
## 2 121 female white middle public vocation 68 59 53 63 61
## 3 86 male white high public general 44 33 54 58 31
## 4 141 male white high public vocation 63 44 47 53 56
## 5 172 male white middle public academic 47 52 57 53 61
## 6 113 male white middle public academic 44 52 51 63 61
str(dta2)
## 'data.frame': 200 obs. of 11 variables:
## $ id : int 70 121 86 141 172 113 50 11 84 48 ...
## $ female : chr "male" "female" "male" "male" ...
## $ race : chr "white" "white" "white" "white" ...
## $ ses : chr "low" "middle" "high" "high" ...
## $ schtyp : chr "public" "public" "public" "public" ...
## $ prog : chr "general" "vocation" "general" "vocation" ...
## $ read : int 57 68 44 63 47 44 50 34 63 57 ...
## $ write : int 52 59 33 44 52 52 59 46 57 55 ...
## $ math : int 41 53 54 47 57 51 42 45 54 52 ...
## $ science: int 47 63 58 53 53 63 53 39 58 NA ...
## $ socst : int 57 61 31 56 61 61 61 36 51 51 ...
# wide format to long
<- reshape(dta2,
dta2_long idvar="id",
varying=list(7:11),
times=c("read", "write", "math", "science", "socst"),
v.names="scores",
direction="long")
head(dta2_long)
## id female race ses schtyp prog time scores
## 70.read 70 male white low public general read 57
## 121.read 121 female white middle public vocation read 68
## 86.read 86 male white high public general read 44
## 141.read 141 male white high public vocation read 63
## 172.read 172 male white middle public academic read 47
## 113.read 113 male white middle public academic read 44
# pairwise.t.test
pairwise.t.test(dta2_long$scores,dta2_long$time, p.adjust = "none")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: dta2_long$scores and dta2_long$time
##
## math read science socst
## read 0.68 - - -
## science 0.47 0.76 - -
## socst 0.81 0.86 0.63 -
## write 0.90 0.58 0.39 0.71
##
## P value adjustment method: none
pairwise.t.test(dta2_long$scores,dta2_long$time, p.adjust = "none",pool.sd = FALSE)
##
## Pairwise comparisons using t tests with non-pooled SD
##
## data: dta2_long$scores and dta2_long$time
##
## math read science socst
## read 0.67 - - -
## science 0.45 0.76 - -
## socst 0.81 0.87 0.64 -
## write 0.89 0.58 0.38 0.72
##
## P value adjustment method: none
# 比較四個種族五個科目平均分數是否有差異
lapply(split(dta2_long, dta2_long$time), function(x) anova(lm(x$score~factor(x$race))))
## $math
## Analysis of Variance Table
##
## Response: x$score
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 1842.1 614.05 7.7033 6.84e-05 ***
## Residuals 196 15623.7 79.71
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $read
## Analysis of Variance Table
##
## Response: x$score
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 1749.8 583.27 5.9637 0.0006539 ***
## Residuals 196 19169.6 97.80
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $science
## Analysis of Variance Table
##
## Response: x$score
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 3169.5 1056.51 13.063 8.505e-08 ***
## Residuals 191 15447.2 80.88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $socst
## Analysis of Variance Table
##
## Response: x$score
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 943.9 314.63 2.804 0.04098 *
## Residuals 196 21992.3 112.21
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $write
## Analysis of Variance Table
##
## Response: x$score
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x$race) 3 1914.2 638.05 7.8334 5.785e-05 ***
## Residuals 196 15964.7 81.45
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
五個科目的平均分數在四個族群之間具有統計上的顯著差異。
The formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r.
Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT$5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate.
Modify this R script to create a function to compute the c-statistic illustrated with the data set in the article:Tryon, W.W. (1984). A simplified time-series analysis for evaluating treatment interventions. Journal of Applied Behavioral Analysis, 34(4), 230-233.
# read in data
<- read.table("cstat.txt", header=T) dta4
str(dta4)
## 'data.frame': 42 obs. of 1 variable:
## $ nc: int 28 46 39 45 24 20 35 37 36 40 ...
head(dta4)
## nc
## 1 28
## 2 46
## 3 39
## 4 45
## 5 24
## 6 20
plot(1:42, dta4[,1], xlab="Observations", ylab="Number of Children")
lines(1:42, dta4[,1])
abline(v=10, lty=2)
abline(v=32, lty=2)
segments(1, mean(dta4[1:10,1]),10, mean(dta4[1:10,1]),col="red")
segments(11, mean(dta4[11:32,1]),32, mean(dta4[11:32,1]),col="blue")
segments(33, mean(dta4[33:42,1]),42, mean(dta4[33:42,1]),col="green")
<- 1-(sum(diff(dta4[1:10,1])^2)/(2*(10-1)*var(dta4[1:10,1])))
cden <- sqrt((10-2)/((10-1)*(10+1)))
sc <- 1-pnorm(cden/sc)
pval pval
## [1] 0.2866238
<- 32
n <- 1-(sum(diff(dta4[1:n,1])^2)/(2*(n-1)*var(dta4[1:n,1])))
cden <- sqrt((n-2)/((n-1)*(n+1)))
sc <- 1-pnorm(cden/sc)
pval list(z=cden/sc,pvalue=pval)
## $z
## [1] 3.879054
##
## $pvalue
## [1] 5.243167e-05