#tidyverse for data reshaping, car for Anova().
library(tidyverse); library(car)
## Warning: package 'car' was built under R version 3.5.3
#The following is required to calculate Type III SS when running ANOVA.
#See Note 0
#options("contrasts")
options(contrasts=c('contr.sum','contr.poly'))
paper <- c(1660,1820,1590,1440,1730,1680,1750,1720,1900,1570,1700,1900,1800,1770,2010,1580,1620,1690,1400,1590,1750,1900,1800,2010,1580,1690,1850,1720,1900,1590)
#H0: mean <= 1600
#Ha: mean > 1600
t.test(paper, mu=1600, alternative="greater")
##
## One Sample t-test
##
## data: paper
## t = 4.5171, df = 29, p-value = 4.843e-05
## alternative hypothesis: true mean is greater than 1600
## 95 percent confidence interval:
## 1677.149 Inf
## sample estimates:
## mean of x
## 1723.667
#The dataset provided by the books website and the data printed in the actual book are different.
#river <- read.csv("data/CH06/ex6-6.txt")
upstream <- c(5.2,4.8,5.1,5.0,4.9,4.8,5.0,4.7,4.7,5.0,4.6,5.2,5.0,4.9,4.7)
downstream <- c(3.2,3.4,3.7,3.9,3.6,3.8,3.9,3.6,4.1,3.3,4.5,3.7,3.9,3.8,3.7)
#Part A
#Equal variances will yield Pooled variance T test, whereas unequal will yield Satterthwaite.
#H0: dowmstream - upstream >= -.5
#Ha: downstream - upstream < -.5
t.test(downstream, upstream,
alternative="less", mu=-.5, conf.level=.01,
var.equal=T)
##
## Two Sample t-test
##
## data: downstream and upstream
## t = -6.9625, df = 28, p-value = 7.166e-08
## alternative hypothesis: true difference in means is less than -0.5
## 1 percent confidence interval:
## -Inf -1.402898
## sample estimates:
## mean of x mean of y
## 3.740000 4.906667
#Part B
shapiro.test(downstream)
##
## Shapiro-Wilk normality test
##
## data: downstream
## W = 0.95141, p-value = 0.5469
shapiro.test(upstream)
##
## Shapiro-Wilk normality test
##
## data: upstream
## W = 0.94347, p-value = 0.428
layout(mat = matrix(c(1,2), 2, 1), height = c(3,1)) #matrix(data, nrow, ncol)
par(mar=c(0, 3, 2, 1)) #margins for each plot
hist(downstream, xlim=c(3,5), col="pink", main="Distribution of Downstream")
boxplot(downstream, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(3,5), axes=F)
qqnorm(downstream)
qqline(downstream)
layout(mat = matrix(c(1,2), 2, 1), height = c(3,1))
par(mar=c(0, 3, 2, 1))
hist(upstream, xlim=c(4,6), col="pink", main="Distribution of Upstream")
boxplot(upstream, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(4,6), axes=F)
qqnorm(upstream)
qqline(upstream)
repairpersons <- read.csv("data/CH06/ex6-28.txt")
#Part A
#H0: After - Before >= 0
#Ha: After - Before < 0
t.test(repairpersons$X.After., repairpersons$X.Before.,
alternative="less", paired=T)
##
## Paired t-test
##
## data: repairpersons$X.After. and repairpersons$X.Before.
## t = -0.86098, df = 9, p-value = 0.2058
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 2.917599
## sample estimates:
## mean of the differences
## -2.584
#Part B
#95% Confidence interval
t.test(repairpersons$X.After., repairpersons$X.Before., paired=T)
##
## Paired t-test
##
## data: repairpersons$X.After. and repairpersons$X.Before.
## t = -0.86098, df = 9, p-value = 0.4116
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -9.373261 4.205261
## sample estimates:
## mean of the differences
## -2.584
#Part C
repairpersons <- mutate(repairpersons, diff = X.After. - X.Before.)
shapiro.test(repairpersons$diff)
##
## Shapiro-Wilk normality test
##
## data: repairpersons$diff
## W = 0.9398, p-value = 0.5508
layout(mat = matrix(c(1,2), 2, 1), height = c(3,1))
par(mar=c(0, 3, 2, 1))
hist(repairpersons$diff, xlim=c(-17,19), col="pink", main="Distribution of After-Before")
boxplot(repairpersons$diff, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(-17,19), axes=F)
qqnorm(repairpersons$diff)
qqline(repairpersons$diff)
See Note 4 for processing hairy strings
golf_string <- "f 75 f 76 f 80 f 77 f 80 f 77 f 73 m 82 m 80 m 85 m 85 m 78 m 87 m 82"
golf <- read.table(text=golf_string, header=FALSE, col.names = c("gender", "score"))
#The base fn wilcox.test cannot handle ties, so another package is necessary.
library(exactRankTests)
## Warning: package 'exactRankTests' was built under R version 3.5.3
## Package 'exactRankTests' is no longer under development.
## Please consider using package 'coin' instead.
#H0: medians/distributions are the same
#Ha: medians/distributions are different
wilcox.exact(score ~ gender, data=golf)
##
## Exact Wilcoxon rank sum test
##
## data: score by gender
## W = 3, p-value = 0.003497
## alternative hypothesis: true mu is not equal to 0
#Dirty data step
Location_string <- "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15"
Before_string <- "10.02 10.16 9.96 10.01 9.87 10.05 10.07 10.08 10.05 10.04 10.09 10.09 9.92 10.05 10.13"
After_string <- "10.21 10.16 10.11 10.10 10.07 10.13 10.08 10.30 10.17 10.10 10.06 10.37 10.24 10.19 10.13"
location <- read.table(text=Location_string, header=FALSE, col.names = c("location"))
before <- read.table(text=Before_string, header=FALSE, col.names = c("before"))
after <- read.table(text=After_string, header=FALSE, col.names = c("after"))
mining <- as.data.frame(c(location, before, after))
rm(Location_string, Before_string, After_string, location, before, after)
wilcox.exact(mining$after, mining$before, paired=T)
##
## Exact Wilcoxon signed rank test
##
## data: mining$after and mining$before
## V = 89, p-value = 0.0007324
## alternative hypothesis: true mu is not equal to 0
#The value V = 89 corresponds to the sum of ranks assigned to the differences with positive sign. See Note 5
See Note 6 for references
oil_string <- "
CNVNTNL 44
CNVNTNL 49
CNVNTNL 37
CNVNTNL 38
CNVNTNL 54
CNVNTNL 29
CNVNTNL 39
CNVNTNL 65
SYNTHET 42
SYNTHET 59
SYNTHET 52
SYNTHET 57
SYNTHET 49
SYNTHET 45
SYNTHET 44
HYBRID 60
HYBRID 71
HYBRID 78
HYBRID 35
HYBRID 18
HYBRID 48
HYBRID 36
HYBRID 68
"
oil <- read.table(text=oil_string, header=F, col.names = c("type", "visc"))
Using car package
library(car) #easier to calculate type 3 SS using this package than base R
Anova(lm(visc ~ type, data=oil), type=3)
## Anova Table (Type III tests)
##
## Response: visc
## Sum Sq Df F value Pr(>F)
## (Intercept) 54140 1 255.8093 7.329e-13 ***
## type 231 2 0.5454 0.588
## Residuals 4233 20
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#HOV Levene test. The result for the ANOVA is the HOV test. See Note 6
oil.lm <- lm(visc ~ type, data = oil)
Anova(lm(residuals(oil.lm)^2 ~ oil$type))
## Anova Table (Type II tests)
##
## Response: residuals(oil.lm)^2
## Sum Sq Df F value Pr(>F)
## oil$type 512653 2 4.6628 0.02177 *
## Residuals 1099463 20
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kruskal.test(visc ~ type, data = oil)
##
## Kruskal-Wallis rank sum test
##
## data: visc by type
## Kruskal-Wallis chi-squared = 1.2054, df = 2, p-value = 0.5473
de_string <- "
1 exposed Positive
2 exposed Positive
3 exposed Positive
4 exposed Positive
5 exposed negative
6 notExposed negative
7 exposed Positive
8 exposed Positive
9 exposed Positive
10 notExposed negative
11 exposed negative
12 exposed Positive
13 notExposed Positive
14 notExposed negative
15 exposed Positive
16 exposed Positive
17 exposed negative
18 notExposed negative
19 exposed Positive
20 exposed Positive
21 exposed Positive
22 notExposed negative
23 exposed negative
24 exposed Positive
25 notExposed negative
26 exposed negative
27 notExposed negative
28 exposed Positive
"
de <- read.table(text=de_string, header=F, col.names=c("subjectID", "exposure", "disease"))
#Chi-squared approximation may be incorrect because sample size is so small.
tbl <- table(de$exposure, de$disease)
#Chi Sq Test for Independence
chisq.test(tbl, correct=F) #Default of correct=T yields Continuity Adj Chi Sq
## Warning in chisq.test(tbl, correct = F): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 9.1146, df = 1, p-value = 0.002536
#Contingency Table
library(gmodels)
CrossTable(de$exposure, de$disease)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 28
##
##
## | de$disease
## de$exposure | negative | Positive | Row Total |
## -------------|-----------|-----------|-----------|
## exposed | 5 | 15 | 20 |
## | 1.488 | 1.116 | |
## | 0.250 | 0.750 | 0.714 |
## | 0.417 | 0.938 | |
## | 0.179 | 0.536 | |
## -------------|-----------|-----------|-----------|
## notExposed | 7 | 1 | 8 |
## | 3.720 | 2.790 | |
## | 0.875 | 0.125 | 0.286 |
## | 0.583 | 0.062 | |
## | 0.250 | 0.036 | |
## -------------|-----------|-----------|-----------|
## Column Total | 12 | 16 | 28 |
## | 0.429 | 0.571 | |
## -------------|-----------|-----------|-----------|
##
##
#Fisher Exact Test
fisher.test(de$exposure, de$disease)
##
## Fisher's Exact Test for Count Data
##
## data: de$exposure and de$disease
## p-value = 0.004236
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.0009998694 0.5759906625
## sample estimates:
## odds ratio
## 0.05433825
gator_string <- "
1.24 I 1.30 I 1.30 I 1.32 F 1.32 F 1.40 F 1.42 I 1.42 F
1.45 I 1.45 O 1.47 I 1.47 F 1.50 I 1.52 I 1.55 I 1.60 I
1.63 I 1.65 O 1.65 I 1.65 F 1.65 F 1.68 F 1.70 I 1.73 O
1.78 I 1.78 I 1.78 O 1.80 I 1.80 F 1.85 F 1.88 I 1.93 I
1.98 I 2.03 F 2.03 O 2.16 F 2.26 F 2.31 F 2.31 F 2.36 F
3.36 F 3.39 F 3.41 F 2.44 O 2.46 F 2.56 O 2.67 O 2.72 I
3.79 F 2.84 F 3.25 O 3.28 O 3.33 F 3.56 F 3.58 F 3.66 F
3.68 O 3.71 F 3.89 F
"
#replace newlines with spaces, otherwise read.table thinks there are 16 columns.
gator <- read.table(text=gsub("\n", " ", gator_string, perl=TRUE), header=F, col.names=c("length", "type"))
#data must be summarized for chisq test
gator_count <- gator %>%
group_by(type) %>%
summarise(count = n())
#chisq.test(x = observed, p = expected)
#F I O
chisq.test(x=gator_count$count, p=c(1/2, 3/10, 1/5))
##
## Chi-squared test for given probabilities
##
## data: gator_count$count
## X-squared = 0.42938, df = 2, p-value = 0.8068
See Note 10 for reference
student_scores_string <- "
1 -3.4 -0.2 7.7 12
2 -2.8 5.2 5.5 4.1
3 2.2 6.6 -0.8 5.9
4 -0.8 5.2 7.4 13.5
5 2.8 -0.6 0.1 7.5
6 -5.9 5.4 11.7 9.3
7 7.8 3.1 1.2 7.1
8 -3.5 6.5 3.8 -0.9
9 2.9 2.4 5.1 8.3
10 1.9 6.2 4.3 9.8
11 -0.2 7.9 3.9 11.1
12 1.5 7.9 6.9 4.9
13 0.4 6.6 2.8 5.8
14 -0.5 0.2 5.4 2.8
15 1.1 1.9 2.5 12
16 5.3 1.3 5.2 8.6
17 -4 1.8 3.1 2
18 -1.3 3.1 6.6 5.9
19 2.6 1.4 0.2 5.6
20 -0.9 2.1 7.1 11.6
21 -0.6 6.6 9.2 7.8
22 -5 7 3 7.2
23 2.4 -0.7 2.3 8.3
24 -0.1 4.1 10.2 6.5
25 -4.7 3.8 4.7 8.3
"
student_scores <- read.table(text=gsub("\n", " ", student_scores_string, perl=TRUE), header=F, col.names=c("student", "control", "piano", "computer", "instructor"))
scores_long <- student_scores %>%
gather(key="method", value="score", control, piano, computer, instructor) %>%
arrange(method)
#The order of the contrast weights maps to the order of the levels. Here, I set the order to be alphabetical.
levels(scores_long$method) <- c("computer", "control", "instructor", "piano")
#Calculate inverse matrix of contrasts
c1 <- c(1, 0, 0, -1) #computer vs piano
c2 <- c(0, 1, -1, 0) #control vs instructor
c3 <- c(-1, 3, -1,-1) #control vs all
mat_temp <- rbind(constant=1/4, c1, c2, c3)
mat <- solve(mat_temp)
mat <- mat[, -1]
#Calculate contrasts.
model <- lm(score ~ method, data=scores_long, contrasts=list(method = mat))
summary(model)
##
## Call:
## lm(formula = score ~ method, data = scores_long, contrasts = list(method = mat))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.300 -1.971 -0.028 2.352 7.912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9610 0.3150 12.576 < 2e-16 ***
## methodc1 0.9720 0.8908 1.091 0.278
## methodc2 -7.5120 0.8908 -8.433 3.42e-13 ***
## methodc3 -16.2920 2.1821 -7.466 3.76e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.15 on 96 degrees of freedom
## Multiple R-squared: 0.433, Adjusted R-squared: 0.4153
## F-statistic: 24.44 on 3 and 96 DF, p-value: 7.807e-12
Different package used, see Note 11
library(TukeyC)
## Warning: package 'TukeyC' was built under R version 3.5.3
## Loading required package: doBy
## Warning: package 'doBy' was built under R version 3.5.3
#Tukey won't take lm or Anova, only aov.
fit <- aov(score ~ method, data=scores_long)
#This package matches SAS output. TukeyC(x, which)
tukes <- TukeyC(fit, "method")
tukes
## Results
## Means G1 G2 G3
## instructor 7.40 a
## computer 4.76 b
## piano 3.79 b
## control -0.11 c
##
## Sig.level
## 0.05
##
## Diff_Prob
## instructor computer piano control
## instructor 0.000 2.636 3.608 7.512
## computer 0.020 0.000 0.972 4.876
## piano 0.001 0.696 0.000 3.904
## control 0.000 0.000 0.000 0.000
##
## MSD
## instructor computer piano control
## instructor 0.000 2.329 2.329 2.329
## computer 2.329 0.000 2.329 2.329
## piano 2.329 2.329 0.000 2.329
## control 2.329 2.329 2.329 0.000
plot(tukes)
See Note 12
soil_string <- "
5.2 100 4 5.9 100 4 6.3 100 4 7.1 100 5
7.4 100 5 7.5 100 5 7.6 100 6 7.2 100 6
7.4 100 6 7.2 100 7 7.5 100 7 7.2 100 7
7.4 200 4 7.0 200 4 7.6 200 4 7.4 200 5
7.3 200 5 7.1 200 5 7.6 200 6 7.5 200 6
7.8 200 6 7.4 200 7 7.0 200 7 6.9 200 7
6.3 300 4 6.7 300 4 6.1 300 4 7.3 300 5
7.5 300 5 7.2 300 5 7.2 300 6 7.3 300 6
7.0 300 6 6.8 300 7 6.6 300 7 6.4 300 7
"
soil <- read.table(text=gsub("\n", " ", soil_string, perl=TRUE), header=F, col.names=c("diameter", "calcium", "ph"))
soil$calcium <- as.factor(soil$calcium)
soil$ph <- as.factor(soil$ph)
Using car package
#Two way Anova
Anova(lm(diameter ~ calcium * ph, data=soil), type=3)
## Anova Table (Type III tests)
##
## Response: diameter
## Sum Sq Df F value Pr(>F)
## (Intercept) 1790.70 1 26420.1680 < 2.2e-16 ***
## calcium 1.47 2 10.8238 0.0004462 ***
## ph 4.46 3 21.9385 4.635e-07 ***
## calcium:ph 3.26 6 8.0041 8.186e-05 ***
## Residuals 1.63 24
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Output residuals and test them for normality
resids <- resid(lm(diameter ~ calcium * ph, data=soil))
shapiro.test(resids)
##
## Shapiro-Wilk normality test
##
## data: resids
## W = 0.98385, p-value = 0.8659
layout(mat = matrix(c(1,2), 2, 1), height = c(3,1))
par(mar=c(0, 3, 2, 1))
hist(resids, xlim=c(-1,1), col="pink", main="Distribution of Residuals")
boxplot(resids, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(-1,1), axes=F)
qqnorm(resids)
qqline(resids)
Another example from HW4 #14-8
#Another example from HW4 #14-8
attention <- read.csv("data/CH14/ex14-8.txt")
attention_stacked <- attention %>%
gather(key="key", value="span", X.A1.P1., X.A2.P1., X.A3.P1., X.A1.P2., X.A2.P2., X.A3.P2.)
attention_stacked$key <- gsub('^..|.$', '', attention_stacked$key)
attention_stacked <- separate(attention_stacked, key, c("age", "product"))
#Two way anova
Anova(lm(span ~ age * product, data=attention_stacked), type=3)
## Anova Table (Type III tests)
##
## Response: span
## Sum Sq Df F value Pr(>F)
## (Intercept) 44608 1 303.2284 < 2.2e-16 ***
## age 1303 2 4.4287 0.0165575 *
## product 2018 1 13.7202 0.0005001 ***
## age:product 1384 2 4.7049 0.0130735 *
## Residuals 7944 54
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
music <- read.csv("data/CH15/ex15-6.txt")
colnames(music) <- c("subject", "type_music", "typing_efficiency")
music$subject <- as.factor(music$subject)
Anova(lm(typing_efficiency ~ type_music + subject, data=music), type=3)
## Anova Table (Type III tests)
##
## Response: typing_efficiency
## Sum Sq Df F value Pr(>F)
## (Intercept) 9557.3 1 4041.0201 < 2.2e-16 ***
## type_music 31.0 2 6.5436 0.0119776 *
## subject 149.3 6 10.5235 0.0003462 ***
## Residuals 28.4 12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lights <- read.csv("data/CH15/ex15-40.txt")
colnames(lights) <- c("intersection", "minutes", "signaling", "period")
lights$intersection <- as.factor(lights$intersection)
lights$period <- as.factor(lights$period)
Anova(lm(minutes ~ signaling + intersection + period, data=lights), type=3)
## Anova Table (Type III tests)
##
## Response: minutes
## Sum Sq Df F value Pr(>F)
## (Intercept) 13372.6 1 2289.3339 4.545e-15 ***
## signaling 76.3 4 3.2648 0.04976 *
## intersection 18.2 4 0.7801 0.55927
## period 1091.7 4 46.7222 3.202e-07 ***
## Residuals 70.1 12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Bonus tukey
fit <- aov(minutes ~ signaling + intersection + period, data=lights)
tukes <- TukeyC(fit, "signaling")
tukes
## Results
## Means G1 G2
## 'B' 24.88 a
## 'E' 24.14 a b
## 'A' 24.12 a b
## 'D' 22.50 a b
## 'C' 20.00 b
##
## Sig.level
## 0.05
##
## Diff_Prob
## 'B' 'E' 'A' 'D' 'C'
## 'B' 0.000 0.740 0.760 2.380 4.88
## 'E' 0.987 0.000 0.020 1.640 4.14
## 'A' 0.986 1.000 0.000 1.620 4.12
## 'D' 0.549 0.817 0.823 0.000 2.50
## 'C' 0.050 0.111 0.113 0.504 0.00
##
## MSD
## 'B' 'E' 'A' 'D' 'C'
## 'B' 0.000 4.872 4.872 4.872 4.872
## 'E' 4.872 0.000 4.872 4.872 4.872
## 'A' 4.872 4.872 0.000 4.872 4.872
## 'D' 4.872 4.872 4.872 0.000 4.872
## 'C' 4.872 4.872 4.872 4.872 0.000
plot(tukes)
oysters_string <- "
1 1 27.2 32.6
1 2 32.0 36.6
1 3 33.0 37.7
1 4 26.8 31.0
2 1 28.6 33.8
2 2 26.8 31.7
2 3 26.5 30.7
2 4 26.8 30.4
3 1 28.6 35.2
3 2 22.4 29.1
3 3 23.2 28.9
3 4 24.4 30.2
4 1 29.3 35.0
4 2 21.8 27.0
4 3 30.3 36.4
4 4 24.3 30.5
5 1 20.4 24.6
5 2 19.6 23.4
5 3 25.1 30.3
5 4 18.1 21.8
"
oysters <- read.table(text=gsub("\n", " ", oysters_string, perl=TRUE), header=F, col.names=c("treatment", "replication", "initial_weight", "final_weight"))
oysters$treatment <- as.factor(oysters$treatment)
oysters$replication <- as.factor(oysters$replication)
#ANCOVA
Anova(lm(final_weight ~ initial_weight + treatment, data=oysters), type=3)
## Anova Table (Type III tests)
##
## Response: final_weight
## Sum Sq Df F value Pr(>F)
## (Intercept) 1.718 1 5.6959 0.0316708 *
## initial_weight 156.040 1 517.3840 1.867e-12 ***
## treatment 12.089 4 10.0212 0.0004819 ***
## Residuals 4.222 14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Test for heteroskedasticity (interaction term)
Anova(lm(final_weight ~ initial_weight * treatment, data=oysters), type=3)
## Anova Table (Type III tests)
##
## Response: final_weight
## Sum Sq Df F value Pr(>F)
## (Intercept) 0.064 1 0.2275 0.6436
## initial_weight 68.529 1 241.8091 2.472e-08 ***
## treatment 1.696 4 1.4963 0.2752
## initial_weight:treatment 1.388 4 1.2247 0.3602
## Residuals 2.834 10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
See Note 16 for all the testing
pesticide <- read.csv("data/CH17/ex17-10.txt")
colnames(pesticide) <- c("location", "chemical", "number_killed")
pesticide$location <- as.factor(pesticide$location)
pesticide$chemical <- as.factor(pesticide$chemical)
#Make F table manually
fit <- lm(number_killed ~ location * chemical, data=pesticide)
results <- Anova(fit, type=3) #this is the anova table. The following replaces F tests with MS_main/MS_interaction
Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[4]
df.error <- Df[4]
Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)
Fvalue[3] <- MeanSq[3]/Error.Term
Pvalue[3] <- 1 - pf(Fvalue[3], Df[3], df.error)
Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Intercept", "Locations", "Chemicals", "Locations:Chemicals", "Residuals")
print(Ftable)
## Df SumSq MeanSq Fvalue Pvalue
## Intercept 1 1686.1022 1686.102250 4869.6093863 2.314014e-25
## Locations 4 3.8115 0.952875 0.7076461 6.020037e-01
## Chemicals 3 180.1327 60.044250 44.5914534 8.797523e-07
## Locations:Chemicals 12 16.1585 1.346542 3.8889290 3.652306e-03
## Residuals 20 6.9250 0.346250 NA NA
Varcomp, see Note 16 for reference.
library(VCA)
## Warning: package 'VCA' was built under R version 3.5.3
#Look at VC column in fit$aov.tab
fit <- remlMM(number_killed~(location)*chemical, pesticide, cov=T)
print(fit)
##
##
## REML-Estimation of Mixed Model:
## -------------------------------
##
## [Fixed Effects]
##
## int chemical1 chemical2 chemical3 chemical4
## 5.71 2.85 -2.34 2.62 0.00
##
##
## [Variance Components]
##
## Name DF VC %Total SD CV[%]
## 1 total 24.594337 0.797191 100 0.892856 13.752109
## 2 location 0 0 2.1e-05 0.00041 0.006316
## 3 location:chemical 5.989053 0.450943 56.566506 0.671523 10.343059
## 4 error 20 0.346248 43.433473 0.588428 9.063201
## Var(VC)
## 1 0.05168
## 2 0.016228
## 3 0.067907
## 4 0.011989
##
## Mean: 6.4925 (N = 40)
##
## Experimental Design: balanced | Method: REML
drug_string <- "
5.03
5.10
5.25
4.98
5.05
4.64
4.73
4.82
4.95
5.06
5.10
5.15
5.20
5.08
5.14
5.05
4.96
5.12
5.12
5.05
5.46
5.15
5.18
5.18
5.11
4.90
4.95
4.86
4.86
5.07
"
drug_y <- read.table(text=gsub("\n", " ", drug_string, perl=TRUE), header=F, col.names = c('y'))
counter_y = 0
site <- c()
batch <- c()
tablet <- c()
for (i in 1:2){ #site
for (j in 1:3){ #batch
for (k in 1:5){ #tablet
counter_y <- counter_y + 1
site[counter_y] <- i
batch[counter_y] <- j
tablet[counter_y] <- k
}
}
}
drug <- data.frame(site, batch, tablet, drug_y)
drug$site <- as.factor(drug$site)
drug$batch <- as.factor(drug$batch)
drug$tablet <- as.factor(drug$tablet)
rm(site, batch, tablet, drug_y)
#Remembering to convert the categorical variables to factors is important.
#Specifying nested term: site/batch == batch(site) in SAS
fit <- lm(y ~ site + site/batch, data=drug)
results <- Anova(fit, type=3)
Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[3]
df.error <- Df[3]
Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)
Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Intercept", "Site", "Batch(Site)", "Residuals")
print(Ftable)
## Df SumSq MeanSq Fvalue Pvalue
## Intercept 1 763.05633333 763.05633333 6.310597e+04 1.469251e-42
## Site 1 0.01825333 0.01825333 1.608176e-01 7.089034e-01
## Batch(Site) 4 0.45401333 0.11350333 9.386906e+00 1.028393e-04
## Residuals 24 0.29020000 0.01209167 NA NA
Week 8 lab, #2
y_string <- "
2.2 2.0 2.1 2.3 2.5 2.7 2.8 2.5
2.4 2.4 2.2 1.8 2.8 2.5 2.5 2.3
3.0 2.4 2.0 1.9 2.7 2.0 2.4 2.6
2.7 2.3 1.9 2.5 2.7 2.0 2.4 2.6
2.5 2.2 1.8 2.6 2.7 2.6 2.4 2.7
2.1 2.2 1.7 2.3 2.5 2.4 2.4 2.5
"
y_vector <- read.table(text=gsub("\n", " ", y_string, perl=TRUE), header=F, col.names = c('y'))
counter_y = 0
device <- c()
rep <- c()
facility <- c()
tester <- c()
for (i in 1:3){ #device
for (j in 1:2){ #rep
for (k in 1:2){ #facility
for (l in 1:4){ #tester
counter_y <- counter_y + 1
device[counter_y] <- i
rep[counter_y] <- j
facility[counter_y] <- k
tester[counter_y] <- l
}
}
}
}
engi <- data.frame(device, rep, facility, tester, y_vector)
engi$device <- as.factor(engi$device)
engi$rep <- as.factor(engi$rep)
engi$facility <- as.factor(engi$facility)
engi$tester <- as.factor(engi$tester)
rm(device, rep, facility, tester, y_vector)
fit <- lm(y ~ facility + device + facility:device + facility/tester, data=engi)
results <- Anova(fit, type=3)
Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[5]
df.error <- Df[5]
Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)
Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Intercept", "Facility", "Device", "Facility*Device", "Tester(Facility)", "Residuals")
print(Ftable)
## Df SumSq MeanSq Fvalue Pvalue
## Intercept 1 269.3268750 269.3268750 5.191843e+03 1.606325e-40
## Facility 1 0.9352083 0.9352083 5.072316e+00 6.524540e-02
## Device 2 0.0087500 0.0043750 8.433735e-02 9.193022e-01
## Facility*Device 2 0.2254167 0.1127083 2.172691e+00 1.285749e-01
## Tester(Facility) 6 1.1062500 0.1843750 3.554217e+00 7.237563e-03
## Residuals 36 1.8675000 0.0518750 NA NA
y_string <- "
1.5 2.2 3.4
2.0 2.6 3.1
1.6 2.7 3.2
1.1 2.3 2.9
2.5 3.5 1.9
2.8 3.1 1.5
2.7 2.9 2.4
2.4 2.6 2.3
3.3 1.9 2.7
3.1 1.6 2.5
3.6 2.3 2.2
3.0 2.5 2.0
"
y_vector <- read.table(text=gsub("\n", " ", y_string, perl=TRUE), header=F, col.names = c('y'))
counter_y = 0
sequence <- c()
patient <- c()
period <- c()
for (i in 1:3){ #sequence
for (j in 1:4){ #patient
for (k in 1:3){ #period
counter_y <- counter_y + 1
sequence[counter_y] <- i
patient[counter_y] <- j
period[counter_y] <- k
}
}
}
bp <- data.frame(sequence, patient, period, y_vector)
bp$sequence<- as.factor(bp$sequence)
bp$patient<- as.factor(bp$patient)
bp$period<- as.factor(bp$period)
rm(sequence, patient, period, y_vector)
trt_str <- data.frame(sequence=as.factor(c(1,2,3)),
period_1=c("T1", "T2", "T3"),
period_2=c("T2", "T3", "T1"),
period_3=c("T3", "T1", "T2")
)
trt_str <- gather(trt_str, key="period", value="treatment", period_1, period_2, period_3)
trt_str$period <- as.factor(gsub("period_", "", trt_str$period, perl=TRUE))
bp <- inner_join(bp, trt_str, c("sequence", "period"))
bp$treatment<- as.factor(bp$treatment)
#Because of perfect multicolinearity, singular.ok = T, see Note 18
fit <- lm(y ~ sequence + treatment + period + sequence/patient + treatment:period, data=bp)
results <- Anova(fit, type=3, singular.ok=T)
## Note: model has aliased coefficients
## sums of squares computed by model comparison
print(results)
## Anova Table (Type III tests)
##
## Response: y
## Sum Sq Df F values Pr(>F)
## sequence 0.0000 0
## treatment 9.5172 2 41.5129 1.81e-07 ***
## period 0.0172 2 0.0751 0.92792
## sequence:patient 0.6692 9 0.6486 0.74252
## treatment:period 0.6489 2 2.8304 0.08535 .
## Residuals 2.0633 18
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Since interaction is insignificant, rerun model without it.
fit <- lm(y ~ sequence + treatment + period + sequence/patient, data=bp)
results <- Anova(fit, type=3)
Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[5]
df.error <- Df[5]
Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)
Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Intercept", "Sequence", "Treatment", "Period", "Patient(Sequence)", "Residuals")
print(Ftable)
## Df SumSq MeanSq Fvalue Pvalue
## Intercept 1 224.50027778 2.245003e+02 1.655471e+03 1.040635e-20
## Sequence 2 0.23388889 1.169444e-01 1.572852e+00 2.595310e-01
## Treatment 2 9.51722222 4.758611e+00 3.509013e+01 2.878627e-07
## Period 2 0.01722222 8.611111e-03 6.349857e-02 9.386639e-01
## Patient(Sequence) 9 0.66916667 7.435185e-02 5.482726e-01 8.221441e-01
## Residuals 20 2.71222222 1.356111e-01 NA NA
Reference for changing options: https://www.r-bloggers.com/ensuring-r-generates-the-same-anova-f-values-as-spss/)
#Use gsub for messier strings. Here it's unnecessary.
#golf <- read.table(text=gsub("(?<=[a-z])\\s+", " ", golf_text, perl=TRUE), header=FALSE, col.names = c("gender", "score"))
#Calculation of V:
diff <- mining$after - mining$before
diff <- diff[diff!=0] #delete all differences = 0, as per Signed Rank procedure
diff.rank <- rank(abs(diff))
diff.rank.sign <- diff.rank * sign(diff[])
ranks.pos <- sum(diff.rank.sign[diff.rank.sign > 0])
ranks.neg <- -sum(diff.rank.sign[diff.rank.sign < 0])
Referenced used for ANOVA: https://www.r-bloggers.com/anova-%E2%80%93-type-iiiiii-ss-explained/
#SAS calculates Leven using Squared deviations. R and it's packages seem to do Absolute instead.
#The following replicates what SAS does.
#https://stackoverflow.com/questions/22396543/levenes-test-using-the-squared-residuals
oil.lm <- lm(visc ~ type, data = oil)
anova(lm(residuals(oil.lm)^2 ~ oil$type))
#HOV Levene using absolute deviations
leveneTest(visc ~ type, data=oil)
Using ezANOVA package
library(ez)
#ezANOVA requires an ID column, wid. This is so that it can handle repeated measures.
oil$id <- as.factor(c(1:length(oil$type)))
av <- ezANOVA(data=oil, dv=visc, wid=id ,between=type, detail=T, type=3, white.adjust=T)
print(av)
ANT example
data(ANT)
b_anova_full <- ezANOVA(data=ANT,
dv=rt,
wid=subnum,
between=group,
within_full=.(cue, flank),
type=3,
detailed=T,
return_aov=T)
print(b_anova_full)
Reference, see DIY Contrasts: https://rstudio-pubs-static.s3.amazonaws.com/65059_586f394d8eb84f84b1baaf56ffb6b47f.html
#Base R uses TukeyHSD, which yields less elegant output.
tukes <- TukeyHSD(fit, "method")
tukes
plot(tukes)
Using ezANOVA package
soil$id <- as.factor(c(1:length(soil$diameter))) #ezANOVA requires an id column
soil_anova <- ezANOVA(data=soil, dv=diameter, wid=id, between=.(calcium, ph), type=3, detailed=T, return_aov=T)
resids <- resid(soil_anova$aov) #bc we used return_aov=T, we can call the base aov() and output the residuals
shapiro.test(resids)
layout(mat = matrix(c(1,2), 2, 1), height = c(3,1))
par(mar=c(0, 3, 2, 1))
hist(resids, xlim=c(-1,1), col="pink", main="Distribution of Residuals")
boxplot(resids, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(-1,1), axes=F)
qqnorm(resids)
qqline(resids)
reference: https://mcfromnz.wordpress.com/2011/03/02/anova-type-iiiiii-ss-explained/ Varcomp reference: https://stats.stackexchange.com/questions/161225/estimates-of-the-variance-of-the-variance-component-of-a-mixed-effects-model Testing on paint
paint <- read.csv("data/CH17/ex17-1.txt")
colnames(paint) <- c("batch", "percentage")
Testing on pesticide
#Both of these attempts have chemical F = 48.11, but answer should be 173.41 if divided by ms_error, or 44.49 if divided by ms_interaction
#attempt 1 with interaction, wrong
library(afex)
mixed(number_killed ~ chemical + (1|location) + (1|location:chemical), data=pesticide)
#with no interaction, CORRECT, but that means interaction is wrong...
model2 <- lmer_alt(number_killed ~ chemical + (1|location), data=pesticide)
anova(model2)
#attempt 2 with interaction, wrong
model3 <- lmer_alt(number_killed ~ chemical + (1|location) + (1|chemical:location), data=pesticide)
summary(model3)
Working solution (but answer is type 2)
#Make F table manually
fit <- lm(number_killed ~ location * chemical, data=pesticide)
results <- Anova(fit) #this is the anova table. The following replaces F tests with MS_main/MS_interaction
Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[3]
df.error <- Df[3]
Fvalue[1] <- MeanSq[1]/Error.Term
Pvalue[1] <- 1 - pf(Fvalue[1], Df[1], df.error)
Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)
Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Locations", "Chemicals", "Locations:Chemicals", "Residuals")
print(Ftable)
Misc things
library(lmerTest)
coef(fit)
#other attempts
model <- lmerTest::lmer(number_killed ~ chemical + (1|location) + (1|location:chemical), data=pesticide)
rand(model)
linearHypothesis(model, "chemical=0") #doesnt work
For carryover, unused and unfinished
carryover <- trt_str
carryover$period <- as.factor(as.numeric(trt_str$period) + 1)
Regarding multicolinearity: https://stats.stackexchange.com/questions/112442/what-are-aliased-coefficients