First, please run this chunk. Then proceed as needed.

#tidyverse for data reshaping, car for Anova().
library(tidyverse); library(car)
## Warning: package 'car' was built under R version 3.5.3
#The following is required to calculate Type III SS when running ANOVA.
#See Note 0
#options("contrasts")
options(contrasts=c('contr.sum','contr.poly'))

1. One sample T test, Week 5 lab

paper <- c(1660,1820,1590,1440,1730,1680,1750,1720,1900,1570,1700,1900,1800,1770,2010,1580,1620,1690,1400,1590,1750,1900,1800,2010,1580,1690,1850,1720,1900,1590)
#H0: mean <= 1600
#Ha: mean > 1600

t.test(paper, mu=1600, alternative="greater")
## 
##  One Sample t-test
## 
## data:  paper
## t = 4.5171, df = 29, p-value = 4.843e-05
## alternative hypothesis: true mean is greater than 1600
## 95 percent confidence interval:
##  1677.149      Inf
## sample estimates:
## mean of x 
##  1723.667

2. Two sample T test, HW5 #6.6

#The dataset provided by the books website and the data printed in the actual book are different.
#river <- read.csv("data/CH06/ex6-6.txt")

upstream <- c(5.2,4.8,5.1,5.0,4.9,4.8,5.0,4.7,4.7,5.0,4.6,5.2,5.0,4.9,4.7)
downstream <- c(3.2,3.4,3.7,3.9,3.6,3.8,3.9,3.6,4.1,3.3,4.5,3.7,3.9,3.8,3.7)
#Part A
#Equal variances will yield Pooled variance T test, whereas unequal will yield Satterthwaite.

#H0: dowmstream - upstream >= -.5
#Ha: downstream - upstream < -.5

t.test(downstream, upstream, 
       alternative="less", mu=-.5, conf.level=.01,
       var.equal=T)
## 
##  Two Sample t-test
## 
## data:  downstream and upstream
## t = -6.9625, df = 28, p-value = 7.166e-08
## alternative hypothesis: true difference in means is less than -0.5
## 1 percent confidence interval:
##       -Inf -1.402898
## sample estimates:
## mean of x mean of y 
##  3.740000  4.906667
#Part B
shapiro.test(downstream)
## 
##  Shapiro-Wilk normality test
## 
## data:  downstream
## W = 0.95141, p-value = 0.5469
shapiro.test(upstream)
## 
##  Shapiro-Wilk normality test
## 
## data:  upstream
## W = 0.94347, p-value = 0.428
layout(mat = matrix(c(1,2), 2, 1),  height = c(3,1)) #matrix(data, nrow, ncol)
par(mar=c(0, 3, 2, 1)) #margins for each plot
hist(downstream, xlim=c(3,5), col="pink", main="Distribution of Downstream")
boxplot(downstream, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(3,5), axes=F)

qqnorm(downstream)
qqline(downstream)

layout(mat = matrix(c(1,2), 2, 1),  height = c(3,1))

par(mar=c(0, 3, 2, 1))
hist(upstream, xlim=c(4,6), col="pink", main="Distribution of Upstream")
boxplot(upstream, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(4,6), axes=F)

qqnorm(upstream)
qqline(upstream)

3. Paired T test, HW5 #6.28

repairpersons <- read.csv("data/CH06/ex6-28.txt")
#Part A
#H0: After - Before >= 0
#Ha: After - Before < 0

t.test(repairpersons$X.After., repairpersons$X.Before., 
       alternative="less", paired=T)
## 
##  Paired t-test
## 
## data:  repairpersons$X.After. and repairpersons$X.Before.
## t = -0.86098, df = 9, p-value = 0.2058
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##      -Inf 2.917599
## sample estimates:
## mean of the differences 
##                  -2.584
#Part B
#95% Confidence interval

t.test(repairpersons$X.After., repairpersons$X.Before., paired=T)
## 
##  Paired t-test
## 
## data:  repairpersons$X.After. and repairpersons$X.Before.
## t = -0.86098, df = 9, p-value = 0.4116
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9.373261  4.205261
## sample estimates:
## mean of the differences 
##                  -2.584
#Part C
repairpersons <- mutate(repairpersons, diff = X.After. - X.Before.)
shapiro.test(repairpersons$diff)
## 
##  Shapiro-Wilk normality test
## 
## data:  repairpersons$diff
## W = 0.9398, p-value = 0.5508
layout(mat = matrix(c(1,2), 2, 1),  height = c(3,1))
par(mar=c(0, 3, 2, 1))
hist(repairpersons$diff, xlim=c(-17,19), col="pink", main="Distribution of After-Before")
boxplot(repairpersons$diff, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(-17,19), axes=F)

qqnorm(repairpersons$diff)
qqline(repairpersons$diff)

4. Wilcoxon Rank Sum / Mann Whitney U non-parametric test for 2 independent means, Week 7 lab

See Note 4 for processing hairy strings

golf_string <- "f 75 f 76 f 80 f 77 f 80 f 77 f 73 m 82 m 80 m 85 m 85 m 78 m 87 m 82"
golf <- read.table(text=golf_string, header=FALSE, col.names = c("gender", "score"))
#The base fn wilcox.test cannot handle ties, so another package is necessary.
library(exactRankTests)
## Warning: package 'exactRankTests' was built under R version 3.5.3
##  Package 'exactRankTests' is no longer under development.
##  Please consider using package 'coin' instead.
#H0: medians/distributions are the same
#Ha: medians/distributions are different
wilcox.exact(score ~ gender, data=golf)
## 
##  Exact Wilcoxon rank sum test
## 
## data:  score by gender
## W = 3, p-value = 0.003497
## alternative hypothesis: true mu is not equal to 0

5. Wilcoxon Signed Rank np test for paired continuous data, Week 7 lab

#Dirty data step

Location_string <- "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15"
Before_string <- "10.02 10.16 9.96 10.01 9.87 10.05 10.07 10.08 10.05 10.04 10.09 10.09 9.92 10.05 10.13"
After_string <- "10.21 10.16 10.11 10.10 10.07 10.13 10.08 10.30 10.17 10.10 10.06 10.37 10.24 10.19 10.13"

location <- read.table(text=Location_string, header=FALSE, col.names = c("location"))
before <- read.table(text=Before_string, header=FALSE, col.names = c("before"))
after <- read.table(text=After_string, header=FALSE, col.names = c("after"))

mining <- as.data.frame(c(location, before, after))
  
rm(Location_string, Before_string, After_string, location, before, after)
wilcox.exact(mining$after, mining$before, paired=T)
## 
##  Exact Wilcoxon signed rank test
## 
## data:  mining$after and mining$before
## V = 89, p-value = 0.0007324
## alternative hypothesis: true mu is not equal to 0
#The value V = 89 corresponds to the sum of ranks assigned to the differences with positive sign. See Note 5

6. One Way ANOVA, Week 9 lab

See Note 6 for references

oil_string <- "
CNVNTNL      44
CNVNTNL      49
CNVNTNL      37
CNVNTNL      38
CNVNTNL      54
CNVNTNL      29
CNVNTNL      39
CNVNTNL      65
SYNTHET      42
SYNTHET      59
SYNTHET      52
SYNTHET      57
SYNTHET      49
SYNTHET      45
SYNTHET      44
HYBRID       60
HYBRID       71
HYBRID       78
HYBRID       35
HYBRID       18
HYBRID       48
HYBRID       36
HYBRID       68
"

oil <- read.table(text=oil_string, header=F, col.names = c("type", "visc"))

Using car package

library(car) #easier to calculate type 3 SS using this package than base R

Anova(lm(visc ~ type, data=oil), type=3)
## Anova Table (Type III tests)
## 
## Response: visc
##             Sum Sq Df  F value    Pr(>F)    
## (Intercept)  54140  1 255.8093 7.329e-13 ***
## type           231  2   0.5454     0.588    
## Residuals     4233 20                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#HOV Levene test. The result for the ANOVA is the HOV test. See Note 6
oil.lm <- lm(visc ~ type, data = oil)
Anova(lm(residuals(oil.lm)^2 ~ oil$type))
## Anova Table (Type II tests)
## 
## Response: residuals(oil.lm)^2
##            Sum Sq Df F value  Pr(>F)  
## oil$type   512653  2  4.6628 0.02177 *
## Residuals 1099463 20                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7. Kruskal Wallis np 1 way test, Week 9 lab

kruskal.test(visc ~ type, data = oil) 
## 
##  Kruskal-Wallis rank sum test
## 
## data:  visc by type
## Kruskal-Wallis chi-squared = 1.2054, df = 2, p-value = 0.5473

8. Chi Square test for independence, week 9 lab

de_string <- "
1 exposed Positive
2 exposed Positive
3 exposed Positive
4 exposed Positive
5 exposed negative
6 notExposed negative
7 exposed Positive
8 exposed Positive
9 exposed Positive
10 notExposed negative 
11 exposed negative
12 exposed Positive
13 notExposed Positive
14 notExposed negative 
15 exposed Positive     
16 exposed Positive
17 exposed negative
18 notExposed negative 
19 exposed Positive
20 exposed Positive
21 exposed Positive
22 notExposed negative 
23 exposed negative
24 exposed Positive
25 notExposed negative
26 exposed negative
27 notExposed negative
28 exposed Positive
"

de <- read.table(text=de_string, header=F, col.names=c("subjectID", "exposure", "disease"))
#Chi-squared approximation may be incorrect because sample size is so small.
tbl <- table(de$exposure, de$disease)

#Chi Sq Test for Independence
chisq.test(tbl, correct=F) #Default of correct=T yields Continuity Adj Chi Sq
## Warning in chisq.test(tbl, correct = F): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 9.1146, df = 1, p-value = 0.002536
#Contingency Table
library(gmodels)
CrossTable(de$exposure, de$disease)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  28 
## 
##  
##              | de$disease 
##  de$exposure |  negative |  Positive | Row Total | 
## -------------|-----------|-----------|-----------|
##      exposed |         5 |        15 |        20 | 
##              |     1.488 |     1.116 |           | 
##              |     0.250 |     0.750 |     0.714 | 
##              |     0.417 |     0.938 |           | 
##              |     0.179 |     0.536 |           | 
## -------------|-----------|-----------|-----------|
##   notExposed |         7 |         1 |         8 | 
##              |     3.720 |     2.790 |           | 
##              |     0.875 |     0.125 |     0.286 | 
##              |     0.583 |     0.062 |           | 
##              |     0.250 |     0.036 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        12 |        16 |        28 | 
##              |     0.429 |     0.571 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
#Fisher Exact Test
fisher.test(de$exposure, de$disease)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  de$exposure and de$disease
## p-value = 0.004236
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.0009998694 0.5759906625
## sample estimates:
## odds ratio 
## 0.05433825

9. Chi Square goodness of fit test, Week 10 lab

gator_string <- "
1.24 I  1.30 I  1.30 I  1.32 F  1.32 F  1.40 F  1.42 I  1.42 F
1.45 I  1.45 O  1.47 I  1.47 F  1.50 I  1.52 I  1.55 I  1.60 I
1.63 I  1.65 O  1.65 I  1.65 F  1.65 F  1.68 F  1.70 I  1.73 O
1.78 I  1.78 I  1.78 O  1.80 I  1.80 F  1.85 F  1.88 I  1.93 I
1.98 I  2.03 F  2.03 O  2.16 F  2.26 F  2.31 F  2.31 F  2.36 F
3.36 F  3.39 F  3.41 F  2.44 O  2.46 F  2.56 O  2.67 O  2.72 I
3.79 F  2.84 F  3.25 O  3.28 O  3.33 F  3.56 F  3.58 F  3.66 F
3.68 O  3.71 F  3.89 F
"

#replace newlines with spaces, otherwise read.table thinks there are 16 columns.
gator <- read.table(text=gsub("\n", " ", gator_string, perl=TRUE), header=F, col.names=c("length", "type"))

#data must be summarized for chisq test
gator_count <- gator %>% 
  group_by(type) %>% 
  summarise(count = n())
#chisq.test(x = observed, p = expected)
#F I O
chisq.test(x=gator_count$count, p=c(1/2, 3/10, 1/5))
## 
##  Chi-squared test for given probabilities
## 
## data:  gator_count$count
## X-squared = 0.42938, df = 2, p-value = 0.8068

10. Linear Contrasts, Week 3 lab (first lab)

See Note 10 for reference

student_scores_string <- "
1     -3.4  -0.2  7.7   12
2     -2.8  5.2   5.5   4.1
3     2.2   6.6   -0.8  5.9
4     -0.8  5.2   7.4   13.5
5     2.8   -0.6  0.1   7.5
6     -5.9  5.4   11.7  9.3
7     7.8   3.1   1.2   7.1
8     -3.5  6.5   3.8   -0.9
9     2.9   2.4   5.1   8.3
10    1.9   6.2   4.3   9.8
11    -0.2  7.9   3.9   11.1
12    1.5   7.9   6.9   4.9
13    0.4   6.6   2.8   5.8
14    -0.5  0.2   5.4   2.8
15    1.1   1.9   2.5   12
16    5.3   1.3   5.2   8.6
17    -4    1.8   3.1   2
18    -1.3  3.1   6.6   5.9
19    2.6   1.4   0.2   5.6
20    -0.9  2.1   7.1   11.6
21    -0.6  6.6   9.2   7.8
22    -5    7     3     7.2
23    2.4   -0.7  2.3   8.3
24    -0.1  4.1   10.2  6.5
25    -4.7  3.8   4.7   8.3
"

student_scores <- read.table(text=gsub("\n", " ", student_scores_string, perl=TRUE), header=F, col.names=c("student", "control", "piano", "computer", "instructor"))

scores_long <- student_scores %>% 
  gather(key="method", value="score", control, piano, computer, instructor) %>% 
  arrange(method)

#The order of the contrast weights maps to the order of the levels. Here, I set the order to be alphabetical.
levels(scores_long$method) <- c("computer", "control", "instructor", "piano")
#Calculate inverse matrix of contrasts
c1 <- c(1, 0, 0, -1) #computer vs piano
c2 <- c(0, 1, -1, 0) #control vs instructor
c3 <- c(-1, 3, -1,-1) #control vs all

mat_temp <- rbind(constant=1/4, c1, c2, c3)
mat <- solve(mat_temp)

mat <- mat[, -1]

#Calculate contrasts.
model <- lm(score ~ method, data=scores_long, contrasts=list(method = mat))
summary(model)
## 
## Call:
## lm(formula = score ~ method, data = scores_long, contrasts = list(method = mat))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.300 -1.971 -0.028  2.352  7.912 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9610     0.3150  12.576  < 2e-16 ***
## methodc1      0.9720     0.8908   1.091    0.278    
## methodc2     -7.5120     0.8908  -8.433 3.42e-13 ***
## methodc3    -16.2920     2.1821  -7.466 3.76e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.15 on 96 degrees of freedom
## Multiple R-squared:  0.433,  Adjusted R-squared:  0.4153 
## F-statistic: 24.44 on 3 and 96 DF,  p-value: 7.807e-12

11. Tukey, Week 3 lab (first lab, same as above)

Different package used, see Note 11

library(TukeyC)
## Warning: package 'TukeyC' was built under R version 3.5.3
## Loading required package: doBy
## Warning: package 'doBy' was built under R version 3.5.3
#Tukey won't take lm or Anova, only aov.
fit <- aov(score ~ method, data=scores_long)

#This package matches SAS output. TukeyC(x, which)
tukes <- TukeyC(fit, "method")
tukes
## Results
##            Means G1 G2 G3
## instructor  7.40  a      
## computer    4.76     b   
## piano       3.79     b   
## control    -0.11        c
## 
## Sig.level
##  0.05
## 
## Diff_Prob
##            instructor computer piano control
## instructor      0.000    2.636 3.608   7.512
## computer        0.020    0.000 0.972   4.876
## piano           0.001    0.696 0.000   3.904
## control         0.000    0.000 0.000   0.000
## 
## MSD
##            instructor computer piano control
## instructor      0.000    2.329 2.329   2.329
## computer        2.329    0.000 2.329   2.329
## piano           2.329    2.329 0.000   2.329
## control         2.329    2.329 2.329   0.000
plot(tukes)

12. Two Way ANOVA, Week 4 lab (second lab)

See Note 12

soil_string <- "
5.2 100 4 5.9   100 4 6.3   100 4 7.1   100 5
7.4 100 5 7.5   100 5 7.6   100 6 7.2   100 6
7.4 100 6 7.2   100 7 7.5   100 7 7.2   100 7
7.4 200 4 7.0   200 4 7.6   200 4 7.4   200 5
7.3 200 5 7.1   200 5 7.6   200 6 7.5   200 6
7.8 200 6 7.4   200 7 7.0   200 7 6.9   200 7
6.3 300 4 6.7   300 4 6.1   300 4 7.3   300 5
7.5 300 5 7.2   300 5 7.2   300 6 7.3   300 6
7.0 300 6 6.8   300 7 6.6   300 7 6.4   300 7
"

soil <- read.table(text=gsub("\n", " ", soil_string, perl=TRUE), header=F, col.names=c("diameter", "calcium", "ph"))

soil$calcium <- as.factor(soil$calcium)
soil$ph <- as.factor(soil$ph)

Using car package

#Two way Anova
Anova(lm(diameter ~ calcium * ph, data=soil), type=3)
## Anova Table (Type III tests)
## 
## Response: diameter
##              Sum Sq Df    F value    Pr(>F)    
## (Intercept) 1790.70  1 26420.1680 < 2.2e-16 ***
## calcium        1.47  2    10.8238 0.0004462 ***
## ph             4.46  3    21.9385 4.635e-07 ***
## calcium:ph     3.26  6     8.0041 8.186e-05 ***
## Residuals      1.63 24                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Output residuals and test them for normality
resids <- resid(lm(diameter ~ calcium * ph, data=soil))

shapiro.test(resids)
## 
##  Shapiro-Wilk normality test
## 
## data:  resids
## W = 0.98385, p-value = 0.8659
layout(mat = matrix(c(1,2), 2, 1),  height = c(3,1))
par(mar=c(0, 3, 2, 1))
hist(resids, xlim=c(-1,1), col="pink", main="Distribution of Residuals")
boxplot(resids, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(-1,1), axes=F)

qqnorm(resids)
qqline(resids)

Another example from HW4 #14-8

#Another example from HW4 #14-8
attention <- read.csv("data/CH14/ex14-8.txt")

attention_stacked <- attention %>% 
  gather(key="key", value="span", X.A1.P1., X.A2.P1., X.A3.P1., X.A1.P2., X.A2.P2., X.A3.P2.) 

attention_stacked$key <- gsub('^..|.$', '', attention_stacked$key)

attention_stacked <- separate(attention_stacked, key, c("age", "product"))

#Two way anova
Anova(lm(span ~ age * product, data=attention_stacked), type=3)
## Anova Table (Type III tests)
## 
## Response: span
##             Sum Sq Df  F value    Pr(>F)    
## (Intercept)  44608  1 303.2284 < 2.2e-16 ***
## age           1303  2   4.4287 0.0165575 *  
## product       2018  1  13.7202 0.0005001 ***
## age:product   1384  2   4.7049 0.0130735 *  
## Residuals     7944 54                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

13. Completely Randomized Block Design, HW3 #15-6

music <- read.csv("data/CH15/ex15-6.txt")
colnames(music) <- c("subject", "type_music", "typing_efficiency")
music$subject <- as.factor(music$subject)
Anova(lm(typing_efficiency ~ type_music + subject, data=music), type=3)
## Anova Table (Type III tests)
## 
## Response: typing_efficiency
##             Sum Sq Df   F value    Pr(>F)    
## (Intercept) 9557.3  1 4041.0201 < 2.2e-16 ***
## type_music    31.0  2    6.5436 0.0119776 *  
## subject      149.3  6   10.5235 0.0003462 ***
## Residuals     28.4 12                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

14. Latin Square Design (LSD), HW3 #15-40

lights <- read.csv("data/CH15/ex15-40.txt")
colnames(lights) <- c("intersection", "minutes", "signaling", "period")
lights$intersection <- as.factor(lights$intersection)
lights$period <- as.factor(lights$period)
Anova(lm(minutes ~ signaling + intersection + period, data=lights), type=3)
## Anova Table (Type III tests)
## 
## Response: minutes
##               Sum Sq Df   F value    Pr(>F)    
## (Intercept)  13372.6  1 2289.3339 4.545e-15 ***
## signaling       76.3  4    3.2648   0.04976 *  
## intersection    18.2  4    0.7801   0.55927    
## period        1091.7  4   46.7222 3.202e-07 ***
## Residuals       70.1 12                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Bonus tukey
fit <- aov(minutes ~ signaling + intersection + period, data=lights)
tukes <- TukeyC(fit, "signaling")
tukes
## Results
##     Means G1 G2
## 'B' 24.88  a   
## 'E' 24.14  a  b
## 'A' 24.12  a  b
## 'D' 22.50  a  b
## 'C' 20.00     b
## 
## Sig.level
##  0.05
## 
## Diff_Prob
##       'B'   'E'   'A'   'D'  'C'
## 'B' 0.000 0.740 0.760 2.380 4.88
## 'E' 0.987 0.000 0.020 1.640 4.14
## 'A' 0.986 1.000 0.000 1.620 4.12
## 'D' 0.549 0.817 0.823 0.000 2.50
## 'C' 0.050 0.111 0.113 0.504 0.00
## 
## MSD
##       'B'   'E'   'A'   'D'   'C'
## 'B' 0.000 4.872 4.872 4.872 4.872
## 'E' 4.872 0.000 4.872 4.872 4.872
## 'A' 4.872 4.872 0.000 4.872 4.872
## 'D' 4.872 4.872 4.872 0.000 4.872
## 'C' 4.872 4.872 4.872 4.872 0.000
plot(tukes)

15. ANCOVA, Week 5 lab

oysters_string <- "
1 1 27.2 32.6
1 2 32.0 36.6
1 3 33.0 37.7
1 4 26.8 31.0
2 1 28.6 33.8
2 2 26.8 31.7
2 3 26.5 30.7
2 4 26.8 30.4
3 1 28.6 35.2
3 2 22.4 29.1
3 3 23.2 28.9
3 4 24.4 30.2
4 1 29.3 35.0
4 2 21.8 27.0
4 3 30.3 36.4
4 4 24.3 30.5
5 1 20.4 24.6
5 2 19.6 23.4
5 3 25.1 30.3
5 4 18.1 21.8
"

oysters <- read.table(text=gsub("\n", " ", oysters_string, perl=TRUE), header=F, col.names=c("treatment", "replication", "initial_weight", "final_weight"))

oysters$treatment <- as.factor(oysters$treatment)
oysters$replication <- as.factor(oysters$replication)
#ANCOVA
Anova(lm(final_weight ~ initial_weight + treatment, data=oysters), type=3)
## Anova Table (Type III tests)
## 
## Response: final_weight
##                 Sum Sq Df  F value    Pr(>F)    
## (Intercept)      1.718  1   5.6959 0.0316708 *  
## initial_weight 156.040  1 517.3840 1.867e-12 ***
## treatment       12.089  4  10.0212 0.0004819 ***
## Residuals        4.222 14                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Test for heteroskedasticity (interaction term)
Anova(lm(final_weight ~ initial_weight * treatment, data=oysters), type=3)
## Anova Table (Type III tests)
## 
## Response: final_weight
##                          Sum Sq Df  F value    Pr(>F)    
## (Intercept)               0.064  1   0.2275    0.6436    
## initial_weight           68.529  1 241.8091 2.472e-08 ***
## treatment                 1.696  4   1.4963    0.2752    
## initial_weight:treatment  1.388  4   1.2247    0.3602    
## Residuals                 2.834 10                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

16. Random Effect and Mixed Effect Models, HW5 #17-10

See Note 16 for all the testing

pesticide <- read.csv("data/CH17/ex17-10.txt")
colnames(pesticide) <- c("location", "chemical", "number_killed")
pesticide$location <- as.factor(pesticide$location)
pesticide$chemical <- as.factor(pesticide$chemical)
#Make F table manually
fit <- lm(number_killed ~ location * chemical, data=pesticide)
results <- Anova(fit, type=3) #this is the anova table. The following replaces F tests with MS_main/MS_interaction
Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[4]
df.error <- Df[4]

Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)

Fvalue[3] <- MeanSq[3]/Error.Term
Pvalue[3] <- 1 - pf(Fvalue[3], Df[3], df.error)

Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Intercept", "Locations", "Chemicals", "Locations:Chemicals", "Residuals")
print(Ftable)
##                     Df     SumSq      MeanSq       Fvalue       Pvalue
## Intercept            1 1686.1022 1686.102250 4869.6093863 2.314014e-25
## Locations            4    3.8115    0.952875    0.7076461 6.020037e-01
## Chemicals            3  180.1327   60.044250   44.5914534 8.797523e-07
## Locations:Chemicals 12   16.1585    1.346542    3.8889290 3.652306e-03
## Residuals           20    6.9250    0.346250           NA           NA

Varcomp, see Note 16 for reference.

library(VCA)
## Warning: package 'VCA' was built under R version 3.5.3
#Look at VC column in fit$aov.tab
fit <- remlMM(number_killed~(location)*chemical, pesticide, cov=T)
print(fit)
## 
## 
## REML-Estimation of Mixed Model:
## -------------------------------
## 
##  [Fixed Effects]
## 
##       int chemical1 chemical2 chemical3 chemical4 
##      5.71      2.85     -2.34      2.62      0.00 
## 
## 
##  [Variance Components]
## 
##   Name              DF        VC       %Total    SD       CV[%]    
## 1 total             24.594337 0.797191 100       0.892856 13.752109
## 2 location          0         0        2.1e-05   0.00041  0.006316 
## 3 location:chemical 5.989053  0.450943 56.566506 0.671523 10.343059
## 4 error             20        0.346248 43.433473 0.588428 9.063201 
##   Var(VC) 
## 1 0.05168 
## 2 0.016228
## 3 0.067907
## 4 0.011989
## 
## Mean: 6.4925 (N = 40) 
## 
## Experimental Design: balanced  |  Method: REML

17. Nested Factors, Week 8 lab #1

drug_string <- "
5.03
5.10
5.25
4.98
5.05
4.64
4.73
4.82
4.95
5.06
5.10
5.15
5.20
5.08
5.14
5.05
4.96
5.12
5.12
5.05
5.46
5.15
5.18
5.18
5.11
4.90
4.95
4.86
4.86
5.07
"

drug_y <- read.table(text=gsub("\n", " ", drug_string, perl=TRUE), header=F, col.names = c('y'))

counter_y = 0
site <- c()
batch <- c()
tablet <- c()

for (i in 1:2){ #site
  for (j in 1:3){ #batch
    for (k in 1:5){ #tablet
      counter_y <- counter_y + 1
      site[counter_y] <- i
      batch[counter_y] <- j
      tablet[counter_y] <- k
    }
  }
}

drug <- data.frame(site, batch, tablet, drug_y)
drug$site <- as.factor(drug$site)
drug$batch <- as.factor(drug$batch)
drug$tablet <- as.factor(drug$tablet)
rm(site, batch, tablet, drug_y)
#Remembering to convert the categorical variables to factors is important.
#Specifying nested term: site/batch == batch(site) in SAS
fit <- lm(y ~ site + site/batch, data=drug)
results <- Anova(fit, type=3) 

Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[3]
df.error <- Df[3]

Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)

Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Intercept", "Site", "Batch(Site)", "Residuals")
print(Ftable)
##             Df        SumSq       MeanSq       Fvalue       Pvalue
## Intercept    1 763.05633333 763.05633333 6.310597e+04 1.469251e-42
## Site         1   0.01825333   0.01825333 1.608176e-01 7.089034e-01
## Batch(Site)  4   0.45401333   0.11350333 9.386906e+00 1.028393e-04
## Residuals   24   0.29020000   0.01209167           NA           NA

Week 8 lab, #2

y_string <- "
2.2 2.0 2.1 2.3 2.5 2.7 2.8 2.5
2.4 2.4 2.2 1.8 2.8 2.5 2.5 2.3
3.0 2.4 2.0 1.9 2.7 2.0 2.4 2.6
2.7 2.3 1.9 2.5 2.7 2.0 2.4 2.6
2.5 2.2 1.8 2.6 2.7 2.6 2.4 2.7
2.1 2.2 1.7 2.3 2.5 2.4 2.4 2.5
"
y_vector <- read.table(text=gsub("\n", " ", y_string, perl=TRUE), header=F, col.names = c('y'))

counter_y = 0
device <- c()
rep <- c()
facility <- c()
tester <- c()

for (i in 1:3){ #device
  for (j in 1:2){ #rep
    for (k in 1:2){ #facility
      for (l in 1:4){ #tester
        counter_y <- counter_y + 1
        device[counter_y] <- i
        rep[counter_y] <- j
        facility[counter_y] <- k 
        tester[counter_y] <- l
      }
    }
  }
}

engi <- data.frame(device, rep, facility, tester, y_vector)
engi$device <- as.factor(engi$device)
engi$rep <- as.factor(engi$rep)
engi$facility <- as.factor(engi$facility)
engi$tester <- as.factor(engi$tester)

rm(device, rep, facility, tester, y_vector)
fit <- lm(y ~ facility + device + facility:device + facility/tester, data=engi)
results <- Anova(fit, type=3) 

Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[5]
df.error <- Df[5]

Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)

Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Intercept", "Facility", "Device", "Facility*Device", "Tester(Facility)", "Residuals")
print(Ftable)
##                  Df       SumSq      MeanSq       Fvalue       Pvalue
## Intercept         1 269.3268750 269.3268750 5.191843e+03 1.606325e-40
## Facility          1   0.9352083   0.9352083 5.072316e+00 6.524540e-02
## Device            2   0.0087500   0.0043750 8.433735e-02 9.193022e-01
## Facility*Device   2   0.2254167   0.1127083 2.172691e+00 1.285749e-01
## Tester(Facility)  6   1.1062500   0.1843750 3.554217e+00 7.237563e-03
## Residuals        36   1.8675000   0.0518750           NA           NA

18. Crossover Design, Week 9 lab

y_string <- "
1.5 2.2 3.4
2.0 2.6 3.1
1.6 2.7 3.2
1.1 2.3 2.9
2.5 3.5 1.9
2.8 3.1 1.5
2.7 2.9 2.4
2.4 2.6 2.3
3.3 1.9 2.7
3.1 1.6 2.5
3.6 2.3 2.2
3.0 2.5 2.0
"
y_vector <- read.table(text=gsub("\n", " ", y_string, perl=TRUE), header=F, col.names = c('y'))

counter_y = 0
sequence <- c()
patient <- c()
period <- c()

for (i in 1:3){ #sequence
  for (j in 1:4){ #patient
    for (k in 1:3){ #period
      counter_y <- counter_y + 1
      sequence[counter_y] <- i
      patient[counter_y] <- j
      period[counter_y] <- k 
    }
  }
}

bp <- data.frame(sequence, patient, period, y_vector)
bp$sequence<- as.factor(bp$sequence)
bp$patient<- as.factor(bp$patient)
bp$period<- as.factor(bp$period)

rm(sequence, patient, period, y_vector)

trt_str <- data.frame(sequence=as.factor(c(1,2,3)),
                        period_1=c("T1", "T2", "T3"),
                        period_2=c("T2", "T3", "T1"),
                        period_3=c("T3", "T1", "T2")
                        )
trt_str <- gather(trt_str, key="period", value="treatment", period_1, period_2, period_3)
trt_str$period <- as.factor(gsub("period_", "", trt_str$period, perl=TRUE))


bp <- inner_join(bp, trt_str, c("sequence", "period"))
bp$treatment<- as.factor(bp$treatment)
#Because of perfect multicolinearity, singular.ok = T, see Note 18
fit <- lm(y ~ sequence + treatment + period + sequence/patient + treatment:period, data=bp)
results <- Anova(fit, type=3, singular.ok=T) 
## Note: model has aliased coefficients
##       sums of squares computed by model comparison
print(results)
## Anova Table (Type III tests)
## 
## Response: y
##                  Sum Sq Df F values   Pr(>F)    
## sequence         0.0000  0                      
## treatment        9.5172  2  41.5129 1.81e-07 ***
## period           0.0172  2   0.0751  0.92792    
## sequence:patient 0.6692  9   0.6486  0.74252    
## treatment:period 0.6489  2   2.8304  0.08535 .  
## Residuals        2.0633 18                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Since interaction is insignificant, rerun model without it.
fit <- lm(y ~ sequence + treatment + period + sequence/patient, data=bp)
results <- Anova(fit, type=3) 

Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[5]
df.error <- Df[5]

Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)

Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Intercept", "Sequence", "Treatment", "Period", "Patient(Sequence)", "Residuals")
print(Ftable)
##                   Df        SumSq       MeanSq       Fvalue       Pvalue
## Intercept          1 224.50027778 2.245003e+02 1.655471e+03 1.040635e-20
## Sequence           2   0.23388889 1.169444e-01 1.572852e+00 2.595310e-01
## Treatment          2   9.51722222 4.758611e+00 3.509013e+01 2.878627e-07
## Period             2   0.01722222 8.611111e-03 6.349857e-02 9.386639e-01
## Patient(Sequence)  9   0.66916667 7.435185e-02 5.482726e-01 8.221441e-01
## Residuals         20   2.71222222 1.356111e-01           NA           NA

Note 4

#Use gsub for messier strings. Here it's unnecessary.
#golf <- read.table(text=gsub("(?<=[a-z])\\s+", " ", golf_text, perl=TRUE), header=FALSE, col.names = c("gender", "score"))

Note 5

#Calculation of V:
diff <- mining$after - mining$before
diff <- diff[diff!=0] #delete all differences = 0, as per Signed Rank procedure
diff.rank <- rank(abs(diff))
diff.rank.sign <- diff.rank * sign(diff[])
ranks.pos <- sum(diff.rank.sign[diff.rank.sign > 0]) 
ranks.neg <- -sum(diff.rank.sign[diff.rank.sign < 0]) 

Note 6

Referenced used for ANOVA: https://www.r-bloggers.com/anova-%E2%80%93-type-iiiiii-ss-explained/

#SAS calculates Leven using Squared deviations. R and it's packages seem to do Absolute instead.
#The following replicates what SAS does.
#https://stackoverflow.com/questions/22396543/levenes-test-using-the-squared-residuals

oil.lm <- lm(visc ~ type, data = oil)
anova(lm(residuals(oil.lm)^2 ~ oil$type))

#HOV Levene using absolute deviations
leveneTest(visc ~ type, data=oil)

Using ezANOVA package

library(ez)

#ezANOVA requires an ID column, wid. This is so that it can handle repeated measures.
oil$id <- as.factor(c(1:length(oil$type)))


av <- ezANOVA(data=oil, dv=visc, wid=id ,between=type, detail=T, type=3, white.adjust=T)
print(av)

ANT example

data(ANT)
b_anova_full <- ezANOVA(data=ANT,
                        dv=rt,
                        wid=subnum,
                        between=group,
                        within_full=.(cue, flank),
                        type=3,
                        detailed=T,
                        return_aov=T)
print(b_anova_full)

Note 11

#Base R uses TukeyHSD, which yields less elegant output.
tukes <- TukeyHSD(fit, "method")
tukes
plot(tukes)

Note 12

Using ezANOVA package

soil$id <- as.factor(c(1:length(soil$diameter))) #ezANOVA requires an id column
soil_anova <- ezANOVA(data=soil, dv=diameter, wid=id, between=.(calcium, ph), type=3, detailed=T, return_aov=T)

resids <- resid(soil_anova$aov) #bc we used return_aov=T, we can call the base aov() and output the residuals

shapiro.test(resids)

layout(mat = matrix(c(1,2), 2, 1),  height = c(3,1))
par(mar=c(0, 3, 2, 1))
hist(resids, xlim=c(-1,1), col="pink", main="Distribution of Residuals")
boxplot(resids, horizontal=T, outline=T, frame=F, col="green1", width=10, ylim=c(-1,1), axes=F)

qqnorm(resids)
qqline(resids)

Note 16

reference: https://mcfromnz.wordpress.com/2011/03/02/anova-type-iiiiii-ss-explained/ Varcomp reference: https://stats.stackexchange.com/questions/161225/estimates-of-the-variance-of-the-variance-component-of-a-mixed-effects-model Testing on paint

paint <- read.csv("data/CH17/ex17-1.txt")
colnames(paint) <- c("batch", "percentage")

Testing on pesticide

#Both of these attempts have chemical F = 48.11, but answer should be 173.41 if divided by ms_error, or 44.49 if divided by ms_interaction

#attempt 1 with interaction, wrong
library(afex)
mixed(number_killed ~ chemical + (1|location) + (1|location:chemical), data=pesticide)

#with no interaction, CORRECT, but that means interaction is wrong...
model2 <- lmer_alt(number_killed ~ chemical + (1|location), data=pesticide)
anova(model2)

#attempt 2 with interaction, wrong
model3 <- lmer_alt(number_killed ~ chemical + (1|location) + (1|chemical:location), data=pesticide)
summary(model3)

Working solution (but answer is type 2)

#Make F table manually
fit <- lm(number_killed ~ location * chemical, data=pesticide)
results <- Anova(fit) #this is the anova table. The following replaces F tests with MS_main/MS_interaction
Df <- results$Df
SumSq <- results$"Sum Sq"
MeanSq <- SumSq/results$Df
Fvalue <- results$"F value"
Pvalue <- results$"Pr(>F)"
Error.Term <- MeanSq[3]
df.error <- Df[3]

Fvalue[1] <- MeanSq[1]/Error.Term
Pvalue[1] <- 1 - pf(Fvalue[1], Df[1], df.error)

Fvalue[2] <- MeanSq[2]/Error.Term
Pvalue[2] <- 1 - pf(Fvalue[2], Df[2], df.error)

Ftable <- cbind(Df, SumSq, MeanSq, Fvalue, Pvalue)
rownames(Ftable) <- c("Locations", "Chemicals", "Locations:Chemicals", "Residuals")
print(Ftable)

Misc things

library(lmerTest) 

coef(fit)


#other attempts
model <- lmerTest::lmer(number_killed ~ chemical + (1|location) + (1|location:chemical), data=pesticide)
rand(model)

linearHypothesis(model, "chemical=0") #doesnt work

Note 18

For carryover, unused and unfinished

carryover <- trt_str
carryover$period <- as.factor(as.numeric(trt_str$period) + 1)

Regarding multicolinearity: https://stats.stackexchange.com/questions/112442/what-are-aliased-coefficients