######### HOMEWORK 2, STOR 665: COMPUTING PART, DUE FEB 24, 2016 #########
######### Student Name: Jie Huang
# INSTRUCTION: Edit this R file by adding your solution to the end of each
# question. Your submitted file should run smoothly in R and provide all
# required results. You are allowed to work with other students but the
# homework should be in your own words. Identical solutions will receive a 0
# in grade and will be investigated.
# This homework is an exercise on ANOVA. We will
# build up R code that 1) produces important ANOVA statistics, and
# 2) perform ANOVA on a real dataset.
# We will be negligent in the sense that we will ignore precautions
# that a numerical analyst would build into his code, and we will not
# consider measures to improve numerical conditioning such as pivoting
# (switching the order of the variables) either.
#----------------------------------------------------------------
# PROBLEM 1: Description of the coagulation data. Start by loading the data:
rm(list=ls(all=TRUE))
library(faraway) # install this package if needed
attach(coagulation) # load the coagulation data.
coagulation # Type help(coagulation) for more details
## coag diet
## 1 62 A
## 2 60 A
## 3 63 A
## 4 59 A
## 5 63 B
## 6 67 B
## 7 71 B
## 8 64 B
## 9 65 B
## 10 66 B
## 11 68 C
## 12 66 C
## 13 71 C
## 14 67 C
## 15 68 C
## 16 68 C
## 17 56 D
## 18 62 D
## 19 60 D
## 20 61 D
## 21 63 D
## 22 64 D
## 23 63 D
## 24 59 D
coag;diet
## [1] 62 60 63 59 63 67 71 64 65 66 68 66 71 67 68 68 56 62 60 61 63 64 63
## [24] 59
## [1] A A A A B B B B B B C C C C C C D D D D D D D D
## Levels: A B C D
# There are four groups of observations in the data. Make four plots comparing
# the mean, median, variance, and distribution of the observations within the
# four groups. Make comments.
# SOLUTION:
data = split(coag, diet) #split the data into 4 categories
#calculate mean median variance:
dmean = sapply(data,mean)
dmedian = sapply(data, median)
dvariance = sapply(data, var)
#plots:
par(mfrow=c(2,2))
boxplot(data,main="distribution of the observations") # or use plot(coag ~ diet)
barplot(dmean, main = "Mean")
barplot(dmedian, main = "Median")
barplot(dvariance, main = "variance")
# my comment:
#Group C has highest mean median and lowest variance
#Group D and A have the same mean, but group A has much smaller variance
#Group B has highest variance
#I did this before class, but Prof Zhang taught another method, here:
attributes(diet)
## $levels
## [1] "A" "B" "C" "D"
##
## $class
## [1] "factor"
plot.design(coagulation) ### Comparison the means
title("Comparison of the means")
plot.design(coagulation, fun=median) ### Comparing the medians
title("Comparison of the medians")
plot.design(coagulation, fun=var) ### Comparing the variances
title("Comparison of the variance")
boxplot(coag~diet) ### Outlier seen in Group C!
title("Boxplots for each treatment")
#----------------------------------------------------------------
# PROBLEM 2: Write a function my.anova(x,y) that takes x as a vector indicating
# the groups and y as a vector with observations. The function then computes a
# list with the following named elements:
# a) "Group_Mean": a matrix with one row per group and columns
# named "Mean" and "Std.Err.Est"
# b) "RMSE": a number, s
# c) "SSE": a number, SSE
# d) "df_SSE": a number, df_SSE
# e) "SST": a number, SST
# f) "df_SST": a number, df_SST
# g) "F": a number, F
# h) "p-value": a number, pv, for the F-test
# You may use matrix multiplication if convenient, but no high-level
# functions other than things like sqrt().
# Try your solution on the coagulation data and show the full result:
# my.anova(diet,coag)
# Compare with results from the canned functions
# anova(lm(coag~diet))
# and
# oneway.test(coag~diet,var.equal=T)
# You need to get the exact numbers for the F stat and the p-value.
# Are the group means all equal?
# SOLUTION:
my.anova <- function(x,y)
{
I = nlevels(x) #how many treatments --- I
n = length(x) # total number of observations
# new = data.frame(x,y)
# newdata=new[order(x),]
# x = newdata$x
# y = newdata$y
# a) "Group_Mean": a matrix with one row per group and columns
# named "Mean" and "Std.Err.Est"
matrixA <- matrix(0,I,5)
#initiate a matrix, will update later
colnames(matrixA) <- c("Mean" , "Std.Err.Est", "Sum", "Count", "Sum of Square")
rownames(matrixA) <- c(levels(x))
k=1
matrixA[1,3] = y[1]
matrixA[1,4] = 1
matrixA[1,5] = y[1]^2
for (i in 2:n)
{
if (x[i] != x[i-1]) #if start a different category, then
{
k = k+1
}
matrixA[k,3] = matrixA[k,3] + y[i]
matrixA[k,4] = matrixA[k,4] + 1
matrixA[k,5] = matrixA[k,5] + y[i]^2
}
for (k in 1:I)
{matrixA[k,1] = matrixA[k,3]/matrixA[k,4]
#sum[(y-ybar)^2] = sum[y^2] - n*ybar^2
#variance = sum[(y-ybar)^2] /(n-1)
matrixA[k,2] = sqrt((matrixA[k,5] - matrixA[k,4]* matrixA[k,1]^2)/(matrixA[k,4]-1))}
# c) "SSE": a number, SSE
SSE=0
for (k in 1:I)
{SSE = SSE + matrixA[k,5] - matrixA[k,4]*matrixA[k,1]^2}
#b) "RMSE": a number, s
s = sqrt(SSE/n)
# d) "df_SSE": a number, df_SSE
df_SSE = n-I
# e) "SST": a number, SST
SST = sum(matrixA[,4]* matrixA[,1]^2) - n * mean(y)^2
# f) "df_SST": a number, df_SST
df_SST = I-1
# g) "F": a number, F
F=(SST/df_SST)/(SSE/df_SSE)
# h) "p-value": a number, pv, for the F-test
p = 1-pf(F,df_SST,df_SSE)
returned = list(Group_Mean=matrixA, RMSE = s, SSE=SSE, df_SSE=df_SSE, SST=SST, df_SST=df_SST, F=F, "p-value"=p)
return(returned)
}
my.anova(diet,coag)
## $Group_Mean
## Mean Std.Err.Est Sum Count Sum of Square
## A 61 1.825742 244 4 14894
## B 66 2.828427 396 6 26176
## C 68 1.673320 408 6 27758
## D 61 2.618615 488 8 29816
##
## $RMSE
## [1] 2.160247
##
## $SSE
## [1] 112
##
## $df_SSE
## [1] 20
##
## $SST
## [1] 228
##
## $df_SST
## [1] 3
##
## $F
## [1] 13.57143
##
## $`p-value`
## [1] 4.658471e-05
anova(lm(coag~diet))
## Analysis of Variance Table
##
## Response: coag
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 3 228 76.0 13.571 4.658e-05 ***
## Residuals 20 112 5.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
oneway.test(coag~diet,var.equal=T)
##
## One-way analysis of means
##
## data: coag and diet
## F = 13.5714, num df = 3, denom df = 20, p-value = 4.658e-05
#----------------------------------------------------------------
# PROBLEM 3: Test the equality of variance through the Bartlett test.
# bartlett.test(coag~diet)
# Can we assume that the group variances are all equal?
# If not, then the test of equality of group means can be done by
# oneway.test(coag~diet)
# Compare this test with the same test in Problem 2 assuming an equal
# variance. Make comments.
# SOLUTION:
bartlett.test(coag~diet)
##
## Bartlett test of homogeneity of variances
##
## data: coag by diet
## Bartlett's K-squared = 1.668, df = 3, p-value = 0.6441
not significant, so we can assume that the group variances are all equal
oneway.test(coag~diet)
##
## One-way analysis of means (not assuming equal variances)
##
## data: coag and diet
## F = 16.7281, num df = 3.000, denom df = 9.953, p-value = 0.0003249
both of these two assumptions (equal variance v.s. not assuming equal var) gives a highly significant p value
#----------------------------------------------------------------
# PROBLEM 4: Construct the Tukey CIs for the contrasts.
# Which group means are different if any?
# SOLUTION:
fm1 <- aov(coag~diet)
summary(fm1)
## Df Sum Sq Mean Sq F value Pr(>F)
## diet 3 228 76.0 13.57 4.66e-05 ***
## Residuals 20 112 5.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(fm1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = coag ~ diet)
##
## $diet
## diff lwr upr p adj
## B-A 5 0.7245544 9.275446 0.0183283
## C-A 7 2.7245544 11.275446 0.0009577
## D-A 0 -4.0560438 4.056044 1.0000000
## C-B 2 -1.8240748 5.824075 0.4766005
## D-B -5 -8.5770944 -1.422906 0.0044114
## D-C -7 -10.5770944 -3.422906 0.0001268
It seems that B-A, C-A, D-B, D-C have different group means