######### HOMEWORK 2, STOR 665: COMPUTING PART, DUE FEB 24, 2016 #########

######### Student Name:                      Jie Huang

# INSTRUCTION: Edit this R file by adding your solution to the end of each 
# question. Your submitted file should run smoothly in R and provide all 
# required results. You are allowed to work with other students but the 
# homework should be in your own words. Identical solutions will receive a 0 
# in grade and will be investigated.

# This homework is an exercise on ANOVA.  We will
# build up R code that 1) produces important ANOVA statistics, and
# 2) perform ANOVA on a real dataset.

# We will be negligent in the sense that we will ignore precautions 
# that a numerical analyst would build into his code, and we will not
# consider measures to improve numerical conditioning such as pivoting
# (switching the order of the variables) either.

#----------------------------------------------------------------

# PROBLEM 1: Description of the coagulation data. Start by loading the data:
  rm(list=ls(all=TRUE))   
  library(faraway)                  # install this package if needed
  attach(coagulation)               # load the coagulation data.
  coagulation                       # Type help(coagulation) for more details
##    coag diet
## 1    62    A
## 2    60    A
## 3    63    A
## 4    59    A
## 5    63    B
## 6    67    B
## 7    71    B
## 8    64    B
## 9    65    B
## 10   66    B
## 11   68    C
## 12   66    C
## 13   71    C
## 14   67    C
## 15   68    C
## 16   68    C
## 17   56    D
## 18   62    D
## 19   60    D
## 20   61    D
## 21   63    D
## 22   64    D
## 23   63    D
## 24   59    D
  coag;diet
##  [1] 62 60 63 59 63 67 71 64 65 66 68 66 71 67 68 68 56 62 60 61 63 64 63
## [24] 59
##  [1] A A A A B B B B B B C C C C C C D D D D D D D D
## Levels: A B C D
# There are four groups of observations in the data. Make four plots comparing
# the mean, median, variance, and distribution of the observations within the
# four groups. Make comments.

# SOLUTION:
data = split(coag, diet)  #split the data into 4 categories
#calculate mean median variance:
dmean = sapply(data,mean) 
dmedian = sapply(data, median)
dvariance = sapply(data, var)

#plots:
par(mfrow=c(2,2))
boxplot(data,main="distribution of the observations")  # or use plot(coag ~ diet)
barplot(dmean, main = "Mean")
barplot(dmedian, main = "Median")
barplot(dvariance, main = "variance")

# my comment:
#Group C has highest mean median and lowest variance
#Group D and A have the same mean, but group A has much smaller variance
#Group B has highest variance


#I did this before class, but Prof Zhang taught another method, here:
attributes(diet)
## $levels
## [1] "A" "B" "C" "D"
## 
## $class
## [1] "factor"
plot.design(coagulation)            ### Comparison the means
title("Comparison of the means")
plot.design(coagulation, fun=median)  ### Comparing the medians
title("Comparison of the medians")
plot.design(coagulation, fun=var)   ### Comparing the variances
title("Comparison of the variance")
boxplot(coag~diet)          ### Outlier seen in Group C!
title("Boxplots for each treatment")

#----------------------------------------------------------------
# PROBLEM 2: Write a function my.anova(x,y) that takes x as a vector indicating
# the groups and y as a vector with observations. The function then computes a 
# list with the following named elements:

# a) "Group_Mean": a matrix with one row per group and columns
#    named "Mean" and "Std.Err.Est"
# b) "RMSE": a number, s
# c) "SSE": a number, SSE
# d) "df_SSE": a number, df_SSE
# e) "SST": a number, SST
# f) "df_SST": a number, df_SST
# g) "F": a number, F
# h) "p-value": a number, pv, for the F-test

# You may use matrix multiplication if convenient, but no high-level
# functions other than things like sqrt().

# Try your solution on the coagulation data and show the full result:
#   my.anova(diet,coag)
# Compare with results from the canned functions
#   anova(lm(coag~diet))
# and
#   oneway.test(coag~diet,var.equal=T)
# You need to get the exact numbers for the F stat and the p-value.
# Are the group means all equal?
 
# SOLUTION:

my.anova <- function(x,y)
  {
  I = nlevels(x) #how many treatments --- I
  n = length(x)  # total number of observations
#   new = data.frame(x,y)
#   newdata=new[order(x),]
#   x = newdata$x
#   y = newdata$y
  
# a) "Group_Mean": a matrix with one row per group and columns
#    named "Mean" and "Std.Err.Est"  
  matrixA <- matrix(0,I,5)

  #initiate a matrix, will update later
  colnames(matrixA) <- c("Mean" , "Std.Err.Est", "Sum", "Count", "Sum of Square")
  rownames(matrixA) <- c(levels(x))
  
  k=1
  matrixA[1,3] = y[1]
  matrixA[1,4] = 1
  matrixA[1,5] = y[1]^2

  for (i in 2:n)
    {
    if (x[i] != x[i-1])  #if start a different category, then 
      {
      k = k+1
       }
    matrixA[k,3] = matrixA[k,3] + y[i]
    matrixA[k,4] = matrixA[k,4] + 1
    matrixA[k,5] =  matrixA[k,5] + y[i]^2
     }

  for (k in 1:I)
    {matrixA[k,1] = matrixA[k,3]/matrixA[k,4]
     
     #sum[(y-ybar)^2] = sum[y^2] - n*ybar^2
     #variance = sum[(y-ybar)^2] /(n-1)
     matrixA[k,2] = sqrt((matrixA[k,5] - matrixA[k,4]* matrixA[k,1]^2)/(matrixA[k,4]-1))}

# c) "SSE": a number, SSE
SSE=0
for (k in 1:I)
  {SSE = SSE + matrixA[k,5] - matrixA[k,4]*matrixA[k,1]^2}

#b) "RMSE": a number, s
s = sqrt(SSE/n)

# d) "df_SSE": a number, df_SSE
df_SSE = n-I

# e) "SST": a number, SST
SST = sum(matrixA[,4]* matrixA[,1]^2) - n * mean(y)^2

# f) "df_SST": a number, df_SST
df_SST = I-1
# g) "F": a number, F
F=(SST/df_SST)/(SSE/df_SSE)

# h) "p-value": a number, pv, for the F-test
p = 1-pf(F,df_SST,df_SSE)

  returned = list(Group_Mean=matrixA, RMSE = s, SSE=SSE, df_SSE=df_SSE, SST=SST, df_SST=df_SST, F=F, "p-value"=p)
  return(returned)
  }
my.anova(diet,coag)
## $Group_Mean
##   Mean Std.Err.Est Sum Count Sum of Square
## A   61    1.825742 244     4         14894
## B   66    2.828427 396     6         26176
## C   68    1.673320 408     6         27758
## D   61    2.618615 488     8         29816
## 
## $RMSE
## [1] 2.160247
## 
## $SSE
## [1] 112
## 
## $df_SSE
## [1] 20
## 
## $SST
## [1] 228
## 
## $df_SST
## [1] 3
## 
## $F
## [1] 13.57143
## 
## $`p-value`
## [1] 4.658471e-05
anova(lm(coag~diet))
## Analysis of Variance Table
## 
## Response: coag
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## diet       3    228    76.0  13.571 4.658e-05 ***
## Residuals 20    112     5.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
oneway.test(coag~diet,var.equal=T)
## 
##  One-way analysis of means
## 
## data:  coag and diet
## F = 13.5714, num df = 3, denom df = 20, p-value = 4.658e-05
#----------------------------------------------------------------

# PROBLEM 3: Test the equality of variance through the Bartlett test.
#   bartlett.test(coag~diet)
# Can we assume that the group variances are all equal?

# If not, then the test of equality of group means can be done by
#  oneway.test(coag~diet)
# Compare this test with the same test in Problem 2 assuming an equal
# variance. Make comments.

# SOLUTION:

bartlett.test(coag~diet)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  coag by diet
## Bartlett's K-squared = 1.668, df = 3, p-value = 0.6441

not significant, so we can assume that the group variances are all equal

oneway.test(coag~diet)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  coag and diet
## F = 16.7281, num df = 3.000, denom df = 9.953, p-value = 0.0003249

both of these two assumptions (equal variance v.s. not assuming equal var) gives a highly significant p value

#----------------------------------------------------------------

# PROBLEM 4: Construct the Tukey CIs for the contrasts.
# Which group means are different if any?

# SOLUTION:

fm1 <- aov(coag~diet)
summary(fm1)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## diet         3    228    76.0   13.57 4.66e-05 ***
## Residuals   20    112     5.6                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(fm1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = coag ~ diet)
## 
## $diet
##     diff         lwr       upr     p adj
## B-A    5   0.7245544  9.275446 0.0183283
## C-A    7   2.7245544 11.275446 0.0009577
## D-A    0  -4.0560438  4.056044 1.0000000
## C-B    2  -1.8240748  5.824075 0.4766005
## D-B   -5  -8.5770944 -1.422906 0.0044114
## D-C   -7 -10.5770944 -3.422906 0.0001268

It seems that B-A, C-A, D-B, D-C have different group means