Homework3.utf8.md

library(car) #leveneTest

## Loading required package: carData

#loading RatWeightGain dataset
ratdata <- read.csv('RatWeightGain.csv')

#Calculating the mean and standard deviation of weight gain for each source and for each Source
ratsourcem <- aggregate(WEIGHTGAIN~SOURCE, data=ratdata, FUN=function(x) c(mean=mean(x)))
ratsourcesd <- aggregate(WEIGHTGAIN~SOURCE, data=ratdata, FUN=function(x) c(sd=sd(x)))
ratsourcem

##   SOURCE WEIGHTGAIN
## 1   Beef       89.6
## 2 Cereal       84.9

ratsourcesd

##   SOURCE WEIGHTGAIN
## 1   Beef   17.71232
## 2 Cereal   14.99438

#Calculating the mean and standard deviation of weight gain for each Type
rattypem <- aggregate(WEIGHTGAIN~TYPE, data=ratdata, FUN=function(x) c(mean=mean(x)))
rattypesd <- aggregate(WEIGHTGAIN~TYPE, data=ratdata, FUN=function(x) c(sd=sd(x)))
rattypem

##   TYPE WEIGHTGAIN
## 1 High      92.95
## 2  Low      81.55

rattypesd

##   TYPE WEIGHTGAIN
## 1 High   16.36259
## 2  Low   14.63045

#Boxplot showing the mean of beef is 90 where as Cereal is less than 90 in weightgain
boxplot(WEIGHTGAIN~SOURCE, vertical=T,pch=19, data = ratdata, main='WEIGHTGAIN Vs SOURCE', xlab='SOURCE',
           ylab='WAIGHTGAIN')

#Boxplot showing the mean of high diet is 95 where as low diet is less than 85 in weightgain
boxplot(WEIGHTGAIN~TYPE, vertical=T,pch=19, data = ratdata, main='WEIGHTGAIN Vs TYPE', xlab='TYPE',
        ylab='WAIGHTGAIN')

#H0 = all variance are equal
#H1 = all variance are not equal
#p-value > significance level, The test reveals a p-value greater than 0.05, 
#indicating that there is no significant difference between the group variances in Source.
leveneTest(WEIGHTGAIN ~ SOURCE, data = ratdata)

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  0.3062 0.5833
##       38

#H0 = All means are equal
#H1 = All means are not equal
#we will run linear model for analysis of variance 
analysis <- lm(WEIGHTGAIN~SOURCE, data = ratdata)
#now look into the structure of the fitted model, we can see degree of freedom,sum squares 
#mean square = varibility between sources and varibility within the source, here we can see thats 
#F value < 1 and P value > 0.05 thus we accept null hypothesis, which tells all means are equal
anova(analysis)

## Analysis of Variance Table
## 
## Response: WEIGHTGAIN
##           Df  Sum Sq Mean Sq F value Pr(>F)
## SOURCE     1   220.9  220.90  0.8203 0.3708
## Residuals 38 10232.6  269.28

#ploting the residuals
plot(analysis, which = 1)

plot(analysis, which = 2)

#positive equally sqweness data
resids <- rstandard(analysis)
hist(resids)

##we have accept the null hypothesis thus tukey test is not required to look into the difference in means.

#H0 = all variance are equal
#H1 = all variance are not equal
#p-value > significance level, The test reveals a p-value greater than 0.05, 
#indicating that there is no significant difference between the group variances in Type.
leveneTest(WEIGHTGAIN ~ TYPE, data = ratdata)

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  0.2038 0.6542
##       38

#H0 = All means are equal
#H1 = All means are not equal
#we will run linear model for analysis of variance 
analysis2 <- lm(WEIGHTGAIN~TYPE, data = ratdata)
#now look into the structure of the fitted model, we can see degree of freedom,sum squares 
#mean square = varibility between sources and varibility within the source, here we can see thats 
#F value > 1 and P value < 0.05 thus we reject null hypothesis, which tells all means are not equal
anova(analysis2)

## Analysis of Variance Table
## 
## Response: WEIGHTGAIN
##           Df Sum Sq Mean Sq F value  Pr(>F)  
## TYPE       1 1299.6 1299.60  5.3949 0.02565 *
## Residuals 38 9153.9  240.89                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#ploting the residuals
plot(analysis2, which = 1)

plot(analysis2, which = 2)

#left hand sqweed data
resids <- rstandard(analysis2)
hist(resids)

#we have reject the null hypothesis thus accepting that there is a difference in effectiveness in Type,
#but which are the effective type, diffence in means can be calculated by tukey test and aov for analysis
#of variance
#we can see that low - high are having 0.02 are differ from one another
TukeyHSD(aov(analysis2))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = analysis2)
## 
## $TYPE
##           diff       lwr       upr     p adj
## Low-High -11.4 -21.33588 -1.464119 0.0256485

#__________________________________________________________________________________________________________

# Research Question: Is there any relationship between the weightgain 
#with source and type of diet made?
#H0 = There is no interaction between source and type of diet
#H1 = There is an interaction between source and type of diet

# Generate frequency table. If values in all cells are same, then have a balanced design.
table(ratdata$SOURCE, ratdata$TYPE)

##         
##          High Low
##   Beef     10  10
##   Cereal   10  10

# Visualize the data, showing beef high diet has 104 median where as beef low diet has 84 median
#there must be some intersacting point but not showing
boxplot(WEIGHTGAIN ~ SOURCE * TYPE, ratdata)

interaction.plot(ratdata$SOURCE, ratdata$TYPE, ratdata$WEIGHTGAIN)

# Compute two-way ANOVA test. We begin by using a model with interaction. If
# interaction is not significant, then use additive model.
aovres3 <- aov(WEIGHTGAIN ~ SOURCE * TYPE, ratdata)
summary(aovres3)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## SOURCE       1    221   220.9   0.988 0.3269  
## TYPE         1   1300  1299.6   5.812 0.0211 *
## SOURCE:TYPE  1    884   883.6   3.952 0.0545 .
## Residuals   36   8049   223.6                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Interpret results:
# From ANOVA results, we see that based on p-values and significance level of 0.05:
# 1. The p-value of SOURCE > 0.05,  which indicates
# that the sources are not associated with significant different weightgain.

# 2. The p-value of TYPE < 0.05,which indicates that the type of diet
# are associated with significant different weightgain.

# 3. The p-value for the interaction between SOURCE:TYPE = 0.05, which 
# indicates that the relationships between source of diet and weightgain are
#equally depends on the type of diet. That means we accept H1 i.e.,
#There is an interaction between source and type of diet

# ANOVA test is not significant for Source of diet but for Type of diet
#significant difference exists so we need to perform Tukeytest for Type of diet.
# low-high diet < 0.05 significant difference found in dataset
TukeyHSD(aovres3, "TYPE", conf.level = 0.95)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = WEIGHTGAIN ~ SOURCE * TYPE, data = ratdata)
## 
## $TYPE
##           diff    lwr   upr     p adj
## Low-High -11.4 -20.99 -1.81 0.0211449

# 1. Check homogeneity of variance assumption
# 1.1 Residuals vs. fit plot
# This suggests that the assumption that the relationship is linear is reasonable.
# There are outliers 6 at point -30 residual, 33 at point -29 residual and 11 at the point -31 residual
#outliers exist, it can be useful to remove outliers to meet test assumptions.
plot(aovres3, 1)

#Levene's test
#p-value > significance level, we can assume homogeneity of variances in the different 
#treatment groups.
library(car)
leveneTest(WEIGHTGAIN ~ SOURCE * TYPE, ratdata)

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  0.1635 0.9202
##       36

#Checking for normality assumptions
# Normality plot of the residuals
# Quantiles of residuals are plotted against quantiles of normal distribution
# along with a 45-degree reference line.
# Verify assumption that residuals are normally distributed.
# Normal probability plot of residuals should approximately follow a straight line. but there is
# is outlier 6, 33, 11 below the straight line
plot(aovres3, 2)

# histogram of the residuals and seems like it is equally skewed
aov_residuals <- residuals(aovres3)
hist(aov_residuals)

Homework3.R

arnabchakraboty

2020-02-12