library(reshape2)
library(car)
## Loading required package: carData
library(ggplot2)
#Loading the data.
crop <- read.csv("/Users/michaelcajigal/Desktop/MA552/Final/crop.csv")
#Checking for any missing data.
head(crop)
## density block fertilizer yield
## 1 1 1 1 177.2287
## 2 2 2 1 177.5500
## 3 1 3 1 176.4085
## 4 2 4 1 177.7036
## 5 1 1 1 177.1255
## 6 2 2 1 176.7783
summary(crop)
## density block fertilizer yield
## Min. :1.0 Min. :1.00 Min. :1 Min. :175.4
## 1st Qu.:1.0 1st Qu.:1.75 1st Qu.:1 1st Qu.:176.5
## Median :1.5 Median :2.50 Median :2 Median :177.1
## Mean :1.5 Mean :2.50 Mean :2 Mean :177.0
## 3rd Qu.:2.0 3rd Qu.:3.25 3rd Qu.:3 3rd Qu.:177.4
## Max. :2.0 Max. :4.00 Max. :3 Max. :179.1
colSums(is.na(crop))
## density block fertilizer yield
## 0 0 0 0
#Convert fertilizer as a factor.
crop$fertilizer <- as.factor(crop$fertilizer)
#Tests for Normality.
#a. Histogram per group.
ggplot(crop, aes(x = yield)) +
geom_histogram(binwidth = 0.3, fill = "gray", color = "black") +
facet_wrap(~ fertilizer) +
theme_minimal()
#b. Q-Q plots per group.
ggplot(crop, aes(sample = yield)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~ fertilizer) +
theme_minimal()
#c. Shapiro-Wilk Tests per group.
by(crop$yield, crop$fertilizer, shapiro.test)
## crop$fertilizer: 1
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.97914, p-value = 0.7743
##
## ------------------------------------------------------------
## crop$fertilizer: 2
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.98329, p-value = 0.8875
##
## ------------------------------------------------------------
## crop$fertilizer: 3
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95878, p-value = 0.2542
#Tests for Homegeneity of Variances.
#Levene’s Test.
leveneTest(yield ~ fertilizer, data = crop)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.8472 0.4319
## 93
#ANOVA test since necessary assumptions are met.
anova_result <- aov(yield ~ fertilizer, data = crop)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## fertilizer 2 6.07 3.0340 7.863 7e-04 ***
## Residuals 93 35.89 0.3859
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Q-Q plot for residuals
qqnorm(residuals(anova_result))
qqline(residuals(anova_result), col = "red")
#Histogram of ANOVA residuals
hist(residuals(anova_result))
# Tukey HSD test for multiple comparisons with confidence intervals
tukey_result <- TukeyHSD(anova_result)
print(tukey_result)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = yield ~ fertilizer, data = crop)
##
## $fertilizer
## diff lwr upr p adj
## 2-1 0.1761687 -0.19371896 0.5460564 0.4954705
## 3-1 0.5991256 0.22923789 0.9690133 0.0006125
## 3-2 0.4229569 0.05306916 0.7928445 0.0208735
plot(tukey_result)
Report your results.
A one-way ANOVA was conducted to examine the effect of fertilizer type on crop yield. The results revealed a statistically significant difference in mean yield across the three fertilizer groups, F(2, 93) = 7.863, p < .001. Post-hoc comparisons using Tukey’s HSD test indicated that Fertilizer 3 significantly yielded more crops than Fertilizer 1 (Mean Difference = 0.599 , p < .001) and Fertilizer 2 (Mean Difference = 0.423 , p = 0.021). Additionally, Fertilizer 2 did not significantly differ than Fertilizer 1 (p = 0.495). Residual diagnostics suggested normality and homogeneity of variances were reasonably met.
#Loading the data.
sprint <- read.csv("/Users/michaelcajigal/Desktop/MA552/Final/sprint.csv")
#Checking for missing data.
head(sprint)
## Smoking Sprint
## 1 0 7.978
## 2 0 8.004
## 3 0 NA
## 4 NA 8.473
## 5 2 NA
## 6 0 4.650
summary(sprint)
## Smoking Sprint
## Min. :0.0000 Min. :4.503
## 1st Qu.:0.0000 1st Qu.:5.588
## Median :0.0000 Median :6.569
## Mean :0.4307 Mean :6.582
## 3rd Qu.:1.0000 3rd Qu.:7.458
## Max. :2.0000 Max. :9.597
## NA's :24 NA's :61
colSums(is.na(sprint))
## Smoking Sprint
## 24 61
#Since there are missing data, we will handle by removing entire row with at least one column with missing data.
sprint_clean <- na.omit(sprint)
#Checking again.
colSums(is.na(sprint_clean))
## Smoking Sprint
## 0 0
#Convert Smoking as a factor.
sprint_clean$Smoking <- as.factor(sprint_clean$Smoking)
#Tests for Normality.
#a. Histogram per group.
ggplot(sprint_clean, aes(x = Sprint)) +
geom_histogram(binwidth = 0.3, fill = "gray", color = "black") +
facet_wrap(~ Smoking) +
theme_minimal()
#b. Q-Q plots per group.
ggplot(sprint_clean, aes(sample = Sprint)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~ Smoking) +
theme_minimal()
#c. Shapiro-Wilk Tests per group.
by(sprint_clean$Sprint, sprint_clean$Smoking, shapiro.test)
## sprint_clean$Smoking: 0
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96037, p-value = 1.385e-06
##
## ------------------------------------------------------------
## sprint_clean$Smoking: 1
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96846, p-value = 0.4388
##
## ------------------------------------------------------------
## sprint_clean$Smoking: 2
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96901, p-value = 0.137
#Tests for Homegeneity of Variances.
#Levene’s Test.
leveneTest(Sprint ~ Smoking, data = sprint_clean)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.3218 0.09961 .
## 350
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Conduct the appropriate test to address the research question.
Since only the non-smoker group violates the assumption of normality with Q-Q plots looking mostly symmetrical overall (besides the slightly left-skewed for non-smoker group), and the test for equal variances is not significant, I will proceed with ANOVA paired with a Kruskal-Wallis non-parametric test.
#ANOVA test since necessary assumptions are met.
anova_result <- aov(Sprint ~ Smoking, data = sprint_clean)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Smoking 2 26.8 13.394 9.209 0.000127 ***
## Residuals 350 509.1 1.455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Q-Q plot for residuals
qqnorm(residuals(anova_result))
qqline(residuals(anova_result), col = "red")
#Histogram of ANOVA residuals
hist(residuals(anova_result))
#Run Kruskal-Wallis test
#less sensitive to outliers
kruskal_result <- kruskal.test(Sprint ~ Smoking, data = sprint_clean)
print(kruskal_result)
##
## Kruskal-Wallis rank sum test
##
## data: Sprint by Smoking
## Kruskal-Wallis chi-squared = 18.379, df = 2, p-value = 0.0001021
# Tukey HSD test for multiple comparisons with confidence intervals
tukey_result <- TukeyHSD(anova_result)
print(tukey_result)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Sprint ~ Smoking, data = sprint_clean)
##
## $Smoking
## diff lwr upr p adj
## 1-0 0.4238467 -0.1006135 0.9483070 0.1395572
## 2-0 0.7094287 0.3002200 1.1186374 0.0001644
## 2-1 0.2855819 -0.3314776 0.9026415 0.5213170
plot(tukey_result)