library(reshape2)
library(car)
## Loading required package: carData
library(ggplot2)

Problem 3a: A researcher investigated whether three different types of fertilizer have an effect on crop yield (data file = crop).

#Loading the data.
crop <- read.csv("/Users/michaelcajigal/Desktop/MA552/Final/crop.csv")

#Checking for any missing data.
head(crop)
##   density block fertilizer    yield
## 1       1     1          1 177.2287
## 2       2     2          1 177.5500
## 3       1     3          1 176.4085
## 4       2     4          1 177.7036
## 5       1     1          1 177.1255
## 6       2     2          1 176.7783
summary(crop)
##     density        block        fertilizer     yield      
##  Min.   :1.0   Min.   :1.00   Min.   :1    Min.   :175.4  
##  1st Qu.:1.0   1st Qu.:1.75   1st Qu.:1    1st Qu.:176.5  
##  Median :1.5   Median :2.50   Median :2    Median :177.1  
##  Mean   :1.5   Mean   :2.50   Mean   :2    Mean   :177.0  
##  3rd Qu.:2.0   3rd Qu.:3.25   3rd Qu.:3    3rd Qu.:177.4  
##  Max.   :2.0   Max.   :4.00   Max.   :3    Max.   :179.1
colSums(is.na(crop))
##    density      block fertilizer      yield 
##          0          0          0          0
#Convert fertilizer as a factor.
crop$fertilizer <- as.factor(crop$fertilizer)

#Tests for Normality.
#a. Histogram per group.
ggplot(crop, aes(x = yield)) +
  geom_histogram(binwidth = 0.3, fill = "gray", color = "black") +
  facet_wrap(~ fertilizer) +
  theme_minimal()

#b. Q-Q plots per group.
ggplot(crop, aes(sample = yield)) +
  stat_qq() +
  stat_qq_line() +
  facet_wrap(~ fertilizer) +
  theme_minimal()

#c. Shapiro-Wilk Tests per group.
by(crop$yield, crop$fertilizer, shapiro.test)
## crop$fertilizer: 1
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.97914, p-value = 0.7743
## 
## ------------------------------------------------------------ 
## crop$fertilizer: 2
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.98329, p-value = 0.8875
## 
## ------------------------------------------------------------ 
## crop$fertilizer: 3
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95878, p-value = 0.2542
#Tests for Homegeneity of Variances.
#Levene’s Test.
leveneTest(yield ~ fertilizer, data = crop)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  0.8472 0.4319
##       93
#ANOVA test since necessary assumptions are met.
anova_result <- aov(yield ~ fertilizer, data = crop)
summary(anova_result)
##             Df Sum Sq Mean Sq F value Pr(>F)    
## fertilizer   2   6.07  3.0340   7.863  7e-04 ***
## Residuals   93  35.89  0.3859                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Q-Q plot for residuals
qqnorm(residuals(anova_result))
qqline(residuals(anova_result), col = "red")

#Histogram of ANOVA residuals
hist(residuals(anova_result))

# Tukey HSD test for multiple comparisons with confidence intervals
tukey_result <- TukeyHSD(anova_result)
print(tukey_result)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = yield ~ fertilizer, data = crop)
## 
## $fertilizer
##          diff         lwr       upr     p adj
## 2-1 0.1761687 -0.19371896 0.5460564 0.4954705
## 3-1 0.5991256  0.22923789 0.9690133 0.0006125
## 3-2 0.4229569  0.05306916 0.7928445 0.0208735
plot(tukey_result)

Problem 3b: Does sprint time differ based on smoking status (Smoking: Nonsmoker = 0, Past smoker =1, Current smoker = 2) (data file = sprint)?

#Loading the data.
sprint <- read.csv("/Users/michaelcajigal/Desktop/MA552/Final/sprint.csv")

#Checking for missing data.
head(sprint)
##   Smoking Sprint
## 1       0  7.978
## 2       0  8.004
## 3       0     NA
## 4      NA  8.473
## 5       2     NA
## 6       0  4.650
summary(sprint)
##     Smoking           Sprint     
##  Min.   :0.0000   Min.   :4.503  
##  1st Qu.:0.0000   1st Qu.:5.588  
##  Median :0.0000   Median :6.569  
##  Mean   :0.4307   Mean   :6.582  
##  3rd Qu.:1.0000   3rd Qu.:7.458  
##  Max.   :2.0000   Max.   :9.597  
##  NA's   :24       NA's   :61
colSums(is.na(sprint))
## Smoking  Sprint 
##      24      61
#Since there are missing data, we will handle by removing entire row with at least one column with missing data.
sprint_clean <- na.omit(sprint)

#Checking again.
colSums(is.na(sprint_clean))
## Smoking  Sprint 
##       0       0
#Convert Smoking as a factor.
sprint_clean$Smoking <- as.factor(sprint_clean$Smoking)


#Tests for Normality.
#a. Histogram per group.
ggplot(sprint_clean, aes(x = Sprint)) +
  geom_histogram(binwidth = 0.3, fill = "gray", color = "black") +
  facet_wrap(~ Smoking) +
  theme_minimal()

#b. Q-Q plots per group.
ggplot(sprint_clean, aes(sample = Sprint)) +
  stat_qq() +
  stat_qq_line() +
  facet_wrap(~ Smoking) +
  theme_minimal()

#c. Shapiro-Wilk Tests per group.
by(sprint_clean$Sprint, sprint_clean$Smoking, shapiro.test)
## sprint_clean$Smoking: 0
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96037, p-value = 1.385e-06
## 
## ------------------------------------------------------------ 
## sprint_clean$Smoking: 1
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96846, p-value = 0.4388
## 
## ------------------------------------------------------------ 
## sprint_clean$Smoking: 2
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96901, p-value = 0.137
#Tests for Homegeneity of Variances.
#Levene’s Test.
leveneTest(Sprint ~ Smoking, data = sprint_clean)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   2  2.3218 0.09961 .
##       350                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANOVA test since necessary assumptions are met.
anova_result <- aov(Sprint ~ Smoking, data = sprint_clean)
summary(anova_result)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Smoking       2   26.8  13.394   9.209 0.000127 ***
## Residuals   350  509.1   1.455                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Q-Q plot for residuals
qqnorm(residuals(anova_result))
qqline(residuals(anova_result), col = "red")

#Histogram of ANOVA residuals
hist(residuals(anova_result))

#Run Kruskal-Wallis test
#less sensitive to outliers
kruskal_result <- kruskal.test(Sprint ~ Smoking, data = sprint_clean)
print(kruskal_result)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Sprint by Smoking
## Kruskal-Wallis chi-squared = 18.379, df = 2, p-value = 0.0001021
# Tukey HSD test for multiple comparisons with confidence intervals
tukey_result <- TukeyHSD(anova_result)
print(tukey_result)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Sprint ~ Smoking, data = sprint_clean)
## 
## $Smoking
##          diff        lwr       upr     p adj
## 1-0 0.4238467 -0.1006135 0.9483070 0.1395572
## 2-0 0.7094287  0.3002200 1.1186374 0.0001644
## 2-1 0.2855819 -0.3314776 0.9026415 0.5213170
plot(tukey_result)