# INDEPENDENT T-TEST & MANN-WHITNEY U TEST

# HYPOTHESIS TESTED:
#To determine if Medication A is more effective at reducing headaches than the current medication on the market (Medication B).

# H0:There is no difference in the mean number of headache days between participants taking Medication A and those taking Medication B
# H1: Medication A is more effective than Medication B, meaning the mean number of headache days for the Medication A group is less than that for the Medication B group

options(repos = c(CRAN = "https://cloud.r-project.org"))
 install.packages("readxl")
## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readxl'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\chris\AppData\Local\R\win-library\4.5\00LOCK\readxl\libs\x64\readxl.dll
## to C:\Users\chris\AppData\Local\R\win-library\4.5\readxl\libs\x64\readxl.dll:
## Permission denied
## Warning: restored 'readxl'
## 
## The downloaded binary packages are in
##  C:\Users\chris\AppData\Local\Temp\RtmpYngcQm\downloaded_packages
# LOAD THE PACKAGE

library(readxl)

A6R1 <- read_excel("C:/Users/chris/Downloads/A6R1.xlsx")

# DESCRIPTIVE STATISTICS

install.packages("dplyr")
## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\chris\AppData\Local\R\win-library\4.5\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\chris\AppData\Local\R\win-library\4.5\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\chris\AppData\Local\Temp\RtmpYngcQm\downloaded_packages
# LOAD THE PACKAGE

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
A6R1 %>%
  group_by(Medication) %>%
  summarise(
    Mean = mean(HeadacheDays, na.rm = TRUE),
    Median = median(HeadacheDays, na.rm = TRUE),
    SD = sd(HeadacheDays, na.rm = TRUE),
    N = n()
  )
## # A tibble: 2 × 5
##   Medication  Mean Median    SD     N
##   <chr>      <dbl>  <dbl> <dbl> <int>
## 1 A            8.1    8    2.81    50
## 2 B           12.6   12.5  3.59    50
# HISTOGRAMS



# CREATE THE HISTOGRAMS 
# Replace "dataset" with your dataset name (without .xlsx)
# Replace "score" with your dependent variable R code name (example: USD)
# Replace "group" with your independent variable R code name (example: Country)
# Replace "Group1" with the R code name for your first group (example: USA)
# Replace "Group2" with the R code name for your second group (example: India)

hist(A6R1$HeadacheDays[A6R1$Medication == "A"],
     main = "Histogram of Group 1 Scores",
     xlab = "Value",
     ylab = "Frequency",
     col = "lightblue",
     border = "black",
     breaks = 20)

hist(A6R1$HeadacheDays[A6R1$Medication == "B"],
     main = "Histogram of Group 2 Scores",
     xlab = "Value",
     ylab = "Frequency",
     col = "lightgreen",
     border = "black",
     breaks = 20)

# 1)The SKEWNESS of the VARIABLE 1 histogram.
# The histogram of VARIABLE 1 (Group 1 Scores) appears fairly symmetrical with a single peak.
# Therefore, it is likely to have skewness close to zero and looks symmetrical.

# 2 The KURTOSIS of the VARIABLE 1 histogram.
# The histogram has a moderate peak and tails, suggesting a mesokurtic distribution.
# It resembles a proper bell curve.

# 3 The SKEWNESS of the VARIABLE 2 histogram.
# The histogram of VARIABLE 2 (Group 2 Scores) shows multiple peaks and a slight right tail.
# This suggests it is slightly positively skewed.

# 4 The KURTOSIS of the VARIABLE 2 histogram.
# The histogram has sharper peaks and heavier tails, indicating a leptokurtic distribution.
# It looks too tall compared to a normal bell curve.

# SHAPIRO-WILK TEST
# Purpose: Check the normality for each group's score statistically.
# The Shapiro-Wilk Test is a test that checks skewness and kurtosis at the same time.
# The test is checking "Is this variable the SAME as normal data (null hypothesis) or DIFFERENT from normal data (alternate hypothesis)?"
# For this test, if p is GREATER than .05 (p > .05), the data is NORMAL.
# If p is LESS than .05 (p < .05), the data is NOT normal.

# CONDUCT THE SHAPIRO-WILK TEST
# Replace "dataset" with your dataset name (without .xlsx)
# Replace "score" with your dependent variable R code name (example: USD)
# Replace "group" with your independent variable R code name (example: Country)
# Replace "Group1" with the R code name for your first group (example: USA)
# Replace "Group2" with the R code name for your second group (example: India)

shapiro.test(A6R1$HeadacheDays[A6R1$Medication == "A"])
## 
##  Shapiro-Wilk normality test
## 
## data:  A6R1$HeadacheDays[A6R1$Medication == "A"]
## W = 0.97852, p-value = 0.4913
shapiro.test(A6R1$HeadacheDays[A6R1$Medication == "B"])
## 
##  Shapiro-Wilk normality test
## 
## data:  A6R1$HeadacheDays[A6R1$Medication == "B"]
## W = 0.98758, p-value = 0.8741
#  The data normally distributed for Variable 1
#  The data normally distributed for Variable 2

#box-plot test below.

# BOXPLOT

# INSTALL REQUIRED PACKAGE

install.packages("ggplot2")
## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\chris\AppData\Local\Temp\RtmpYngcQm\downloaded_packages
install.packages("ggpubr")
## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ggpubr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\chris\AppData\Local\Temp\RtmpYngcQm\downloaded_packages
# LOAD THE PACKAGE
# Always reload the package you want to use. 

library(ggplot2)
library(ggpubr)

# CREATE THE BOXPLOT
# Replace "dataset" with your dataset name (without .xlsx)
# Replace "score" with your dependent variable R code name (example: USD)
# Replace "group" with your independent variable R code name (example: Country)


ggboxplot(A6R1, x = "Medication", y = "HeadacheDays",
          color = "Medication",
          palette = "jco",
          add = "jitter")

# Q1) Were there any dots outside of the boxplots? These dots represent participants with extreme scores.
# Yes, both boxplots (for Medication A and B) show several dots outside the boxes, indicating the presence of outliers.

# Q2) If there are outliers, in your opinion are the scores of those dots changing the mean so much that the mean no longer accurately represents the average score?
# Although there are outliers in both groups, they are relatively few compared to the total number of data points.
# Therefore, the mean still appears to be a reasonably accurate representation of the average score.

# If there were no extreme outliers, this means the data is NORMAL. Continue to the Independent t-test.

# INDEPENDENT T-TEST 
#============================================================================
# PURPOSE: Test if there was a difference between the means of the two groups.

# Replace "dataset" with your dataset name (without .xlsx)
# Replace "score" with your dependent variable R code name (example: USD)
# Replace "group" with your independent variable R code name (example: Country)

t.test(HeadacheDays ~ Medication, data = A6R1, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  HeadacheDays by Medication
## t = -6.9862, df = 98, p-value = 3.431e-10
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -5.778247 -3.221753
## sample estimates:
## mean in group A mean in group B 
##             8.1            12.6
# DETERMINE STATISTICAL SIGNIFICANCE

# Results were statistically significant (p < .05), continue to effect size section below.

# EFFECT-SIZE

install.packages("effectsize")
## Installing package into 'C:/Users/chris/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'effectsize' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\chris\AppData\Local\Temp\RtmpYngcQm\downloaded_packages
library(effectsize)

# COHEN’S D

cohen_d_result <- cohens_d(HeadacheDays ~ Medication, data = A6R1, pooled_sd = TRUE)
print(cohen_d_result)
## Cohen's d |         95% CI
## --------------------------
## -1.40     | [-1.83, -0.96]
## 
## - Estimated using pooled SD.
# Q1) What is the size of the effect?

#  very large =-1.40 indicates the difference between the group A&B

# Q2) Group B had the higher average score?



# WRITTEN REPORT FOR INDEPENDENT T-TEST

#  OUTPUT
#    1. The name of the inferential test used (Independent t-test)
#    2. The names of the IV- Medication and DV-HeadacheDays 
#    3. The sample size for each group =50
#    4. The inferential test results were statistically significant (p < .05) 
#    5. mean 8.1 and SD 2.81 for A
#       mean 12.6 and SD 3.59 for B
#    7. Degrees of freedom =98
#    8. t-value = -6.9862
#    9. EXACT p-value to three decimals= p < .001
#   10. Effect size (Cohen’s d) =-1.40


# 2) REPORT 

# REPORT

# An Independent t-test was conducted to compare 
# the number of headache days between participants who took Medication A (n = 50) and those who took Medication B (n = 50). 
# Participants taking Medication A reported significantly fewer headache days (M = 8.10, SD = 2.81) than 
# those taking Medication B (M = 12.60, SD = 3.59), t(98) = -6.99, p < .001.
# The effect size was very large (d = -1.40), indicating a substantial difference in headache reduction between the two medications.
# Overall, Medication A was more effective at reducing headache days than Medication B..