############################################################
# Assignment 6 — Research Scenario 3
# Training: Pre vs Post Communication
# DEPENDENT T-TEST & NORMALITY CHECK
############################################################

# NULL HYPOTHESIS (H0)
# There is no difference between the PreTraining scores and PostTraining scores.

# ALTERNATE HYPOTHESIS (H1)
# There is a difference between the PreTraining scores and PostTraining scores.


############################
# IMPORT EXCEL FILE
############################

# Load required package
# If never installed, remove the hashtag before the install code.
# If previously installed, leave the hashtag in front of the code.
# install.packages("readxl")

library(readxl)

# Import your Excel dataset
dataset <- read_excel("A6R3.xlsx")


Before <- dataset$PreTraining
After  <- dataset$PostTraining

# Difference scores: After minus Before
Differences <- After - Before


############################
# HISTOGRAM OF DIFFERENCE SCORES
############################

hist(Differences,
     main  = "Histogram of Difference Scores",
     xlab  = "After - Before",
     ylab  = "Frequency",
     col   = "blue",
     border = "black",
     breaks = 20)

# QUESTION 1: Is the histogram symmetrical, positively skewed, or negatively skewed?
# ANSWER: It look nearly symmetrical.

# QUESTION 2: Did the histogram look too flat, too tall, or did it have a proper bell curve?
# ANSWER: It resembled a proper bell curve, not too flat or too peaked.


############################
# SHAPIRO-WILK NORMALITY TEST
############################

shapiro.test(Differences)
## 
##  Shapiro-Wilk normality test
## 
## data:  Differences
## W = 0.98773, p-value = 0.21
## From the output:
##  Shapiro-Wilk normality test
##  W = 0.98773, p-value = 0.21

# QUESTION: Was the data normally distributed or abnormally distributed?
# If p > 0.05 this means the data is NORMAL (use Dependent t-test).
# If p < 0.05 this means the data is NOT normal (switch to Wilcoxon Sign Rank).
# ANSWER: The data were normally distributed (W = 0.988, p = 0.210 > .05), 
#         so it is appropriate to use a Dependent t-test.


############################
# BOXPLOT OF DIFFERENCE SCORES
############################

boxplot(Differences,
        main  = "Distribution of Score Differences (After - Before)",
        ylab  = "Difference in Scores",
        col   = "blue",
        border = "darkblue")

# QUESTION 1: How many dots are in your boxplot?
# A) No dots.
# B) One or two dots.
# C) Many dots.
# ANSWER: B

# QUESTION 2: Where are the dots in your boxplot?
# A) There are no dots.
# B) Very close to the whiskers (lines of the boxplot).
# C) Far from the whiskers (lines of the boxplot).
# ANSWER: B

# QUESTION 3: Based on the dots and their location, is the data normal?
# - If there are no dots, or only one/two close to the whiskers, data is likely normal.
# - Many dots far from the whiskers = likely NOT normal.
# ANSWER: The boxplot did not show problematic/extreme outliers, 
#         which is consistent with the Shapiro-Wilk result indicating normal data.


############################################################
# DEPENDENT T-TEST (PAIRED SAMPLES T-TEST)
############################################################

t.test(Before, After, paired = TRUE)
## 
##  Paired t-test
## 
## data:  Before and After
## t = -23.285, df = 149, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -10.313424  -8.699909
## sample estimates:
## mean difference 
##       -9.506667
## From the output:
##  Paired t-test
##  data:  Before and After
##  t = -23.285, df = 149, p-value < 2.2e-16
##  alternative hypothesis: true mean difference is not equal to 0
##  95 percent confidence interval:
##   -10.313424  -8.699909
##  sample estimates:
##  mean difference = -9.506667

# DETERMINE STATISTICAL SIGNIFICANCE
# If p < .05 → statistically significant difference.
# If p > .05 → NOT statistically significant.

# Here, p-value < .001 (reported as < 2.2e-16), so:
# The difference between PreTraining and PostTraining scores IS statistically significant.


############################################################
# EFFECT SIZE FOR DEPENDENT T-TEST (COHEN'S D)
############################################################

# INSTALL REQUIRED PACKAGE (if not already installed)
# install.packages("effectsize")

# LOAD THE PACKAGE
library(effectsize)

# CALCULATE COHEN’S D
cohens_d(Before, After, paired = TRUE)
## For paired samples, 'repeated_measures_d()' provides more options.
## Cohen's d |         95% CI
## --------------------------
## -1.90     | [-2.17, -1.63]
## From the Rq3 HTML output:
## Cohen's d |         95% CI
## --------------------------
## -1.90     | [-2.17, -1.63]

# QUESTION 1: What is the size of the effect?
# ± 0.00 to 0.19 = ignore
# ± 0.20 to 0.49 = small
# ± 0.50 to 0.79 = moderate
# ± 0.80 to 1.29 = large
# ± 1.30 and above = very large
# ANSWER:
# Cohen’s d was -1.90, which is a VERY LARGE effect size. 
# This indicates a very large difference between the pre- and post-training scores.

# QUESTION 2: Which group had the higher average score?
# With the way we calculated differences (After minus Before):
# - If the mean difference is POSITIVE → After scores were higher.
# - If the mean difference is NEGATIVE → Before scores were higher.
# From the output: mean difference = -9.506667 (negative).
# ANSWER:
# The Before (PreTraining) scores were higher on average than the After (PostTraining) scores.


############################################################
# RESEARCH REPORT ON RESULTS: DEPENDENT T-TEST
############################################################


# A dependent t-test was conducted to compare communication scores before and after the 
# training among 150 participants. Results showed that post-training scores were 
# significantly different from pre-training scores, t(149) = -23.29, p < .001, with a 
# mean difference of -9.51 points (After – Before). The 95% confidence interval for the 
# mean difference ranged from -10.31 to -8.70. The effect size was very large, 
# Cohen’s d = -1.90, indicating a substantial change in scores across time.