Assignment 4 Markdown

#What is the relationship between how much students study (hours) and their exam score (percentage)?
library(readxl)
library(ggpubr)

## Loading required package: ggplot2

library(rmarkdown)
DatasetA <- read_excel("C:/Users/Admin/Downloads/DatasetA.xlsx")

#Independent Variable: StudyHours
mean(DatasetA$StudyHours)

## [1] 6.135609

sd(DatasetA$StudyHours)

## [1] 1.369224

#Dependent Variable: ExamScore
mean(DatasetA$ExamScore)

## [1] 90.06906

sd(DatasetA$ExamScore)

## [1] 6.795224

hist(DatasetA$StudyHours,
     main = "StudyHours",
     breaks = 20,
     col = "lightblue",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

#The variable StudyHours appears normally distributed. The data is mostly in the middle (around 6 hours) and looks fairly symmetrical, forming a bell-shaped curve.

hist(DatasetA$ExamScore,
     main = "ExamScore",
     breaks = 20,
     col = "lightblue",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

#The variable ExamScore distribution is not normal. Most values are clustered near the high end (around 90–100, especially close to 100), with fewer values on the low end, so it looks negatively skew (most data is on the right) rather than a symmetrical bell curve.


shapiro.test(DatasetA$StudyHours)

## 
##  Shapiro-Wilk normality test
## 
## data:  DatasetA$StudyHours
## W = 0.99388, p-value = 0.9349

# The Shaprio-Wilk p-value for StudyHours normality test is greater than .05 (.9349), so the data is normal.

shapiro.test(DatasetA$ExamScore)

## 
##  Shapiro-Wilk normality test
## 
## data:  DatasetA$ExamScore
## W = 0.96286, p-value = 0.006465

# The Shaprio-Wilk p-value for ExamScore normality test is smaller than .05 (.006465), so the data is not normal.


cor.test(DatasetA$StudyHours, DatasetA$ExamScore, method = "spearman")

## Warning in cor.test.default(DatasetA$StudyHours, DatasetA$ExamScore, method =
## "spearman"): Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  DatasetA$StudyHours and DatasetA$ExamScore
## S = 16518, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9008825

# The Spearman Correlation test was selected because the variable ExamScore was abnormally distributed according to the histograms and the Shapiro-Wilk tests.
# The p-value (probability value) is < 2.2e-16, which is below .05. This means the results are statistically significant. The alternate hypothesis is supported.
# The rho-value is 0.90.
# The correlation is positive, which means as StudyHours increases, ExamScore increases.
# The correlation value is greater 0.50, which means the relationship is strong.


ggscatter(
  DatasetA,
  x = "StudyHours",
  y = "ExamScore",
  add = "reg.line",
  xlab = "StudyHours",
  ylab = "ExamScore"
)

# The line of best fit is pointing to the top right. This means the diretion of the data is positive. As StudyHours increases, ExamScore increases.
# The dots closely hug the line. This means there is a strong relationship between the variables.
# The dots form a straight-line pattern. This means the data is linear.
# There is possibly one outlier (the individual who studied around 4 hours but still got above 90%). However, the dot is towards the center of the line of best fit. Therefore, it does not appear to impact the relationship between the independent and dependent variables.


# StudyHours (M = 6.14, SD = 1.37) was correlated with ExamScore (M = 90.07, SD = 6.80),ρ(rho) = 0.90, p < 2.2e-16. 
# The relationship was positive and strong. As the StudyHours increased, the ExamScore increased.













#What is the relationship between how much a person uses their phone (hours) and how much they sleep (hours)?
library(readxl)
library(ggpubr)
library(rmarkdown)
DatasetB <- read_excel("C:/Users/Admin/Downloads/DatasetB.xlsx")

#Independent Variable: ScreenTime
mean(DatasetB$ScreenTime)

## [1] 5.063296

sd(DatasetB$ScreenTime)

## [1] 2.056833

#Dependent Variable: SleepingHours
mean(DatasetB$SleepingHours)

## [1] 6.938459

sd(DatasetB$SleepingHours)

## [1] 1.351332

hist(DatasetB$ScreenTime,
     main = "ScreenTime",
     breaks = 20,
     col = "lightblue",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

#The variable ScreenTime distribution is not normal. Most values are clustered near the low end (around 3-5), with fewer values on the high end, so it looks positively skew (most data is on the left) rather than a symmetrical bell curve.

hist(DatasetB$SleepingHours,
     main = "SleepingHours",
     breaks = 20,
     col = "lightblue",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

#The variable SleepingHours appears normally distributed. The data is mostly in the middle (around 7 hours) and looks fairly symmetrical, forming a bell-shaped curve.


shapiro.test(DatasetB$ScreenTime)

## 
##  Shapiro-Wilk normality test
## 
## data:  DatasetB$ScreenTime
## W = 0.90278, p-value = 1.914e-06

# The Shaprio-Wilk p-value for ScreenTime normality test is smaller than .05 (1.914e-06), so the data is not normal.

shapiro.test(DatasetB$SleepingHours)

## 
##  Shapiro-Wilk normality test
## 
## data:  DatasetB$SleepingHours
## W = 0.98467, p-value = 0.3004

# The Shaprio-Wilk p-value for SleepingHours normality test is greater than .05 (.3004), so the data is normal.


cor.test(DatasetB$ScreenTime, DatasetB$SleepingHours, method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  DatasetB$ScreenTime and DatasetB$SleepingHours
## S = 259052, p-value = 3.521e-09
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.5544674

# The Spearman Correlation test was selected because the variable ScreenTime was abnormally distributed according to the histograms and the Shapiro-Wilk tests.
# The p-value (probability value) is = 3.521e-09, which is below .05. This means the results are statistically significant. The alternate hypothesis is supported.
# The rho-value is -0.55.
# The correlation is negative, which means as ScreenTime increases, SleepingHours decreases.
# The correlation value is greater 0.50, which means the relationship is strong.


ggscatter(
  DatasetB,
  x = "ScreenTime",
  y = "SleepingHours",
  add = "reg.line",
  xlab = "ScreenTime",
  ylab = "SleepingHours"
)

# The line of best fit is pointing to the bottom right. This means the diretion of the data is negative As ScreenTime increases, SleepingHours decreases.
# The dots closely hug the line. This means there is a strong relationship between the variables.
# The dots form a straight-line pattern. This means the data is linear.
# There are no clear outliers.


# ScreenTime (M = 5.06, SD = 2.06) was correlated with SleepingHours (M = 6.94, SD = 1.35),ρ(rho) = -0.55, p = 3.521e-09. 
# The relationship was negative and strong. As the ScreenTime increased, the SleepingHours decreased.

Assignment 4 Markdown

Sinh Kiet Nguyen

2026-02-04