Correlation Analysis: Academic Habits and Lifestyle

library("readxl")
library("ggpubr")
## Loading required package: ggplot2
DataSetA <- read_excel("/Users/komakechivan/Downloads/DatasetA.xlsx")
DataSetB <- read_excel("/Users/komakechivan/Downloads/DatasetB.xlsx")
# DataSetA
# independent variable
mean(DataSetA$StudyHours)
## [1] 6.135609
sd(DataSetA$StudyHours)
## [1] 1.369224
#dependent variable
mean(DataSetA$ExamScore)
## [1] 90.06906
sd(DataSetA$ExamScore)
## [1] 6.795224
# DataSetB
#independent variable
mean(DataSetB$ScreenTime)
## [1] 5.063296
sd(DataSetB$ScreenTime)
## [1] 2.056833
# dependent variable
mean(DataSetB$SleepingHours)
## [1] 6.938459
sd(DataSetB$SleepingHours)
## [1] 1.351332

Histograms

# Histogram
# DataSetA
hist(DataSetA$StudyHours,
     main = "Study Hours",
     breaks = 20,
     col = "lightgreen",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

# The variable "Study Hours" appears normally distrubuted. The data looks
# symmetrical having most data in the middle.


hist(DataSetA$ExamScore,
     main = "Exam Score",
     breaks = 20,
     col = "lightblue",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

# The variable "Exam Score" appears not normally distributed. The data looks
# negatively skewed having most data to the right


#DataSetB
hist(DataSetB$ScreenTime,
     main = "Screen Time",
     breaks = 20,
     col = "lightcyan",
     border = "cyan",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

# The variable "Screen Time" appears not normally distributed. The data looks
# positively skewed having most data to the left


hist(DataSetB$SleepingHours,
     main = "Sleeping Hours",
     breaks = 20,
     col = "gray",
     border = "black",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

PART 3: CHECK NORMALITY (SHAPIRO-WILK)

# Dataset A: If p < 0.05, the variable is NOT normal.
shapiro.test(DataSetA$StudyHours)
## 
##  Shapiro-Wilk normality test
## 
## data:  DataSetA$StudyHours
## W = 0.99388, p-value = 0.9349
shapiro.test(DataSetA$ExamScore) 
## 
##  Shapiro-Wilk normality test
## 
## data:  DataSetA$ExamScore
## W = 0.96286, p-value = 0.006465
# Note: Based on your observation of skewness in Exam Score, 
# this will likely be < 0.05, necessitating a Spearman test.

# Dataset B
shapiro.test(DataSetB$ScreenTime)
## 
##  Shapiro-Wilk normality test
## 
## data:  DataSetB$ScreenTime
## W = 0.90278, p-value = 1.914e-06
shapiro.test(DataSetB$SleepingHours)
## 
##  Shapiro-Wilk normality test
## 
## data:  DataSetB$SleepingHours
## W = 0.98467, p-value = 0.3004

PART 4: CORRELATION ANALYSIS

### Research Question 1: Study Hours vs Exam Score
# Using Spearman because ExamScore showed skewness
cor_test_A <- cor.test(DataSetA$StudyHours, DataSetA$ExamScore, 
                       method = "spearman", exact = FALSE)
print(cor_test_A)
## 
##  Spearman's rank correlation rho
## 
## data:  DataSetA$StudyHours and DataSetA$ExamScore
## S = 16518, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9008825
### Research Question 2: Screen Time vs Sleep
# Using Spearman because ScreenTime showed skewness
cor_test_B <- cor.test(DataSetB$ScreenTime, DataSetB$SleepingHours, 
                       method = "spearman", exact = FALSE)
print(cor_test_B)
## 
##  Spearman's rank correlation rho
## 
## data:  DataSetB$ScreenTime and DataSetB$SleepingHours
## S = 259052, p-value = 2.161e-09
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.5544674

PART 5: SCATTERPLOTS (GG-PUBR)

# Dataset A Scatterplot
# We use ggscatter to easily include the regression line and correlation stats

ggscatter(DataSetA, x = "StudyHours", y = "ExamScore", 
          add = "reg.line", conf.int = TRUE, 
          cor.method = "spearman",
          main = "Relationship: Study Hours vs Exam Score",
          xlab = "Hours Spent Studying", ylab = "Exam Score (%)",
          color = "darkgreen", shape = 21, fill = "lightgreen")

# Dataset B Scatterplot

ggscatter(DataSetB, x = "ScreenTime", y = "SleepingHours", 
          add = "reg.line", conf.int = TRUE, 
          cor.method = "spearman",
          main = "Relationship: Screen Time vs Sleeping Hours",
          xlab = "Daily Screen Time (Hours)", ylab = "Total Sleep (Hours)",
          color = "darkblue", shape = 21, fill = "lightblue")

Part 6: Research Interpretation

Study Hours and Exam scores

Students studied for an average of 6.14 hours (SD = 1.37), while the average exam score was 90.1% (SD = 6.80). A Spearman’s rank correlation was conducted to assess the relationship between the variables.The analysis revealed a statistically significant, very strong positive relationship, (rho(98) = 0.90, p < .001). This indicates that as study hours increase, exam scores increase significantly and predictably.

Screen Time and Sleeping Hours

Participants averaged 5.1 hours of screen time (SD = 2.10) and 6.94 hours of sleep (SD = 1.35). A Spearman’s rank correlation was conducted to assess the relationship. The analysis revealed a statistically significant, moderate negative relationship, (rho(98) = -0.55, p < .001). This indicates that as screen time increases, sleeping hours tend to decrease significantly.