library(readxl)
library(ggpubr)
## Loading required package: ggplot2
DatasetA <- read_excel("C:/Users/user/Downloads/DatasetA.xlsx")

DatasetA
## # A tibble: 100 × 2
##    StudyHours ExamScore
##         <dbl>     <dbl>
##  1       5.16      83.7
##  2       5.65      89.0
##  3       8.34     100  
##  4       6.11      89.5
##  5       6.19      88.1
##  6       8.57     100  
##  7       6.69      91.1
##  8       4.10      75.5
##  9       4.97      83.7
## 10       5.33      89.4
## # ℹ 90 more rows
mean(DatasetA$StudyHours) 
## [1] 6.135609
sd(DatasetA$StudyHours)
## [1] 1.369224
mean(DatasetA$ExamScore)
## [1] 90.06906
sd(DatasetA$ExamScore)
## [1] 6.795224
hist(DatasetA$StudyHours,
     main = "StudyHours",
     breaks = 20,
     col = "lightblue",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

hist(DatasetA$ExamScore,
     main = "ExamScore",
     breaks = 20,
     col = "lightcoral",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

The variable “StudyHours” appears normally distributed. The data looks symmetrical (most data is in the middle). The data also appears to have a proper bell curve. The variable “ExamScore” appears normally distributed. The data looks symmetrical (most data is in the middle). The data also appears to have a proper bell curve.

shapiro.test(DatasetA$StudyHours) 
## 
##  Shapiro-Wilk normality test
## 
## data:  DatasetA$StudyHours
## W = 0.99388, p-value = 0.9349
shapiro.test(DatasetA$ExamScore)
## 
##  Shapiro-Wilk normality test
## 
## data:  DatasetA$ExamScore
## W = 0.96286, p-value = 0.006465

The Shaprio-Wilk p-value for StudyHours normality test is greater than .05 ( 0.9349), so the data is normal. The Shapiro-Wilk p-value for ExamScore the normality test is less than .05 (0.006465), so the data is not normal.

correlation test

cor.test(DatasetA$StudyHours, DatasetA$ExamScore, method = "spearman")
## Warning in cor.test.default(DatasetA$StudyHours, DatasetA$ExamScore, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  DatasetA$StudyHours and DatasetA$ExamScore
## S = 16518, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9008825

Output Pearson’s product-moment correlation data:DatasetA$StudyHours and DatasetA$ExamScore t = 20.959, df = 98, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.8606509 0.9346369 sample estimates: cor 0.904214 # The Spearman Correlation test was selected because both variables were abnormally distributed according to the histograms and the Shapiro-Wilk tests. # The correlation value is greater 0.50, which means the relationship is strong.

ggscatter(
  DatasetA,
  x = "StudyHours",
  y = "ExamScore",
  add = "reg.line",
  xlab = "StudyHours",
  ylab = "ExamScore"
)

# The line of best fit is pointing to the top right. This means the diretion of the data is positive. # The dots closely hug the line. This means there is a strong relationship between the variables. # The dots form a straight-line pattern. This means the data is linear. # There is possibly some outlier. However, the dot is towards the center of the line of best fit. Therefore, it does not appear to impact the relationship between the independent and dependent variables.

Dataset B

DatasetB <- read_excel("C:/Users/user/Downloads/DatasetB.xlsx")

mean(DatasetB$ScreenTime) 
## [1] 5.063296
sd(DatasetB$ScreenTime)
## [1] 2.056833
mean(DatasetB$SleepingHours)
## [1] 6.938459
sd(DatasetB$SleepingHours)
## [1] 1.351332
hist(DatasetB$ScreenTime,
     main = "ScreenTime",
     breaks = 20,
     col = "lightblue",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

hist(DatasetB$SleepingHours,
     main = "SleepingHours",
     breaks = 20,
     col = "lightcoral",
     border = "white",
     cex.main = 1,
     cex.axis = 1,
     cex.lab = 1)

The variable “ScreenTime” appears normally distributed. The data looks symmetrical (most data is in the middle). The data also appears to have a proper bell curve. The variable “SleepingHours” appears normally distributed. The data looks symmetrical (most data is in the middle). The data also appears to have a proper bell curve.

shapiro.test(DatasetB$ScreenTime) 
## 
##  Shapiro-Wilk normality test
## 
## data:  DatasetB$ScreenTime
## W = 0.90278, p-value = 1.914e-06
shapiro.test(DatasetB$SleepingHours)
## 
##  Shapiro-Wilk normality test
## 
## data:  DatasetB$SleepingHours
## W = 0.98467, p-value = 0.3004

The Shaprio-Wilk p-value for screenTime normality test is greater than .05 (1.914e-06), so the data is normal. The Shapiro-Wilk p-value for the SleepingHours normality test is less than .05 (0.3004), so the data is not normal. ’’’ correlation test

cor.test(DatasetB$ScreenTime, DatasetB$SleepingHours, method = "spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  DatasetB$ScreenTime and DatasetB$SleepingHours
## S = 259052, p-value = 3.521e-09
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.5544674

Output data: DatasetB\(ScreenTime and DatasetB\)SleepingHours t = -8.2538, df = 98, p-value = 7.27e-13 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: -0.7433008 -0.5078341 sample estimates: cor -0.6403761 # The p-value (probability value) is 7.27e-13, which is below .05. This means the results are statistically significant. The alternate hypothesis is supported. # The correlation is negative, which means Studyhours increases, hours sleeping decreases.

ggscatter(
  DatasetB,
  x = "ScreenTime",
  y = "SleepingHours",
  add = "reg.line",
  xlab = "ScreenTime",
  ylab = "SleepingHours"
)

# The line of best fit is pointing to the top right. This means the diretion of the data is positive. # The dots closely hug the line. This means there is a strong relationship between the variables. # The dots form a straight-line pattern. This means the data is linear. # There is possibly some outlier. However, the dot is towards the center of the line of best fit. Therefore, it does not appear to impact the relationship between the independent and dependent variables.