This R Markdown report simulates a data set in which
ExamScore is positively associated with
HoursStudied. It then examines the relationship using a
Pearson correlation test, a simple linear regression model, and a set of
diagnostic plots.
# Set the random seed so results are reproducible
set.seed(123)
# Define sample size
n <- 250
# Generate hours studied as an integer from 1 to 20
data <- data.frame(
HoursStudied = round(runif(n, 1, 20), 0)
)
# Generate exam scores so they are positively related to hours studied
# The random error term adds realistic variation
data$ExamScore <- round(
45 + 2.5 * data$HoursStudied + rnorm(n, mean = 0, sd = 8),
0
)
# Keep exam scores in a realistic range from 0 to 100
data$ExamScore[data$ExamScore < 0] <- 0
data$ExamScore[data$ExamScore > 100] <- 100
# View the first few rows
head(data)
## HoursStudied ExamScore
## 1 6 55
## 2 16 87
## 3 9 68
## 4 18 82
## 5 19 92
## 6 2 62
summary(data)
## HoursStudied ExamScore
## Min. : 1.00 Min. : 30.00
## 1st Qu.: 6.00 1st Qu.: 61.00
## Median :10.00 Median : 71.50
## Mean :10.62 Mean : 71.79
## 3rd Qu.:15.00 3rd Qu.: 83.75
## Max. :20.00 Max. :100.00
# Test whether HoursStudied and ExamScore are linearly correlated
cor_test_result <- cor.test(
data$HoursStudied,
data$ExamScore,
method = "pearson"
)
cor_test_result
##
## Pearson's product-moment correlation
##
## data: data$HoursStudied and data$ExamScore
## t = 26.381, df = 248, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8221683 0.8881066
## sample estimates:
## cor
## 0.8586496
# Fit a simple least squares regression model
model <- lm(ExamScore ~ HoursStudied, data = data)
summary(model)
##
## Call:
## lm(formula = ExamScore ~ HoursStudied, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.7974 -4.8614 -0.3246 5.3847 25.4506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.69656 1.14256 39.12 <2e-16 ***
## HoursStudied 2.55040 0.09667 26.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.915 on 248 degrees of freedom
## Multiple R-squared: 0.7373, Adjusted R-squared: 0.7362
## F-statistic: 696 on 1 and 248 DF, p-value: < 2.2e-16
ggplot(data, aes(x = HoursStudied, y = ExamScore)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", color = "blue", se = TRUE) +
labs(
title = "Relationship between Hours Studied and Exam Score",
x = "Hours Studied",
y = "Exam Score"
) +
theme_minimal()
# Add fitted values and residual diagnostics to the data set
data$fitted <- fitted(model)
data$residuals <- resid(model)
data$std_resid <- rstandard(model)
data$cooks_d <- cooks.distance(model)
# Create grouped study-hour categories for boxplots
data$StudyGroup <- cut(
data$HoursStudied,
breaks = c(0, 5, 10, 15, 20),
labels = c("1-5", "6-10", "11-15", "16-20")
)
# Create observation order for plotting residuals by row number
data$Obs <- 1:nrow(data)
head(data)
## HoursStudied ExamScore fitted residuals std_resid cooks_d
## 1 6 55 59.99895 -4.9989544 -0.63388294 1.454956e-03
## 2 16 87 85.50295 1.4970543 0.18993848 1.511899e-04
## 3 9 68 67.65015 0.3498482 0.04429949 4.330007e-06
## 4 18 82 90.60374 -8.6037439 -1.09369866 7.335919e-03
## 5 19 92 93.15414 -1.1541431 -0.14688825 1.583633e-04
## 6 2 62 49.79736 12.2026421 1.55353093 1.849613e-02
## StudyGroup Obs
## 1 6-10 1
## 2 16-20 2
## 3 6-10 3
## 4 16-20 4
## 5 16-20 5
## 6 1-5 6
ggplot(data, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.7) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_smooth(se = FALSE, color = "blue") +
labs(
title = "Residuals vs Fitted Values",
x = "Fitted Values",
y = "Residuals"
) +
theme_minimal()
ggplot(data, aes(x = HoursStudied, y = residuals)) +
geom_point(alpha = 0.7) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_smooth(se = FALSE, color = "blue") +
labs(
title = "Residuals vs Hours Studied",
x = "Hours Studied",
y = "Residuals"
) +
theme_minimal()
ggplot(data, aes(x = residuals)) +
geom_histogram(bins = 30, color = "black", fill = "lightblue") +
labs(
title = "Histogram of Residuals",
x = "Residuals",
y = "Frequency"
) +
theme_minimal()
ggplot(data, aes(sample = residuals)) +
stat_qq() +
stat_qq_line(color = "blue") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
ggplot(data, aes(x = StudyGroup, y = residuals)) +
geom_boxplot(fill = "lightblue") +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
title = "Residuals by Hours Studied Group",
x = "Hours Studied Group",
y = "Residuals"
) +
theme_minimal()
ggplot(data, aes(x = Obs, y = residuals)) +
geom_point(alpha = 0.7) +
geom_line(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
title = "Residuals by Observation Order",
x = "Observation",
y = "Residuals"
) +
theme_minimal()
ggplot(data, aes(x = Obs, y = cooks_d)) +
geom_col() +
labs(
title = "Cook's Distance",
x = "Observation",
y = "Cook's Distance"
) +
theme_minimal()
shapiro.test(residuals(model))
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.99474, p-value = 0.5432
par(mfrow = c(2, 2))
plot(model)
par(mfrow = c(1, 1))
ggplot(data, aes(x = StudyGroup, y = ExamScore)) +
geom_boxplot(fill = "lightblue") +
labs(
title = "Exam Scores by Hours Studied Group",
x = "Hours Studied Group",
y = "Exam Score"
) +
theme_minimal()
The simulated data show a positive relationship between hours studied and exam score, which is confirmed by both the Pearson correlation test and the linear regression model. The diagnostic plots help assess linearity, constant variance, residual normality, and influential observations. These checks are important because least squares regression can be affected by outliers, heteroscedasticity, nonlinearity, and influential cases.