Overview

This R Markdown report simulates a data set in which ExamScore is positively associated with HoursStudied. It then examines the relationship using a Pearson correlation test, a simple linear regression model, and a set of diagnostic plots.

Generate Simulated Data

# Set the random seed so results are reproducible
set.seed(123)

# Define sample size
n <- 250

# Generate hours studied as an integer from 1 to 20
data <- data.frame(
  HoursStudied = round(runif(n, 1, 20), 0)
)

# Generate exam scores so they are positively related to hours studied
# The random error term adds realistic variation
data$ExamScore <- round(
  45 + 2.5 * data$HoursStudied + rnorm(n, mean = 0, sd = 8),
  0
)

# Keep exam scores in a realistic range from 0 to 100
data$ExamScore[data$ExamScore < 0] <- 0
data$ExamScore[data$ExamScore > 100] <- 100

# View the first few rows
head(data)

##   HoursStudied ExamScore
## 1            6        55
## 2           16        87
## 3            9        68
## 4           18        82
## 5           19        92
## 6            2        62

Descriptive Summary

summary(data)

##   HoursStudied     ExamScore     
##  Min.   : 1.00   Min.   : 30.00  
##  1st Qu.: 6.00   1st Qu.: 61.00  
##  Median :10.00   Median : 71.50  
##  Mean   :10.62   Mean   : 71.79  
##  3rd Qu.:15.00   3rd Qu.: 83.75  
##  Max.   :20.00   Max.   :100.00

Pearson Correlation Test

# Test whether HoursStudied and ExamScore are linearly correlated
cor_test_result <- cor.test(
  data$HoursStudied,
  data$ExamScore,
  method = "pearson"
)

cor_test_result

## 
##  Pearson's product-moment correlation
## 
## data:  data$HoursStudied and data$ExamScore
## t = 26.381, df = 248, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8221683 0.8881066
## sample estimates:
##       cor 
## 0.8586496

Fit a Simple Linear Regression Model

# Fit a simple least squares regression model
model <- lm(ExamScore ~ HoursStudied, data = data)

summary(model)

## 
## Call:
## lm(formula = ExamScore ~ HoursStudied, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.7974  -4.8614  -0.3246   5.3847  25.4506 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44.69656    1.14256   39.12   <2e-16 ***
## HoursStudied  2.55040    0.09667   26.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.915 on 248 degrees of freedom
## Multiple R-squared:  0.7373, Adjusted R-squared:  0.7362 
## F-statistic:   696 on 1 and 248 DF,  p-value: < 2.2e-16

Scatterplot with Regression Line

ggplot(data, aes(x = HoursStudied, y = ExamScore)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", color = "blue", se = TRUE) +
  labs(
    title = "Relationship between Hours Studied and Exam Score",
    x = "Hours Studied",
    y = "Exam Score"
  ) +
  theme_minimal()

Create Diagnostic Variables

# Add fitted values and residual diagnostics to the data set
data$fitted <- fitted(model)
data$residuals <- resid(model)
data$std_resid <- rstandard(model)
data$cooks_d <- cooks.distance(model)

# Create grouped study-hour categories for boxplots
data$StudyGroup <- cut(
  data$HoursStudied,
  breaks = c(0, 5, 10, 15, 20),
  labels = c("1-5", "6-10", "11-15", "16-20")
)

# Create observation order for plotting residuals by row number
data$Obs <- 1:nrow(data)

head(data)

##   HoursStudied ExamScore   fitted  residuals   std_resid      cooks_d
## 1            6        55 59.99895 -4.9989544 -0.63388294 1.454956e-03
## 2           16        87 85.50295  1.4970543  0.18993848 1.511899e-04
## 3            9        68 67.65015  0.3498482  0.04429949 4.330007e-06
## 4           18        82 90.60374 -8.6037439 -1.09369866 7.335919e-03
## 5           19        92 93.15414 -1.1541431 -0.14688825 1.583633e-04
## 6            2        62 49.79736 12.2026421  1.55353093 1.849613e-02
##   StudyGroup Obs
## 1       6-10   1
## 2      16-20   2
## 3       6-10   3
## 4      16-20   4
## 5      16-20   5
## 6        1-5   6

Residuals vs Fitted Values

ggplot(data, aes(x = fitted, y = residuals)) +
  geom_point(alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, color = "blue") +
  labs(
    title = "Residuals vs Fitted Values",
    x = "Fitted Values",
    y = "Residuals"
  ) +
  theme_minimal()

Residuals vs Hours Studied

ggplot(data, aes(x = HoursStudied, y = residuals)) +
  geom_point(alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, color = "blue") +
  labs(
    title = "Residuals vs Hours Studied",
    x = "Hours Studied",
    y = "Residuals"
  ) +
  theme_minimal()

Histogram of Residuals

ggplot(data, aes(x = residuals)) +
  geom_histogram(bins = 30, color = "black", fill = "lightblue") +
  labs(
    title = "Histogram of Residuals",
    x = "Residuals",
    y = "Frequency"
  ) +
  theme_minimal()

Normal Q-Q Plot

ggplot(data, aes(sample = residuals)) +
  stat_qq() +
  stat_qq_line(color = "blue") +
  labs(title = "Q-Q Plot of Residuals") +
  theme_minimal()

Boxplots of Residuals by Study Group

ggplot(data, aes(x = StudyGroup, y = residuals)) +
  geom_boxplot(fill = "lightblue") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    title = "Residuals by Hours Studied Group",
    x = "Hours Studied Group",
    y = "Residuals"
  ) +
  theme_minimal()

Residuals by Observation Order

ggplot(data, aes(x = Obs, y = residuals)) +
  geom_point(alpha = 0.7) +
  geom_line(alpha = 0.5) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(
    title = "Residuals by Observation Order",
    x = "Observation",
    y = "Residuals"
  ) +
  theme_minimal()

Cook’s Distance

ggplot(data, aes(x = Obs, y = cooks_d)) +
  geom_col() +
  labs(
    title = "Cook's Distance",
    x = "Observation",
    y = "Cook's Distance"
  ) +
  theme_minimal()

Shapiro-Wilk Normality Test

shapiro.test(residuals(model))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.99474, p-value = 0.5432

Standard Base R Diagnostic Panel

par(mfrow = c(2, 2))
plot(model)

par(mfrow = c(1, 1))

Boxplot of Exam Score by Study Group

ggplot(data, aes(x = StudyGroup, y = ExamScore)) +
  geom_boxplot(fill = "lightblue") +
  labs(
    title = "Exam Scores by Hours Studied Group",
    x = "Hours Studied Group",
    y = "Exam Score"
  ) +
  theme_minimal()

Interpretation

The simulated data show a positive relationship between hours studied and exam score, which is confirmed by both the Pearson correlation test and the linear regression model. The diagnostic plots help assess linearity, constant variance, residual normality, and influential observations. These checks are important because least squares regression can be affected by outliers, heteroscedasticity, nonlinearity, and influential cases.

Correlation, Regression, and Diagnostic Analysis

Irene Tsapara