Module10_Exercise: Statistical Modeling

Author

Cienna Kim

Published

June 17, 2026

Introduction: Data Loading and Setup

# Check files in the working directory
list.files()
[1] "data"                                
[2] "u1412840_Module10_Exercise.html"     
[3] "u1412840_Module10_Exercise.qmd"      
[4] "u1412840_Module10_Exercise.rmarkdown"
# Load data
enrollment <- read.csv("./data/enrollmentForecast.csv")

# Examine data structure
str(enrollment)
'data.frame':   29 obs. of  5 variables:
 $ YEAR : int  1 2 3 4 5 6 7 8 9 10 ...
 $ ROLL : int  5501 5945 6629 7556 8716 9369 9920 10167 11084 12504 ...
 $ UNEM : num  8.1 7 7.3 7.5 7 6.4 6.5 6.4 6.3 7.7 ...
 $ HGRAD: int  9552 9680 9731 11666 14675 15265 15484 15723 16501 16890 ...
 $ INC  : int  1923 1961 1979 2030 2112 2192 2235 2351 2411 2475 ...
head(enrollment)
  YEAR ROLL UNEM HGRAD  INC
1    1 5501  8.1  9552 1923
2    2 5945  7.0  9680 1961
3    3 6629  7.3  9731 1979
4    4 7556  7.5 11666 2030
5    5 8716  7.0 14675 2112
6    6 9369  6.4 15265 2192
# Summary Statistics
summary(enrollment)
      YEAR         ROLL            UNEM            HGRAD            INC      
 Min.   : 1   Min.   : 5501   Min.   : 5.700   Min.   : 9552   Min.   :1923  
 1st Qu.: 8   1st Qu.:10167   1st Qu.: 7.000   1st Qu.:15723   1st Qu.:2351  
 Median :15   Median :14395   Median : 7.500   Median :17203   Median :2863  
 Mean   :15   Mean   :12707   Mean   : 7.717   Mean   :16528   Mean   :2729  
 3rd Qu.:22   3rd Qu.:14969   3rd Qu.: 8.200   3rd Qu.:18266   3rd Qu.:3127  
 Max.   :29   Max.   :16081   Max.   :10.100   Max.   :19800   Max.   :3345  

1. Scatterplots: Enrollment vs Predictor Variables

# ROLL vs UNEMPLOYMENT
plot(enrollment$UNEM,
     enrollment$ROLL,
     xlab = "UNEM",
     ylab = "ROLL",
     main = "ROLL vs UNEMPLOYMENT")

# ROLL vs HGRAD
plot(enrollment$HGRAD,
     enrollment$ROLL,
     xlab = "HGRAD",
     ylab = "ROLL",
     main = "ROLL vs HIGH SCHOOL GRAD")

# ROLL vs INC
plot(enrollment$INC,
     enrollment$ROLL,
     xlab = "INC",
     ylab = "ROLL",
     main = "ROLL vs INCOME")


2. Linear Model: ROLL ~ UNEM + HGRAD

# Build linear model
fit1 <- lm(ROLL ~ UNEM + HGRAD,
           data = enrollment)

# Model output
fit1

Call:
lm(formula = ROLL ~ UNEM + HGRAD, data = enrollment)

Coefficients:
(Intercept)         UNEM        HGRAD  
 -8255.7511     698.2681       0.9423  
# Model summary
summary(fit1)

Call:
lm(formula = ROLL ~ UNEM + HGRAD, data = enrollment)

Residuals:
    Min      1Q  Median      3Q     Max 
-2102.2  -861.6  -349.4   374.5  3603.5 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -8.256e+03  2.052e+03  -4.023  0.00044 ***
UNEM         6.983e+02  2.244e+02   3.111  0.00449 ** 
HGRAD        9.423e-01  8.613e-02  10.941 3.16e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1313 on 26 degrees of freedom
Multiple R-squared:  0.8489,    Adjusted R-squared:  0.8373 
F-statistic: 73.03 on 2 and 26 DF,  p-value: 2.144e-11
# ANOVA table
anova(fit1)
Analysis of Variance Table

Response: ROLL
          Df    Sum Sq   Mean Sq F value    Pr(>F)    
UNEM       1  45407767  45407767  26.349 2.366e-05 ***
HGRAD      1 206279143 206279143 119.701 3.157e-11 ***
Residuals 26  44805568   1723291                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4. Residual Plot and Bias Check

# Residuals vs fitted values
plot(fit1, which = 1)

Residual Plot Assignment There is no obvious pattern in the residual plot. Therefore, there is little evidence of model bias.


5. Predict Fall Enrollment

# New observation
newdata <- data.frame(
  UNEM = 9,
  HGRAD = 25000
)

# Predicted enrollment
predict(fit1,
        newdata,
        interval = "prediction")
       fit      lwr     upr
1 21585.58 18452.36 24718.8

Predicted Enrollment The predicted fall enrollment is approximately 21,586 students.


6. Second Model: Add Income (INC)

# Build second model
fit2 <- lm(ROLL ~ UNEM + HGRAD + INC,
           data = enrollment)

# Model summary
summary(fit2)

Call:
lm(formula = ROLL ~ UNEM + HGRAD + INC, data = enrollment)

Residuals:
     Min       1Q   Median       3Q      Max 
-1148.84  -489.71    -1.88   387.40  1425.75 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -9.153e+03  1.053e+03  -8.691 5.02e-09 ***
UNEM         4.501e+02  1.182e+02   3.809 0.000807 ***
HGRAD        4.065e-01  7.602e-02   5.347 1.52e-05 ***
INC          4.275e+00  4.947e-01   8.642 5.59e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 670.4 on 25 degrees of freedom
Multiple R-squared:  0.9621,    Adjusted R-squared:  0.9576 
F-statistic: 211.5 on 3 and 25 DF,  p-value: < 2.2e-16
# ANOVA table
anova(fit2)
Analysis of Variance Table

Response: ROLL
          Df    Sum Sq   Mean Sq F value    Pr(>F)    
UNEM       1  45407767  45407767  101.02 2.894e-10 ***
HGRAD      1 206279143 206279143  458.92 < 2.2e-16 ***
INC        1  33568255  33568255   74.68 5.594e-09 ***
Residuals 25  11237313    449493                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

7. Compare the Two Models

anova(fit1, fit2)
Analysis of Variance Table

Model 1: ROLL ~ UNEM + HGRAD
Model 2: ROLL ~ UNEM + HGRAD + INC
  Res.Df      RSS Df Sum of Sq     F    Pr(>F)    
1     26 44805568                                 
2     25 11237313  1  33568255 74.68 5.594e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model Comparison

(p < 0.05)

Therefore, including INC significantly improves the model because the comparison ANOVA is significant and the model explains more variation in enrollment.