Lab 6

Catholic DAT-4313 - Data Viz in Model Development

Author

Erick Xavier Maldonado

Code for Catholic with depvar=read12

DATA

options(scipen=999)

library(socviz)


load("Catholic_viz.RData")

Partition 60% Train / 40% Test

library(caret)
set.seed(1)
index <- createDataPartition(Catholic_viz$read12, p=0.6, list=FALSE)
train <- Catholic_viz[index,]
test  <- Catholic_viz[-index,]

Training set number of observations: 3583
Test dataset number of observations: 2387

EDA

Descriptive Statistics

library(psych)
library(tidyverse)
library(flextable)
psych::describe(train, fast=TRUE)  # (class note --> remove flextable)

         vars    n       mean         sd       min        max      range
id          1 3583 4642399.08 2719725.86 124915.00 7979086.00 7854171.00
read12      2 3583      51.56       9.39     29.15      68.09      38.94
math12      3 3583      52.00       9.59     30.14      71.37      41.23
female      4 3583       0.53       0.50      0.00       1.00       1.00
asian       5 3583       0.06       0.25      0.00       1.00       1.00
hispan      6 3583       0.11       0.31      0.00       1.00       1.00
black       7 3583       0.08       0.26      0.00       1.00       1.00
motheduc    8 3583      13.24       2.03      8.00      18.00      10.00
fatheduc    9 3583      13.55       2.26      8.00      18.00      10.00
lfaminc    10 3583      10.31       0.80      6.21      12.35       6.13
hsgrad     11 3583       0.93       0.26      0.00       1.00       1.00
cathhs     12 3583       0.07       0.26      0.00       1.00       1.00
parcath    13 3583       0.36       0.48      0.00       1.00       1.00
               se
id       45436.17
read12       0.16
math12       0.16
female       0.01
asian        0.00
hispan       0.01
black        0.00
motheduc     0.03
fatheduc     0.04
lfaminc      0.01
hsgrad       0.00
cathhs       0.00
parcath      0.01

Boxplots – All Numeric

data_long <- train %>%
  select(where(is.numeric)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(data_long, aes(x = Variable, y = Value)) +
  geom_boxplot() +
  coord_flip() + 
  facet_wrap(~ Variable, scales = "free", ncol=3) +
  theme_minimal() +
  theme(axis.text.x = element_blank()) + 
  labs(title = "Horizontal Boxplot for Each Numeric Variable")

library(DataExplorer)
plot_boxplot(train, by="female")

Histograms

plot_histogram(train)

Scatterplots (depvar ~ all x)

Look at the shape of the relationship between the dependent variable and all of the continuous potential independent variables.

plot_scatterplot(test, by="read12")

Correlation Matrix

plot_correlation(train, type="continuous")

MODEL

Linear Regression Model

From the above EDA, we chose the following variables to start. Note that, given the shape of the relationship between read12 and gander

Estimate the following model:
\(log(read12) = female + asian + black + hispan + motherduc + fatherduc\)

Estimate Coefficients and show coefficients table

# Build model
model <- lm(log(read12) ~ female + asian + black + hispan + motheduc + fatheduc, data = train)



model %>% as_flextable

	Estimate	Standard Error	t value	Pr(>\|t\|)
(Intercept)	3.494	0.023	153.141	0.0000	***
female	0.033	0.006	5.556	0.0000	***
asian	0.021	0.012	1.722	0.0851	.
black	-0.122	0.011	-10.611	0.0000	***
hispan	-0.028	0.010	-2.825	0.0048	**
motheduc	0.012	0.002	6.583	0.0000	***
fatheduc	0.020	0.002	11.839	0.0000	***
Signif. codes: 0 <= '*' < 0.001 < '' < 0.01 < '*' < 0.05

Residual standard error: 0.1795 on 3576 degrees of freedom
Multiple R-squared: 0.1453, Adjusted R-squared: 0.1439
F-statistic: 101.4 on 3576 and 6 DF, p-value: 0.0000

Coefficient Magnitude Plot

# note this simple code using coefplot package produces chart like Figure 6.6 in one short line. 
library(coefplot)
coefplot(model, soft="magnitude", intercept=FALSE)

Check for predictor independence

Using Variance Inflation Factors (VIF)

library(car)
vif(model)

  female    asian    black   hispan motheduc fatheduc 
1.004622 1.027676 1.024268 1.087508 1.542893 1.549068

Residual Analysis

Residual Range

quantile(round(residuals(model, type = "deviance"),2))

   0%   25%   50%   75%  100% 
-0.59 -0.12  0.03  0.13  0.45

Residual Plots

We are looking for: - Random distribution of residuals vs fitted values - Normally distributed residuals : Normal Q-Q plot with values along line - Homoskedasticity with a Scale-Location line that is horizontal and no residual pattern - Minimal influential obs - that is, those outside the borders of Cook’s distance

plot(model)

Plot Fitted Value by Actual Value

library(broom)
train2 <- augment(model, data=train)  # this appends predicted to original dataset to build plots on your own

p <- ggplot(train2, mapping = aes(y=.fitted, x=read12))
p  + geom_point()

Plot Residuals by Fitted Values

p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p  + geom_point()

Performance Evaluation

Use Model to Score `test` dataset (Display First 10 values - depvar and fitted values only)

pred <- predict(model, newdata = test, interval="predict")  #score test dataset wtih model
test_w_predict <- cbind(test, pred)  # append score to original test dataset

test_w_predict %>%
  head(10) %>%
    select(read12, fit, lwr,upr) %>% 
  as_flextable(show_coltype = FALSE)

read12	fit	lwr	upr
61.4	3.9	3.5	4.2
59.3	3.9	3.6	4.3
57.6	3.9	3.6	4.3
52.5	3.9	3.5	4.2
42.1	3.9	3.6	4.3
64.0	3.9	3.6	4.3
38.5	3.9	3.6	4.3
60.3	4.0	3.6	4.3
63.6	3.9	3.5	4.2
52.6	4.0	3.6	4.3
n: 10

Plot Actual vs Fitted (`test`)

plot(test_w_predict$read12, test_w_predict$fit, pch=16, cex=1)

Performance Metrics

library(Metrics)
metric_label <- c("MAE","RMSE", "MAPE")
metrics <- c(round(mae(test_w_predict$read12, test_w_predict$fit),4),
                         round(rmse(test_w_predict$read12, test_w_predict$fit),4),
                         round(mape(test_w_predict$read12, test_w_predict$fit),4))
pmtable <- data.frame(Metric=metric_label, Value = metrics)
flextable(pmtable)

Metric	Value
MAE	47.6018
RMSE	48.5153
MAPE	0.9210

Model Fit by Gender

# more in depth plots of performance 
p <- ggplot(data = subset(test_w_predict, female %in% c(1, 0)),
            aes(x = read12,
                y = fit, ymin = lwr, ymax = upr,
                color = female,
                fill = female,
                group = female))
p +  geom_point(alpha = 0.5) + 
       geom_line() + geom_ribbon(alpha = 0.2, color = FALSE) +
       labs(title="Actual vs Fitted with Upper and Lower CI",
              subtitle="Gender being Female 1 and Male 0",
              caption="Catholic Dataset") +
       xlab(NULL) + ylab("fitted") +
       theme(legend.position = "bottom")

`ggplot2` explore (Intro Section)

This is just replication of the first section within Chapter 6 (as FYI)

# Simple model
pip <- lm(read12 ~ female, data=train)
summary(pip)


Call:
lm(formula = read12 ~ female, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.387  -7.398   1.201   7.677  17.121 

Coefficients:
            Estimate Std. Error t value             Pr(>|t|)    
(Intercept)  50.9694     0.2278 223.763 < 0.0000000000000002 ***
female        1.1279     0.3138   3.594             0.000329 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.378 on 3581 degrees of freedom
Multiple R-squared:  0.003595,  Adjusted R-squared:  0.003317 
F-statistic: 12.92 on 1 and 3581 DF,  p-value: 0.0003294

library(ggplot2)

# Plot models
p <- ggplot(data=train, mapping=aes(y=read12, x=math12))

p + geom_point(alpha = 0.2) +  
       geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS"))

p + geom_point(alpha=0.1) +
    geom_smooth(color = "tomato", fill="tomato", method = MASS::rlm) + 
    geom_smooth(color = "steelblue", fill="steelblue", method = "lm")

p + geom_point(alpha=0.1) +
    geom_smooth(color = "tomato", method = "lm", linewidth = 1.2,  
                formula = y ~ splines::bs(x, 3), se = FALSE)

p + geom_point(alpha=0.1) +
    geom_quantile(color = "tomato", size = 1.2, method = "rqss",
                  lambda = 1, quantiles = c(0.20, 0.5, 0.85))

SUMMARY ASSESSMENT AND EVALUATION OF THE MODEL

Being female is associated with scoring 0.033 points higher on read12 compared to males, holding other predictors constant (p<0.001).

Asian students score 0.021 points higher than the reference group (Caucasian students), though this effect is only marginally significant (p=0.085).

Black students score 0.122 points lower compared to Caucasians (p<0.001).

Hispanic students score 0.028 points lower than Caucasians (p=0.005).

Mother’s education level significantly predicts read12 scores, with each additional year of education associated with a 0.012 increase in read12 (p<0.001).

Similarly, father’s education is positively associated with read12, with each extra year predicting a 0.020 increase in score (p<0.001).

The predictors in the model explain 14.5% of the variation in read12 scores, based on the multiple R-squared value. The overall F-test indicates the model provides a statistically significant improvement over a null model with no predictors (F=101.4, p<0.001).

In summary, the results show demographic factors like gender and race/ethnicity impact reading scores, with females and Asian students scoring higher on average. Parent education levels also emerge as significant positive predictors of reading achievement. The visualizations were helpful for identifying these relationships during the exploratory analysis. However, over 85% of variability in read12 remains unexplained by this simple linear model, highlighting opportunities to consider additional relevant predictors to improve model fit.