options(scipen=999)
load("Catholic_viz.RData")LAB #6
DATA
Partition 60% Train / 40% Test
library(caret)Loading required package: ggplot2
Loading required package: lattice
set.seed(1)
index <- createDataPartition(Catholic_viz$read12, p=0.6, list=FALSE)
train <- Catholic_viz[index,]
test <- Catholic_viz[-index,]EDA
Descriptive Statistics
library(psych)
Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':
%+%, alpha
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.3 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ psych::%+%() masks ggplot2::%+%()
✖ psych::alpha() masks ggplot2::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ purrr::lift() masks caret::lift()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
psych::describe(train, fast=TRUE) vars n mean sd min max range
id 1 3583 4642399.08 2719725.86 124915.00 7979086.00 7854171.00
read12 2 3583 51.56 9.39 29.15 68.09 38.94
math12 3 3583 52.00 9.59 30.14 71.37 41.23
female 4 3583 0.53 0.50 0.00 1.00 1.00
asian 5 3583 0.06 0.25 0.00 1.00 1.00
hispan 6 3583 0.11 0.31 0.00 1.00 1.00
black 7 3583 0.08 0.26 0.00 1.00 1.00
motheduc 8 3583 13.24 2.03 8.00 18.00 10.00
fatheduc 9 3583 13.55 2.26 8.00 18.00 10.00
lfaminc 10 3583 10.31 0.80 6.21 12.35 6.13
hsgrad 11 3583 0.93 0.26 0.00 1.00 1.00
cathhs 12 3583 0.07 0.26 0.00 1.00 1.00
parcath 13 3583 0.36 0.48 0.00 1.00 1.00
se
id 45436.17
read12 0.16
math12 0.16
female 0.01
asian 0.00
hispan 0.01
black 0.00
motheduc 0.03
fatheduc 0.04
lfaminc 0.01
hsgrad 0.00
cathhs 0.00
parcath 0.01
Boxplots – All Numeric
data_long <- train %>%
select(where(is.numeric)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(data_long, aes(x = Variable, y = Value)) +
geom_boxplot() +
coord_flip() +
facet_wrap(~ Variable, scales = "free", ncol=3) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
labs(title = "Horizontal Boxplot for Each Numeric Variable")library(DataExplorer)
plot_boxplot(train, by= "lfaminc") Histograms
plot_histogram(train)Scatterplots (depvar ~ all x)
plot_scatterplot(test, by="read12")Correlation Matrix
plot_correlation(train, type="continuous")MODEL
Linear Regression Model
Estimate Coefficients and show coefficients table
library(flextable)
Attaching package: 'flextable'
The following object is masked from 'package:purrr':
compose
model <- lm((read12^2) ~ lfaminc + I(math12^2) + fatheduc + motheduc, data=train)
model %>% as_flextableEstimate | Standard Error | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|
(Intercept) | 134.609 | 141.528 | 0.951 | 0.3416 |
|
lfaminc | 40.634 | 15.497 | 2.622 | 0.0088 | ** |
I(math12^2) | 0.657 | 0.012 | 54.233 | 0.0000 | *** |
fatheduc | 22.207 | 6.244 | 3.557 | 0.0004 | *** |
motheduc | 4.159 | 6.689 | 0.622 | 0.5341 |
|
Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05 | |||||
Residual standard error: 646.9 on 3578 degrees of freedom | |||||
Multiple R-squared: 0.5295, Adjusted R-squared: 0.529 | |||||
F-statistic: 1007 on 3578 and 4 DF, p-value: 0.0000 | |||||
Coefficient Magnitude Plot
library(coefplot)
coefplot(model, soft="magnitude", intercept=FALSE)Check for predictor independence
Using Variance Inflation Factors (VIF)
library(car)Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:dplyr':
recode
The following object is masked from 'package:purrr':
some
The following object is masked from 'package:psych':
logit
vif(model) lfaminc I(math12^2) fatheduc motheduc
1.331495 1.228852 1.709087 1.579509
Residual Analysis
Residual Range
quantile(round(residuals(model, type = "deviance"),2)) 0% 25% 50% 75% 100%
-2655.93 -438.40 -5.67 426.02 1991.32
Residual Plots
We are looking for: - Random distribution of residuals vs fitted values - Normally distributed residuals : Normal Q-Q plot with values along line - Homoskedasticity with a Scale-Location line that is horizontal and no residual pattern - Minimal influential obs - that is, those outside the borders of Cook’s distance
plot(model)Plot Fitted Value by Actual Value
library(broom)
train2 <- augment(model, data=train) # this appends predicted to original dataset to build plots on your own
p <- ggplot(train2, mapping = aes(y=.fitted, x=read12))
p + geom_point()Plot Residuals by Fitted Values
p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p + geom_point()Performance Evaluation
Use Model to Score test dataset (Display First 10 values - depvar and fitted values only)
pred <- predict(model, newdata = test, interval="predict") #score test dataset wtih model
test_w_predict <- cbind(test, pred) # append score to original test dataset
test_w_predict %>%
head(10) %>%
select(id, read12, fit, lwr,upr) %>%
as_flextable(show_coltype = FALSE) id | read12 | fit | lwr | upr |
|---|---|---|---|---|
124,902 | 61.4 | 2,506.0 | 1,237.2 | 3,774.8 |
124,916 | 59.3 | 2,523.9 | 1,254.8 | 3,793.0 |
124,944 | 57.6 | 2,818.7 | 1,550.0 | 4,087.5 |
124,947 | 52.5 | 2,992.4 | 1,723.1 | 4,261.6 |
124,972 | 42.1 | 2,837.4 | 1,568.8 | 4,106.0 |
124,990 | 64.0 | 3,558.1 | 2,288.9 | 4,827.3 |
124,992 | 38.5 | 3,284.1 | 2,014.2 | 4,554.0 |
175,585 | 60.3 | 3,497.9 | 2,228.7 | 4,767.0 |
180,607 | 63.6 | 3,117.7 | 1,848.8 | 4,386.6 |
180,660 | 52.6 | 3,308.3 | 2,039.5 | 4,577.2 |
n: 10 | ||||
Plot Actual vs Fitted (test)
plot(test_w_predict$read12, test_w_predict$fit, pch=16, cex=1)Performance Metrics
library(Metrics)
Attaching package: 'Metrics'
The following objects are masked from 'package:caret':
precision, recall
metric_label <- c("MAE","RMSE", "MAPE")
metrics <- c(round(mae(test_w_predict$read12, test_w_predict$fit),4),
round(rmse(test_w_predict$read12, test_w_predict$fit),4),
round(mape(test_w_predict$read12, test_w_predict$fit),4))
pmtable <- data.frame(Metric=metric_label, Value = metrics)
flextable(pmtable)Metric | Value |
|---|---|
MAE | 2,665.8955 |
RMSE | 2,745.3778 |
MAPE | 51.9644 |