LAB #6

Author

Jose Gabriel Rodriguez

DATA

options(scipen=999)
load("Catholic_viz.RData")

Partition 60% Train / 40% Test

library(caret)

Loading required package: ggplot2

Loading required package: lattice

set.seed(1)
index <- createDataPartition(Catholic_viz$read12, p=0.6, list=FALSE)
train <- Catholic_viz[index,]
test  <- Catholic_viz[-index,]

EDA

Descriptive Statistics

library(psych)


Attaching package: 'psych'

The following objects are masked from 'package:ggplot2':

    %+%, alpha

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.0

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ psych::%+%()    masks ggplot2::%+%()
✖ psych::alpha()  masks ggplot2::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ purrr::lift()   masks caret::lift()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

psych::describe(train, fast=TRUE)

         vars    n       mean         sd       min        max      range
id          1 3583 4642399.08 2719725.86 124915.00 7979086.00 7854171.00
read12      2 3583      51.56       9.39     29.15      68.09      38.94
math12      3 3583      52.00       9.59     30.14      71.37      41.23
female      4 3583       0.53       0.50      0.00       1.00       1.00
asian       5 3583       0.06       0.25      0.00       1.00       1.00
hispan      6 3583       0.11       0.31      0.00       1.00       1.00
black       7 3583       0.08       0.26      0.00       1.00       1.00
motheduc    8 3583      13.24       2.03      8.00      18.00      10.00
fatheduc    9 3583      13.55       2.26      8.00      18.00      10.00
lfaminc    10 3583      10.31       0.80      6.21      12.35       6.13
hsgrad     11 3583       0.93       0.26      0.00       1.00       1.00
cathhs     12 3583       0.07       0.26      0.00       1.00       1.00
parcath    13 3583       0.36       0.48      0.00       1.00       1.00
               se
id       45436.17
read12       0.16
math12       0.16
female       0.01
asian        0.00
hispan       0.01
black        0.00
motheduc     0.03
fatheduc     0.04
lfaminc      0.01
hsgrad       0.00
cathhs       0.00
parcath      0.01

Boxplots – All Numeric

data_long <- train %>%
  select(where(is.numeric)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(data_long, aes(x = Variable, y = Value)) +
  geom_boxplot() +
  coord_flip() + 
  facet_wrap(~ Variable, scales = "free", ncol=3) +
  theme_minimal() +
  theme(axis.text.x = element_blank()) + 
  labs(title = "Horizontal Boxplot for Each Numeric Variable")

library(DataExplorer)
plot_boxplot(train, by= "lfaminc")

Histograms

plot_histogram(train)

Scatterplots (depvar ~ all x)

plot_scatterplot(test, by="read12")

Correlation Matrix

plot_correlation(train, type="continuous")

MODEL

Linear Regression Model

Estimate Coefficients and show coefficients table

library(flextable)


Attaching package: 'flextable'

The following object is masked from 'package:purrr':

    compose

model <- lm((read12^2) ~ lfaminc + I(math12^2) + fatheduc + motheduc, data=train)
model %>% as_flextable

	Estimate	Standard Error	t value	Pr(>\|t\|)
(Intercept)	134.609	141.528	0.951	0.3416
lfaminc	40.634	15.497	2.622	0.0088	**
I(math12^2)	0.657	0.012	54.233	0.0000	***
fatheduc	22.207	6.244	3.557	0.0004	***
motheduc	4.159	6.689	0.622	0.5341
Signif. codes: 0 <= '*' < 0.001 < '' < 0.01 < '*' < 0.05

Residual standard error: 646.9 on 3578 degrees of freedom
Multiple R-squared: 0.5295, Adjusted R-squared: 0.529
F-statistic: 1007 on 3578 and 4 DF, p-value: 0.0000

Coefficient Magnitude Plot

library(coefplot)
coefplot(model, soft="magnitude", intercept=FALSE)

Check for predictor independence

Using Variance Inflation Factors (VIF)

library(car)

Loading required package: carData


Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

The following object is masked from 'package:psych':

    logit

vif(model)

    lfaminc I(math12^2)    fatheduc    motheduc 
   1.331495    1.228852    1.709087    1.579509

Residual Analysis

Residual Range

quantile(round(residuals(model, type = "deviance"),2))

      0%      25%      50%      75%     100% 
-2655.93  -438.40    -5.67   426.02  1991.32

Residual Plots

We are looking for: - Random distribution of residuals vs fitted values - Normally distributed residuals : Normal Q-Q plot with values along line - Homoskedasticity with a Scale-Location line that is horizontal and no residual pattern - Minimal influential obs - that is, those outside the borders of Cook’s distance

plot(model)

Plot Fitted Value by Actual Value

library(broom)
train2 <- augment(model, data=train)  # this appends predicted to original dataset to build plots on your own

p <- ggplot(train2, mapping = aes(y=.fitted, x=read12))
p  + geom_point()

Plot Residuals by Fitted Values

p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p  + geom_point()

Performance Evaluation

Use Model to Score `test` dataset (Display First 10 values - depvar and fitted values only)

pred <- predict(model, newdata = test, interval="predict")  #score test dataset wtih model
test_w_predict <- cbind(test, pred)  # append score to original test dataset

test_w_predict %>%
  head(10) %>%
    select(id, read12, fit, lwr,upr) %>% 
  as_flextable(show_coltype = FALSE)

id	read12	fit	lwr	upr
124,902	61.4	2,506.0	1,237.2	3,774.8
124,916	59.3	2,523.9	1,254.8	3,793.0
124,944	57.6	2,818.7	1,550.0	4,087.5
124,947	52.5	2,992.4	1,723.1	4,261.6
124,972	42.1	2,837.4	1,568.8	4,106.0
124,990	64.0	3,558.1	2,288.9	4,827.3
124,992	38.5	3,284.1	2,014.2	4,554.0
175,585	60.3	3,497.9	2,228.7	4,767.0
180,607	63.6	3,117.7	1,848.8	4,386.6
180,660	52.6	3,308.3	2,039.5	4,577.2
n: 10

Plot Actual vs Fitted (`test`)

plot(test_w_predict$read12, test_w_predict$fit, pch=16, cex=1)

Performance Metrics

library(Metrics)


Attaching package: 'Metrics'

The following objects are masked from 'package:caret':

    precision, recall

metric_label <- c("MAE","RMSE", "MAPE")
metrics <- c(round(mae(test_w_predict$read12, test_w_predict$fit),4),
                         round(rmse(test_w_predict$read12, test_w_predict$fit),4),
                         round(mape(test_w_predict$read12, test_w_predict$fit),4))
pmtable <- data.frame(Metric=metric_label, Value = metrics)
flextable(pmtable)

Metric	Value
MAE	2,665.8955
RMSE	2,745.3778
MAPE	51.9644