LAB #6

Author

Jose Gabriel Rodriguez

DATA

options(scipen=999)
load("Catholic_viz.RData")

Partition 60% Train / 40% Test

library(caret)
Loading required package: ggplot2
Loading required package: lattice
set.seed(1)
index <- createDataPartition(Catholic_viz$read12, p=0.6, list=FALSE)
train <- Catholic_viz[index,]
test  <- Catholic_viz[-index,]

EDA

Descriptive Statistics

library(psych)

Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':

    %+%, alpha
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ psych::%+%()    masks ggplot2::%+%()
✖ psych::alpha()  masks ggplot2::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ purrr::lift()   masks caret::lift()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
psych::describe(train, fast=TRUE)
         vars    n       mean         sd       min        max      range
id          1 3583 4642399.08 2719725.86 124915.00 7979086.00 7854171.00
read12      2 3583      51.56       9.39     29.15      68.09      38.94
math12      3 3583      52.00       9.59     30.14      71.37      41.23
female      4 3583       0.53       0.50      0.00       1.00       1.00
asian       5 3583       0.06       0.25      0.00       1.00       1.00
hispan      6 3583       0.11       0.31      0.00       1.00       1.00
black       7 3583       0.08       0.26      0.00       1.00       1.00
motheduc    8 3583      13.24       2.03      8.00      18.00      10.00
fatheduc    9 3583      13.55       2.26      8.00      18.00      10.00
lfaminc    10 3583      10.31       0.80      6.21      12.35       6.13
hsgrad     11 3583       0.93       0.26      0.00       1.00       1.00
cathhs     12 3583       0.07       0.26      0.00       1.00       1.00
parcath    13 3583       0.36       0.48      0.00       1.00       1.00
               se
id       45436.17
read12       0.16
math12       0.16
female       0.01
asian        0.00
hispan       0.01
black        0.00
motheduc     0.03
fatheduc     0.04
lfaminc      0.01
hsgrad       0.00
cathhs       0.00
parcath      0.01

Boxplots – All Numeric

data_long <- train %>%
  select(where(is.numeric)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(data_long, aes(x = Variable, y = Value)) +
  geom_boxplot() +
  coord_flip() + 
  facet_wrap(~ Variable, scales = "free", ncol=3) +
  theme_minimal() +
  theme(axis.text.x = element_blank()) + 
  labs(title = "Horizontal Boxplot for Each Numeric Variable")

library(DataExplorer)
plot_boxplot(train, by= "lfaminc") 

Histograms

plot_histogram(train)

Scatterplots (depvar ~ all x)

plot_scatterplot(test, by="read12")

Correlation Matrix

plot_correlation(train, type="continuous")

MODEL

Linear Regression Model

Estimate Coefficients and show coefficients table

library(flextable)

Attaching package: 'flextable'
The following object is masked from 'package:purrr':

    compose
model <- lm((read12^2) ~ lfaminc + I(math12^2) + fatheduc + motheduc, data=train)
model %>% as_flextable

Estimate

Standard Error

t value

Pr(>|t|)

(Intercept)

134.609

141.528

0.951

0.3416

lfaminc

40.634

15.497

2.622

0.0088

**

I(math12^2)

0.657

0.012

54.233

0.0000

***

fatheduc

22.207

6.244

3.557

0.0004

***

motheduc

4.159

6.689

0.622

0.5341

Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05

Residual standard error: 646.9 on 3578 degrees of freedom

Multiple R-squared: 0.5295, Adjusted R-squared: 0.529

F-statistic: 1007 on 3578 and 4 DF, p-value: 0.0000

Coefficient Magnitude Plot

library(coefplot)
coefplot(model, soft="magnitude", intercept=FALSE)

Check for predictor independence

Using Variance Inflation Factors (VIF)

library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
The following object is masked from 'package:purrr':

    some
The following object is masked from 'package:psych':

    logit
vif(model)
    lfaminc I(math12^2)    fatheduc    motheduc 
   1.331495    1.228852    1.709087    1.579509 

Residual Analysis

Residual Range

quantile(round(residuals(model, type = "deviance"),2))
      0%      25%      50%      75%     100% 
-2655.93  -438.40    -5.67   426.02  1991.32 

Residual Plots

We are looking for: - Random distribution of residuals vs fitted values - Normally distributed residuals : Normal Q-Q plot with values along line - Homoskedasticity with a Scale-Location line that is horizontal and no residual pattern - Minimal influential obs - that is, those outside the borders of Cook’s distance

plot(model)

Plot Fitted Value by Actual Value

library(broom)
train2 <- augment(model, data=train)  # this appends predicted to original dataset to build plots on your own

p <- ggplot(train2, mapping = aes(y=.fitted, x=read12))
p  + geom_point()

Plot Residuals by Fitted Values

p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p  + geom_point()

Performance Evaluation

Use Model to Score test dataset (Display First 10 values - depvar and fitted values only)

pred <- predict(model, newdata = test, interval="predict")  #score test dataset wtih model
test_w_predict <- cbind(test, pred)  # append score to original test dataset

test_w_predict %>%
  head(10) %>%
    select(id, read12, fit, lwr,upr) %>% 
  as_flextable(show_coltype = FALSE) 

id

read12

fit

lwr

upr

124,902

61.4

2,506.0

1,237.2

3,774.8

124,916

59.3

2,523.9

1,254.8

3,793.0

124,944

57.6

2,818.7

1,550.0

4,087.5

124,947

52.5

2,992.4

1,723.1

4,261.6

124,972

42.1

2,837.4

1,568.8

4,106.0

124,990

64.0

3,558.1

2,288.9

4,827.3

124,992

38.5

3,284.1

2,014.2

4,554.0

175,585

60.3

3,497.9

2,228.7

4,767.0

180,607

63.6

3,117.7

1,848.8

4,386.6

180,660

52.6

3,308.3

2,039.5

4,577.2

n: 10

Plot Actual vs Fitted (test)

plot(test_w_predict$read12, test_w_predict$fit, pch=16, cex=1)

Performance Metrics

library(Metrics)

Attaching package: 'Metrics'
The following objects are masked from 'package:caret':

    precision, recall
metric_label <- c("MAE","RMSE", "MAPE")
metrics <- c(round(mae(test_w_predict$read12, test_w_predict$fit),4),
                         round(rmse(test_w_predict$read12, test_w_predict$fit),4),
                         round(mape(test_w_predict$read12, test_w_predict$fit),4))
pmtable <- data.frame(Metric=metric_label, Value = metrics)
flextable(pmtable)

Metric

Value

MAE

2,665.8955

RMSE

2,745.3778

MAPE

51.9644