options(scipen=999)
library(socviz)
load("Catholic_viz.RData")Lab 6
Catholic dataset
DATA
Partition 60% Train / 40% Test
library(caret)
set.seed(1)
index <- createDataPartition(Catholic_viz$read12, p=0.6, list=FALSE)
train <- Catholic_viz[index,]
test <- Catholic_viz[-index,]Training set number of observations: 3583
Test dataset number of observations: 2387
EDA
Descriptive Statistics
library(psych)
library(tidyverse)
psych::describe(train, fast=TRUE) # (class note --> remove flextable) vars n mean sd min max range
id 1 3583 4642399.08 2719725.86 124915.00 7979086.00 7854171.00
read12 2 3583 51.56 9.39 29.15 68.09 38.94
math12 3 3583 52.00 9.59 30.14 71.37 41.23
female 4 3583 0.53 0.50 0.00 1.00 1.00
asian 5 3583 0.06 0.25 0.00 1.00 1.00
hispan 6 3583 0.11 0.31 0.00 1.00 1.00
black 7 3583 0.08 0.26 0.00 1.00 1.00
motheduc 8 3583 13.24 2.03 8.00 18.00 10.00
fatheduc 9 3583 13.55 2.26 8.00 18.00 10.00
lfaminc 10 3583 10.31 0.80 6.21 12.35 6.13
hsgrad 11 3583 0.93 0.26 0.00 1.00 1.00
cathhs 12 3583 0.07 0.26 0.00 1.00 1.00
parcath 13 3583 0.36 0.48 0.00 1.00 1.00
se
id 45436.17
read12 0.16
math12 0.16
female 0.01
asian 0.00
hispan 0.01
black 0.00
motheduc 0.03
fatheduc 0.04
lfaminc 0.01
hsgrad 0.00
cathhs 0.00
parcath 0.01
Boxplots – All Numeric
data_long <- train %>%
select(where(is.numeric)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(data_long, aes(x = Variable, y = Value)) +
geom_boxplot() +
coord_flip() +
facet_wrap(~ Variable, scales = "free", ncol=3) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
labs(title = "Horizontal Boxplot for Each Numeric Variable")library(DataExplorer)
plot_boxplot(train, by="female") Histograms
plot_histogram(train)ALl of the previous boxplots demostrate that the variable “lfaminc” is the one with the most outliers.
Scatterplots (depvar ~ all x)
plot_scatterplot(test, by="read12")The relation between the dependent variable and the independent variables suggested a positive link between a student’s reading score and their math performance, meaning that if the do good in one, they are likely to do good in the other. The correlation matrix revealed a correlation coefficient of 0.73 between reading and math scores, indicating a strong positive correlation. Furthermore, there is a slight relationship between parental education levels, with a correlation coefficient of 0.58 between father and mother education.
Correlation Matrix
plot_correlation(train, type="continuous")MODEL
Linear Regression Model
Estimate Coefficients and show coefficients table
install.packages("/path/to/flextable.tar.gz", repos = NULL, type = "source")
library(flextable)
library(magrittr)
model <- lm(log(read12) ~ female + asian + black + motheduc + fatheduc, data = train)
model %>% as_flextableEstimate | Standard Error | t value | Pr(>|t|) | ||
|---|---|---|---|---|---|
(Intercept) | 3.475 | 0.022 | 158.987 | 0.0000 | *** |
female | 0.033 | 0.006 | 5.494 | 0.0000 | *** |
asian | 0.024 | 0.012 | 1.949 | 0.0514 | . |
black | -0.118 | 0.011 | -10.353 | 0.0000 | *** |
motheduc | 0.013 | 0.002 | 7.012 | 0.0000 | *** |
fatheduc | 0.020 | 0.002 | 12.164 | 0.0000 | *** |
Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05 | |||||
Residual standard error: 0.1797 on 3577 degrees of freedom | |||||
Multiple R-squared: 0.1434, Adjusted R-squared: 0.1422 | |||||
F-statistic: 119.8 on 3577 and 5 DF, p-value: 0.0000 | |||||
This model suggests that the variables that are statistically most significant with a 95% level of confidence are their sex, if they are black and the level of their mother’s and father’s education.
Coefficient Magnitude Plot
install.packages("/path/to/coefplot.tar.gz", repos = NULL, type = "source")
library(coefplot)
coefplot(model, soft="magnitude", intercept=FALSE)The coefficient plot confirmed that the primary factors that can possible influence student’s reading performance is their family’s sex, Asian gender, income and level of education. On the other hand, the ones with the most negative impact where if they were black or hispan.
Check for predictor independence
library(car)
vif(model) female asian black motheduc fatheduc
1.004205 1.021323 1.011811 1.515853 1.534045
Residual Analysis
Residual Range
quantile(round(residuals(model, type = "deviance"),2)) 0% 25% 50% 75% 100%
-0.59 -0.12 0.03 0.13 0.43
Residual Plots
plot(model)This residual plots demonstrated that there is no collinearity between the variables that were used to predict.
Plot Fitted Value by Actual Value
library(ggplot2)
library(broom)
train2 <- augment(model, data=train)
p <- ggplot(train2, mapping = aes(y=.fitted, x=read12))
p + geom_point()Plot Residuals by Fitted Values
p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p + geom_point()Performance Evaluation
Use Model to Score test dataset (Display First 10 values - depvar and fitted values only)
library(flextable)
set_flextable_defaults(col_keys = list(col_key(read12 = ""), col_key(fit = ""), col_key(lwr = ""), col_key(upr = "")))
pred <- predict(model, newdata = test, interval = "predict")
test_w_predict <- cbind(test, pred)
flextable(head(test_w_predict[, c("read12", "fit", "lwr", "upr")], 10))read12 | fit | lwr | upr |
|---|---|---|---|
61.41 | 3.893477 | 3.540999 | 4.245954 |
59.33 | 3.906591 | 3.554048 | 4.259135 |
57.62 | 3.901052 | 3.548628 | 4.253476 |
52.53 | 3.881066 | 3.528606 | 4.233525 |
42.09 | 3.933449 | 3.581033 | 4.285866 |
63.95 | 3.926578 | 3.574107 | 4.279048 |
38.51 | 3.906591 | 3.554048 | 4.259135 |
60.28 | 3.992076 | 3.639549 | 4.344603 |
63.57 | 3.867951 | 3.515512 | 4.220390 |
52.56 | 3.973422 | 3.620950 | 4.325895 |
Plot Actual vs Fitted (test)
plot(test_w_predict$read12, test_w_predict$fit, pch=16, cex=1)The preceding charts illustrate the predicted and observed values, revealing a favorable alignment between the two.
Performance Metrics
library(Metrics)
metric_label <- c("MAE","RMSE", "MAPE")
metrics <- c(round(mae(test_w_predict$read12, test_w_predict$fit),4),
round(rmse(test_w_predict$read12, test_w_predict$fit),4),
round(mape(test_w_predict$read12, test_w_predict$fit),4))
pmtable <- data.frame(Metric=metric_label, Value = metrics)
flextable(pmtable)Metric | Value |
|---|---|
MAE | 47.6017 |
RMSE | 48.5153 |
MAPE | 0.9210 |
The performance metrics for this model are as follows: it yielded a mean absolute error (MAE) of 47.6018, a root mean squared error (RMSE) of 48.5153, and a mean absolute percentage error (MAPE) of 0.9210. Overall, these figures suggest that the final model was satisfactory but still necessitated further refinement.
Replication of first section of chapter 6
pip <- lm(log(read12) ~ female, data=train)
summary(pip)
Call:
lm(formula = log(read12) ~ female, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.54485 -0.13729 0.04129 0.15693 0.30883
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.912001 0.004704 831.656 < 0.0000000000000002 ***
female 0.024337 0.006480 3.756 0.000176 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1937 on 3581 degrees of freedom
Multiple R-squared: 0.003924, Adjusted R-squared: 0.003645
F-statistic: 14.11 on 1 and 3581 DF, p-value: 0.0001756
library(ggplot2)
p <- ggplot(data=train, mapping=aes(y=log(read12), x=female))
p + geom_point(alpha = 0.2) +
geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS"))p + geom_point(alpha=0.1) +
geom_smooth(color = "darkgreen", fill="darkgreen", method = MASS::rlm) +
geom_smooth(color = "orange", fill="orange", method = "lm")p + geom_point(alpha=0.1) +
geom_smooth(color = "darkgreen", method = "lm", linewidth = 1.2,
formula = y ~ splines::bs(x, 3), se = FALSE)p + geom_point(alpha=0.1) +
geom_quantile(color = "darkgreen", size = 1.2, method = "rqss",
lambda = 1, quantiles = c(0.20, 0.5, 0.85))SUMMARY ASSESSMENT AND EVALUATION OF THE MODEL
The developed model demonstrated some effectiveness, but it fell short of being truly robust. To improve predictive accuracy and confidence levels, further testing and the inclusion of variables would be necessary. The analysis highlighted several key factors influencing students’ reading scores, including their performance in math, gender, racial background, paternal education level, and household income.