Lab 6

Author

Jason Flores

options(scipen=999)

library(socviz)
load("Catholic_viz.RData")
head(Catholic_viz, n=20)
       id read12 math12 female asian hispan black motheduc fatheduc   lfaminc
1  124902  61.41  49.77      0     0      0     0       14     12.0 10.308950
2  124915  58.34  59.84      0     0      0     0       14     14.0 10.308950
3  124916  59.33  50.38      1     0      0     0       14     11.0 10.308950
4  124932  49.59  45.03      1     0      0     0       12     14.0 10.308950
5  124944  57.62  54.26      1     0      0     0       12     12.0 10.657260
6  124947  52.53  56.73      1     0      0     0       12     11.0 11.042920
7  124966  60.12  55.37      1     0      0     0       14     14.0  9.433484
8  124968  64.69  65.07      0     0      0     0       14     14.0 10.308950
9  124972  42.09  53.98      0     0      0     0       14     14.0 10.308950
10 124974  60.23  62.72      0     0      0     0       14     12.0  9.769957
11 124981  53.03  46.86      0     0      0     0       12     14.0 10.308950
12 124985  58.25  52.05      1     0      0     0       12     11.5 10.657260
13 124990  63.95  63.86      1     0      0     0       14     12.0 10.308950
14 124992  38.51  61.41      1     0      0     0       14     11.0  9.076809
15 124998  43.15  45.54      1     0      0     0       12     12.0  9.769957
16 175585  60.28  62.50      1     0      0     0       16     14.0 10.308950
17 180607  63.57  58.77      0     0      0     0       12     12.0  9.769957
18 180608  61.73  59.25      0     0      0     0       14     16.0 10.308950
19 180610  55.40  63.43      0     0      0     0       12     16.0 10.657260
20 180625  45.79  51.51      0     0      0     0       12     14.0 10.657260
   hsgrad cathhs parcath
1       1      0       1
2       1      0       1
3       1      0       1
4       1      0       1
5       1      0       1
6       1      0       1
7       1      0       1
8       1      0       1
9       1      0       1
10      1      0       1
11      1      0       1
12      1      0       1
13      1      0       1
14      1      0       1
15      1      0       1
16      1      0       1
17      1      1       1
18      1      1       0
19      1      1       0
20      1      1       1
library(caret)
Loading required package: ggplot2
Loading required package: lattice
set.seed(1)
index <- createDataPartition(Catholic_viz$read12, p=0.6, list=FALSE)
train <- Catholic_viz[index,]
test  <- Catholic_viz[-index,]

Training set number of observations: 3583
Test data set number of observations: 2387

library(psych)

Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':

    %+%, alpha
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.2     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ psych::%+%()    masks ggplot2::%+%()
✖ psych::alpha()  masks ggplot2::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ purrr::lift()   masks caret::lift()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flextable)

Attaching package: 'flextable'

The following object is masked from 'package:purrr':

    compose
psych::describe(train, fast=TRUE) 
         vars    n       mean         sd       min        max      range
id          1 3583 4642399.08 2719725.86 124915.00 7979086.00 7854171.00
read12      2 3583      51.56       9.39     29.15      68.09      38.94
math12      3 3583      52.00       9.59     30.14      71.37      41.23
female      4 3583       0.53       0.50      0.00       1.00       1.00
asian       5 3583       0.06       0.25      0.00       1.00       1.00
hispan      6 3583       0.11       0.31      0.00       1.00       1.00
black       7 3583       0.08       0.26      0.00       1.00       1.00
motheduc    8 3583      13.24       2.03      8.00      18.00      10.00
fatheduc    9 3583      13.55       2.26      8.00      18.00      10.00
lfaminc    10 3583      10.31       0.80      6.21      12.35       6.13
hsgrad     11 3583       0.93       0.26      0.00       1.00       1.00
cathhs     12 3583       0.07       0.26      0.00       1.00       1.00
parcath    13 3583       0.36       0.48      0.00       1.00       1.00
               se
id       45436.17
read12       0.16
math12       0.16
female       0.01
asian        0.00
hispan       0.01
black        0.00
motheduc     0.03
fatheduc     0.04
lfaminc      0.01
hsgrad       0.00
cathhs       0.00
parcath      0.01
data_long <- train %>%
  select(where(is.numeric)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(data_long, aes(x = Variable, y = Value)) +
  geom_boxplot() +
  coord_flip() + 
  facet_wrap(~ Variable, scales = "free", ncol=3) +
  theme_minimal() +
  theme(axis.text.x = element_blank()) + 
  labs(title = "Horizontal Boxplot for Each Numeric Variable")

library(DataExplorer)
plot_boxplot(train, by="read12") 

plot_histogram(train)

plot_scatterplot(test, by="read12")

plot_correlation(train, type="continuous")

model <- lm(log(read12) ~ id + female + asian +  hispan + black + motheduc + 
                                   fatheduc + lfaminc, data=train)
model %>% as_flextable

Estimate

Standard Error

t value

Pr(>|t|)

(Intercept)

3.220

0.041

79.478

0.0000

***

id

0.000

0.000

2.050

0.0404

*

female

0.035

0.006

5.846

0.0000

***

asian

0.022

0.012

1.811

0.0701

.

hispan

-0.023

0.010

-2.351

0.0188

*

black

-0.105

0.012

-9.083

0.0000

***

motheduc

0.009

0.002

4.997

0.0000

***

fatheduc

0.016

0.002

9.246

0.0000

***

lfaminc

0.034

0.004

7.919

0.0000

***

Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05

Residual standard error: 0.1779 on 3574 degrees of freedom

Multiple R-squared: 0.1614, Adjusted R-squared: 0.1595

F-statistic: 85.99 on 3574 and 8 DF, p-value: 0.0000

library(coefplot)
coefplot(model, soft="magnitude", intercept=FALSE)

library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
The following object is masked from 'package:purrr':

    some
The following object is masked from 'package:psych':

    logit
vif(model)
      id   female    asian   hispan    black motheduc fatheduc  lfaminc 
1.007639 1.006261 1.028496 1.094507 1.060495 1.598620 1.676037 1.343525 
quantile(round(residuals(model, type = "deviance"),2))
   0%   25%   50%   75%  100% 
-0.60 -0.12  0.03  0.13  0.43 
plot(model)

library(broom)
train2 <- augment(model, data=train)  

p <- ggplot(train2, mapping = aes(y=.fitted, x=read12))
p  + geom_point()

p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p  + geom_point()

pred <- predict(model, newdata = test, interval="predict")  #score test dataset wtih model
test_w_predict <- cbind(test, pred) 

test_w_predict %>%
  head(10) %>%
    select(id, read12) %>% 
  as_flextable(show_coltype = FALSE) 

id

read12

124,902

61.4

124,916

59.3

124,944

57.6

124,947

52.5

124,972

42.1

124,990

64.0

124,992

38.5

175,585

60.3

180,607

63.6

180,660

52.6

n: 10

plot(test_w_predict$read12, test_w_predict$fit, pch=16, cex=1)

library(Metrics)

Attaching package: 'Metrics'
The following objects are masked from 'package:caret':

    precision, recall
metric_label <- c("MAE","RMSE", "MAPE")
metrics <- c(round(mae(test_w_predict$read12, test_w_predict$fit),4),
                         round(rmse(test_w_predict$read12, test_w_predict$fit),4),
                         round(mape(test_w_predict$read12, test_w_predict$fit),4))
pmtable <- data.frame(Metric=metric_label, Value = metrics)
flextable(pmtable)

Metric

Value

MAE

47.6012

RMSE

48.5145

MAPE

0.9210

pip <- lm(log(read12) ~ id, data=train)
summary(pip)

Call:
lm(formula = log(read12) ~ id, data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.54595 -0.13452  0.04246  0.15703  0.31191 

Coefficients:
                  Estimate     Std. Error t value             Pr(>|t|)    
(Intercept) 3.906049100011 0.006403335009 610.002 < 0.0000000000000002 ***
id          0.000000004044 0.000000001190   3.398             0.000686 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1937 on 3581 degrees of freedom
Multiple R-squared:  0.003214,  Adjusted R-squared:  0.002936 
F-statistic: 11.55 on 1 and 3581 DF,  p-value: 0.0006858
library(ggplot2)
p <- ggplot(data=train, mapping=aes(y=log(read12), x=id))

p +  geom_point(alpha = 0.2) + 
       geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS"))
`geom_smooth()` using formula = 'y ~ x'

p + geom_point(alpha=0.1) +
    geom_smooth(color = "tomato", fill="tomato", method = MASS::rlm) +
    geom_smooth(color = "steelblue", fill="steelblue", method = "lm")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

p + geom_point(alpha=0.1) +
    geom_smooth(color = "tomato", method = "lm", linewidth = 1.2, 
                formula = y ~ splines::bs(x, 3), se = FALSE)

p + geom_point(alpha=0.1) +
    geom_quantile(color = "tomato", size = 1.2, method = "rqss",
                  lambda = 1, quantiles = c(0.20, 0.5, 0.85))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Smoothing formula not specified. Using: y ~ qss(x, lambda = 1)

Reflection

After viewing the data set and interpreting the results, we can really see the difference in the factors of the Catholic_Viz data. This data shows us many numerical data on if certain types of people are Catholic or not. We can see the amount of people that are Catholic from every race. We used “read12” as the dependent variable to split the data set into train and test categories. We created different models that walk through the performance of each variable. Each model helps us understand the data more and more./