Lab 6

Author

Jason Flores

options(scipen=999)

library(socviz)
load("Catholic_viz.RData")
head(Catholic_viz, n=20)

       id read12 math12 female asian hispan black motheduc fatheduc   lfaminc
1  124902  61.41  49.77      0     0      0     0       14     12.0 10.308950
2  124915  58.34  59.84      0     0      0     0       14     14.0 10.308950
3  124916  59.33  50.38      1     0      0     0       14     11.0 10.308950
4  124932  49.59  45.03      1     0      0     0       12     14.0 10.308950
5  124944  57.62  54.26      1     0      0     0       12     12.0 10.657260
6  124947  52.53  56.73      1     0      0     0       12     11.0 11.042920
7  124966  60.12  55.37      1     0      0     0       14     14.0  9.433484
8  124968  64.69  65.07      0     0      0     0       14     14.0 10.308950
9  124972  42.09  53.98      0     0      0     0       14     14.0 10.308950
10 124974  60.23  62.72      0     0      0     0       14     12.0  9.769957
11 124981  53.03  46.86      0     0      0     0       12     14.0 10.308950
12 124985  58.25  52.05      1     0      0     0       12     11.5 10.657260
13 124990  63.95  63.86      1     0      0     0       14     12.0 10.308950
14 124992  38.51  61.41      1     0      0     0       14     11.0  9.076809
15 124998  43.15  45.54      1     0      0     0       12     12.0  9.769957
16 175585  60.28  62.50      1     0      0     0       16     14.0 10.308950
17 180607  63.57  58.77      0     0      0     0       12     12.0  9.769957
18 180608  61.73  59.25      0     0      0     0       14     16.0 10.308950
19 180610  55.40  63.43      0     0      0     0       12     16.0 10.657260
20 180625  45.79  51.51      0     0      0     0       12     14.0 10.657260
   hsgrad cathhs parcath
1       1      0       1
2       1      0       1
3       1      0       1
4       1      0       1
5       1      0       1
6       1      0       1
7       1      0       1
8       1      0       1
9       1      0       1
10      1      0       1
11      1      0       1
12      1      0       1
13      1      0       1
14      1      0       1
15      1      0       1
16      1      0       1
17      1      1       1
18      1      1       0
19      1      1       0
20      1      1       1

library(caret)

Loading required package: ggplot2

Loading required package: lattice

set.seed(1)
index <- createDataPartition(Catholic_viz$read12, p=0.6, list=FALSE)
train <- Catholic_viz[index,]
test  <- Catholic_viz[-index,]

Training set number of observations: 3583
Test data set number of observations: 2387

library(psych)


Attaching package: 'psych'

The following objects are masked from 'package:ggplot2':

    %+%, alpha

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.2     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.0

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ psych::%+%()    masks ggplot2::%+%()
✖ psych::alpha()  masks ggplot2::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ purrr::lift()   masks caret::lift()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(flextable)


Attaching package: 'flextable'

The following object is masked from 'package:purrr':

    compose

psych::describe(train, fast=TRUE)

         vars    n       mean         sd       min        max      range
id          1 3583 4642399.08 2719725.86 124915.00 7979086.00 7854171.00
read12      2 3583      51.56       9.39     29.15      68.09      38.94
math12      3 3583      52.00       9.59     30.14      71.37      41.23
female      4 3583       0.53       0.50      0.00       1.00       1.00
asian       5 3583       0.06       0.25      0.00       1.00       1.00
hispan      6 3583       0.11       0.31      0.00       1.00       1.00
black       7 3583       0.08       0.26      0.00       1.00       1.00
motheduc    8 3583      13.24       2.03      8.00      18.00      10.00
fatheduc    9 3583      13.55       2.26      8.00      18.00      10.00
lfaminc    10 3583      10.31       0.80      6.21      12.35       6.13
hsgrad     11 3583       0.93       0.26      0.00       1.00       1.00
cathhs     12 3583       0.07       0.26      0.00       1.00       1.00
parcath    13 3583       0.36       0.48      0.00       1.00       1.00
               se
id       45436.17
read12       0.16
math12       0.16
female       0.01
asian        0.00
hispan       0.01
black        0.00
motheduc     0.03
fatheduc     0.04
lfaminc      0.01
hsgrad       0.00
cathhs       0.00
parcath      0.01

data_long <- train %>%
  select(where(is.numeric)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(data_long, aes(x = Variable, y = Value)) +
  geom_boxplot() +
  coord_flip() + 
  facet_wrap(~ Variable, scales = "free", ncol=3) +
  theme_minimal() +
  theme(axis.text.x = element_blank()) + 
  labs(title = "Horizontal Boxplot for Each Numeric Variable")

library(DataExplorer)
plot_boxplot(train, by="read12")

plot_histogram(train)

plot_scatterplot(test, by="read12")

plot_correlation(train, type="continuous")

model <- lm(log(read12) ~ id + female + asian +  hispan + black + motheduc + 
                                   fatheduc + lfaminc, data=train)
model %>% as_flextable

	Estimate	Standard Error	t value	Pr(>\|t\|)
(Intercept)	3.220	0.041	79.478	0.0000	***
id	0.000	0.000	2.050	0.0404	*
female	0.035	0.006	5.846	0.0000	***
asian	0.022	0.012	1.811	0.0701	.
hispan	-0.023	0.010	-2.351	0.0188	*
black	-0.105	0.012	-9.083	0.0000	***
motheduc	0.009	0.002	4.997	0.0000	***
fatheduc	0.016	0.002	9.246	0.0000	***
lfaminc	0.034	0.004	7.919	0.0000	***
Signif. codes: 0 <= '*' < 0.001 < '' < 0.01 < '*' < 0.05

Residual standard error: 0.1779 on 3574 degrees of freedom
Multiple R-squared: 0.1614, Adjusted R-squared: 0.1595
F-statistic: 85.99 on 3574 and 8 DF, p-value: 0.0000

library(coefplot)
coefplot(model, soft="magnitude", intercept=FALSE)

library(car)

Loading required package: carData


Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

The following object is masked from 'package:psych':

    logit

vif(model)

      id   female    asian   hispan    black motheduc fatheduc  lfaminc 
1.007639 1.006261 1.028496 1.094507 1.060495 1.598620 1.676037 1.343525

quantile(round(residuals(model, type = "deviance"),2))

   0%   25%   50%   75%  100% 
-0.60 -0.12  0.03  0.13  0.43

plot(model)

library(broom)
train2 <- augment(model, data=train)  

p <- ggplot(train2, mapping = aes(y=.fitted, x=read12))
p  + geom_point()

p <- ggplot(train2, mapping = aes(y=.resid, x=.fitted))
p  + geom_point()

pred <- predict(model, newdata = test, interval="predict")  #score test dataset wtih model
test_w_predict <- cbind(test, pred) 

test_w_predict %>%
  head(10) %>%
    select(id, read12) %>% 
  as_flextable(show_coltype = FALSE)

id	read12
124,902	61.4
124,916	59.3
124,944	57.6
124,947	52.5
124,972	42.1
124,990	64.0
124,992	38.5
175,585	60.3
180,607	63.6
180,660	52.6
n: 10

plot(test_w_predict$read12, test_w_predict$fit, pch=16, cex=1)

library(Metrics)


Attaching package: 'Metrics'

The following objects are masked from 'package:caret':

    precision, recall

metric_label <- c("MAE","RMSE", "MAPE")
metrics <- c(round(mae(test_w_predict$read12, test_w_predict$fit),4),
                         round(rmse(test_w_predict$read12, test_w_predict$fit),4),
                         round(mape(test_w_predict$read12, test_w_predict$fit),4))
pmtable <- data.frame(Metric=metric_label, Value = metrics)
flextable(pmtable)

Metric	Value
MAE	47.6012
RMSE	48.5145
MAPE	0.9210

pip <- lm(log(read12) ~ id, data=train)
summary(pip)


Call:
lm(formula = log(read12) ~ id, data = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.54595 -0.13452  0.04246  0.15703  0.31191 

Coefficients:
                  Estimate     Std. Error t value             Pr(>|t|)    
(Intercept) 3.906049100011 0.006403335009 610.002 < 0.0000000000000002 ***
id          0.000000004044 0.000000001190   3.398             0.000686 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1937 on 3581 degrees of freedom
Multiple R-squared:  0.003214,  Adjusted R-squared:  0.002936 
F-statistic: 11.55 on 1 and 3581 DF,  p-value: 0.0006858

library(ggplot2)
p <- ggplot(data=train, mapping=aes(y=log(read12), x=id))

p +  geom_point(alpha = 0.2) + 
       geom_smooth(method = "lm", aes(color = "OLS", fill = "OLS"))

`geom_smooth()` using formula = 'y ~ x'

p + geom_point(alpha=0.1) +
    geom_smooth(color = "tomato", fill="tomato", method = MASS::rlm) +
    geom_smooth(color = "steelblue", fill="steelblue", method = "lm")

`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

p + geom_point(alpha=0.1) +
    geom_smooth(color = "tomato", method = "lm", linewidth = 1.2, 
                formula = y ~ splines::bs(x, 3), se = FALSE)

p + geom_point(alpha=0.1) +
    geom_quantile(color = "tomato", size = 1.2, method = "rqss",
                  lambda = 1, quantiles = c(0.20, 0.5, 0.85))

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Smoothing formula not specified. Using: y ~ qss(x, lambda = 1)

Reflection

After viewing the data set and interpreting the results, we can really see the difference in the factors of the Catholic_Viz data. This data shows us many numerical data on if certain types of people are Catholic or not. We can see the amount of people that are Catholic from every race. We used “read12” as the dependent variable to split the data set into train and test categories. We created different models that walk through the performance of each variable. Each model helps us understand the data more and more./