options (scipen= 999 )
library (socviz)
load ("Catholic_viz.RData" )
head (Catholic_viz, n= 20 )
id read12 math12 female asian hispan black motheduc fatheduc lfaminc
1 124902 61.41 49.77 0 0 0 0 14 12.0 10.308950
2 124915 58.34 59.84 0 0 0 0 14 14.0 10.308950
3 124916 59.33 50.38 1 0 0 0 14 11.0 10.308950
4 124932 49.59 45.03 1 0 0 0 12 14.0 10.308950
5 124944 57.62 54.26 1 0 0 0 12 12.0 10.657260
6 124947 52.53 56.73 1 0 0 0 12 11.0 11.042920
7 124966 60.12 55.37 1 0 0 0 14 14.0 9.433484
8 124968 64.69 65.07 0 0 0 0 14 14.0 10.308950
9 124972 42.09 53.98 0 0 0 0 14 14.0 10.308950
10 124974 60.23 62.72 0 0 0 0 14 12.0 9.769957
11 124981 53.03 46.86 0 0 0 0 12 14.0 10.308950
12 124985 58.25 52.05 1 0 0 0 12 11.5 10.657260
13 124990 63.95 63.86 1 0 0 0 14 12.0 10.308950
14 124992 38.51 61.41 1 0 0 0 14 11.0 9.076809
15 124998 43.15 45.54 1 0 0 0 12 12.0 9.769957
16 175585 60.28 62.50 1 0 0 0 16 14.0 10.308950
17 180607 63.57 58.77 0 0 0 0 12 12.0 9.769957
18 180608 61.73 59.25 0 0 0 0 14 16.0 10.308950
19 180610 55.40 63.43 0 0 0 0 12 16.0 10.657260
20 180625 45.79 51.51 0 0 0 0 12 14.0 10.657260
hsgrad cathhs parcath
1 1 0 1
2 1 0 1
3 1 0 1
4 1 0 1
5 1 0 1
6 1 0 1
7 1 0 1
8 1 0 1
9 1 0 1
10 1 0 1
11 1 0 1
12 1 0 1
13 1 0 1
14 1 0 1
15 1 0 1
16 1 0 1
17 1 1 1
18 1 1 0
19 1 1 0
20 1 1 1
Loading required package: ggplot2
Loading required package: lattice
set.seed (1 )
index <- createDataPartition (Catholic_viz$ read12, p= 0.6 , list= FALSE )
train <- Catholic_viz[index,]
test <- Catholic_viz[- index,]
Training set number of observations: 3583
Test data set number of observations: 2387
Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':
%+%, alpha
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.3 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ lubridate 1.9.2 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ psych::%+%() masks ggplot2::%+%()
✖ psych::alpha() masks ggplot2::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ purrr::lift() masks caret::lift()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'flextable'
The following object is masked from 'package:purrr':
compose
psych:: describe (train, fast= TRUE )
vars n mean sd min max range
id 1 3583 4642399.08 2719725.86 124915.00 7979086.00 7854171.00
read12 2 3583 51.56 9.39 29.15 68.09 38.94
math12 3 3583 52.00 9.59 30.14 71.37 41.23
female 4 3583 0.53 0.50 0.00 1.00 1.00
asian 5 3583 0.06 0.25 0.00 1.00 1.00
hispan 6 3583 0.11 0.31 0.00 1.00 1.00
black 7 3583 0.08 0.26 0.00 1.00 1.00
motheduc 8 3583 13.24 2.03 8.00 18.00 10.00
fatheduc 9 3583 13.55 2.26 8.00 18.00 10.00
lfaminc 10 3583 10.31 0.80 6.21 12.35 6.13
hsgrad 11 3583 0.93 0.26 0.00 1.00 1.00
cathhs 12 3583 0.07 0.26 0.00 1.00 1.00
parcath 13 3583 0.36 0.48 0.00 1.00 1.00
se
id 45436.17
read12 0.16
math12 0.16
female 0.01
asian 0.00
hispan 0.01
black 0.00
motheduc 0.03
fatheduc 0.04
lfaminc 0.01
hsgrad 0.00
cathhs 0.00
parcath 0.01
data_long <- train %>%
select (where (is.numeric)) %>%
pivot_longer (cols = everything (), names_to = "Variable" , values_to = "Value" )
ggplot (data_long, aes (x = Variable, y = Value)) +
geom_boxplot () +
coord_flip () +
facet_wrap (~ Variable, scales = "free" , ncol= 3 ) +
theme_minimal () +
theme (axis.text.x = element_blank ()) +
labs (title = "Horizontal Boxplot for Each Numeric Variable" )
library (DataExplorer)
plot_boxplot (train, by= "read12" )
plot_scatterplot (test, by= "read12" )
plot_correlation (train, type= "continuous" )
model <- lm (log (read12) ~ id + female + asian + hispan + black + motheduc +
fatheduc + lfaminc, data= train)
model %>% as_flextable
Estimate
Standard Error
t value
Pr(>|t|)
(Intercept)
3.220
0.041
79.478
0.0000
***
id
0.000
0.000
2.050
0.0404
*
female
0.035
0.006
5.846
0.0000
***
asian
0.022
0.012
1.811
0.0701
.
hispan
-0.023
0.010
-2.351
0.0188
*
black
-0.105
0.012
-9.083
0.0000
***
motheduc
0.009
0.002
4.997
0.0000
***
fatheduc
0.016
0.002
9.246
0.0000
***
lfaminc
0.034
0.004
7.919
0.0000
***
Signif. codes: 0 <= '***' < 0.001 < '**' < 0.01 < '*' < 0.05
Residual standard error: 0.1779 on 3574 degrees of freedom
Multiple R-squared: 0.1614, Adjusted R-squared: 0.1595
F-statistic: 85.99 on 3574 and 8 DF, p-value: 0.0000
library (coefplot)
coefplot (model, soft= "magnitude" , intercept= FALSE )
Loading required package: carData
The following object is masked from 'package:dplyr':
recode
The following object is masked from 'package:purrr':
some
The following object is masked from 'package:psych':
logit
id female asian hispan black motheduc fatheduc lfaminc
1.007639 1.006261 1.028496 1.094507 1.060495 1.598620 1.676037 1.343525
quantile (round (residuals (model, type = "deviance" ),2 ))
0% 25% 50% 75% 100%
-0.60 -0.12 0.03 0.13 0.43
library (broom)
train2 <- augment (model, data= train)
p <- ggplot (train2, mapping = aes (y= .fitted, x= read12))
p + geom_point ()
p <- ggplot (train2, mapping = aes (y= .resid, x= .fitted))
p + geom_point ()
pred <- predict (model, newdata = test, interval= "predict" ) #score test dataset wtih model
test_w_predict <- cbind (test, pred)
test_w_predict %>%
head (10 ) %>%
select (id, read12) %>%
as_flextable (show_coltype = FALSE )
id
read12
124,902
61.4
124,916
59.3
124,944
57.6
124,947
52.5
124,972
42.1
124,990
64.0
124,992
38.5
175,585
60.3
180,607
63.6
180,660
52.6
n: 10
plot (test_w_predict$ read12, test_w_predict$ fit, pch= 16 , cex= 1 )
Attaching package: 'Metrics'
The following objects are masked from 'package:caret':
precision, recall
metric_label <- c ("MAE" ,"RMSE" , "MAPE" )
metrics <- c (round (mae (test_w_predict$ read12, test_w_predict$ fit),4 ),
round (rmse (test_w_predict$ read12, test_w_predict$ fit),4 ),
round (mape (test_w_predict$ read12, test_w_predict$ fit),4 ))
pmtable <- data.frame (Metric= metric_label, Value = metrics)
flextable (pmtable)
Metric
Value
MAE
47.6012
RMSE
48.5145
MAPE
0.9210
pip <- lm (log (read12) ~ id, data= train)
summary (pip)
Call:
lm(formula = log(read12) ~ id, data = train)
Residuals:
Min 1Q Median 3Q Max
-0.54595 -0.13452 0.04246 0.15703 0.31191
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.906049100011 0.006403335009 610.002 < 0.0000000000000002 ***
id 0.000000004044 0.000000001190 3.398 0.000686 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1937 on 3581 degrees of freedom
Multiple R-squared: 0.003214, Adjusted R-squared: 0.002936
F-statistic: 11.55 on 1 and 3581 DF, p-value: 0.0006858
library (ggplot2)
p <- ggplot (data= train, mapping= aes (y= log (read12), x= id))
p + geom_point (alpha = 0.2 ) +
geom_smooth (method = "lm" , aes (color = "OLS" , fill = "OLS" ))
`geom_smooth()` using formula = 'y ~ x'
p + geom_point (alpha= 0.1 ) +
geom_smooth (color = "tomato" , fill= "tomato" , method = MASS:: rlm) +
geom_smooth (color = "steelblue" , fill= "steelblue" , method = "lm" )
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
p + geom_point (alpha= 0.1 ) +
geom_smooth (color = "tomato" , method = "lm" , linewidth = 1.2 ,
formula = y ~ splines:: bs (x, 3 ), se = FALSE )
p + geom_point (alpha= 0.1 ) +
geom_quantile (color = "tomato" , size = 1.2 , method = "rqss" ,
lambda = 1 , quantiles = c (0.20 , 0.5 , 0.85 ))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Smoothing formula not specified. Using: y ~ qss(x, lambda = 1)
Reflection
After viewing the data set and interpreting the results, we can really see the difference in the factors of the Catholic_Viz data. This data shows us many numerical data on if certain types of people are Catholic or not. We can see the amount of people that are Catholic from every race. We used “read12” as the dependent variable to split the data set into train and test categories. We created different models that walk through the performance of each variable. Each model helps us understand the data more and more./