# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "broom", "sjPlot", "ggplot2", "car", "Lock5Data", "pandoc", "mosaic") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
##
## Attaching package: 'mosaic'
##
##
## The following object is masked from 'package:Matrix':
##
## mean
##
##
## The following objects are masked from 'package:car':
##
## deltaMethod, logit
##
##
## The following object is masked from 'package:modelsummary':
##
## msummary
##
##
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
##
##
## The following object is masked from 'package:purrr':
##
## cross
##
##
## The following object is masked from 'package:ggplot2':
##
## stat
##
##
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
##
##
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "sjPlot" "broom" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[6]]
## [1] "sjPlot" "broom" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[7]]
## [1] "car" "carData" "sjPlot" "broom" "modelsummary"
## [6] "fst" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "Lock5Data" "car" "carData" "sjPlot" "broom"
## [6] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "pandoc" "Lock5Data" "car" "carData" "sjPlot"
## [6] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "mosaic" "mosaicData" "ggformula" "Matrix" "lattice"
## [6] "pandoc" "Lock5Data" "car" "carData" "sjPlot"
## [11] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
# Load the data
sat_gpa <- read_csv("sat_gpa.csv")
## Rows: 1000 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): sex
## dbl (5): sat_verbal, sat_math, sat_total, gpa_hs, gpa_fy
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Calculate and interpret the correlation between SAT math scores (sat_math) and freshman GPA (gpa_fy). Next, visualize this relationship with a scatterplot, including a regression line.
# Calculate the relationship between math SAT scores and freshman GPA
cor(sat_gpa$sat_math, sat_gpa$gpa_fy)
## [1] 0.3871178
Interpretation: The value is at an approximate of 0.38, showing a moderate and positive relationship.
## Visualizing it
ggplot(sat_gpa, aes(x = sat_math, y = gpa_fy)) +
geom_point(size = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "SAT Math Score", y = "Freshman GPA")
## `geom_smooth()` using formula = 'y ~ x'
Create regression tables to analyze the relationship between SAT scores (total and verbal) and freshman GPA, using both the modelsummary and sjPlot packages. Interpret.
m1 <- lm(gpa_fy ~ sat_total, data = sat_gpa)
m2 <- lm(gpa_fy ~ sat_verbal, data = sat_gpa)
m3 <- lm(gpa_fy ~ sat_total + sat_verbal, data = sat_gpa)
models <- list(m1, m2, m3)
modelsummary(models)
| (1) | (2) | (3) | |
|---|---|---|---|
| (Intercept) | 0.002 | 0.701 | 0.007 |
| (0.152) | (0.129) | (0.152) | |
| sat_total | 0.024 | 0.022 | |
| (0.001) | (0.003) | ||
| sat_verbal | 0.036 | 0.003 | |
| (0.003) | (0.005) | ||
| Num.Obs. | 1000 | 1000 | 1000 |
| R2 | 0.212 | 0.161 | 0.212 |
| R2 Adj. | 0.211 | 0.160 | 0.211 |
| AIC | 2004.8 | 2067.2 | 2006.4 |
| BIC | 2019.5 | 2081.9 | 2026.0 |
| Log.Lik. | -999.382 | -1030.580 | -999.189 |
| RMSE | 0.66 | 0.68 | 0.66 |
tab_model(m1, m2, m3)
| Â | gpa_fy | gpa_fy | gpa_fy | ||||||
|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 0.00 | -0.30 – 0.30 | 0.990 | 0.70 | 0.45 – 0.95 | <0.001 | 0.01 | -0.29 – 0.31 | 0.961 |
| sat total | 0.02 | 0.02 – 0.03 | <0.001 | 0.02 | 0.02 – 0.03 | <0.001 | |||
| sat verbal | 0.04 | 0.03 – 0.04 | <0.001 | 0.00 | -0.01 – 0.01 | 0.536 | |||
| Observations | 1000 | 1000 | 1000 | ||||||
| R2 / R2 adjusted | 0.212 / 0.211 | 0.161 / 0.160 | 0.212 / 0.211 | ||||||
Interpretations:
m1 (SAT total and freshman GPA): First, we have an intercept of 0.00 and a slope of 0.02, indicating that for every 1 unit increase in SAT score freshman GPA also increased by 0.02 units. That means this is a moderate and positive relationship. We then have a p-value of 0.990, indicating a non-significant relationship statistically (because 0.990>0.05). Lastly, the R square value equals 0.212 and the adjusted value of R square equals 0.211, meaning that SAT scores explain abput 21.1% of variability of freshman GPA.
m2 (SAT verbal and freshman GPA): First, we have an intercept of 0.70 and a slope of 0.04, indicating that for every 1 unit increase in SAT verbal score freshman GPA also increased by 0.04 units. That means this is a moderate and positive relationship. We then have a p-value of <0.001, indicating a significant relationship statistically (because <0.001<0.05). Lastly, the R square value equals 0.161 and the adjusted value of R square equals 0.160, meaning that SAT scores explain about 16% of variability of freshman GPA.
m3 (SAT total and verbal scores and freshman GPA):First, we have an intercept of 0.01 and a slope of 0.00, indicating that there is no relationship. We then have a p-value of 0.961, indicating a non-significant relationship statistically (because 0.961>0.05). Lastly, the R square value equals 0.212 and the adjusted value of R square equals 0.211, meaning that SAT verbal and total scores explain about 21.1% of variability of freshman GPA.
Explore the relationship between Happy Life Years (HLY) and GDP per Capita using the HappyPlanetIndex dataset. Visualize the coefficients using both the modelsummary and Sjplot packages.
data("HappyPlanetIndex")
World <- HappyPlanetIndex
plot(World$GDPperCapita, World$HLY, xlab = "GDP per Capita", ylab = "Happy Life Years")
m1 <- lm(HLY ~ GDPperCapita, data = World)
modelsummary(m1)
| (1) | |
|---|---|
| (Intercept) | 31.182 |
| (1.114) | |
| GDPperCapita | 0.001 |
| (0.000) | |
| Num.Obs. | 141 |
| R2 | 0.566 |
| R2 Adj. | 0.563 |
| AIC | 1043.2 |
| BIC | 1052.0 |
| Log.Lik. | -518.576 |
| RMSE | 9.57 |
tab_model(m1)
| Â | HLY | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 31.18 | 28.98 – 33.38 | <0.001 |
| GDPperCapita | 0.00 | 0.00 – 0.00 | <0.001 |
| Observations | 141 | ||
| R2 / R2 adjusted | 0.566 / 0.563 | ||
Visualize the coefficients of the models specified below using both modelplot from the modelsummary package and plot_model from the sjPlot package. Fit three models to predict happiness (Happiness) using different predictors:
Model 1: Life Expectancy (LifeExpectancy)
Model 2: Ecological Footprint (Footprint)
Model 3: GDP per Capita (GDPperCapita), Human Development Index (HDI), and Population (Population)
Now, let’s visualize with modelplot():
m1 <- lm(Happiness ~ LifeExpectancy, data = World)
m2 <- lm(Happiness ~ Footprint, data = World)
m3 <- lm(Happiness ~ GDPperCapita + HDI + Population, data = World)
Regression tables for the models using modelsummary:
models_world <- list(m1, m2, m3)
modelsummary(models_world)
| (1) | (2) | (3) | |
|---|---|---|---|
| (Intercept) | -1.104 | 4.713 | 1.528 |
| (0.399) | (0.150) | (0.357) | |
| LifeExpectancy | 0.104 | ||
| (0.006) | |||
| Footprint | 0.419 | ||
| (0.042) | |||
| GDPperCapita | 0.000 | ||
| (0.000) | |||
| HDI | 5.778 | ||
| (0.577) | |||
| Population | 0.001 | ||
| (0.000) | |||
| Num.Obs. | 143 | 143 | 141 |
| R2 | 0.693 | 0.414 | 0.707 |
| R2 Adj. | 0.691 | 0.410 | 0.700 |
| AIC | 332.7 | 425.0 | 327.6 |
| BIC | 341.6 | 433.9 | 342.3 |
| Log.Lik. | -163.359 | -209.524 | -158.779 |
| RMSE | 0.76 | 1.05 | 0.75 |
sjPlot:
tab_model(m1, m2, m3)
| Â | Happiness | Happiness | Happiness | ||||||
|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | -1.10 | -1.89 – -0.32 | 0.006 | 4.71 | 4.42 – 5.01 | <0.001 | 1.53 | 0.82 – 2.23 | <0.001 |
| LifeExpectancy | 0.10 | 0.09 – 0.11 | <0.001 | ||||||
| Footprint | 0.42 | 0.34 – 0.50 | <0.001 | ||||||
| GDPperCapita | 0.00 | -0.00 – 0.00 | 0.100 | ||||||
| HDI | 5.78 | 4.64 – 6.92 | <0.001 | ||||||
| Population | 0.00 | -0.00 – 0.00 | 0.247 | ||||||
| Observations | 143 | 143 | 141 | ||||||
| R2 / R2 adjusted | 0.693 / 0.691 | 0.414 / 0.410 | 0.707 / 0.700 | ||||||
Improved presentation of the regression table with modelsummary:
# Display the models summary table
modelsummary(models_world,
statistic = "conf.int",
coef_map = c("LifeExpectancy" = "Life Expectancy",
"Footprint" = "Ecological Footprint",
"GDPperCapita" = "GDP per Capita",
"HDI" = "Human Development Index",
"Population" = "Population (millions)"),
gof_map = c("nobs", "r.squared", "adj.r.squared", "aic", "bic"))
| (1) | (2) | (3) | |
|---|---|---|---|
| Life Expectancy | 0.104 | ||
| [0.092, 0.115] | |||
| Ecological Footprint | 0.419 | ||
| [0.336, 0.502] | |||
| GDP per Capita | 0.000 | ||
| [0.000, 0.000] | |||
| Human Development Index | 5.778 | ||
| [4.636, 6.920] | |||
| Population (millions) | 0.001 | ||
| [0.000, 0.001] | |||
| Num.Obs. | 143 | 143 | 141 |
| R2 | 0.693 | 0.414 | 0.707 |
| R2 Adj. | 0.691 | 0.410 | 0.700 |
| AIC | 332.7 | 425.0 | 327.6 |
| BIC | 341.6 | 433.9 | 342.3 |
# Plot model coefficients
modelplot(models_world, coef_omit = "Intercept") +
labs(x = 'Coefficient Estimate',
y = 'Term',
title = 'Model Coefficients with Confidence Intervals',
caption = 'Comparison of Models 1, 2, 3') +
theme_minimal()
and we can customize the plot:
# Customize the plot with ggplot2
modelplot(models_world, coef_omit = "Intercept") +
geom_point(aes(x = estimate, y = term, color = model, shape = model), size = 3) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high, y = term, color = model), height = 0.2) +
scale_color_brewer(palette = "Dark2") +
labs(x = 'Coefficient Estimate',
y = 'Term',
title = 'Model Coefficients with Confidence Intervals',
caption = 'Comparison of Models 1, 2, and 3') +
theme_minimal() +
theme(legend.position = "bottom")
Other plotting options:
# Facet the plot by model
modelplot(models_world, coef_omit = "Intercept") +
geom_point(aes(x = estimate, y = term, color = model, shape = model), size = 3) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high, y = term, color = model), height = 0.2) +
scale_color_brewer(palette = "Dark2") +
labs(x = 'Coefficient Estimate',
y = 'Term',
title = 'Model Coefficients with Confidence Intervals',
caption = 'Comparison of Models 1, 2, and 3') +
theme_minimal() +
theme(legend.position = "bottom") +
facet_wrap(~ model, scales = "free_x")
sj plot and standardized coefficient:
# Plot standardized coefficients using Gelman's approach
plot_model(m3,
type = "std2", # Gelman's standardization
show.values = TRUE,
show.p = FALSE,
ci.lvl = 0.95,
title = "Standardized Coefficients using Gelman Approach",
m.labels = c("Model 1: Happiness ~ LifeExpectancy",
"Model 2: Happiness ~ Footprint",
"Model 3: Happiness ~ GDPperCapita + HDI + Population")) +
theme_minimal() +
theme(legend.position = "bottom")
Predict murder rates using both internet penetration and GDP, from the Violence dataset. Create regression tables for the model using both the modelsummary and sjPlot packages. Interpret.
# Load the data
load("Violence.RData")
data("Violence")
## Warning in data("Violence"): data set 'Violence' not found
nrow(Violence)
## [1] 8
ncol(Violence)
## [1] 19
mean(Violence$MurderRate)
## [1] 22.55
Interpretation: the intercept, which is at a 37.85 value represents murder rates if GDP is at 0. This example is hard to find in real life, but serves as a baseline. The slope which is at a -0.0007 value, which indicates that murder rates actually decrease by this much for per 1% increase in GDP. The t-value is a -2.51 with an associated p-value of 0.0455. Therefore, we conclude that there is a statistically significant relationship between GDP and murder rates at a 5% significance level (because 0.0455<0.05). The residuals (differences between predicted vs. observed values) tange from -29.34 to 25.20, indicating a lot of variability within the model. The R square value is at a 51.33%, meaning approximately 51% of murder rates variability is explained by GDP. The adjusted R square value being slightly lower, is at a 43.22%, adjusting the number of predictors.
Interpretation: the intercept, which is at a 38.26 value represents murder rates if internet penetration is at 0. This example is hard to find in real life, but serves as a baseline. The slope which is at a -0.3668 value, which indicates that murder rates actually decrease by this much for per 1% increase in internet penetration. The t-value is a -1.305 with an associated p-value of 0.2397. Therefore, we conclude that there is no statistically significant relationship between internet penetration and murder rates at a 5% significance level (because 0.2397>0.05). The residuals (differences between predicted vs. observed values) tange from -23.81 to 41.755, indicating a lot of variability within the model. The R square value is at a 22.11%, meaning approximately 22% of murder rates variability is explained by GDP. The adjusted R square value being much lower, is at a 9.12%, adjusting the number of predictors.
model <- lm(MurderRate ~ Internet + GDP, data = Violence)
summary(model)
##
## Call:
## lm(formula = MurderRate ~ Internet + GDP, data = Violence)
##
## Residuals:
## Austria Belgium Guatemala Jamaica
## -2.507 -2.822 14.174 10.453
## Dominican Republic South Africa Ukraine United States
## -6.930 13.338 -29.231 3.526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.9842977 11.9299808 2.430 0.0594 .
## Internet 0.4629043 0.4385000 1.056 0.3394
## GDP -0.0013199 0.0006033 -2.188 0.0803 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.84 on 5 degrees of freedom
## Multiple R-squared: 0.602, Adjusted R-squared: 0.4429
## F-statistic: 3.782 on 2 and 5 DF, p-value: 0.09991
Interpretations:
We have an intercept of 28.98, meaning this is the rate of expected murders if both internet penetration and GDP equal 0. This cannot happen in real life, but serves as a baseline.
Internet penetration: The slope equals 0.4629, meaning per one percent increase in internet penetration, expected murder rates are also expected to increase by this amount. The t value equals 1.056 and has an associated p value is at a 0.339, indicating a non significant relationship statistically (because 0.339>0.05) at a 5% significance level.
GDP: The slope equals -0.0013, meaning per one percent increase in GDP, expected murder rates decrease by this amount. The t value equals -2.18 and has an associated p value is at a 0.080, indicating a non significant relationship statistically (because 0.080>0.05) at a 5% significance level.
The residuals range from -29.231 to 14.174, indicating a lot of variability within the model. R square is at a 0.602 and the adjusted R square at a 0.44, suggesting that 60.2% of murder rates can be explained by internet penetration and GDP.
tab_model(model)
| Â | MurderRate | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 28.98 | -1.68 – 59.65 | 0.059 |
| Internet | 0.46 | -0.66 – 1.59 | 0.339 |
| GDP | -0.00 | -0.00 – 0.00 | 0.080 |
| Observations | 8 | ||
| R2 / R2 adjusted | 0.602 / 0.443 | ||
plot_model(model, show.values = TRUE, show.p = TRUE)
predictions <- predict(model)
Violence$PredictedMurderRate <- predictions
Violence$PredictedMurderRate <- predict(model)
ggplot(Violence, aes(x = PredictedMurderRate, y = MurderRate)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Actual vs. Predicted Murder Rates",
x = "Predicted Murder Rate",
y = "Actual Murder Rate") +
theme_minimal()