# List of packages
packages <- c("tidyverse", "fst", "modelsummary", "broom", "sjPlot", "ggplot2", "car", "Lock5Data", "pandoc", "mosaic") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
##
## Loading required package: carData
##
##
## Attaching package: 'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
##
## Attaching package: 'mosaic'
##
##
## The following object is masked from 'package:Matrix':
##
## mean
##
##
## The following objects are masked from 'package:car':
##
## deltaMethod, logit
##
##
## The following object is masked from 'package:modelsummary':
##
## msummary
##
##
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
##
##
## The following object is masked from 'package:purrr':
##
## cross
##
##
## The following object is masked from 'package:ggplot2':
##
## stat
##
##
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
##
##
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "sjPlot" "broom" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[6]]
## [1] "sjPlot" "broom" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[7]]
## [1] "car" "carData" "sjPlot" "broom" "modelsummary"
## [6] "fst" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "Lock5Data" "car" "carData" "sjPlot" "broom"
## [6] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "pandoc" "Lock5Data" "car" "carData" "sjPlot"
## [6] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "mosaic" "mosaicData" "ggformula" "Matrix" "lattice"
## [6] "pandoc" "Lock5Data" "car" "carData" "sjPlot"
## [11] "broom" "modelsummary" "fst" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
library(modelsummary)
library(tidyverse)
library(dplyr)
library(sjPlot)
library(tidyverse)
library(ggplot2)
library(pandoc)
library(readr)
sat_gpa <- read_csv("Desktop/sat_gpa.csv")
## Rows: 1000 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): sex
## dbl (5): sat_verbal, sat_math, sat_total, gpa_hs, gpa_fy
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cor(sat_gpa$gpa_fy, sat_gpa$sat_math)
## [1] 0.3871178
The output shows a value of 0.38, indicating a weak to moderate positive relationship between SAT Math results and Freshman GPA.
ggplot(sat_gpa, aes(x = sat_math, y = gpa_fy)) +
geom_point(size = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "SAT Math Score", y = "Freshman GPA")
## `geom_smooth()` using formula = 'y ~ x'
m1 <- lm(gpa_fy ~ sat_total, data = sat_gpa)
m2 <- lm(gpa_fy ~ sat_math, data = sat_gpa)
models_sat_gpa <- list(m1, m2)
modelsummary(models_sat_gpa)
| (1) | (2) | |
|---|---|---|
| (Intercept) | 0.002 | 0.622 |
| (0.152) | (0.141) | |
| sat_total | 0.024 | |
| (0.001) | ||
| sat_math | 0.034 | |
| (0.003) | ||
| Num.Obs. | 1000 | 1000 |
| R2 | 0.212 | 0.150 |
| R2 Adj. | 0.211 | 0.149 |
| AIC | 2004.8 | 2080.5 |
| BIC | 2019.5 | 2095.2 |
| Log.Lik. | -999.382 | -1037.243 |
| RMSE | 0.66 | 0.68 |
tab_model(m1, m2)
| gpa_fy | gpa_fy | |||||
|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 0.00 | -0.30 – 0.30 | 0.990 | 0.62 | 0.35 – 0.90 | <0.001 |
| sat total | 0.02 | 0.02 – 0.03 | <0.001 | |||
| sat math | 0.03 | 0.03 – 0.04 | <0.001 | |||
| Observations | 1000 | 1000 | ||||
| R2 / R2 adjusted | 0.212 / 0.211 | 0.150 / 0.149 | ||||
In Model 1, the intercept is 0.002 with a 95% confidence interval (CI) ranging from -0.30 to 0.30 and a p-value of 0.990, indicating that it is not statistically significant. The coefficient for the SAT total score is 0.024 with a 95% CI of 0.02 to 0.03 and a p-value of less than 0.001, indicating strong statistical significance. This suggests that for each one-point increase in the SAT total score, the freshman year GPA is expected to increase by 0.024 points, holding all other factors constant. The R² value is 0.212, indicating that approximately 21.2% of the variance in freshman year GPA can be explained by the SAT total score.
In Model 2, the intercept is 0.622 with a 95% CI of 0.35 to 0.90 and a p-value of less than 0.001, indicating strong statistical significance. The coefficient for the SAT math score is 0.034 with a 95% CI of 0.03 to 0.04 and a p-value of less than 0.001, indicating strong statistical significance. This suggests that for each one-point increase in the SAT math score, the freshman year GPA is expected to increase by 0.034 points, holding all other factors constant. The R² value is 0.150, indicating that approximately 15.0% of the variance in freshman year GPA can be explained by the SAT math score.
data("HappyPlanetIndex")
World <- HappyPlanetIndex
m3 <- lm(HLY ~ GDPperCapita, data = World)
models_world <- list(m3)
modelsummary(m3)
| (1) | |
|---|---|
| (Intercept) | 31.182 |
| (1.114) | |
| GDPperCapita | 0.001 |
| (0.000) | |
| Num.Obs. | 141 |
| R2 | 0.566 |
| R2 Adj. | 0.563 |
| AIC | 1043.2 |
| BIC | 1052.0 |
| Log.Lik. | -518.576 |
| RMSE | 9.57 |
tab_model(m3)
| HLY | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 31.18 | 28.98 – 33.38 | <0.001 |
| GDPperCapita | 0.00 | 0.00 – 0.00 | <0.001 |
| Observations | 141 | ||
| R2 / R2 adjusted | 0.566 / 0.563 | ||
modelplot(models_world, coef_omit = "Intercept") +
geom_point(aes(x = estimate, y = term, color = model, shape = model), size = 3) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high, y = term, color = model), height = 0.2) +
scale_color_brewer(palette = "Dark2") +
labs(x = 'Coefficient Estimate',
y = 'Term',
title = 'ModelSummary Visual on Model 3',
caption = 'Comparison of HLY and GDP') +
theme_minimal() +
theme(legend.position = "bottom")
plot_model(m3,
type = "std2", # Gelman's standardization
show.values = TRUE,
show.p = FALSE,
ci.lvl = 0.95,
title = "SjPlot Visual on Model 3",
m.labels = c("Model 3: Happy Life Years ~ GDPperCapita")) +
theme_minimal() +
theme(legend.position = "bottom")
m4 <- lm(Happiness ~ LifeExpectancy, data = World)
m5 <- lm(Happiness ~ Footprint, data = World)
m6 <- lm(Happiness ~ GDPperCapita + HDI + Population, data = World)
models_world_2 <- list(m4, m5, m6)
modelsummary(models_world_2)
| (1) | (2) | (3) | |
|---|---|---|---|
| (Intercept) | -1.104 | 4.713 | 1.528 |
| (0.399) | (0.150) | (0.357) | |
| LifeExpectancy | 0.104 | ||
| (0.006) | |||
| Footprint | 0.419 | ||
| (0.042) | |||
| GDPperCapita | 0.000 | ||
| (0.000) | |||
| HDI | 5.778 | ||
| (0.577) | |||
| Population | 0.001 | ||
| (0.000) | |||
| Num.Obs. | 143 | 143 | 141 |
| R2 | 0.693 | 0.414 | 0.707 |
| R2 Adj. | 0.691 | 0.410 | 0.700 |
| AIC | 332.7 | 425.0 | 327.6 |
| BIC | 341.6 | 433.9 | 342.3 |
| Log.Lik. | -163.359 | -209.524 | -158.779 |
| RMSE | 0.76 | 1.05 | 0.75 |
modelplot(models_world_2, coef_omit = "Intercept") +
labs(x = 'Coefficient Estimate',
y = 'Term',
title = 'Model Coefficients with Confidence Intervals',
caption = 'Comparison of Models 4, 5, and 6') +
theme_minimal()
plot_model(m6,
type = "std2",
show.values = TRUE,
show.p = FALSE,
ci.lvl = 0.55,
title = "Standardized Coefficients using Gelman Approach",
m.labels = c("Model 4: Happiness ~ LifeExpectancy",
"Model 5: Happiness ~ Footprint",
"Model 6: Happiness ~ GDPperCapita + HDI + Population")) +
theme_minimal() +
theme(legend.position = "bottom")
load("~/Downloads/Violence.RData")
list(Violence)
## [[1]]
## Country Code LandArea Population Energy Rural
## Austria Austria AUT 82450 8.337 33246 32.8
## Belgium Belgium BEL 30280 10.708 58583 2.6
## Guatemala Guatemala GUA 107160 13.686 8072 51.4
## Jamaica Jamaica JAM 10830 2.687 4387 46.7
## Dominican Republic Dominican Republic DOM 48320 9.953 8162 31.0
## South Africa South Africa RSA 1214470 48.793 134489 39.3
## Ukraine Ukraine UKR 579320 46.258 136143 32.0
## United States United States USA 9147420 304.375 2283722 18.3
## Military Health HIV Internet Developed BirthRate ElderlyPop
## Austria 2.4 15.8 0.3 72.9 2 9.3 17.0
## Belgium 2.7 14.8 0.2 70.5 2 11.7 17.2
## Guatemala 3.6 15.9 0.8 14.3 1 33.0 4.4
## Jamaica 1.6 5.7 1.7 57.3 2 16.7 7.7
## Dominican Republic 3.8 10.4 0.9 20.8 1 22.5 5.9
## South Africa 4.3 10.4 17.9 8.6 2 22.0 4.4
## Ukraine 7.2 8.6 1.1 22.5 2 11.0 15.9
## United States 18.6 18.7 0.6 75.8 2 14.3 12.6
## LifeExpectancy CO2 GDP Cell Electricity
## Austria 80.2 8.1235965 45209.396 145.99132 7944.3892
## Belgium 80.4 9.7927294 43144.343 111.71857 7903.0293
## Guatemala 70.3 0.8702226 2862.367 125.56855 548.1122
## Jamaica 71.8 4.5414469 5274.037 114.83936 1901.6175
## Dominican Republic 72.6 2.2366354 5214.537 89.57889 1358.1914
## South Africa 51.5 8.9332027 7275.344 100.76153 4532.0219
## Ukraine 68.3 6.9940481 3006.915 117.56705 3200.4655
## United States 78.4 17.9417289 47198.504 90.24406 12903.8067
## MurderRate
## Austria 0.55
## Belgium 1.85
## Guatemala 46.00
## Jamaica 59.00
## Dominican Republic 24.80
## South Africa 36.70
## Ukraine 6.20
## United States 5.30
m7 <- lm(MurderRate ~ Internet + GDP, data = Violence)
summary(m7)
##
## Call:
## lm(formula = MurderRate ~ Internet + GDP, data = Violence)
##
## Residuals:
## Austria Belgium Guatemala Jamaica
## -2.507 -2.822 14.174 10.453
## Dominican Republic South Africa Ukraine United States
## -6.930 13.338 -29.231 3.526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.9842977 11.9299808 2.430 0.0594 .
## Internet 0.4629043 0.4385000 1.056 0.3394
## GDP -0.0013199 0.0006033 -2.188 0.0803 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.84 on 5 degrees of freedom
## Multiple R-squared: 0.602, Adjusted R-squared: 0.4429
## F-statistic: 3.782 on 2 and 5 DF, p-value: 0.09991
Intercept: The intercept in our regression model is estimated to be 28.98 with a standard error of 11.93. This value represents the expected murder rate when both internet usage and GDP are zero. While this scenario is not practically meaningful (since it’s unlikely to have zero internet usage and zero GDP), it serves as a baseline for our model. The associated p-value is 0.0594, which is slightly above the conventional significance level of 0.05, indicating that the intercept is not statistically significant but is close to being so.
Internet: The coefficient for the Internet variable is 0.46 with a standard error of 0.44. This indicates that for each one-unit increase in internet usage, the murder rate is expected to increase by approximately 0.46 units, holding GDP constant. However, the p-value for this coefficient is 0.3394, which is well above 0.05, suggesting that there is no statistically significant relationship between internet usage and the murder rate at the 5% significance level.
GDP: The coefficient for the GDP variable is -0.0013 with a standard error of 0.0006. This indicates that for each one-unit increase in GDP, the murder rate is expected to decrease by approximately 0.0013 units, holding internet usage constant. The t-value for the GDP coefficient is -2.188, and the associated p-value is 0.0803. While this p-value is above the 0.05 threshold, it is close enough to suggest that GDP may have a nearly significant effect on the murder rate.
Residuals: The residuals, which are the differences between observed and predicted murder rates, range from -29.231 (Ukraine) to 14.174 (Guatemala). These residuals indicate some variability in the fit of the model. The residual standard error is 16.84, which provides a measure of the typical size of the residuals.
R-squared: The model’s R-squared value is 0.602, meaning that approximately 60.2% of the variability in murder rates can be explained by internet usage and GDP. The adjusted R-squared value is slightly lower at 0.4429, which adjusts for the number of predictors in the model. This indicates a reasonably good fit given the small sample size and simplicity of the model.
F-statistic: The F-statistic for the model is 3.782 with a p-value of 0.09991. This p-value is above the 0.05 threshold, indicating that the model as a whole is not statistically significant at the 5% level, but it is close. This suggests that while the individual predictors are not statistically significant, there may still be some explanatory power in the model when considering internet usage and GDP together.
Overall, the model indicates that neither internet usage nor GDP is a statistically significant predictor of the murder rate at the conventional 5% significance level, though GDP is close. The model explains a moderate proportion of the variance in murder rates, but there is still considerable variability that is not accounted for, as reflected in the residuals and R-squared values.
predict(m7)
## Austria Belgium Guatemala Jamaica
## 3.057338 4.672068 31.825741 48.547421
## Dominican Republic South Africa Ukraine United States
## 31.729948 23.362419 35.430764 1.774301
model_predictions <- predict(m7)
modelsummary(m7)
| (1) | |
|---|---|
| (Intercept) | 28.984 |
| (11.930) | |
| Internet | 0.463 |
| (0.438) | |
| GDP | -0.001 |
| (0.001) | |
| Num.Obs. | 8 |
| R2 | 0.602 |
| R2 Adj. | 0.443 |
| AIC | 72.1 |
| BIC | 72.4 |
| Log.Lik. | -32.059 |
| RMSE | 13.31 |
tab_model(m7)
| MurderRate | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 28.98 | -1.68 – 59.65 | 0.059 |
| Internet | 0.46 | -0.66 – 1.59 | 0.339 |
| GDP | -0.00 | -0.00 – 0.00 | 0.080 |
| Observations | 8 | ||
| R2 / R2 adjusted | 0.602 / 0.443 | ||
In this model, we examine the relationship between internet usage, GDP, and the murder rate. The intercept estimate is 28.98 with a 95% confidence interval (CI) ranging from -1.68 to 59.65 and a p-value of 0.059, indicating it is not statistically significant at the 0.05 level but is close to significance. The coefficient for internet usage is 0.46 with a 95% CI of -0.66 to 1.59 and a p-value of 0.339, indicating that it is not statistically significant. This suggests that changes in internet usage are not reliably associated with changes in the murder rate, holding GDP constant. The coefficient for GDP is -0.00 with a 95% CI of -0.00 to 0.00 and a p-value of 0.080, indicating it is not statistically significant at the 0.05 level but is close to significance. This suggests a negligible and non-significant effect of GDP on the murder rate, holding internet usage constant. The R² value is 0.602, indicating that approximately 60.2% of the variance in the murder rate can be explained by internet usage and GDP. The adjusted R² is 0.443, suggesting that when adjusting for the number of predictors, approximately 44.3% of the variance in the murder rate can be explained by the model. Overall, this model suggests that internet usage and GDP are not significant predictors of the murder rate, given the p-values and confidence intervals.
print('I did it!')
## [1] "I did it!"