library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(tidyverse)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(readxl)
read_excel("Next Data2.xls")
## # A tibble: 254 × 13
## County `Total Population` `NH-White Population` `NH-Black Population`
## <chr> <dbl> <dbl> <dbl>
## 1 Anderson 57900 32356 11503
## 2 Andrews 18791 7172 350
## 3 Angelina 87634 49650 13152
## 4 Aransas 25181 16063 402
## 5 Archer 9159 7820 43
## 6 Armstrong 1830 1549 17
## 7 Atascosa 50942 16243 483
## 8 Austin 32095 19077 2569
## 9 Bailey 6777 2087 61
## 10 Bandera 22448 16239 275
## # ℹ 244 more rows
## # ℹ 9 more variables: `Hispanic Population` <dbl>, `NH-Asian Population` <dbl>,
## # `GDP 2023` <dbl>, `Percentage Total Poverty` <dbl>,
## # `Number of Families Poverty` <dbl>, `Median Family Income $` <chr>,
## # `Total Crime` <dbl>, `Unemployment Rate` <dbl>,
## # `Percentage With Bachelor's Degree` <dbl>
capstone_data <- read_excel("Next Data2.xls")
cor(capstone_data$`GDP 2023`,capstone_data$`Total Crime`, method ="pearson")
## [1] NA
cor(capstone_data$`GDP 2023`, capstone_data$`Total Crime`,
method = "pearson", use = "complete.obs")
## [1] 0.6984111
clean_data <- capstone_data %>%
drop_na(`GDP 2023`, `Total Crime`)
cor(clean_data$`GDP 2023`, clean_data$`Total Crime`)
## [1] 0.6984111
plot(clean_data$`GDP 2023`, clean_data$`Total Crime`)

model<-lm(clean_data$`GDP 2023`~clean_data$`Total Crime`, data=clean_data)
summary(model)
##
## Call:
## lm(formula = clean_data$`GDP 2023` ~ clean_data$`Total Crime`,
## data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127646866 -3819317 4594717 8793918 280353972
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10037888 2489942 -4.031 7.36e-05 ***
## clean_data$`Total Crime` 37042 2396 15.460 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33550000 on 251 degrees of freedom
## Multiple R-squared: 0.4878, Adjusted R-squared: 0.4857
## F-statistic: 239 on 1 and 251 DF, p-value: < 2.2e-16
model <- lm(`GDP 2023` ~ `Total Crime` +
`Unemployment Rate` + `Percentage With Bachelor's Degree` + `Total Population` +
`Percentage Total Poverty`,
data = clean_data)
summary(model)
##
## Call:
## lm(formula = `GDP 2023` ~ `Total Crime` + `Unemployment Rate` +
## `Percentage With Bachelor's Degree` + `Total Population` +
## `Percentage Total Poverty`, data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80313451 -1541603 872039 2944896 95514685
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.411e+07 4.684e+06 3.012 0.002867 **
## `Total Crime` 5.257e+02 1.403e+03 0.375 0.708203
## `Unemployment Rate` -1.914e+06 8.700e+05 -2.200 0.028754 *
## `Percentage With Bachelor's Degree` -3.986e+05 1.124e+05 -3.547 0.000466 ***
## `Total Population` 1.072e+02 3.023e+00 35.451 < 2e-16 ***
## `Percentage Total Poverty` -7.917e+04 1.589e+05 -0.498 0.618829
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13330000 on 247 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9188
## F-statistic: 571.1 on 5 and 247 DF, p-value: < 2.2e-16
numeric_vars <- clean_data[, c("GDP 2023", "Total Crime", "Unemployment Rate",
"Percentage With Bachelor's Degree",
"Total Population", "Percentage Total Poverty")]
cor_matrix <- cor(numeric_vars, use = "complete.obs")
print(cor_matrix)
## GDP 2023 Total Crime Unemployment Rate
## GDP 2023 1.00000000 0.69841106 -0.01263471
## Total Crime 0.69841106 1.00000000 0.11358282
## Unemployment Rate -0.01263471 0.11358282 1.00000000
## Percentage With Bachelor's Degree 0.31161015 0.22509753 -0.25709166
## Total Population 0.95679242 0.72656391 0.01508708
## Percentage Total Poverty -0.03792168 0.03380412 0.38576517
## Percentage With Bachelor's Degree
## GDP 2023 0.3116102
## Total Crime 0.2250975
## Unemployment Rate -0.2570917
## Percentage With Bachelor's Degree 1.0000000
## Total Population 0.3775500
## Percentage Total Poverty -0.3369075
## Total Population Percentage Total Poverty
## GDP 2023 0.95679242 -0.03792168
## Total Crime 0.72656391 0.03380412
## Unemployment Rate 0.01508708 0.38576517
## Percentage With Bachelor's Degree 0.37754998 -0.33690755
## Total Population 1.00000000 -0.03720875
## Percentage Total Poverty -0.03720875 1.00000000
model <- lm(`GDP 2023` ~ `Total Crime` +
`Unemployment Rate` + `Percentage With Bachelor's Degree` +
`Percentage Total Poverty`,
data = clean_data)
summary(model)
##
## Call:
## lm(formula = `GDP 2023` ~ `Total Crime` + `Unemployment Rate` +
## `Percentage With Bachelor's Degree` + `Percentage Total Poverty`,
## data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -124459986 -6450115 3009080 9819724 282220694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18123107 11314044 -1.602 0.11047
## `Total Crime` 35554 2452 14.497 < 2e-16 ***
## `Unemployment Rate` -2376097 2142070 -1.109 0.26840
## `Percentage With Bachelor's Degree` 804905 263816 3.051 0.00253 **
## `Percentage Total Poverty` 87054 391185 0.223 0.82408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32830000 on 248 degrees of freedom
## Multiple R-squared: 0.5153, Adjusted R-squared: 0.5075
## F-statistic: 65.92 on 4 and 248 DF, p-value: < 2.2e-16
clean_data$log_GDP <- log(clean_data$`GDP 2023`)
hist(clean_data$log_GDP)

lm_final <- lm(log_GDP ~ log(`Total Crime` +1) + `Unemployment Rate` +
`Percentage With Bachelor's Degree` +
log(`NH-White Population`) + `Percentage Total Poverty`,
data = clean_data)
summary(lm_final)
##
## Call:
## lm(formula = log_GDP ~ log(`Total Crime` + 1) + `Unemployment Rate` +
## `Percentage With Bachelor's Degree` + log(`NH-White Population`) +
## `Percentage Total Poverty`, data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9780 -0.7056 -0.1407 0.4761 5.2078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.358371 0.463640 18.028 < 2e-16 ***
## log(`Total Crime` + 1) 0.138068 0.063172 2.186 0.02979 *
## `Unemployment Rate` -0.201576 0.073168 -2.755 0.00631 **
## `Percentage With Bachelor's Degree` -0.003518 0.009768 -0.360 0.71903
## log(`NH-White Population`) 0.642563 0.057856 11.106 < 2e-16 ***
## `Percentage Total Poverty` 0.015787 0.013354 1.182 0.23826
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.102 on 247 degrees of freedom
## Multiple R-squared: 0.563, Adjusted R-squared: 0.5541
## F-statistic: 63.64 on 5 and 247 DF, p-value: < 2.2e-16