library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(tidyverse)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(readxl)
read_excel("Next Data2.xls")
## # A tibble: 254 × 13
##    County    `Total Population` `NH-White Population` `NH-Black Population`
##    <chr>                  <dbl>                 <dbl>                 <dbl>
##  1 Anderson               57900                 32356                 11503
##  2 Andrews                18791                  7172                   350
##  3 Angelina               87634                 49650                 13152
##  4 Aransas                25181                 16063                   402
##  5 Archer                  9159                  7820                    43
##  6 Armstrong               1830                  1549                    17
##  7 Atascosa               50942                 16243                   483
##  8 Austin                 32095                 19077                  2569
##  9 Bailey                  6777                  2087                    61
## 10 Bandera                22448                 16239                   275
## # ℹ 244 more rows
## # ℹ 9 more variables: `Hispanic Population` <dbl>, `NH-Asian Population` <dbl>,
## #   `GDP 2023` <dbl>, `Percentage Total Poverty` <dbl>,
## #   `Number of Families Poverty` <dbl>, `Median Family Income $` <chr>,
## #   `Total Crime` <dbl>, `Unemployment Rate` <dbl>,
## #   `Percentage With Bachelor's Degree` <dbl>
capstone_data <- read_excel("Next Data2.xls")
cor(capstone_data$`GDP 2023`,capstone_data$`Total Crime`, method ="pearson")
## [1] NA
cor(capstone_data$`GDP 2023`, capstone_data$`Total Crime`, 
    method = "pearson", use = "complete.obs")
## [1] 0.6984111
clean_data <- capstone_data %>%
  drop_na(`GDP 2023`, `Total Crime`)

cor(clean_data$`GDP 2023`, clean_data$`Total Crime`)
## [1] 0.6984111
plot(clean_data$`GDP 2023`, clean_data$`Total Crime`)

model<-lm(clean_data$`GDP 2023`~clean_data$`Total Crime`, data=clean_data)
summary(model)
## 
## Call:
## lm(formula = clean_data$`GDP 2023` ~ clean_data$`Total Crime`, 
##     data = clean_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -127646866   -3819317    4594717    8793918  280353972 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -10037888    2489942  -4.031 7.36e-05 ***
## clean_data$`Total Crime`     37042       2396  15.460  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33550000 on 251 degrees of freedom
## Multiple R-squared:  0.4878, Adjusted R-squared:  0.4857 
## F-statistic:   239 on 1 and 251 DF,  p-value: < 2.2e-16
model <- lm(`GDP 2023` ~ `Total Crime` + 
            `Unemployment Rate` + `Percentage With Bachelor's Degree` + `Total Population` +
            `Percentage Total Poverty`, 
            data = clean_data)
summary(model)
## 
## Call:
## lm(formula = `GDP 2023` ~ `Total Crime` + `Unemployment Rate` + 
##     `Percentage With Bachelor's Degree` + `Total Population` + 
##     `Percentage Total Poverty`, data = clean_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -80313451  -1541603    872039   2944896  95514685 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1.411e+07  4.684e+06   3.012 0.002867 ** 
## `Total Crime`                        5.257e+02  1.403e+03   0.375 0.708203    
## `Unemployment Rate`                 -1.914e+06  8.700e+05  -2.200 0.028754 *  
## `Percentage With Bachelor's Degree` -3.986e+05  1.124e+05  -3.547 0.000466 ***
## `Total Population`                   1.072e+02  3.023e+00  35.451  < 2e-16 ***
## `Percentage Total Poverty`          -7.917e+04  1.589e+05  -0.498 0.618829    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13330000 on 247 degrees of freedom
## Multiple R-squared:  0.9204, Adjusted R-squared:  0.9188 
## F-statistic: 571.1 on 5 and 247 DF,  p-value: < 2.2e-16
numeric_vars <- clean_data[, c("GDP 2023", "Total Crime", "Unemployment Rate",
                               "Percentage With Bachelor's Degree",
                               "Total Population", "Percentage Total Poverty")]


cor_matrix <- cor(numeric_vars, use = "complete.obs")
print(cor_matrix)
##                                      GDP 2023 Total Crime Unemployment Rate
## GDP 2023                           1.00000000  0.69841106       -0.01263471
## Total Crime                        0.69841106  1.00000000        0.11358282
## Unemployment Rate                 -0.01263471  0.11358282        1.00000000
## Percentage With Bachelor's Degree  0.31161015  0.22509753       -0.25709166
## Total Population                   0.95679242  0.72656391        0.01508708
## Percentage Total Poverty          -0.03792168  0.03380412        0.38576517
##                                   Percentage With Bachelor's Degree
## GDP 2023                                                  0.3116102
## Total Crime                                               0.2250975
## Unemployment Rate                                        -0.2570917
## Percentage With Bachelor's Degree                         1.0000000
## Total Population                                          0.3775500
## Percentage Total Poverty                                 -0.3369075
##                                   Total Population Percentage Total Poverty
## GDP 2023                                0.95679242              -0.03792168
## Total Crime                             0.72656391               0.03380412
## Unemployment Rate                       0.01508708               0.38576517
## Percentage With Bachelor's Degree       0.37754998              -0.33690755
## Total Population                        1.00000000              -0.03720875
## Percentage Total Poverty               -0.03720875               1.00000000
model <- lm(`GDP 2023` ~ `Total Crime` + 
            `Unemployment Rate` + `Percentage With Bachelor's Degree` + 
            `Percentage Total Poverty`, 
            data = clean_data)
summary(model)
## 
## Call:
## lm(formula = `GDP 2023` ~ `Total Crime` + `Unemployment Rate` + 
##     `Percentage With Bachelor's Degree` + `Percentage Total Poverty`, 
##     data = clean_data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -124459986   -6450115    3009080    9819724  282220694 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         -18123107   11314044  -1.602  0.11047    
## `Total Crime`                           35554       2452  14.497  < 2e-16 ***
## `Unemployment Rate`                  -2376097    2142070  -1.109  0.26840    
## `Percentage With Bachelor's Degree`    804905     263816   3.051  0.00253 ** 
## `Percentage Total Poverty`              87054     391185   0.223  0.82408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32830000 on 248 degrees of freedom
## Multiple R-squared:  0.5153, Adjusted R-squared:  0.5075 
## F-statistic: 65.92 on 4 and 248 DF,  p-value: < 2.2e-16
clean_data$log_GDP <- log(clean_data$`GDP 2023`)
hist(clean_data$log_GDP)

lm_final <- lm(log_GDP ~ log(`Total Crime` +1) + `Unemployment Rate` +
               `Percentage With Bachelor's Degree` + 
               log(`NH-White Population`) + `Percentage Total Poverty`,
             data = clean_data)
summary(lm_final)
## 
## Call:
## lm(formula = log_GDP ~ log(`Total Crime` + 1) + `Unemployment Rate` + 
##     `Percentage With Bachelor's Degree` + log(`NH-White Population`) + 
##     `Percentage Total Poverty`, data = clean_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9780 -0.7056 -0.1407  0.4761  5.2078 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          8.358371   0.463640  18.028  < 2e-16 ***
## log(`Total Crime` + 1)               0.138068   0.063172   2.186  0.02979 *  
## `Unemployment Rate`                 -0.201576   0.073168  -2.755  0.00631 ** 
## `Percentage With Bachelor's Degree` -0.003518   0.009768  -0.360  0.71903    
## log(`NH-White Population`)           0.642563   0.057856  11.106  < 2e-16 ***
## `Percentage Total Poverty`           0.015787   0.013354   1.182  0.23826    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.102 on 247 degrees of freedom
## Multiple R-squared:  0.563,  Adjusted R-squared:  0.5541 
## F-statistic: 63.64 on 5 and 247 DF,  p-value: < 2.2e-16