library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(tidyverse)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(readxl)
getwd()
## [1] "/Users/karinadominguez/Desktop/PAD - Applied Quanitative Methods"
read_excel("NewCapstone1.xlsx")
## # A tibble: 254 × 7
##    County    `Total Population` `GDP 2023` Percentage Total Pove…¹ `Total Crime`
##    <chr>                  <dbl>      <dbl>                   <dbl>         <dbl>
##  1 Anderson               57900    2709645                    12.4           399
##  2 Andrews                18791    5201771                    11.1           272
##  3 Angelina               87634    4310685                    11.3           573
##  4 Aransas                25181     942655                    14.4           297
##  5 Archer                  9159     409437                     6.6           168
##  6 Armstrong               1830      79368                     5.5            12
##  7 Atascosa               50942    4330189                    13.5           334
##  8 Austin                 32095    2213481                    12.8           453
##  9 Bailey                  6777     595427                     4.6          1725
## 10 Bandera                22448     520256                     9.5            85
## # ℹ 244 more rows
## # ℹ abbreviated name: ¹​`Percentage Total Poverty`
## # ℹ 2 more variables: `Unemployment Rate` <dbl>,
## #   `Percentage With Bachelor's Degree` <dbl>
capstone_data1 <- read_excel("NewCapstone1.xlsx")
cor(capstone_data1$`GDP 2023`,
    capstone_data1$`Total Crime`,
    method = "pearson",
    use = "complete.obs")
## [1] 0.6984111
summary(capstone_data1)
##     County          Total Population     GDP 2023        
##  Length:254         Min.   :     52   Min.   :    59177  
##  Class :character   1st Qu.:   6189   1st Qu.:   485518  
##  Mode  :character   Median :  18878   Median :  1428522  
##                     Mean   : 120965   Mean   : 10382825  
##                     3rd Qu.:  54594   3rd Qu.:  4083550  
##                     Max.   :4870713   Max.   :563988106  
##                                                          
##  Percentage Total Poverty  Total Crime     Unemployment Rate
##  Min.   : 0.00            Min.   :   0.0   Min.   :0.300    
##  1st Qu.: 7.60            1st Qu.: 108.0   1st Qu.:3.300    
##  Median :10.80            Median : 267.0   Median :3.700    
##  Mean   :11.32            Mean   : 552.4   Mean   :3.935    
##  3rd Qu.:13.97            3rd Qu.: 583.0   3rd Qu.:4.400    
##  Max.   :36.80            Max.   :7928.0   Max.   :9.300    
##                           NA's   :1                         
##  Percentage With Bachelor's Degree
##  Min.   : 0.00                    
##  1st Qu.:15.60                    
##  Median :19.70                    
##  Mean   :21.46                    
##  3rd Qu.:25.20                    
##  Max.   :55.50                    
## 
model <- lm(`GDP 2023` ~ `Total Crime` + `Unemployment Rate` + 
            `Percentage With Bachelor's Degree`  + 
            `Percentage Total Poverty`, 
            data = capstone_data1)
summary(model)
## 
## Call:
## lm(formula = `GDP 2023` ~ `Total Crime` + `Unemployment Rate` + 
##     `Percentage With Bachelor's Degree` + `Percentage Total Poverty`, 
##     data = capstone_data1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -124459986   -6450115    3009080    9819724  282220694 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         -18123107   11314044  -1.602  0.11047    
## `Total Crime`                           35554       2452  14.497  < 2e-16 ***
## `Unemployment Rate`                  -2376097    2142070  -1.109  0.26840    
## `Percentage With Bachelor's Degree`    804905     263816   3.051  0.00253 ** 
## `Percentage Total Poverty`              87054     391185   0.223  0.82408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32830000 on 248 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5153, Adjusted R-squared:  0.5075 
## F-statistic: 65.92 on 4 and 248 DF,  p-value: < 2.2e-16
capstone_data1$GDP_per_capita <- capstone_data1$`GDP 2023` / capstone_data1$`Total Population`
capstone_data1$Crime_per_capita <- capstone_data1$`Total Crime` / capstone_data1$`Total Population`
capstone_data1$Crime_per_100k <- 
  (capstone_data1$`Total Crime` / capstone_data1$`Total Population`) * 100000
summary(capstone_data1$GDP_per_capita)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     10.52     45.87     62.78    979.50    108.74 206247.61
summary(capstone_data1$Crime_per_capita)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.000000 0.004287 0.010790 0.030459 0.024666 0.530710        1
model_pc <- lm(GDP_per_capita ~ Crime_per_capita + `Unemployment Rate` +
               `Percentage With Bachelor's Degree` +
               `Percentage Total Poverty`,
               data = capstone_data1)
summary(model_pc)
## 
## Call:
## lm(formula = GDP_per_capita ~ Crime_per_capita + `Unemployment Rate` + 
##     `Percentage With Bachelor's Degree` + `Percentage Total Poverty`, 
##     data = capstone_data1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10500  -3039  -1130   1282 183767 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         23235.70    4311.01   5.390 1.64e-07 ***
## Crime_per_capita                     1376.39   13000.43   0.106 0.915769    
## `Unemployment Rate`                 -2868.57     797.74  -3.596 0.000390 ***
## `Percentage With Bachelor's Degree`  -378.04      96.81  -3.905 0.000121 ***
## `Percentage Total Poverty`           -254.50     147.10  -1.730 0.084858 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12360 on 248 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1056, Adjusted R-squared:  0.09119 
## F-statistic: 7.322 on 4 and 248 DF,  p-value: 1.374e-05
summary(capstone_data1$GDP_per_capita)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     10.52     45.87     62.78    979.50    108.74 206247.61
summary(capstone_data1$Crime_per_capita)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.000000 0.004287 0.010790 0.030459 0.024666 0.530710        1
sum(capstone_data1$Crime_per_capita == 0, na.rm = TRUE)
## [1] 1
clean_data <- capstone_data1[
  capstone_data1$GDP_per_capita > 0 &
  capstone_data1$Crime_per_capita > 0, ]

model_log <- lm(log(GDP_per_capita) ~ log(Crime_per_capita) +
                `Unemployment Rate` +
                `Percentage With Bachelor's Degree` +
                `Percentage Total Poverty`,
                data = clean_data)
clean_data <- capstone_data1[
  capstone_data1$GDP_per_capita > 0 &
  capstone_data1$Crime_per_capita > 0, ]

model_log <- lm(log(GDP_per_capita) ~ log(Crime_per_capita) +
                `Unemployment Rate` +
                `Percentage With Bachelor's Degree` +
                `Percentage Total Poverty`,
                data = clean_data)
summary(model_log)
## 
## Call:
## lm(formula = log(GDP_per_capita) ~ log(Crime_per_capita) + `Unemployment Rate` + 
##     `Percentage With Bachelor's Degree` + `Percentage Total Poverty`, 
##     data = clean_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6548 -0.5791 -0.1691  0.3590  5.5984 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          6.969824   0.349038  19.969  < 2e-16 ***
## log(Crime_per_capita)                0.089250   0.049220   1.813 0.071000 .  
## `Unemployment Rate`                 -0.341503   0.064021  -5.334 2.17e-07 ***
## `Percentage With Bachelor's Degree` -0.027671   0.008067  -3.430 0.000707 ***
## `Percentage Total Poverty`          -0.021210   0.011602  -1.828 0.068736 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9628 on 247 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1774 
## F-statistic: 14.53 on 4 and 247 DF,  p-value: 1.14e-10
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif(model_log)
##               log(Crime_per_capita)                 `Unemployment Rate` 
##                            1.178246                            1.238191 
## `Percentage With Bachelor's Degree`          `Percentage Total Poverty` 
##                            1.337304                            1.294463
cor.test(log(clean_data$GDP_per_capita),
         log(clean_data$Crime_per_capita))
## 
##  Pearson's product-moment correlation
## 
## data:  log(clean_data$GDP_per_capita) and log(clean_data$Crime_per_capita)
## t = 3.269, df = 250, p-value = 0.001231
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0809188 0.3180817
## sample estimates:
##       cor 
## 0.2024671
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(capstone_data1, type = "text", summary = TRUE)
## 
## =================================
## Statistic N Mean St. Dev. Min Max
## =================================
model_log <- lm(
  log(GDP_per_capita + 1) ~ log(Crime_per_capita + 1) +
    `Unemployment Rate` +
    `Percentage With Bachelor's Degree` +
    `Percentage Total Poverty`,
  data = capstone_data1
)
capstone_clean <- na.omit(capstone_data1)

model_log <- lm(
  log(GDP_per_capita + 1) ~ log(Crime_per_capita + 1) +
    `Unemployment Rate` +
    `Percentage With Bachelor's Degree` +
    `Percentage Total Poverty`,
  data = capstone_clean
)
library(lmtest)

bptest(model_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_log
## BP = 13.435, df = 4, p-value = 0.009334
library(sandwich)
library(lmtest)
coeftest(model_log, vcov = vcovHC(model_log, type = "HC1"))
## 
## t test of coefficients:
## 
##                                      Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)                          6.723783   0.721286  9.3219 < 2.2e-16 ***
## log(Crime_per_capita + 1)            0.702643   1.058800  0.6636  0.507549    
## `Unemployment Rate`                 -0.370138   0.128326 -2.8844  0.004267 ** 
## `Percentage With Bachelor's Degree` -0.031058   0.012645 -2.4563  0.014725 *  
## `Percentage Total Poverty`          -0.018067   0.015039 -1.2014  0.230760    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
raintest(model_log)
## 
##  Rainbow test
## 
## data:  model_log
## Rain = 0.482, df1 = 127, df2 = 121, p-value = 1