library(nortest)
library(car)
## Loading required package: carData
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(classInt)
library(sandwich)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
library(spdep)
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
library(spatialreg)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
##
## as.spam.listw, as_dgRMatrix_listw, as_dsCMatrix_I,
## as_dsCMatrix_IrW, as_dsTMatrix_listw, can.be.simmed, cheb_setup,
## create_WX, do_ldet, eigen_pre_setup, eigen_setup, eigenw,
## errorsarlm, get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, GMargminImage, GMerrorsar,
## griffith_sone, gstsls, Hausman.test, impacts, intImpacts,
## Jacobian_W, jacobianSetup, l_max, lagmess, lagsarlm, lextrB,
## lextrS, lextrW, lmSLX, LU_prepermutate_setup, LU_setup,
## Matrix_J_setup, Matrix_setup, mcdet_setup, MCMCsamp, ME, mom_calc,
## mom_calc_int2, moments_setup, powerWeights, sacsarlm,
## SE_classic_setup, SE_interp_setup, SE_whichMin_setup,
## set.ClusterOption, set.coresOption, set.mcOption,
## set.VerboseOption, set.ZeroPolicyOption, similar.listw, spam_setup,
## spam_update_setup, SpatialFiltering, spautolm, spBreg_err,
## spBreg_lag, spBreg_sac, stsls, subgraph_eigenw, trW
library(sf)
library(dplyr)
library(ggplot2)
For this analysis my main outcome variable is mortality rate in 2015 in the United States and my predictor variables are poverty rate in 2005, age.adjusted mortality rate, and the gini coefficient in 2010.
dataurl<-"https://github.com/coreysparks/data/blob/master/uscounty_data.Rdata?raw=true"
load(url(dataurl))
summary(usco)
## pblack phisp prural rucc
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Length:3101
## 1st Qu.: 0.4603 1st Qu.: 1.154 1st Qu.: 33.60 Class :character
## Median : 2.1182 Median : 2.353 Median : 59.50 Mode :character
## Mean : 9.0072 Mean : 7.114 Mean : 58.62
## 3rd Qu.:10.3887 3rd Qu.: 6.478 3rd Qu.: 87.20
## Max. :85.9945 Max. :97.571 Max. :100.00
##
## ppov pfemhh unemp medhouseval
## Min. : 2.50 Min. : 2.7 Min. : 1.800 Min. : 31500
## 1st Qu.:10.80 1st Qu.:12.7 1st Qu.: 4.100 1st Qu.: 84500
## Median :14.20 Median :15.5 Median : 5.100 Median :108900
## Mean :15.33 Mean :16.7 Mean : 5.408 Mean :130283
## 3rd Qu.:18.60 3rd Qu.:19.3 3rd Qu.: 6.200 3rd Qu.:152900
## Max. :51.00 Max. :47.8 Max. :16.000 Max. :912600
## NA's :1
## popdensity gini County Deaths
## Min. :9.269e+04 Min. :0.2070 Length:3101 Min. : 12
## 1st Qu.:1.715e+07 1st Qu.:0.4070 Class :character 1st Qu.: 646
## Median :4.405e+07 Median :0.4290 Mode :character Median : 1402
## Mean :2.279e+08 Mean :0.4313 Mean : 4182
## 3rd Qu.:1.058e+08 3rd Qu.:0.4530 3rd Qu.: 3368
## Max. :6.979e+10 Max. :0.6450 Max. :298108
## NA's :2
## Population Age.Adjusted.Rate state geoid
## Min. : 481 Min. : 300.9 Length:3101 Length:3101
## 1st Qu.: 56089 1st Qu.: 721.6 Class :character Class :character
## Median : 131260 Median : 806.4 Mode :character Mode :character
## Mean : 511096 Mean : 815.3
## 3rd Qu.: 344108 3rd Qu.: 906.4
## Max. :50036337 Max. :1435.7
## NA's :7
## geometry metro_cut
## MULTIPOLYGON :3101 Metropolitan :2049
## epsg:9311 : 0 Non-Metropolitan:1052
## +proj=laea...: 0
##
##
##
##
head(usco)
## Simple feature collection with 6 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1259088 ymin: -1452583 xmax: 1390058 ymax: -1107765
## Projected CRS: NAD27 / US National Atlas Equal Area
## pblack phisp prural rucc ppov pfemhh unemp medhouseval popdensity
## 1 3.810512 1.8326418 100.0 08 15.5 14.5 3.5 101000 25816818
## 2 18.489258 3.2830777 47.2 04 15.1 18.5 3.5 137000 67111751
## 3 31.616198 1.5051066 100.0 08 18.7 20.5 4.6 73100 17147879
## 4 12.566549 0.9458693 69.7 06 18.4 19.2 4.0 90600 35909345
## 5 24.557442 0.7867706 100.0 08 23.5 22.7 4.3 68100 22546153
## 6 20.917781 3.3047510 50.9 04 16.0 21.4 4.0 103300 86871603
## gini County Deaths Population Age.Adjusted.Rate state geoid
## 1 0.424 Cleburne County, AL 926 75007 979.53 01 01029
## 2 0.452 Coffee County, AL 2484 254213 851.78 01 01031
## 3 0.443 Coosa County, AL 687 55109 954.81 01 01037
## 4 0.475 Covington County, AL 2578 189254 963.49 01 01039
## 5 0.466 Crenshaw County, AL 876 69471 989.55 01 01041
## 6 0.424 Dale County, AL 2392 248927 876.17 01 01045
## geometry metro_cut
## 1 MULTIPOLYGON (((1349148 -11... Non-Metropolitan
## 2 MULTIPOLYGON (((1327060 -13... Metropolitan
## 3 MULTIPOLYGON (((1305589 -12... Non-Metropolitan
## 4 MULTIPOLYGON (((1306630 -14... Metropolitan
## 5 MULTIPOLYGON (((1315155 -13... Non-Metropolitan
## 6 MULTIPOLYGON (((1354333 -14... Metropolitan
Data <- st_transform(usco, crs = 2163)
Data<-usco%>%
filter(state %in% c("06","48"))
Data$state<-Recode(Data$state, recodes="'06'='California'; '48'='Texas'")
Data %>%
ggplot()+
geom_sf(aes( fill=state, group=state))+
#coord_sf(crs = 2163)+
scale_colour_brewer()+
scale_fill_brewer()+
guides(title="County Type",fill = guide_legend(nrow = 4, byrow = TRUE ))+
guides(fill=guide_legend(title="County Type"))+
ggtitle(label = "Texas and California Counties")+theme(legend.position="bottom")
fit.0<-lm(Age.Adjusted.Rate~ state+scale(ppov)+scale(gini),
data=Data)
summary(fit.0)
##
## Call:
## lm(formula = Age.Adjusted.Rate ~ state + scale(ppov) + scale(gini),
## data = Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -448.97 -67.88 3.76 76.12 291.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 698.414 15.655 44.613 < 2e-16 ***
## stateTexas 129.693 17.499 7.412 1.24e-12 ***
## scale(ppov) 26.806 7.451 3.597 0.000375 ***
## scale(gini) -25.119 7.638 -3.288 0.001125 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 114.6 on 305 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.2353, Adjusted R-squared: 0.2278
## F-statistic: 31.28 on 3 and 305 DF, p-value: < 2.2e-16
The model indicates that Texas has a higher mortality rates than California based on its coefficients and the County characteristics associated with higher mortality rates is the proportion of the population in poverty.
County characteristics associated with lower mortality rates is the Gini coefficient. However, all the variable are significant.
fit.a<-lm(Age.Adjusted.Rate~state+(scale(ppov)+scale(gini)),
data=Data)
fit.b<-lm(Age.Adjusted.Rate~state/(scale(ppov)+scale(gini)),
data=Data)
anova(fit.a, fit.b, test="F")
## Analysis of Variance Table
##
## Model 1: Age.Adjusted.Rate ~ state + (scale(ppov) + scale(gini))
## Model 2: Age.Adjusted.Rate ~ state/(scale(ppov) + scale(gini))
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 305 4006849
## 2 303 3825013 2 181836 7.2021 0.0008795 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since the test is significant due to small F-value (7.2021), l can conclude that the coefficients of the Texas and California models are not the same as the pooled test meaning that the model is non-stationary. As a result, l can go ahead and split the data into regimes and examine the results.
fit.a<-lm(Age.Adjusted.Rate~scale(ppov)+scale(gini),
data=usco)
fit.t<-lm(Age.Adjusted.Rate~(scale(ppov)+scale(gini)),
data=Data, subset=state=="Texas")
fit.c<-lm(Age.Adjusted.Rate~(scale(ppov)+scale(gini)),
data=Data, subset=state=="California")
RSS2<-sum(resid(fit.t)^2) + sum(resid(fit.c)^2)
RSS0<-sum(resid(fit.a)^2)
k<-length(coef(fit.a))
n1<-as.numeric(table(Data$state)[1])
n2<-as.numeric(table(Data$state)[2])
ctest<-((RSS0 - RSS2)/k) / (RSS2 / (n1+n2- 2*k))
ctest
## [1] 943.7477
df(ctest, df1 = k, df2=n1+n2-2*k)
## [1] 4.288962e-155
The chow test is very significant with a p-value of 4.288962^(-55), which is extremely small.
summary(fit.t)
##
## Call:
## lm(formula = Age.Adjusted.Rate ~ (scale(ppov) + scale(gini)),
## data = Data, subset = state == "Texas")
##
## Residuals:
## Min 1Q Median 3Q Max
## -432.00 -71.95 4.91 78.48 304.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 829.271 7.515 110.347 <2e-16 ***
## scale(ppov) 16.263 8.206 1.982 0.0486 *
## scale(gini) -19.120 8.327 -2.296 0.0225 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 117.9 on 248 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.02541, Adjusted R-squared: 0.01755
## F-statistic: 3.233 on 2 and 248 DF, p-value: 0.04111
For the Texas counties, the overall interpretation is similar to what it was for the pooled data.
summary(fit.c)
##
## Call:
## lm(formula = Age.Adjusted.Rate ~ (scale(ppov) + scale(gini)),
## data = Data, subset = state == "California")
##
## Residuals:
## Min 1Q Median 3Q Max
## -232.73 -37.41 1.81 33.25 210.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 741.38 14.37 51.609 < 2e-16 ***
## scale(ppov) 100.83 15.61 6.459 2.89e-08 ***
## scale(gini) -49.41 17.71 -2.790 0.00724 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 82.58 on 55 degrees of freedom
## Multiple R-squared: 0.4562, Adjusted R-squared: 0.4364
## F-statistic: 23.07 on 2 and 55 DF, p-value: 5.313e-08
The California model is slightly different in effect compared to the Texas model in terms of their levels of impacts on the variables. However, both models show significant effects and did behave a little differently in different settings or regimes.
Contains a short (2 paragraph) description of the analysis results.
All of the two predictor effects on Texas and California and the pooled model were significant. The chow test was very significant with a p-value of 4.288962^(-55). The Texas model test was significant with a p-value of 0.04111. The California model test was also significant with a p-value of 5.313e-08. Though all the model tests were significant, i need to emphasize that there were slight variations in impacts of the predictors in different states or regimes. For example, the proportion in poverty fluctuates more with a coefficient of 100.8 and 16.3 between California and Texas.
The factor associated with a lower mortality rate is the Gini coefficient, and the proportion in poverty is associated with higher mortality rates.