library(nortest)
library(car)
## Loading required package: carData
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(classInt)
library(sandwich)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
library(spdep)
## Loading required package: sp
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
library(spatialreg)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
## 
##     as.spam.listw, as_dgRMatrix_listw, as_dsCMatrix_I,
##     as_dsCMatrix_IrW, as_dsTMatrix_listw, can.be.simmed, cheb_setup,
##     create_WX, do_ldet, eigen_pre_setup, eigen_setup, eigenw,
##     errorsarlm, get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, GMargminImage, GMerrorsar,
##     griffith_sone, gstsls, Hausman.test, impacts, intImpacts,
##     Jacobian_W, jacobianSetup, l_max, lagmess, lagsarlm, lextrB,
##     lextrS, lextrW, lmSLX, LU_prepermutate_setup, LU_setup,
##     Matrix_J_setup, Matrix_setup, mcdet_setup, MCMCsamp, ME, mom_calc,
##     mom_calc_int2, moments_setup, powerWeights, sacsarlm,
##     SE_classic_setup, SE_interp_setup, SE_whichMin_setup,
##     set.ClusterOption, set.coresOption, set.mcOption,
##     set.VerboseOption, set.ZeroPolicyOption, similar.listw, spam_setup,
##     spam_update_setup, SpatialFiltering, spautolm, spBreg_err,
##     spBreg_lag, spBreg_sac, stsls, subgraph_eigenw, trW
library(sf)
library(dplyr)
library(ggplot2)

A. Define your outcome variable and all predictors.

For this analysis my main outcome variable is mortality rate in 2015 in the United States and my predictor variables are poverty rate in 2005, age.adjusted mortality rate, and the gini coefficient in 2010.

Loading data

dataurl<-"https://github.com/coreysparks/data/blob/master/uscounty_data.Rdata?raw=true"
load(url(dataurl))
summary(usco)
##      pblack            phisp            prural           rucc          
##  Min.   : 0.0000   Min.   : 0.000   Min.   :  0.00   Length:3101       
##  1st Qu.: 0.4603   1st Qu.: 1.154   1st Qu.: 33.60   Class :character  
##  Median : 2.1182   Median : 2.353   Median : 59.50   Mode  :character  
##  Mean   : 9.0072   Mean   : 7.114   Mean   : 58.62                     
##  3rd Qu.:10.3887   3rd Qu.: 6.478   3rd Qu.: 87.20                     
##  Max.   :85.9945   Max.   :97.571   Max.   :100.00                     
##                                                                        
##       ppov           pfemhh         unemp         medhouseval    
##  Min.   : 2.50   Min.   : 2.7   Min.   : 1.800   Min.   : 31500  
##  1st Qu.:10.80   1st Qu.:12.7   1st Qu.: 4.100   1st Qu.: 84500  
##  Median :14.20   Median :15.5   Median : 5.100   Median :108900  
##  Mean   :15.33   Mean   :16.7   Mean   : 5.408   Mean   :130283  
##  3rd Qu.:18.60   3rd Qu.:19.3   3rd Qu.: 6.200   3rd Qu.:152900  
##  Max.   :51.00   Max.   :47.8   Max.   :16.000   Max.   :912600  
##                                                  NA's   :1       
##    popdensity             gini           County              Deaths      
##  Min.   :9.269e+04   Min.   :0.2070   Length:3101        Min.   :    12  
##  1st Qu.:1.715e+07   1st Qu.:0.4070   Class :character   1st Qu.:   646  
##  Median :4.405e+07   Median :0.4290   Mode  :character   Median :  1402  
##  Mean   :2.279e+08   Mean   :0.4313                      Mean   :  4182  
##  3rd Qu.:1.058e+08   3rd Qu.:0.4530                      3rd Qu.:  3368  
##  Max.   :6.979e+10   Max.   :0.6450                      Max.   :298108  
##                                                          NA's   :2       
##    Population       Age.Adjusted.Rate    state              geoid          
##  Min.   :     481   Min.   : 300.9    Length:3101        Length:3101       
##  1st Qu.:   56089   1st Qu.: 721.6    Class :character   Class :character  
##  Median :  131260   Median : 806.4    Mode  :character   Mode  :character  
##  Mean   :  511096   Mean   : 815.3                                         
##  3rd Qu.:  344108   3rd Qu.: 906.4                                         
##  Max.   :50036337   Max.   :1435.7                                         
##                     NA's   :7                                              
##           geometry               metro_cut   
##  MULTIPOLYGON :3101   Metropolitan    :2049  
##  epsg:9311    :   0   Non-Metropolitan:1052  
##  +proj=laea...:   0                          
##                                              
##                                              
##                                              
## 
head(usco)
## Simple feature collection with 6 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 1259088 ymin: -1452583 xmax: 1390058 ymax: -1107765
## Projected CRS: NAD27 / US National Atlas Equal Area
##      pblack     phisp prural rucc ppov pfemhh unemp medhouseval popdensity
## 1  3.810512 1.8326418  100.0   08 15.5   14.5   3.5      101000   25816818
## 2 18.489258 3.2830777   47.2   04 15.1   18.5   3.5      137000   67111751
## 3 31.616198 1.5051066  100.0   08 18.7   20.5   4.6       73100   17147879
## 4 12.566549 0.9458693   69.7   06 18.4   19.2   4.0       90600   35909345
## 5 24.557442 0.7867706  100.0   08 23.5   22.7   4.3       68100   22546153
## 6 20.917781 3.3047510   50.9   04 16.0   21.4   4.0      103300   86871603
##    gini               County Deaths Population Age.Adjusted.Rate state geoid
## 1 0.424  Cleburne County, AL    926      75007            979.53    01 01029
## 2 0.452    Coffee County, AL   2484     254213            851.78    01 01031
## 3 0.443     Coosa County, AL    687      55109            954.81    01 01037
## 4 0.475 Covington County, AL   2578     189254            963.49    01 01039
## 5 0.466  Crenshaw County, AL    876      69471            989.55    01 01041
## 6 0.424      Dale County, AL   2392     248927            876.17    01 01045
##                         geometry        metro_cut
## 1 MULTIPOLYGON (((1349148 -11... Non-Metropolitan
## 2 MULTIPOLYGON (((1327060 -13...     Metropolitan
## 3 MULTIPOLYGON (((1305589 -12... Non-Metropolitan
## 4 MULTIPOLYGON (((1306630 -14...     Metropolitan
## 5 MULTIPOLYGON (((1315155 -13... Non-Metropolitan
## 6 MULTIPOLYGON (((1354333 -14...     Metropolitan
Data <- st_transform(usco, crs = 2163)

B. Subset the data to have counties from only Texas and California (or generate spatial regimes in your own data)

Create Spatia0l Regimes

Data<-usco%>%
  filter(state %in% c("06","48"))

Data$state<-Recode(Data$state, recodes="'06'='California'; '48'='Texas'")
Data %>%
  ggplot()+
  geom_sf(aes( fill=state, group=state))+
  #coord_sf(crs = 2163)+
  scale_colour_brewer()+
  scale_fill_brewer()+
  guides(title="County Type",fill = guide_legend(nrow = 4, byrow = TRUE ))+
  guides(fill=guide_legend(title="County Type"))+
  ggtitle(label = "Texas and California Counties")+theme(legend.position="bottom")

C. Estimate a linear regression model for the Age adjusted mortality rates, use the Gini coefficient, the % persons in poverty.

fit.0<-lm(Age.Adjusted.Rate~ state+scale(ppov)+scale(gini),
          data=Data)

summary(fit.0)
## 
## Call:
## lm(formula = Age.Adjusted.Rate ~ state + scale(ppov) + scale(gini), 
##     data = Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -448.97  -67.88    3.76   76.12  291.47 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  698.414     15.655  44.613  < 2e-16 ***
## stateTexas   129.693     17.499   7.412 1.24e-12 ***
## scale(ppov)   26.806      7.451   3.597 0.000375 ***
## scale(gini)  -25.119      7.638  -3.288 0.001125 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.6 on 305 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.2353, Adjusted R-squared:  0.2278 
## F-statistic: 31.28 on 3 and 305 DF,  p-value: < 2.2e-16

The model indicates that Texas has a higher mortality rates than California based on its coefficients and the County characteristics associated with higher mortality rates is the proportion of the population in poverty.

County characteristics associated with lower mortality rates is the Gini coefficient. However, all the variable are significant.

D. Fit the OLS model for your outcome :

  1. Test for non-stationarity in the model by interacting the base mode from step C with the indicator variable for the state (TX vs CA) using the process described by Brazil from the lecture.

Spatial regime model building through interactions

fit.a<-lm(Age.Adjusted.Rate~state+(scale(ppov)+scale(gini)), 
          data=Data)

fit.b<-lm(Age.Adjusted.Rate~state/(scale(ppov)+scale(gini)), 
          data=Data)

anova(fit.a, fit.b, test="F")
## Analysis of Variance Table
## 
## Model 1: Age.Adjusted.Rate ~ state + (scale(ppov) + scale(gini))
## Model 2: Age.Adjusted.Rate ~ state/(scale(ppov) + scale(gini))
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    305 4006849                                  
## 2    303 3825013  2    181836 7.2021 0.0008795 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

E.From part D, does it appear that the model is stationary with respect to the state?

Since the test is significant due to small F-value (7.2021), l can conclude that the coefficients of the Texas and California models are not the same as the pooled test meaning that the model is non-stationary. As a result, l can go ahead and split the data into regimes and examine the results.

F. Consider state-specific models and describe the results for each state separately.

fit.a<-lm(Age.Adjusted.Rate~scale(ppov)+scale(gini),
          data=usco)

fit.t<-lm(Age.Adjusted.Rate~(scale(ppov)+scale(gini)), 
          data=Data, subset=state=="Texas")

fit.c<-lm(Age.Adjusted.Rate~(scale(ppov)+scale(gini)), 
          data=Data, subset=state=="California")

Chow F test code

RSS2<-sum(resid(fit.t)^2) + sum(resid(fit.c)^2)
RSS0<-sum(resid(fit.a)^2)
k<-length(coef(fit.a))
n1<-as.numeric(table(Data$state)[1])
n2<-as.numeric(table(Data$state)[2])
ctest<-((RSS0 - RSS2)/k) / (RSS2 / (n1+n2- 2*k))
ctest
## [1] 943.7477
df(ctest, df1 = k, df2=n1+n2-2*k)
## [1] 4.288962e-155

The chow test is very significant with a p-value of 4.288962^(-55), which is extremely small.

Texas model

summary(fit.t)
## 
## Call:
## lm(formula = Age.Adjusted.Rate ~ (scale(ppov) + scale(gini)), 
##     data = Data, subset = state == "Texas")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -432.00  -71.95    4.91   78.48  304.10 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  829.271      7.515 110.347   <2e-16 ***
## scale(ppov)   16.263      8.206   1.982   0.0486 *  
## scale(gini)  -19.120      8.327  -2.296   0.0225 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 117.9 on 248 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.02541,    Adjusted R-squared:  0.01755 
## F-statistic: 3.233 on 2 and 248 DF,  p-value: 0.04111

For the Texas counties, the overall interpretation is similar to what it was for the pooled data.

California model

summary(fit.c)
## 
## Call:
## lm(formula = Age.Adjusted.Rate ~ (scale(ppov) + scale(gini)), 
##     data = Data, subset = state == "California")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -232.73  -37.41    1.81   33.25  210.50 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   741.38      14.37  51.609  < 2e-16 ***
## scale(ppov)   100.83      15.61   6.459 2.89e-08 ***
## scale(gini)   -49.41      17.71  -2.790  0.00724 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 82.58 on 55 degrees of freedom
## Multiple R-squared:  0.4562, Adjusted R-squared:  0.4364 
## F-statistic: 23.07 on 2 and 55 DF,  p-value: 5.313e-08

The California model is slightly different in effect compared to the Texas model in terms of their levels of impacts on the variables. However, both models show significant effects and did behave a little differently in different settings or regimes.

G. Presents the results of your analysis, including figures and output tables to adequately describe the analysis.

Contains a short (2 paragraph) description of the analysis results.

All of the two predictor effects on Texas and California and the pooled model were significant. The chow test was very significant with a p-value of 4.288962^(-55). The Texas model test was significant with a p-value of 0.04111. The California model test was also significant with a p-value of 5.313e-08. Though all the model tests were significant, i need to emphasize that there were slight variations in impacts of the predictors in different states or regimes. For example, the proportion in poverty fluctuates more with a coefficient of 100.8 and 16.3 between California and Texas.

The factor associated with a lower mortality rate is the Gini coefficient, and the proportion in poverty is associated with higher mortality rates.