Loading some necessary library for this analysis

library(nortest)
library(car)
library(lmtest)
library(classInt)
library(sandwich)
library(tidyverse)
library(spdep)
library(spatialreg)
library(sf)
daturl<-"https://github.com/coreysparks/data/blob/master/uscounty_data.Rdata?raw=true"
load(url(daturl))

summary(usco)
##      pblack            phisp            prural           rucc          
##  Min.   : 0.0000   Min.   : 0.000   Min.   :  0.00   Length:3101       
##  1st Qu.: 0.4603   1st Qu.: 1.154   1st Qu.: 33.60   Class :character  
##  Median : 2.1182   Median : 2.353   Median : 59.50   Mode  :character  
##  Mean   : 9.0072   Mean   : 7.114   Mean   : 58.62                     
##  3rd Qu.:10.3887   3rd Qu.: 6.478   3rd Qu.: 87.20                     
##  Max.   :85.9945   Max.   :97.571   Max.   :100.00                     
##                                                                        
##       ppov           pfemhh         unemp         medhouseval    
##  Min.   : 2.50   Min.   : 2.7   Min.   : 1.800   Min.   : 31500  
##  1st Qu.:10.80   1st Qu.:12.7   1st Qu.: 4.100   1st Qu.: 84500  
##  Median :14.20   Median :15.5   Median : 5.100   Median :108900  
##  Mean   :15.33   Mean   :16.7   Mean   : 5.408   Mean   :130283  
##  3rd Qu.:18.60   3rd Qu.:19.3   3rd Qu.: 6.200   3rd Qu.:152900  
##  Max.   :51.00   Max.   :47.8   Max.   :16.000   Max.   :912600  
##                                                  NA's   :1       
##    popdensity             gini           County              Deaths      
##  Min.   :9.269e+04   Min.   :0.2070   Length:3101        Min.   :    12  
##  1st Qu.:1.715e+07   1st Qu.:0.4070   Class :character   1st Qu.:   646  
##  Median :4.405e+07   Median :0.4290   Mode  :character   Median :  1402  
##  Mean   :2.279e+08   Mean   :0.4313                      Mean   :  4182  
##  3rd Qu.:1.058e+08   3rd Qu.:0.4530                      3rd Qu.:  3368  
##  Max.   :6.979e+10   Max.   :0.6450                      Max.   :298108  
##                                                          NA's   :2       
##    Population       Age.Adjusted.Rate    state              geoid          
##  Min.   :     481   Min.   : 300.9    Length:3101        Length:3101       
##  1st Qu.:   56089   1st Qu.: 721.6    Class :character   Class :character  
##  Median :  131260   Median : 806.4    Mode  :character   Mode  :character  
##  Mean   :  511096   Mean   : 815.3                                         
##  3rd Qu.:  344108   3rd Qu.: 906.4                                         
##  Max.   :50036337   Max.   :1435.7                                         
##                     NA's   :7                                              
##           geometry               metro_cut   
##  MULTIPOLYGON :3101   Metropolitan    :2049  
##  epsg:9311    :   0   Non-Metropolitan:1052  
##  +proj=laea...:   0                          
##                                              
##                                              
##                                              
## 
head(usco)
## Simple feature collection with 6 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 1259088 ymin: -1452583 xmax: 1390058 ymax: -1107765
## Projected CRS: NAD27 / US National Atlas Equal Area
##      pblack     phisp prural rucc ppov pfemhh unemp medhouseval popdensity
## 1  3.810512 1.8326418  100.0   08 15.5   14.5   3.5      101000   25816818
## 2 18.489258 3.2830777   47.2   04 15.1   18.5   3.5      137000   67111751
## 3 31.616198 1.5051066  100.0   08 18.7   20.5   4.6       73100   17147879
## 4 12.566549 0.9458693   69.7   06 18.4   19.2   4.0       90600   35909345
## 5 24.557442 0.7867706  100.0   08 23.5   22.7   4.3       68100   22546153
## 6 20.917781 3.3047510   50.9   04 16.0   21.4   4.0      103300   86871603
##    gini               County Deaths Population Age.Adjusted.Rate state geoid
## 1 0.424  Cleburne County, AL    926      75007            979.53    01 01029
## 2 0.452    Coffee County, AL   2484     254213            851.78    01 01031
## 3 0.443     Coosa County, AL    687      55109            954.81    01 01037
## 4 0.475 Covington County, AL   2578     189254            963.49    01 01039
## 5 0.466  Crenshaw County, AL    876      69471            989.55    01 01041
## 6 0.424      Dale County, AL   2392     248927            876.17    01 01045
##                         geometry        metro_cut
## 1 MULTIPOLYGON (((1349148 -11... Non-Metropolitan
## 2 MULTIPOLYGON (((1327060 -13...     Metropolitan
## 3 MULTIPOLYGON (((1305589 -12... Non-Metropolitan
## 4 MULTIPOLYGON (((1306630 -14...     Metropolitan
## 5 MULTIPOLYGON (((1315155 -13... Non-Metropolitan
## 6 MULTIPOLYGON (((1354333 -14...     Metropolitan
usco <- st_transform(usco, crs = 2163)

usco2 <- subset(usco, state== "06" | state=="48")

head(usco2)
## Simple feature collection with 6 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -2001856 ymin: -1081281 xmax: -1637152 ymax: -102816.7
## Projected CRS: NAD27 / US National Atlas Equal Area
##         pblack     phisp prural rucc ppov pfemhh unemp medhouseval  popdensity
## 158  9.4529654 15.009640   70.5   07 16.9   15.3   8.0      176500     7652410
## 159  0.8084838 11.959261   70.0   06 20.4   15.6   7.9      157300     2430975
## 160  7.3254119 13.744986    0.0   01 12.2   19.1   5.1      765700 15775094404
## 161  2.3093507 37.320654    5.0   02 12.7   17.0   4.3      446500   146526342
## 162  0.2620850  8.415842   99.7   08 10.7   11.4   8.4      195700     3602549
## 163 15.3362667 21.170914    3.7   02  9.2   20.5   5.4      263600   500864602
##      gini                   County Deaths Population Age.Adjusted.Rate state
## 158 0.422        Lassen County, CA   1142     163141            724.37    06
## 159 0.430         Modoc County, CA    558      45868            827.50    06
## 160 0.507 San Francisco County, CA  28128    4196196            553.17    06
## 161 0.475 Santa Barbara County, CA  14905    2176804            590.62    06
## 162 0.389        Sierra County, CA    178      15187            738.86    06
## 163 0.404        Solano County, CA  14981    2124261            685.45    06
##     geoid                       geometry        metro_cut
## 158 06035 MULTIPOLYGON (((-1668658 -2... Non-Metropolitan
## 159 06049 MULTIPOLYGON (((-1676174 -1...     Metropolitan
## 160 06075 MULTIPOLYGON (((-2001706 -5...     Metropolitan
## 161 06083 MULTIPOLYGON (((-1814667 -1...     Metropolitan
## 162 06091 MULTIPOLYGON (((-1703056 -4... Non-Metropolitan
## 163 06095 MULTIPOLYGON (((-1928795 -5...     Metropolitan

The map shows the spatial distribution of the counties in California and Texas.

library(scales)
usco %>%
  ggplot()+
  geom_sf(aes( fill=state, group=state))+
  #coord_sf(crs = 2163)+
  scale_colour_viridis_d()+
  scale_fill_viridis_d()+
  guides(title="State",fill = guide_legend(nrow = 4, byrow = TRUE ))+
  #guides(fill=guide_legend(title="County Type"))+
  ggtitle(label = "US-States")+theme(legend.position="bottom")

usco2 %>%
  ggplot()+
  geom_sf(aes( fill=state, group=state))+
  #coord_sf(crs = 2163)+
  scale_colour_viridis_d()+
  scale_fill_viridis_d()+
  guides(fill=guide_legend(title="Calfornia and Texas"))+
  ggtitle(label = "Counties of California and Texas")

  1. Estimate a linear regression model for the Age adjusted mortality rates, use the Gini coefficient, the % persons in poverty. Here’s the basic OLS model:
fit.0<-lm(Age.Adjusted.Rate~state+scale(ppov)+scale(gini),
          data=usco2)

summary(fit.0)
## 
## Call:
## lm(formula = Age.Adjusted.Rate ~ state + scale(ppov) + scale(gini), 
##     data = usco2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -448.97  -67.88    3.76   76.12  291.47 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  698.414     15.655  44.613  < 2e-16 ***
## state48      129.693     17.499   7.412 1.24e-12 ***
## scale(ppov)   26.806      7.451   3.597 0.000375 ***
## scale(gini)  -25.119      7.638  -3.288 0.001125 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.6 on 305 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.2353, Adjusted R-squared:  0.2278 
## F-statistic: 31.28 on 3 and 305 DF,  p-value: < 2.2e-16

Interpretation: Texas has higher mortality rate than California. And we also observe that county characteristics like poverty rate and gini score is associated with mortality rate. As they are all signigicantly associated with mortality I choose age.adjusted mortality as my dependent variable, and poverty rate(ppov), state, and gini as my predictor variable. Now, we will see how in these states our outcomes varies.

fit.2<-lm(Age.Adjusted.Rate~scale(ppov)+scale(gini),
          data=usco2)

fit.cl<-lm(Age.Adjusted.Rate~(scale(ppov)+scale(gini)),
          data=usco2, subset=usco2$state=="06")

fit.tx<-lm(Age.Adjusted.Rate~(scale(ppov)+scale(gini)),
          data=usco2, subset=usco2$state=="48")

Chow test to find out the whether the models differ or similar by their coefficient value?

RSS2<-sum(resid(fit.tx)^2) + sum(resid(fit.cl)^2)
RSS0<-sum(resid(fit.2)^2)
k<-length(coef(fit.2))
n1<-as.numeric(table(usco2$state)[1])
n2<-as.numeric(table(usco2$state)[2])
ctest<-((RSS0 - RSS2)/k) / (RSS2 / (n1+n2- 2*k))
ctest
## [1] 24.09278
df(ctest, df1 = k, df2=n1+n2-2*k)
## [1] 6.035142e-14

Interpretation: Smaller p-value indicates that we can interpret the regime model because there is no homogeneity by cofficient values. they are different than original model.

  1. Fit the OLS model for your outcome : Test for non-stationarity in the model by interacting the base mode from step C with the indicator variable for the state (TX vs CA) using the process described by Brazil from the lecture.
fit.2<-lm(Age.Adjusted.Rate~scale(ppov)+scale(gini),
          data=usco2)

fit.brazil<-lm(Age.Adjusted.Rate~state/(scale(ppov)+scale(gini)), 
          data=usco2)

anova(fit.2, fit.brazil, test="F")
## Analysis of Variance Table
## 
## Model 1: Age.Adjusted.Rate ~ scale(ppov) + scale(gini)
## Model 2: Age.Adjusted.Rate ~ state/(scale(ppov) + scale(gini))
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    306 4728495                                  
## 2    303 3825013  3    903482 23.857 6.896e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We see similar result as we found from chowtest. Hence we can proceed to interpret each state’s age adjusted mortality rate by the model we developed. Also we have significant chow test result which shows that the coefficients of two regime and the main data are not same. And that allows us to interpret the data by different regime.

fit.2
## 
## Call:
## lm(formula = Age.Adjusted.Rate ~ scale(ppov) + scale(gini), data = usco2)
## 
## Coefficients:
## (Intercept)  scale(ppov)  scale(gini)  
##      803.89        43.07       -29.19
fit.cl
## 
## Call:
## lm(formula = Age.Adjusted.Rate ~ (scale(ppov) + scale(gini)), 
##     data = usco2, subset = usco2$state == "06")
## 
## Coefficients:
## (Intercept)  scale(ppov)  scale(gini)  
##      741.38       100.83       -49.41
fit.tx
## 
## Call:
## lm(formula = Age.Adjusted.Rate ~ (scale(ppov) + scale(gini)), 
##     data = usco2, subset = usco2$state == "48")
## 
## Coefficients:
## (Intercept)  scale(ppov)  scale(gini)  
##      829.27        16.26       -19.12

Interpretation: Age adjusted mortality rate is higher in Texas than California. However, poverty rate is highly associated with mortality rate in California than Texas. gini variability in California is less contributing to increase mortality than Texas.