Loading some necessary library for this analysis
library(nortest)
library(car)
library(lmtest)
library(classInt)
library(sandwich)
library(tidyverse)
library(spdep)
library(spatialreg)
library(sf)
daturl<-"https://github.com/coreysparks/data/blob/master/uscounty_data.Rdata?raw=true"
load(url(daturl))
summary(usco)
## pblack phisp prural rucc
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Length:3101
## 1st Qu.: 0.4603 1st Qu.: 1.154 1st Qu.: 33.60 Class :character
## Median : 2.1182 Median : 2.353 Median : 59.50 Mode :character
## Mean : 9.0072 Mean : 7.114 Mean : 58.62
## 3rd Qu.:10.3887 3rd Qu.: 6.478 3rd Qu.: 87.20
## Max. :85.9945 Max. :97.571 Max. :100.00
##
## ppov pfemhh unemp medhouseval
## Min. : 2.50 Min. : 2.7 Min. : 1.800 Min. : 31500
## 1st Qu.:10.80 1st Qu.:12.7 1st Qu.: 4.100 1st Qu.: 84500
## Median :14.20 Median :15.5 Median : 5.100 Median :108900
## Mean :15.33 Mean :16.7 Mean : 5.408 Mean :130283
## 3rd Qu.:18.60 3rd Qu.:19.3 3rd Qu.: 6.200 3rd Qu.:152900
## Max. :51.00 Max. :47.8 Max. :16.000 Max. :912600
## NA's :1
## popdensity gini County Deaths
## Min. :9.269e+04 Min. :0.2070 Length:3101 Min. : 12
## 1st Qu.:1.715e+07 1st Qu.:0.4070 Class :character 1st Qu.: 646
## Median :4.405e+07 Median :0.4290 Mode :character Median : 1402
## Mean :2.279e+08 Mean :0.4313 Mean : 4182
## 3rd Qu.:1.058e+08 3rd Qu.:0.4530 3rd Qu.: 3368
## Max. :6.979e+10 Max. :0.6450 Max. :298108
## NA's :2
## Population Age.Adjusted.Rate state geoid
## Min. : 481 Min. : 300.9 Length:3101 Length:3101
## 1st Qu.: 56089 1st Qu.: 721.6 Class :character Class :character
## Median : 131260 Median : 806.4 Mode :character Mode :character
## Mean : 511096 Mean : 815.3
## 3rd Qu.: 344108 3rd Qu.: 906.4
## Max. :50036337 Max. :1435.7
## NA's :7
## geometry metro_cut
## MULTIPOLYGON :3101 Metropolitan :2049
## epsg:9311 : 0 Non-Metropolitan:1052
## +proj=laea...: 0
##
##
##
##
head(usco)
## Simple feature collection with 6 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1259088 ymin: -1452583 xmax: 1390058 ymax: -1107765
## Projected CRS: NAD27 / US National Atlas Equal Area
## pblack phisp prural rucc ppov pfemhh unemp medhouseval popdensity
## 1 3.810512 1.8326418 100.0 08 15.5 14.5 3.5 101000 25816818
## 2 18.489258 3.2830777 47.2 04 15.1 18.5 3.5 137000 67111751
## 3 31.616198 1.5051066 100.0 08 18.7 20.5 4.6 73100 17147879
## 4 12.566549 0.9458693 69.7 06 18.4 19.2 4.0 90600 35909345
## 5 24.557442 0.7867706 100.0 08 23.5 22.7 4.3 68100 22546153
## 6 20.917781 3.3047510 50.9 04 16.0 21.4 4.0 103300 86871603
## gini County Deaths Population Age.Adjusted.Rate state geoid
## 1 0.424 Cleburne County, AL 926 75007 979.53 01 01029
## 2 0.452 Coffee County, AL 2484 254213 851.78 01 01031
## 3 0.443 Coosa County, AL 687 55109 954.81 01 01037
## 4 0.475 Covington County, AL 2578 189254 963.49 01 01039
## 5 0.466 Crenshaw County, AL 876 69471 989.55 01 01041
## 6 0.424 Dale County, AL 2392 248927 876.17 01 01045
## geometry metro_cut
## 1 MULTIPOLYGON (((1349148 -11... Non-Metropolitan
## 2 MULTIPOLYGON (((1327060 -13... Metropolitan
## 3 MULTIPOLYGON (((1305589 -12... Non-Metropolitan
## 4 MULTIPOLYGON (((1306630 -14... Metropolitan
## 5 MULTIPOLYGON (((1315155 -13... Non-Metropolitan
## 6 MULTIPOLYGON (((1354333 -14... Metropolitan
usco <- st_transform(usco, crs = 2163)
usco2 <- subset(usco, state== "06" | state=="48")
head(usco2)
## Simple feature collection with 6 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -2001856 ymin: -1081281 xmax: -1637152 ymax: -102816.7
## Projected CRS: NAD27 / US National Atlas Equal Area
## pblack phisp prural rucc ppov pfemhh unemp medhouseval popdensity
## 158 9.4529654 15.009640 70.5 07 16.9 15.3 8.0 176500 7652410
## 159 0.8084838 11.959261 70.0 06 20.4 15.6 7.9 157300 2430975
## 160 7.3254119 13.744986 0.0 01 12.2 19.1 5.1 765700 15775094404
## 161 2.3093507 37.320654 5.0 02 12.7 17.0 4.3 446500 146526342
## 162 0.2620850 8.415842 99.7 08 10.7 11.4 8.4 195700 3602549
## 163 15.3362667 21.170914 3.7 02 9.2 20.5 5.4 263600 500864602
## gini County Deaths Population Age.Adjusted.Rate state
## 158 0.422 Lassen County, CA 1142 163141 724.37 06
## 159 0.430 Modoc County, CA 558 45868 827.50 06
## 160 0.507 San Francisco County, CA 28128 4196196 553.17 06
## 161 0.475 Santa Barbara County, CA 14905 2176804 590.62 06
## 162 0.389 Sierra County, CA 178 15187 738.86 06
## 163 0.404 Solano County, CA 14981 2124261 685.45 06
## geoid geometry metro_cut
## 158 06035 MULTIPOLYGON (((-1668658 -2... Non-Metropolitan
## 159 06049 MULTIPOLYGON (((-1676174 -1... Metropolitan
## 160 06075 MULTIPOLYGON (((-2001706 -5... Metropolitan
## 161 06083 MULTIPOLYGON (((-1814667 -1... Metropolitan
## 162 06091 MULTIPOLYGON (((-1703056 -4... Non-Metropolitan
## 163 06095 MULTIPOLYGON (((-1928795 -5... Metropolitan
The map shows the spatial distribution of the counties in California and Texas.
library(scales)
usco %>%
ggplot()+
geom_sf(aes( fill=state, group=state))+
#coord_sf(crs = 2163)+
scale_colour_viridis_d()+
scale_fill_viridis_d()+
guides(title="State",fill = guide_legend(nrow = 4, byrow = TRUE ))+
#guides(fill=guide_legend(title="County Type"))+
ggtitle(label = "US-States")+theme(legend.position="bottom")
usco2 %>%
ggplot()+
geom_sf(aes( fill=state, group=state))+
#coord_sf(crs = 2163)+
scale_colour_viridis_d()+
scale_fill_viridis_d()+
guides(fill=guide_legend(title="Calfornia and Texas"))+
ggtitle(label = "Counties of California and Texas")
fit.0<-lm(Age.Adjusted.Rate~state+scale(ppov)+scale(gini),
data=usco2)
summary(fit.0)
##
## Call:
## lm(formula = Age.Adjusted.Rate ~ state + scale(ppov) + scale(gini),
## data = usco2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -448.97 -67.88 3.76 76.12 291.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 698.414 15.655 44.613 < 2e-16 ***
## state48 129.693 17.499 7.412 1.24e-12 ***
## scale(ppov) 26.806 7.451 3.597 0.000375 ***
## scale(gini) -25.119 7.638 -3.288 0.001125 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 114.6 on 305 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.2353, Adjusted R-squared: 0.2278
## F-statistic: 31.28 on 3 and 305 DF, p-value: < 2.2e-16
Interpretation: Texas has higher mortality rate than California. And we also observe that county characteristics like poverty rate and gini score is associated with mortality rate. As they are all signigicantly associated with mortality I choose age.adjusted mortality as my dependent variable, and poverty rate(ppov), state, and gini as my predictor variable. Now, we will see how in these states our outcomes varies.
fit.2<-lm(Age.Adjusted.Rate~scale(ppov)+scale(gini),
data=usco2)
fit.cl<-lm(Age.Adjusted.Rate~(scale(ppov)+scale(gini)),
data=usco2, subset=usco2$state=="06")
fit.tx<-lm(Age.Adjusted.Rate~(scale(ppov)+scale(gini)),
data=usco2, subset=usco2$state=="48")
Chow test to find out the whether the models differ or similar by their coefficient value?
RSS2<-sum(resid(fit.tx)^2) + sum(resid(fit.cl)^2)
RSS0<-sum(resid(fit.2)^2)
k<-length(coef(fit.2))
n1<-as.numeric(table(usco2$state)[1])
n2<-as.numeric(table(usco2$state)[2])
ctest<-((RSS0 - RSS2)/k) / (RSS2 / (n1+n2- 2*k))
ctest
## [1] 24.09278
df(ctest, df1 = k, df2=n1+n2-2*k)
## [1] 6.035142e-14
Interpretation: Smaller p-value indicates that we can interpret the regime model because there is no homogeneity by cofficient values. they are different than original model.
fit.2<-lm(Age.Adjusted.Rate~scale(ppov)+scale(gini),
data=usco2)
fit.brazil<-lm(Age.Adjusted.Rate~state/(scale(ppov)+scale(gini)),
data=usco2)
anova(fit.2, fit.brazil, test="F")
## Analysis of Variance Table
##
## Model 1: Age.Adjusted.Rate ~ scale(ppov) + scale(gini)
## Model 2: Age.Adjusted.Rate ~ state/(scale(ppov) + scale(gini))
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 306 4728495
## 2 303 3825013 3 903482 23.857 6.896e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We see similar result as we found from chowtest. Hence we can proceed to interpret each state’s age adjusted mortality rate by the model we developed. Also we have significant chow test result which shows that the coefficients of two regime and the main data are not same. And that allows us to interpret the data by different regime.
fit.2
##
## Call:
## lm(formula = Age.Adjusted.Rate ~ scale(ppov) + scale(gini), data = usco2)
##
## Coefficients:
## (Intercept) scale(ppov) scale(gini)
## 803.89 43.07 -29.19
fit.cl
##
## Call:
## lm(formula = Age.Adjusted.Rate ~ (scale(ppov) + scale(gini)),
## data = usco2, subset = usco2$state == "06")
##
## Coefficients:
## (Intercept) scale(ppov) scale(gini)
## 741.38 100.83 -49.41
fit.tx
##
## Call:
## lm(formula = Age.Adjusted.Rate ~ (scale(ppov) + scale(gini)),
## data = usco2, subset = usco2$state == "48")
##
## Coefficients:
## (Intercept) scale(ppov) scale(gini)
## 829.27 16.26 -19.12
Interpretation: Age adjusted mortality rate is higher in Texas than California. However, poverty rate is highly associated with mortality rate in California than Texas. gini variability in California is less contributing to increase mortality than Texas.