This example will illustrate a way to combine individual survey data with aggregate data on counties to produce a county level estimate of basically any health indicator measured using the BRFSS. The framework I use below takes observed individual level survey responses from the BRFSS and merges these to county level variables from the ACS. This allows me to estimate the overall regression model for county-level prevalence, controlling for higher level variables. Then, I can use this equation for prediction for counties where I have not observed survey respondents, but I have observed the county level characteristics.
In this example, I estimate county level obesity rates and compare my estimates to those from the CDC.
The example merges data from the 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link and the 2010 American Community Survey 5-year estimates at the county level. More details on these data are found below.
library(lme4)
## Loading required package: Matrix
## Loading required package: Rcpp
library(arm)
## Loading required package: MASS
##
## arm (Version 1.7-07, built: 2014-8-27)
##
## Working directory is /Users/ozd504/Google Drive/dem7283/Rcode/Rcode15
library(car)
##
## Attaching package: 'car'
##
## The following object is masked from 'package:arm':
##
## logit
#load county map data
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
uscos<-readShapePoly("~/Google Drive/dem7283/data/usdata_mort.shp")
#load brfss
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)
brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-as.factor(paste(brfss_11$statefip, brfss_11$cofip, sep=""))
#just for brevity, I just select respondents with non missing weights
brfss_11<-brfss_11[is.na(brfss_11$cntywt)==F,]
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-ifelse(brfss_11$genhlth %in% c(4,5),1,0)
#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=T)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=T)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=T)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=T)
#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)
#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")
#education level
brfss_11$lths<-recode(brfss_11$educa, recodes="1:3=1;9=NA; else=0", as.factor.result=F)
brfss_11$coll<-recode(brfss_11$educa, recodes="5:6=1;9=NA; else=0", as.factor.result=F)
#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')
#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')
#income
brfss_11$inc<-as.factor(ifelse(brfss_11$incomg==9, NA, brfss_11$incomg))
#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))
brfss_11$obese<-ifelse(brfss_11$bmi>=30,1,0)
tab<-data.frame(xtabs(I(obese==1)~agec+sex+racegr2+cofips, brfss_11 ))
head(tab, n=20)
## agec sex racegr2 cofips Freq
## 1 (0,24] 1 1 01073 1
## 2 (24,39] 1 1 01073 9
## 3 (39,59] 1 1 01073 15
## 4 (59,79] 1 1 01073 17
## 5 (79,99] 1 1 01073 5
## 6 (0,24] 2 1 01073 1
## 7 (24,39] 2 1 01073 5
## 8 (39,59] 2 1 01073 15
## 9 (59,79] 2 1 01073 33
## 10 (79,99] 2 1 01073 5
## 11 (0,24] 1 2 01073 1
## 12 (24,39] 1 2 01073 2
## 13 (39,59] 1 2 01073 18
## 14 (59,79] 1 2 01073 7
## 15 (79,99] 1 2 01073 0
## 16 (0,24] 2 2 01073 7
## 17 (24,39] 2 2 01073 17
## 18 (39,59] 2 2 01073 33
## 19 (59,79] 2 2 01073 29
## 20 (79,99] 2 2 01073 3
Now we will begin fitting the multilevel regression model with the county, that the person lives in being the higher level I will also add in some Census variables from the ACS 2010 5 Year estimates,
#load in ACS data from Factfinder
#economic data
acsecon<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_DP03_with_ann.csv")
acsecon$povrate<-acsecon[, "HC03_VC156"]
acsecon$unemployed<-acsecon[, "HC03_VC13"]
acsecon$pinsurance<-acsecon[,"HC03_VC127"]
acsecon$cofips<-substr(acsecon$GEO.id, 10,14)
acsecon<-acsecon[, c("cofips", "povrate", "unemployed")]
head(acsecon)
## cofips povrate unemployed
## 1 01001 7.5 6.2
## 2 01003 9.1 6.6
## 3 01005 19.9 9.6
## 4 01007 9.4 9.1
## 5 01009 10.0 7.5
## 6 01011 22.6 11.2
#housing data
acshous<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_DP04_with_ann.csv")
acshous$cofips<-substr(acshous$GEO.id, 10,14)
acshous$pvacanthouse<-acshous[, "HC03_VC05"]
acshous$houseval<-as.numeric(acshous[,"HC01_VC116"])/10000
acshous<-acshous[, c("cofips", "pvacanthouse", "houseval")]
head(acshous)
## cofips pvacanthouse houseval
## 1 01001 8.4 1.5285
## 2 01003 31.3 5.3309
## 3 01005 18.4 0.6665
## 4 01007 16.3 0.6170
## 5 01009 12.3 1.6897
## 6 01011 17.9 0.2871
#education data
acssoc<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_S1501_with_ann.csv")
acssoc$cofips<-substr(acssoc$GEO.id, 10,14)
acssoc$pnohs<-acssoc[, "HC01_EST_VC02"]
acssoc$pbacc<-acssoc[,"HC01_EST_VC05"]
acssoc<-acssoc[, c("cofips", "pnohs", "pbacc")]
#race data
acsrace<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_B03002_with_ann.csv")
acsrace$pblack<-acsrace[,"HD01_VD04"]/acsrace[,"HD01_VD01"]
acsrace$phispanic<-acsrace[,"HD01_VD12"]/acsrace[,"HD01_VD01"]
acsrace<-acsrace[, c("pblack", "phispanic")]
#put these together
joindata<-cbind(acsecon, acshous[,-1], acssoc[,-1], acsrace)
head(joindata)
## cofips povrate unemployed pvacanthouse houseval pnohs pbacc pblack
## 1 01001 7.5 6.2 8.4 1.5285 23.3 6.4 0.17821466
## 2 01003 9.1 6.6 31.3 5.3309 21.0 6.9 0.09379320
## 3 01005 19.9 9.6 18.4 0.6665 32.3 0.9 0.46467381
## 4 01007 9.4 9.1 16.3 0.6170 26.6 3.0 0.15258735
## 5 01009 10.0 7.5 12.3 1.6897 22.4 1.0 0.01155366
## 6 01011 22.6 11.2 17.9 0.2871 47.5 0.0 0.74164607
## phispanic
## 1 0.023196313
## 2 0.038955350
## 3 0.045958338
## 4 0.005926581
## 5 0.074895929
## 6 0.016478989
load("~/Google Drive/dem7283/data/segregation_county.Rdata")
rucc<-uscos@data[, c("COFIPS", "RUCC", "region")]
joindata<-merge(joindata, county_dat, by.x="cofips", by.y="cofips", all.x=T)
joindata<-merge(joindata, rucc, by.x="cofips", by.y="COFIPS", all.x=T)
head(joindata)
## cofips povrate unemployed pvacanthouse houseval pnohs pbacc pblack
## 1 01001 7.5 6.2 8.4 1.5285 23.3 6.4 0.17821466
## 2 01003 9.1 6.6 31.3 5.3309 21.0 6.9 0.09379320
## 3 01005 19.9 9.6 18.4 0.6665 32.3 0.9 0.46467381
## 4 01007 9.4 9.1 16.3 0.6170 26.6 3.0 0.15258735
## 5 01009 10.0 7.5 12.3 1.6897 22.4 1.0 0.01155366
## 6 01011 22.6 11.2 17.9 0.2871 47.5 0.0 0.74164607
## phispanic dissim_wb isolation_b interaction_bw TheileH RUCC region
## 1 0.023196313 0.2897522 0.27474922 0.6897242 0.08299523 2 3
## 2 0.038955350 0.3903035 0.18464112 0.7673910 0.09203860 2 3
## 3 0.045958338 0.1913727 0.50243412 0.4518021 0.04898714 6 3
## 4 0.005926581 0.4338891 0.33170220 0.6477188 0.11959518 6 3
## 5 0.074895929 0.3776626 0.02200736 0.9040929 0.05283605 2 3
## 6 0.016478989 0.2412244 0.71810200 0.2139202 0.05627216 6 3
summary(joindata)
## cofips povrate unemployed pvacanthouse
## Length:3143 Min. : 0.00 Min. : 0.00 Min. : 1.70
## Class :character 1st Qu.: 7.40 1st Qu.: 5.40 1st Qu.:10.20
## Mode :character Median :10.40 Median : 7.30 Median :14.50
## Mean :11.35 Mean : 7.53 Mean :17.26
## 3rd Qu.:14.10 3rd Qu.: 9.25 3rd Qu.:20.90
## Max. :44.90 Max. :30.90 Max. :83.30
##
## houseval pnohs pbacc pblack
## Min. : 0.0000 22.7 : 24 0.0 : 221 Min. :0.000000
## 1st Qu.: 0.3214 19.2 : 23 4.1 : 46 1st Qu.:0.004214
## Median : 0.7283 15.2 : 22 2.8 : 43 Median :0.019569
## Mean : 2.4209 15.3 : 21 2.7 : 40 Mean :0.088359
## 3rd Qu.: 1.8512 15.4 : 21 4.7 : 40 3rd Qu.:0.100550
## Max. :155.2091 15.7 : 21 3.3 : 39 Max. :0.861355
## (Other):3011 (Other):2714
## phispanic dissim_wb isolation_b interaction_bw
## Min. :0.00000 Min. :0.0000 Min. :0.000000 Min. :0.0000
## 1st Qu.:0.01440 1st Qu.:0.1938 1st Qu.:0.005815 1st Qu.:0.6519
## Median :0.03024 Median :0.2996 Median :0.037123 Median :0.8459
## Mean :0.07818 Mean :0.3077 Mean :0.131310 Mean :0.7718
## 3rd Qu.:0.07654 3rd Qu.:0.4222 3rd Qu.:0.197412 3rd Qu.:0.9427
## Max. :0.98328 Max. :0.9243 Max. :0.865954 Max. :0.9922
##
## TheileH RUCC region
## Min. :0.00000 Min. :0.00 1 : 217
## 1st Qu.:0.01463 1st Qu.:3.00 2 :1055
## Median :0.04631 Median :6.00 3 :1382
## Mean :0.07345 Mean :5.62 4 : 413
## 3rd Qu.:0.10166 3rd Qu.:8.00 NA's: 76
## Max. :0.97694 Max. :9.00
## NA's :76
#z-score everything!!
joindata$povz<-scale(joindata$povrate, center=T, scale=T)
joindata$unempz<-scale(joindata$unemployed, center=T, scale=T)
joindata$vachousz<-scale(joindata$pvacanthouse, center=T, scale=T)
joindata$housvalz<-scale(joindata$houseval, center=T, scale=T)
joindata$nohsz<-scale(as.numeric(as.character(joindata$pnohs)), center=T, scale=T)
## Warning in scale(as.numeric(as.character(joindata$pnohs)), center = T,
## scale = T): NAs introduced by coercion
joindata$baccz<-scale(as.numeric(as.character(joindata$pbacc)), center=T, scale=T)
## Warning in scale(as.numeric(as.character(joindata$pbacc)), center = T,
## scale = T): NAs introduced by coercion
joindata$blackz<-scale(joindata$pblack, center=T, scale=T)
joindata$hispanicz<-scale(joindata$phispanic, center=T, scale=T)
joindata$diss.z<-scale(joindata$dissim_wb, center=T, scale=T)
joindata$iso.z<-scale(joindata$isolation_b, center=T, scale=T)
joindata$theil.z<-scale(joindata$TheileH, center=T, scale=T)
joindata$noind<-ifelse(joindata$cofips%in%unique(brfss_11$cofips), 0,1)
joindata$cofips<-as.factor(joindata$cofips)
head(joindata, n=10)
## cofips povrate unemployed pvacanthouse houseval pnohs pbacc pblack
## 1 01001 7.5 6.2 8.4 1.5285 23.3 6.4 0.17821466
## 2 01003 9.1 6.6 31.3 5.3309 21.0 6.9 0.09379320
## 3 01005 19.9 9.6 18.4 0.6665 32.3 0.9 0.46467381
## 4 01007 9.4 9.1 16.3 0.6170 26.6 3.0 0.15258735
## 5 01009 10.0 7.5 12.3 1.6897 22.4 1.0 0.01155366
## 6 01011 22.6 11.2 17.9 0.2871 47.5 0.0 0.74164607
## 7 01013 19.7 9.5 19.9 0.5530 28.5 1.9 0.42924145
## 8 01015 15.2 11.7 12.6 3.2818 18.1 4.0 0.20218696
## 9 01017 16.4 14.2 18.5 0.9764 24.7 2.7 0.39119119
## 10 01019 13.7 12.4 28.7 0.8802 39.2 4.1 0.04677043
## phispanic dissim_wb isolation_b interaction_bw TheileH RUCC region
## 1 0.023196313 0.2897522 0.27474922 0.6897242 0.08299523 2 3
## 2 0.038955350 0.3903035 0.18464112 0.7673910 0.09203860 2 3
## 3 0.045958338 0.1913727 0.50243412 0.4518021 0.04898714 6 3
## 4 0.005926581 0.4338891 0.33170220 0.6477188 0.11959518 6 3
## 5 0.074895929 0.3776626 0.02200736 0.9040929 0.05283605 2 3
## 6 0.016478989 0.2412244 0.71810200 0.2139202 0.05627216 6 3
## 7 0.002775784 0.3008681 0.50934745 0.4691282 0.09515667 7 3
## 8 0.030345970 0.4482778 0.40948665 0.5476673 0.17938028 3 3
## 9 0.015153706 0.2804150 0.44779526 0.5277732 0.06880993 5 3
## 10 0.012490272 0.2618203 0.06212876 0.9108150 0.03493550 6 3
## povz unempz vachousz housvalz nohsz baccz
## 1 -0.6960015 -0.396270981 -0.86071350 -0.1392619 0.21498840 0.1788121
## 2 -0.4069846 -0.277120527 1.36296990 0.4540987 -0.01936268 0.2851512
## 3 1.5438794 0.616507876 0.11032729 -0.2737761 1.13201436 -0.9909173
## 4 -0.3527939 0.467569809 -0.09359128 -0.2815005 0.55123125 -0.5442933
## 5 -0.2444126 -0.009032006 -0.48200759 -0.1141068 0.12328580 -0.9696495
## 6 2.0315954 1.093109691 0.06177525 -0.3329811 2.68076932 -1.1823276
## 7 1.5077523 0.586720263 0.25598341 -0.2914876 0.74482562 -0.7782392
## 8 0.6948923 1.242047759 -0.45287637 0.1343388 -0.31484883 -0.3316153
## 9 0.9116549 1.986738095 0.12003770 -0.2254165 0.35763688 -0.6080968
## 10 0.4239389 1.450561053 1.11049930 -0.2404284 1.83506760 -0.3103474
## blackz hispanicz diss.z iso.z theil.z noind
## 1 0.61587480 -0.42757342 -0.10411204 0.7875219 0.10912857 1
## 2 0.03724395 -0.30502357 0.47778652 0.2928025 0.21251998 1
## 3 2.57928686 -0.25056498 -0.67344308 2.0375776 -0.27968056 1
## 4 0.44022338 -0.56187115 0.73002043 1.1002102 0.52756994 1
## 5 -0.52643187 -0.02553236 0.40463251 -0.6001036 -0.23567651 1
## 6 4.47767497 -0.47981056 -0.38494688 3.2216563 -0.19639204 1
## 7 2.33643081 -0.58637327 -0.03978381 2.0755338 0.24816838 1
## 8 0.78018273 -0.37197425 0.81328881 1.5272692 1.21108350 1
## 9 2.07563154 -0.49011660 -0.15814735 1.7375945 -0.05304978 1
## 10 -0.28505356 -0.51082874 -0.26575674 -0.3798256 -0.44033071 1
Now I merge the county data back to the survey:
merged<-merge(x=brfss_11, y=joindata, by.x="cofips", by.y="cofips", all.x=T, incomparables=NA)
## Warning in merge.data.frame(x = brfss_11, y = joindata, by.x = "cofips", :
## column names 'cholchk', 'mrace' are duplicated in the result
merged$bmiz<-scale(merged$bmi, center=T, scale=T)
merged$agez<-scale(merged$age, center=T, scale=T)
merged$obese<-ifelse(merged$bmi>=30,1,0)
#names(merged)
merged<-merged[, c(1:2, 190:239)]
#names(merged)
merged[,c(8:52)]<-apply(merged[,c(8:52)],2,as.numeric)
## Warning in apply(merged[, c(8:52)], 2, as.numeric): NAs introduced by
## coercion
## Warning in apply(merged[, c(8:52)], 2, as.numeric): NAs introduced by
## coercion
#summary(merged)
#save(merged, file="~/Google Drive/dem7263/data/brfss_sub.Rdata")
Here, I make a copy of the county data to use for prediction:
othercos<-joindata
othercos$state<-substr(othercos$cofips,1,2)
Next, I fit the multilevel model. Here I fit the model with only county-level predictors. Since I want to estimate obesity prevalence, I use a logistic regression for obese==1:
fit.1<-glmer(obese~povz+vachousz+baccz+blackz+hispanicz+factor(region)+(1|state)+(1|cofips), family="binomial", data=merged, weights=cntywt/mean(cntywt))
## Warning: non-integer #successes in a binomial glm!
display(fit.1, detail=T)
## glmer(formula = obese ~ povz + vachousz + baccz + blackz + hispanicz +
## factor(region) + (1 | state) + (1 | cofips), data = merged,
## family = "binomial", weights = cntywt/mean(cntywt))
## coef.est coef.se z value Pr(>|z|)
## (Intercept) -0.88 0.04 -23.65 0.00
## povz 0.09 0.03 3.58 0.00
## vachousz -0.03 0.03 -1.08 0.28
## baccz -0.15 0.01 -10.42 0.00
## blackz 0.03 0.02 1.48 0.14
## hispanicz -0.03 0.02 -1.77 0.08
## factor(region)2 0.08 0.04 2.07 0.04
## factor(region)3 0.05 0.04 1.13 0.26
## factor(region)4 -0.13 0.04 -3.16 0.00
##
## Error terms:
## Groups Name Std.Dev.
## cofips (Intercept) 0.13
## state (Intercept) 0.02
## Residual 1.00
## ---
## number of obs: 220537, groups: cofips, 217; state, 46
## AIC = 223975, DIC = 223952.8
## deviance = 223952.8
Here I find predicted values for the counties with no individual data, but with observed county variables:
fitted<-predict(fit.1, newdata=othercos[, c("state", "cofips","povz", "vachousz", "baccz", "blackz", "hispanicz", "region")], allow.new.levels=T,type="response" )
#form a data frame with the fitted values, remember, the estimates are prevalences
fitted<-data.frame(cofips=othercos$cofips, state=othercos$state, estobese=100*fitted)
head(fitted)
## cofips state estobese
## 1 01001 01 29.55263
## 2 01003 01 28.08815
## 3 01005 01 38.90602
## 4 01007 01 32.09266
## 5 01009 01 32.96915
## 6 01011 01 42.33288
#merge county data to my estimates
mdat<-uscos@data
mdat<-merge(mdat, fitted, by.x="COFIPS", by.y="cofips", all.x=T, sort=F)
uscos@data<-mdat
#Map my estimates
library(sp)
library(RColorBrewer)
spplot(uscos, "estobese", at=quantile(uscos$estobese, na.rm=T), col.regions=brewer.pal(n=5, "Blues"), main="My Estimates of %Obesity Prevalence using BRFSS")
In this type of situation, one should try to find a way to “check” to see how bad your estimates are, so Iread in CDC’s estimates of obesity prevalence, estimated using much fancier methods. These are NOT TRUTH, but just come other person’s estiamtes.
The county-level estimates for more than 3,200 counties or county equivalents (e.g., parish, borough, municipio) in the 50 U.S. states, Puerto Rico, and the District of Columbia were developed using modern small area estimation techniques. This approach employs a statistical model that “borrows strength” in making an estimate for one county from BRFSS data collected in other counties. Bayesian multilevel modeling techniques were used to obtain these estimates. Multilevel logistic regression models with random effects of demographic variables (age 20–44, 45–64, 65+; race; sex) at the county-level and at the state level were developed. Models were fit using a simulation method known as Markov Chain Monte Carlo. The model specification is given in Cadwell, et al.
For all years, rates were age adjusted by calculating age specific rates for the following three age groups, 20–44, 45–64, 65+. A weighted sum based on the distribution of these three age groups from the 2000 census was then used to adjust the rates by age. Cadwell BL, Thompson TJ, Boyle JP, Barker LE (2010). Bayesian small area estimates of diabetes prevalence by U.S. county, 2005. Journal of Data Science 8(1): 173-188. Barker LE, Thompson TJ, Kirtland KA, Boyle JP, Geiss LS, McCauley MM, Albright AL (2013). Bayesian Small Area Estimates of Diabetes Incidence by United States County, 2009. Journal of Data Science 11:249–269.
I got these estimates from here
obdat<-read.csv("~/Google Drive/dem7283/data/OB_PREV_ALL_STATES_2010.csv")
obdat$cofips<-sprintf("%05d", obdat$FIPS.Codes )
obdat$percent<-as.numeric(as.character(obdat$percent))
## Warning: NAs introduced by coercion
names(obdat)
## [1] "State"
## [2] "FIPS.Codes"
## [3] "County"
## [4] "number"
## [5] "percent"
## [6] "lower.confidence.limit"
## [7] "upper.confidence.limit"
## [8] "age.adjusted.percent"
## [9] "age.adjusted.lower.confidence.limit"
## [10] "age.adjusted.upper..confidence.limit"
## [11] "cofips"
head(obdat)
## State FIPS.Codes County number percent lower.confidence.limit
## 1 Alabama 1001 Autauga County 11761 30.5 25.5
## 2 Alabama 1003 Baldwin County 36348 26.6 23.7
## 3 Alabama 1005 Barbour County 7743 37.3 32.1
## 4 Alabama 1007 Bibb County 5884 34.3 28.8
## 5 Alabama 1009 Blount County 12690 30.4 25
## 6 Alabama 1011 Bullock County 3446 42.1 34.4
## upper.confidence.limit age.adjusted.percent
## 1 35.7 30.4
## 2 29.6 26.8
## 3 42.6 37.4
## 4 40.1 34.2
## 5 36.3 30.3
## 6 50 41.9
## age.adjusted.lower.confidence.limit age.adjusted.upper..confidence.limit
## 1 25.3 35.5
## 2 23.8 30.1
## 3 32 42.9
## 4 28.6 40.1
## 5 24.9 36.5
## 6 34.1 50.1
## cofips
## 1 01001
## 2 01003
## 3 01005
## 4 01007
## 5 01009
## 6 01011
#merge these data to the map's data
mdat2<-uscos@data
mdat2<-merge(mdat2, obdat, by.x="COFIPS", by.y="cofips", all.x=T, sort=F)
uscos@data<-mdat2
#compare the estimates
summary(uscos$estobese)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 7.406 27.640 30.410 30.270 33.010 48.650 1
summary(uscos$percent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.10 28.30 30.70 30.58 33.20 47.90
Pretty close as judged by simple numerics! Now I use some other standard method to see how close my estimates are to the CDC’s:
uscos$pdiffest<-100*((uscos$estobese-uscos$percent)/uscos$percent)
hist(uscos$pdiffest, xlim=c(min(uscos$pdiffest, na.rm=T), max(uscos$pdiffest, na.rm=T)))
#so on average, I was pretty close to the CDC's estimates, I should be I'm using the same data as they do!
#Here are various measures of how bad my estimate was compared to CDC's
#MAPE
mean(abs(uscos$pdiffest),na.rm=T)
## [1] 9.227811
#9.2% off on average
#MAE
mean(abs(uscos$estobese- uscos$percent), na.rm=T)
## [1] 2.752269
#on avereage, I mis-estimated bmi by 2.8 percentage points
#MPE
mean(uscos$pdiffest,na.rm=T)
## [1] -0.4030874
#on average for counties, I underestimated the % obese by .4%
#My estimates vs CDC's
library(ggplot2)
ggplot(uscos@data, aes(x=percent, y=estobese))+geom_point(alpha=.5)+theme_gray()+xlab("CDC Estimate")+ylab("My Estimate")
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(uscos@data, aes(x=percent, y=estobese)) + stat_binhex()+xlab("CDC Estimate")+ylab("My Estimate")
## Warning: Removed 1 rows containing missing values (stat_hexbin).
plot(uscos$estobese, uscos$percent, pch=".", ylab="CDC Estimate", xlab="My Estimate")
abline(coef(lm(percent~estobese, uscos)), col=2, lwd=2)
#Map the differences
spplot(uscos, "pdiffest", at=quantile(uscos$pdiffest,na.rm=T), col.regions=brewer.pal(n=5, "RdBu"), main="% Difference in My Estimates vs CDC's Estimates")
#Red = I estimated too low, blue=I estimated to high
#Moreover, I don't see any big swaths of the country thate are all red or all blue
#Ok, but how did I do at the state level?
boxplot(pdiffest~state, uscos)
abline(h=0, col=2)
#Not bad!!
plot(tapply(uscos$estobese, uscos$state, mean), tapply(uscos$percent, uscos$state, mean), xlab="My Estimate", ylab="CDC Estimate", pch="0")
abline(coef(lm(tapply(uscos$percent, uscos$state, mean)~ tapply(uscos$estobese, uscos$state, mean))), lwd=2, col=2)