This example will illustrate a way to combine individual survey data with aggregate data on counties to produce a county level estimate of basically any health indicator measured using the BRFSS. The framework I use below takes observed individual level survey responses from the BRFSS and merges these to county level variables from the ACS. This allows me to estimate the overall regression model for county-level prevalence, controlling for higher level variables. Then, I can use this equation for prediction for counties where I have not observed survey respondents, but I have observed the county level characteristics.

In this example, I estimate county level obesity rates and compare my estimates to those from the CDC.

The example merges data from the 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link and the 2010 American Community Survey 5-year estimates at the county level. More details on these data are found below.

library(lme4)
## Loading required package: Matrix
## Loading required package: Rcpp
library(arm)
## Loading required package: MASS
## 
## arm (Version 1.7-07, built: 2014-8-27)
## 
## Working directory is /Users/ozd504/Google Drive/dem7283/Rcode/Rcode15
library(car)
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:arm':
## 
##     logit
#load county map data
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
uscos<-readShapePoly("~/Google Drive/dem7283/data/usdata_mort.shp")

#load brfss
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)

brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-as.factor(paste(brfss_11$statefip, brfss_11$cofip, sep=""))

#just for brevity, I just select respondents with non missing weights
brfss_11<-brfss_11[is.na(brfss_11$cntywt)==F,]
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-ifelse(brfss_11$genhlth %in% c(4,5),1,0)

#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=T)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=T)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=T)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=T)

#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)

#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")

#education level
brfss_11$lths<-recode(brfss_11$educa, recodes="1:3=1;9=NA; else=0", as.factor.result=F)
brfss_11$coll<-recode(brfss_11$educa, recodes="5:6=1;9=NA; else=0", as.factor.result=F)

#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')

#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')

#income
brfss_11$inc<-as.factor(ifelse(brfss_11$incomg==9, NA, brfss_11$incomg))

#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))
brfss_11$obese<-ifelse(brfss_11$bmi>=30,1,0)
tab<-data.frame(xtabs(I(obese==1)~agec+sex+racegr2+cofips, brfss_11  ))
head(tab, n=20)
##       agec sex racegr2 cofips Freq
## 1   (0,24]   1       1  01073    1
## 2  (24,39]   1       1  01073    9
## 3  (39,59]   1       1  01073   15
## 4  (59,79]   1       1  01073   17
## 5  (79,99]   1       1  01073    5
## 6   (0,24]   2       1  01073    1
## 7  (24,39]   2       1  01073    5
## 8  (39,59]   2       1  01073   15
## 9  (59,79]   2       1  01073   33
## 10 (79,99]   2       1  01073    5
## 11  (0,24]   1       2  01073    1
## 12 (24,39]   1       2  01073    2
## 13 (39,59]   1       2  01073   18
## 14 (59,79]   1       2  01073    7
## 15 (79,99]   1       2  01073    0
## 16  (0,24]   2       2  01073    7
## 17 (24,39]   2       2  01073   17
## 18 (39,59]   2       2  01073   33
## 19 (59,79]   2       2  01073   29
## 20 (79,99]   2       2  01073    3

Now we will begin fitting the multilevel regression model with the county, that the person lives in being the higher level I will also add in some Census variables from the ACS 2010 5 Year estimates,

#load in ACS data from Factfinder
#economic data
acsecon<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_DP03_with_ann.csv")
acsecon$povrate<-acsecon[, "HC03_VC156"]
acsecon$unemployed<-acsecon[, "HC03_VC13"]
acsecon$pinsurance<-acsecon[,"HC03_VC127"]
acsecon$cofips<-substr(acsecon$GEO.id, 10,14)
acsecon<-acsecon[, c("cofips", "povrate", "unemployed")]
head(acsecon)
##   cofips povrate unemployed
## 1  01001     7.5        6.2
## 2  01003     9.1        6.6
## 3  01005    19.9        9.6
## 4  01007     9.4        9.1
## 5  01009    10.0        7.5
## 6  01011    22.6       11.2
#housing data
acshous<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_DP04_with_ann.csv")
acshous$cofips<-substr(acshous$GEO.id, 10,14)
acshous$pvacanthouse<-acshous[, "HC03_VC05"]
acshous$houseval<-as.numeric(acshous[,"HC01_VC116"])/10000
acshous<-acshous[, c("cofips", "pvacanthouse", "houseval")]
head(acshous)
##   cofips pvacanthouse houseval
## 1  01001          8.4   1.5285
## 2  01003         31.3   5.3309
## 3  01005         18.4   0.6665
## 4  01007         16.3   0.6170
## 5  01009         12.3   1.6897
## 6  01011         17.9   0.2871
#education data
acssoc<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_S1501_with_ann.csv")
acssoc$cofips<-substr(acssoc$GEO.id, 10,14)
acssoc$pnohs<-acssoc[, "HC01_EST_VC02"]
acssoc$pbacc<-acssoc[,"HC01_EST_VC05"]
acssoc<-acssoc[, c("cofips", "pnohs", "pbacc")]

#race data
acsrace<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_B03002_with_ann.csv")
acsrace$pblack<-acsrace[,"HD01_VD04"]/acsrace[,"HD01_VD01"]
acsrace$phispanic<-acsrace[,"HD01_VD12"]/acsrace[,"HD01_VD01"]
acsrace<-acsrace[, c("pblack", "phispanic")]

#put these together
joindata<-cbind(acsecon, acshous[,-1], acssoc[,-1], acsrace)
head(joindata)
##   cofips povrate unemployed pvacanthouse houseval pnohs pbacc     pblack
## 1  01001     7.5        6.2          8.4   1.5285  23.3   6.4 0.17821466
## 2  01003     9.1        6.6         31.3   5.3309  21.0   6.9 0.09379320
## 3  01005    19.9        9.6         18.4   0.6665  32.3   0.9 0.46467381
## 4  01007     9.4        9.1         16.3   0.6170  26.6   3.0 0.15258735
## 5  01009    10.0        7.5         12.3   1.6897  22.4   1.0 0.01155366
## 6  01011    22.6       11.2         17.9   0.2871  47.5   0.0 0.74164607
##     phispanic
## 1 0.023196313
## 2 0.038955350
## 3 0.045958338
## 4 0.005926581
## 5 0.074895929
## 6 0.016478989
load("~/Google Drive/dem7283/data/segregation_county.Rdata")
rucc<-uscos@data[, c("COFIPS", "RUCC", "region")]

joindata<-merge(joindata, county_dat, by.x="cofips", by.y="cofips", all.x=T)
joindata<-merge(joindata, rucc, by.x="cofips", by.y="COFIPS", all.x=T)
head(joindata)
##   cofips povrate unemployed pvacanthouse houseval pnohs pbacc     pblack
## 1  01001     7.5        6.2          8.4   1.5285  23.3   6.4 0.17821466
## 2  01003     9.1        6.6         31.3   5.3309  21.0   6.9 0.09379320
## 3  01005    19.9        9.6         18.4   0.6665  32.3   0.9 0.46467381
## 4  01007     9.4        9.1         16.3   0.6170  26.6   3.0 0.15258735
## 5  01009    10.0        7.5         12.3   1.6897  22.4   1.0 0.01155366
## 6  01011    22.6       11.2         17.9   0.2871  47.5   0.0 0.74164607
##     phispanic dissim_wb isolation_b interaction_bw    TheileH RUCC region
## 1 0.023196313 0.2897522  0.27474922      0.6897242 0.08299523    2      3
## 2 0.038955350 0.3903035  0.18464112      0.7673910 0.09203860    2      3
## 3 0.045958338 0.1913727  0.50243412      0.4518021 0.04898714    6      3
## 4 0.005926581 0.4338891  0.33170220      0.6477188 0.11959518    6      3
## 5 0.074895929 0.3776626  0.02200736      0.9040929 0.05283605    2      3
## 6 0.016478989 0.2412244  0.71810200      0.2139202 0.05627216    6      3
summary(joindata)
##     cofips             povrate        unemployed     pvacanthouse  
##  Length:3143        Min.   : 0.00   Min.   : 0.00   Min.   : 1.70  
##  Class :character   1st Qu.: 7.40   1st Qu.: 5.40   1st Qu.:10.20  
##  Mode  :character   Median :10.40   Median : 7.30   Median :14.50  
##                     Mean   :11.35   Mean   : 7.53   Mean   :17.26  
##                     3rd Qu.:14.10   3rd Qu.: 9.25   3rd Qu.:20.90  
##                     Max.   :44.90   Max.   :30.90   Max.   :83.30  
##                                                                    
##     houseval            pnohs          pbacc          pblack        
##  Min.   :  0.0000   22.7   :  24   0.0    : 221   Min.   :0.000000  
##  1st Qu.:  0.3214   19.2   :  23   4.1    :  46   1st Qu.:0.004214  
##  Median :  0.7283   15.2   :  22   2.8    :  43   Median :0.019569  
##  Mean   :  2.4209   15.3   :  21   2.7    :  40   Mean   :0.088359  
##  3rd Qu.:  1.8512   15.4   :  21   4.7    :  40   3rd Qu.:0.100550  
##  Max.   :155.2091   15.7   :  21   3.3    :  39   Max.   :0.861355  
##                     (Other):3011   (Other):2714                     
##    phispanic         dissim_wb       isolation_b       interaction_bw  
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:0.01440   1st Qu.:0.1938   1st Qu.:0.005815   1st Qu.:0.6519  
##  Median :0.03024   Median :0.2996   Median :0.037123   Median :0.8459  
##  Mean   :0.07818   Mean   :0.3077   Mean   :0.131310   Mean   :0.7718  
##  3rd Qu.:0.07654   3rd Qu.:0.4222   3rd Qu.:0.197412   3rd Qu.:0.9427  
##  Max.   :0.98328   Max.   :0.9243   Max.   :0.865954   Max.   :0.9922  
##                                                                        
##     TheileH             RUCC       region    
##  Min.   :0.00000   Min.   :0.00   1   : 217  
##  1st Qu.:0.01463   1st Qu.:3.00   2   :1055  
##  Median :0.04631   Median :6.00   3   :1382  
##  Mean   :0.07345   Mean   :5.62   4   : 413  
##  3rd Qu.:0.10166   3rd Qu.:8.00   NA's:  76  
##  Max.   :0.97694   Max.   :9.00              
##                    NA's   :76
#z-score everything!!
joindata$povz<-scale(joindata$povrate, center=T, scale=T)
joindata$unempz<-scale(joindata$unemployed, center=T, scale=T)
joindata$vachousz<-scale(joindata$pvacanthouse, center=T, scale=T)
joindata$housvalz<-scale(joindata$houseval, center=T, scale=T)
joindata$nohsz<-scale(as.numeric(as.character(joindata$pnohs)), center=T, scale=T)
## Warning in scale(as.numeric(as.character(joindata$pnohs)), center = T,
## scale = T): NAs introduced by coercion
joindata$baccz<-scale(as.numeric(as.character(joindata$pbacc)), center=T, scale=T)
## Warning in scale(as.numeric(as.character(joindata$pbacc)), center = T,
## scale = T): NAs introduced by coercion
joindata$blackz<-scale(joindata$pblack, center=T, scale=T)
joindata$hispanicz<-scale(joindata$phispanic, center=T, scale=T)
joindata$diss.z<-scale(joindata$dissim_wb, center=T, scale=T)
joindata$iso.z<-scale(joindata$isolation_b, center=T, scale=T)
joindata$theil.z<-scale(joindata$TheileH, center=T, scale=T)
joindata$noind<-ifelse(joindata$cofips%in%unique(brfss_11$cofips), 0,1)
joindata$cofips<-as.factor(joindata$cofips)

head(joindata, n=10)
##    cofips povrate unemployed pvacanthouse houseval pnohs pbacc     pblack
## 1   01001     7.5        6.2          8.4   1.5285  23.3   6.4 0.17821466
## 2   01003     9.1        6.6         31.3   5.3309  21.0   6.9 0.09379320
## 3   01005    19.9        9.6         18.4   0.6665  32.3   0.9 0.46467381
## 4   01007     9.4        9.1         16.3   0.6170  26.6   3.0 0.15258735
## 5   01009    10.0        7.5         12.3   1.6897  22.4   1.0 0.01155366
## 6   01011    22.6       11.2         17.9   0.2871  47.5   0.0 0.74164607
## 7   01013    19.7        9.5         19.9   0.5530  28.5   1.9 0.42924145
## 8   01015    15.2       11.7         12.6   3.2818  18.1   4.0 0.20218696
## 9   01017    16.4       14.2         18.5   0.9764  24.7   2.7 0.39119119
## 10  01019    13.7       12.4         28.7   0.8802  39.2   4.1 0.04677043
##      phispanic dissim_wb isolation_b interaction_bw    TheileH RUCC region
## 1  0.023196313 0.2897522  0.27474922      0.6897242 0.08299523    2      3
## 2  0.038955350 0.3903035  0.18464112      0.7673910 0.09203860    2      3
## 3  0.045958338 0.1913727  0.50243412      0.4518021 0.04898714    6      3
## 4  0.005926581 0.4338891  0.33170220      0.6477188 0.11959518    6      3
## 5  0.074895929 0.3776626  0.02200736      0.9040929 0.05283605    2      3
## 6  0.016478989 0.2412244  0.71810200      0.2139202 0.05627216    6      3
## 7  0.002775784 0.3008681  0.50934745      0.4691282 0.09515667    7      3
## 8  0.030345970 0.4482778  0.40948665      0.5476673 0.17938028    3      3
## 9  0.015153706 0.2804150  0.44779526      0.5277732 0.06880993    5      3
## 10 0.012490272 0.2618203  0.06212876      0.9108150 0.03493550    6      3
##          povz       unempz    vachousz   housvalz       nohsz      baccz
## 1  -0.6960015 -0.396270981 -0.86071350 -0.1392619  0.21498840  0.1788121
## 2  -0.4069846 -0.277120527  1.36296990  0.4540987 -0.01936268  0.2851512
## 3   1.5438794  0.616507876  0.11032729 -0.2737761  1.13201436 -0.9909173
## 4  -0.3527939  0.467569809 -0.09359128 -0.2815005  0.55123125 -0.5442933
## 5  -0.2444126 -0.009032006 -0.48200759 -0.1141068  0.12328580 -0.9696495
## 6   2.0315954  1.093109691  0.06177525 -0.3329811  2.68076932 -1.1823276
## 7   1.5077523  0.586720263  0.25598341 -0.2914876  0.74482562 -0.7782392
## 8   0.6948923  1.242047759 -0.45287637  0.1343388 -0.31484883 -0.3316153
## 9   0.9116549  1.986738095  0.12003770 -0.2254165  0.35763688 -0.6080968
## 10  0.4239389  1.450561053  1.11049930 -0.2404284  1.83506760 -0.3103474
##         blackz   hispanicz      diss.z      iso.z     theil.z noind
## 1   0.61587480 -0.42757342 -0.10411204  0.7875219  0.10912857     1
## 2   0.03724395 -0.30502357  0.47778652  0.2928025  0.21251998     1
## 3   2.57928686 -0.25056498 -0.67344308  2.0375776 -0.27968056     1
## 4   0.44022338 -0.56187115  0.73002043  1.1002102  0.52756994     1
## 5  -0.52643187 -0.02553236  0.40463251 -0.6001036 -0.23567651     1
## 6   4.47767497 -0.47981056 -0.38494688  3.2216563 -0.19639204     1
## 7   2.33643081 -0.58637327 -0.03978381  2.0755338  0.24816838     1
## 8   0.78018273 -0.37197425  0.81328881  1.5272692  1.21108350     1
## 9   2.07563154 -0.49011660 -0.15814735  1.7375945 -0.05304978     1
## 10 -0.28505356 -0.51082874 -0.26575674 -0.3798256 -0.44033071     1

Now I merge the county data back to the survey:

merged<-merge(x=brfss_11, y=joindata, by.x="cofips", by.y="cofips", all.x=T, incomparables=NA)
## Warning in merge.data.frame(x = brfss_11, y = joindata, by.x = "cofips", :
## column names 'cholchk', 'mrace' are duplicated in the result
merged$bmiz<-scale(merged$bmi, center=T, scale=T)
merged$agez<-scale(merged$age, center=T, scale=T)
merged$obese<-ifelse(merged$bmi>=30,1,0)
#names(merged)
merged<-merged[, c(1:2, 190:239)]
#names(merged)
merged[,c(8:52)]<-apply(merged[,c(8:52)],2,as.numeric)
## Warning in apply(merged[, c(8:52)], 2, as.numeric): NAs introduced by
## coercion
## Warning in apply(merged[, c(8:52)], 2, as.numeric): NAs introduced by
## coercion
#summary(merged)
#save(merged, file="~/Google Drive/dem7263/data/brfss_sub.Rdata")

Here, I make a copy of the county data to use for prediction:

othercos<-joindata
othercos$state<-substr(othercos$cofips,1,2)

Next, I fit the multilevel model. Here I fit the model with only county-level predictors. Since I want to estimate obesity prevalence, I use a logistic regression for obese==1:

fit.1<-glmer(obese~povz+vachousz+baccz+blackz+hispanicz+factor(region)+(1|state)+(1|cofips), family="binomial", data=merged, weights=cntywt/mean(cntywt))
## Warning: non-integer #successes in a binomial glm!
display(fit.1, detail=T)
## glmer(formula = obese ~ povz + vachousz + baccz + blackz + hispanicz + 
##     factor(region) + (1 | state) + (1 | cofips), data = merged, 
##     family = "binomial", weights = cntywt/mean(cntywt))
##                 coef.est coef.se z value Pr(>|z|)
## (Intercept)      -0.88     0.04  -23.65    0.00  
## povz              0.09     0.03    3.58    0.00  
## vachousz         -0.03     0.03   -1.08    0.28  
## baccz            -0.15     0.01  -10.42    0.00  
## blackz            0.03     0.02    1.48    0.14  
## hispanicz        -0.03     0.02   -1.77    0.08  
## factor(region)2   0.08     0.04    2.07    0.04  
## factor(region)3   0.05     0.04    1.13    0.26  
## factor(region)4  -0.13     0.04   -3.16    0.00  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  cofips   (Intercept) 0.13    
##  state    (Intercept) 0.02    
##  Residual             1.00    
## ---
## number of obs: 220537, groups: cofips, 217; state, 46
## AIC = 223975, DIC = 223952.8
## deviance = 223952.8

Here I find predicted values for the counties with no individual data, but with observed county variables:

fitted<-predict(fit.1, newdata=othercos[, c("state", "cofips","povz", "vachousz", "baccz", "blackz", "hispanicz", "region")], allow.new.levels=T,type="response" )

#form a data frame with the fitted values, remember, the estimates are prevalences
fitted<-data.frame(cofips=othercos$cofips, state=othercos$state, estobese=100*fitted)
head(fitted)
##   cofips state estobese
## 1  01001    01 29.55263
## 2  01003    01 28.08815
## 3  01005    01 38.90602
## 4  01007    01 32.09266
## 5  01009    01 32.96915
## 6  01011    01 42.33288
#merge county data to my estimates
mdat<-uscos@data
mdat<-merge(mdat, fitted, by.x="COFIPS", by.y="cofips", all.x=T, sort=F)
uscos@data<-mdat

#Map my estimates
library(sp)
library(RColorBrewer)
spplot(uscos, "estobese", at=quantile(uscos$estobese, na.rm=T), col.regions=brewer.pal(n=5, "Blues"), main="My Estimates of %Obesity Prevalence using BRFSS")

In this type of situation, one should try to find a way to “check” to see how bad your estimates are, so Iread in CDC’s estimates of obesity prevalence, estimated using much fancier methods. These are NOT TRUTH, but just come other person’s estiamtes.

The county-level estimates for more than 3,200 counties or county equivalents (e.g., parish, borough, municipio) in the 50 U.S. states, Puerto Rico, and the District of Columbia were developed using modern small area estimation techniques. This approach employs a statistical model that “borrows strength” in making an estimate for one county from BRFSS data collected in other counties. Bayesian multilevel modeling techniques were used to obtain these estimates. Multilevel logistic regression models with random effects of demographic variables (age 20–44, 45–64, 65+; race; sex) at the county-level and at the state level were developed. Models were fit using a simulation method known as Markov Chain Monte Carlo. The model specification is given in Cadwell, et al.

For all years, rates were age adjusted by calculating age specific rates for the following three age groups, 20–44, 45–64, 65+. A weighted sum based on the distribution of these three age groups from the 2000 census was then used to adjust the rates by age. Cadwell BL, Thompson TJ, Boyle JP, Barker LE (2010). Bayesian small area estimates of diabetes prevalence by U.S. county, 2005. Journal of Data Science 8(1): 173-188. Barker LE, Thompson TJ, Kirtland KA, Boyle JP, Geiss LS, McCauley MM, Albright AL (2013). Bayesian Small Area Estimates of Diabetes Incidence by United States County, 2009. Journal of Data Science 11:249–269.

I got these estimates from here

obdat<-read.csv("~/Google Drive/dem7283/data/OB_PREV_ALL_STATES_2010.csv")
obdat$cofips<-sprintf("%05d", obdat$FIPS.Codes )
obdat$percent<-as.numeric(as.character(obdat$percent))
## Warning: NAs introduced by coercion
names(obdat)
##  [1] "State"                               
##  [2] "FIPS.Codes"                          
##  [3] "County"                              
##  [4] "number"                              
##  [5] "percent"                             
##  [6] "lower.confidence.limit"              
##  [7] "upper.confidence.limit"              
##  [8] "age.adjusted.percent"                
##  [9] "age.adjusted.lower.confidence.limit" 
## [10] "age.adjusted.upper..confidence.limit"
## [11] "cofips"
head(obdat)
##     State FIPS.Codes         County number percent lower.confidence.limit
## 1 Alabama       1001 Autauga County  11761    30.5                   25.5
## 2 Alabama       1003 Baldwin County  36348    26.6                   23.7
## 3 Alabama       1005 Barbour County   7743    37.3                   32.1
## 4 Alabama       1007    Bibb County   5884    34.3                   28.8
## 5 Alabama       1009  Blount County  12690    30.4                     25
## 6 Alabama       1011 Bullock County   3446    42.1                   34.4
##   upper.confidence.limit age.adjusted.percent
## 1                   35.7                 30.4
## 2                   29.6                 26.8
## 3                   42.6                 37.4
## 4                   40.1                 34.2
## 5                   36.3                 30.3
## 6                     50                 41.9
##   age.adjusted.lower.confidence.limit age.adjusted.upper..confidence.limit
## 1                                25.3                                 35.5
## 2                                23.8                                 30.1
## 3                                  32                                 42.9
## 4                                28.6                                 40.1
## 5                                24.9                                 36.5
## 6                                34.1                                 50.1
##   cofips
## 1  01001
## 2  01003
## 3  01005
## 4  01007
## 5  01009
## 6  01011
#merge these data to the map's data
mdat2<-uscos@data
mdat2<-merge(mdat2, obdat, by.x="COFIPS", by.y="cofips", all.x=T, sort=F)
uscos@data<-mdat2

#compare the estimates
summary(uscos$estobese)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   7.406  27.640  30.410  30.270  33.010  48.650       1
summary(uscos$percent)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.10   28.30   30.70   30.58   33.20   47.90

Pretty close as judged by simple numerics! Now I use some other standard method to see how close my estimates are to the CDC’s:

uscos$pdiffest<-100*((uscos$estobese-uscos$percent)/uscos$percent)
hist(uscos$pdiffest, xlim=c(min(uscos$pdiffest, na.rm=T), max(uscos$pdiffest, na.rm=T)))

#so on average, I was pretty close to the CDC's estimates, I should be I'm using the same data as they do!

#Here are various measures of how bad my estimate was compared to CDC's
#MAPE
mean(abs(uscos$pdiffest),na.rm=T)
## [1] 9.227811
#9.2% off on average

#MAE
mean(abs(uscos$estobese- uscos$percent), na.rm=T)
## [1] 2.752269
#on avereage, I mis-estimated bmi by 2.8 percentage points

#MPE
mean(uscos$pdiffest,na.rm=T)
## [1] -0.4030874
#on average for counties, I underestimated the % obese by .4%

#My estimates vs CDC's
library(ggplot2)
ggplot(uscos@data, aes(x=percent, y=estobese))+geom_point(alpha=.5)+theme_gray()+xlab("CDC Estimate")+ylab("My Estimate")
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(uscos@data, aes(x=percent, y=estobese)) + stat_binhex()+xlab("CDC Estimate")+ylab("My Estimate")
## Warning: Removed 1 rows containing missing values (stat_hexbin).

plot(uscos$estobese, uscos$percent, pch=".", ylab="CDC Estimate", xlab="My Estimate")
abline(coef(lm(percent~estobese, uscos)), col=2, lwd=2)

#Map the differences
spplot(uscos, "pdiffest", at=quantile(uscos$pdiffest,na.rm=T), col.regions=brewer.pal(n=5, "RdBu"), main="% Difference in My Estimates vs CDC's Estimates")

#Red = I estimated too low, blue=I estimated to high
#Moreover, I don't see any big swaths of the country thate are all red or all blue

#Ok, but how did I do at the state level?
boxplot(pdiffest~state, uscos)
abline(h=0, col=2)

#Not bad!!

plot(tapply(uscos$estobese, uscos$state, mean), tapply(uscos$percent, uscos$state, mean), xlab="My Estimate", ylab="CDC Estimate", pch="0")
abline(coef(lm(tapply(uscos$percent, uscos$state, mean)~ tapply(uscos$estobese, uscos$state, mean))), lwd=2, col=2)