In this assignment, I apply Bayesian Methods from a sample that includes data from 4,871 individuals born between 1978 and 1988 (ages 26-38) in the State of Texas. Data was gathered from ACS-IPUMS 2014-2016.

The outcome is set as adjusted income and the covariates are sex, being born in the US, number of years in the US, race and education level.

Libraries

library (car)

## Loading required package: carData

library(brms)

## Loading required package: Rcpp

## Loading required package: ggplot2

## Loading 'brms' package (version 2.6.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## Run theme_set(theme_default()) to use the default bayesplot theme.

library(pscl)

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

library(rstan)

## Loading required package: StanHeaders

## rstan (Version 2.18.1, GitRev: 2e1f913d3ca3)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)

library(haven)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

load("C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/ipums_subset2.Rdata")
sub3 <- filter(sub2, birthyr %in% 1978:1988, year %in% 2014:2016,  gq != 3, gq != 4, gq!=6, statefip==48)
save(sub3, file="C:/Users/PCMcC/Documents/Hierarchical Models/hierarchichal methods class/ipums_subset3.Rdata")
rm(sub2)
#gc()

#set.seed(1115)

Filtering Data

names(sub3) #print the column names

##  [1] "year"       "datanum"    "serial"     "hhwt"       "hhtype"    
##  [6] "stateicp"   "statefip"   "county"     "countyfips" "metro"     
## [11] "city"       "puma"       "gq"         "hhincome"   "foodstmp"  
## [16] "pernum"     "perwt"      "sex"        "age"        "birthyr"   
## [21] "fertyr"     "race"       "raced"      "hispan"     "hispand"   
## [26] "bpl"        "bpld"       "citizen"    "yrsusa1"    "language"  
## [31] "languaged"  "school"     "educ"       "educd"      "gradeatt"  
## [36] "gradeattd"  "degfield"   "degfieldd"  "empstat"    "empstatd"  
## [41] "labforce"   "incwage"    "poverty"    "trantime"

myvars<-c("statefip","county","countyfips","metro","city","puma","hhincome","incwage","educd","empstatd","gradeatt","race", "hispan", "hispand","raced","sex","empstat","age","trantime","poverty","birthyr","year","gq","hhtype","yrsusa1","bpl")
sub3<-sub3[,myvars]
#rm(sub3); gc()

#RACE/ETHNICITY
sub3$hispanic <- Recode(sub3$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
sub3$race_rec <- Recode(sub3$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
sub3$race_eth <- interaction(sub3$hispanic, sub3$race_rec, sep = "_")
sub3$race_eth  <- as.factor(ifelse(substr(as.character(sub3$race_eth),1,8) == "Hispanic", "Hispanic", as.character(sub3$race_eth)))
sub3$race_eth <- relevel(sub3$race_eth, ref = "NonHispanic_White")

#SEX
sub3$sex2<- Recode (sub3$sex, recodes= "1=1; 2=2; else=NA")
sub3$male<- Recode (sub3$sex, recodes= "1=1; 2=0; else=NA")
sub3$female<- Recode (sub3$sex, recodes= "2=1; 1=0; else=NA")

#CITIZEN STATUS (1=born in the US or to US citizens, 0=not US citizen)
#sub3$citizenstatus <- Recode(sub3$citizen, recodes = "1:2=1; 3:5=0; else=NA")
#sub3$usborn <-Recode(sub3$citizenstatus, recodes="1=1; 0=0")
#sub3$notusborn <-Recode(sub3$citizenstatus, recodes="0=1; 1=0")
#table(sub3$usborn)


#EDUCATION LEVEL 
sub3$educ_level<- Recode(sub3$educd, recodes = "2:61='0Less_than_HS';62:64='1_HSD/GED';65:80='2_somecollege';90:100='2_somecollege'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")
sub3$educ_level<-as.factor(sub3$educ_level)

#College or more(1=college degree or higher and 0=Associates degree or lower)
sub3$collegeormore<- Recode(sub3$educd, recodes = "84:116=1; 2:83=0")


#USBORN
sub3$usborn <- Recode(sub3$bpl, recodes = "1:120=1; 121:900=0; else=NA")

#EMPLOYMENT
sub3$employedetailed <- Recode(sub3$empstatd, recodes = "10:12=1; 20:22=0; else=NA")

sub3$employed<-Recode(sub3$empstat, recodes = "1=1; else=NA")
sub3$unemployed<-Recode(sub3$empstat, recodes = "2=1; else=NA")
sub3$not_in_laborforce<-Recode(sub3$empstat, recodes = "3=1; else=NA")


#INCOME ADJUSTED FOR INFLATION
#sub$edu_scal_inc <- ave(sub$incwage, sub$male, FUN = scale)
sub3$inc_adj <- ifelse(sub3$year == 2005, sub3$incwage*1.23,
                            ifelse(sub3$year == 2006, sub3$incwage*1.18,
                                   ifelse(sub3$year == 2007, sub3$incwage*1.16,
                                          ifelse(sub3$year %in% c(2008,2009), sub3$incwage*1.1, 
                                                 ifelse(sub3$year == 2010, sub3$incwage*1.1,
                                                        ifelse(sub3$year == 2011, sub3$incwage*1.07,
                                                               ifelse(sub3$year == 2012,sub3$incwage*1.05,
                                                                      ifelse(sub3$year == 2013, sub3$incwage*1.03, 
                                                                             ifelse(sub3$year == 2014, sub3$incwage*1.01,
                                                                                    ifelse(sub3$year == 2015, sub3$incwage*1.01, sub3$incwage))))))))))


#TRAVEL TIME TO WORK (does not include zero time commute)
sub3$traveltime<-ifelse(sub3$trantime==000, NA, sub3$trantime)

#IN HS/COLLEGE OR NOT IN HS/COLLEGE
#sub3$inhs<- Recode(sub3$gradeatt, recodes = "5=1; else=0")
#sub3$incollege <- Recode(sub3$gradeatt, recodes = "6:7=1; else=0")

#YEARS IN THE US (0 indicates less than 1 year or NA - I eliminated all NAs)
sub3$yrsusa2<-Recode(sub3$yrsusa1, recodes="0=NA")

Household level variables

#HOUSEHOLD TYPE (1=married family, 2=one male or female living alone, 3=male or female living with someone)
sub3$housetype<-Recode(sub3$hhtype, recodes = "1=1; c(2,3,4,6)=2; c(5,7)=3; 9=NA" )

#METRO AREA (1=not in metro area, 2=metro central city, 3=suburban, 4=central/principal city unknown)
sub3$metroarea<-Recode(sub3$metro,recodes="1=1; 2=2; 3=3; else='NA'")
sub3$urban<-Recode(sub3$metroarea, recodes="2:3=1; 1=0")
sub3$rural<-Recode(sub3$metroarea, recodes="1=1; 2:3=0")

Higher level (Macro) variables

library(tidycensus)
usacs<-get_acs(geography = "county", year = 2015,
                variables=c( "DP05_0001E", "DP03_0009P", "DP03_0062E", "DP03_0119PE",
                             "DP05_0001E","DP02_0009PE","DP02_0008PE", "DP02_0040E","DP02_0038E",
                             "DP02_0066PE","DP02_0067PE","DP02_0080PE","DP02_0092PE",
                             "DP03_0005PE","DP03_0028PE","DP03_0062E","DP03_0099PE","DP03_0101PE",
                             "DP03_0119PE","DP04_0046PE","DP05_0072PE","DP05_0073PE",
                             "DP05_0066PE","DP05_0033","DP05_0032", "DP05_0072PE", "DP02_0113PE",  "DP04_0003PE") ,
                summary_var = "B01001_001",
                geometry = F, output = "wide")

## Getting data from the 2011-2015 5-year ACS

## Using the ACS Data Profile
## Using the ACS Data Profile

usacs2<-mutate(usacs,totpop= DP05_0001E, fertrate = DP02_0040E,pwhite=DP05_0072PE/100, nwhite=DP05_0032E,nblack=DP05_0033E, pblack=DP05_0073PE/100 , phisp=DP05_0066PE/100, pfemhh=DP02_0008PE/100, phsormore=DP02_0066PE/100,punemp=DP03_0009PE/100, medhhinc=DP03_0062E,ppov=DP03_0119PE/100, pforn=DP02_0092PE/100,plep=DP02_0113PE/100,  pvacant=DP04_0003PE/100, GEOID=GEOID)


myv <- c("GEOID","NAME", "totpop", "pwhite", "pblack", "phisp", "pfemhh", "phsormore", "punemp", "medhhinc", "ppov", "pforn","plep")
acsvars <- usacs2[myv]
acsvars <-filter(acsvars,GEOID %in% 48001:48507)
acsvars$GEOID<- substring(acsvars$GEOID,3)
acsvars$GEOID<-as.numeric(acsvars$GEOID)
#head(acsvars)

Merge files by county FIPS codes and filter for complete cases

#I fix the error:: `x` and `labels` must be same type with the following:
sub3[] <- lapply(sub3, unclass)
joindata<-merge(acsvars, sub3, by.x="GEOID", by.y="countyfips", all.x=T)
#names(joindata)
jdvars<-c("GEOID","NAME","metro","city","puma","hhincome","incwage","educd","empstatd","race", "hispan", "usborn","raced","sex2","age","birthyr","year","gq","hhtype","yrsusa2","totpop", "pwhite", "pblack", "phisp", "pfemhh", "phsormore", "punemp", "medhhinc", "ppov","male", "inc_adj", "pforn","plep","educ_level","race_eth","collegeormore")

joindata2<-joindata[,jdvars]
joindata2<-joindata2%>%
filter(complete.cases(.))
#rm(joindata); gc()

Scaling Variables

joindata2$yearsintheUSz<-scale(joindata2$yrsusa2, center = T, scale=T)
joindata2$agez<-scale(joindata2$age, center = T, scale=T)
joindata2$inc_adjz<-scale(joindata2$inc_adj, center= T, scale=T)

Simple Regression

The simple regression shows that all of the covariates with the exeption of being born in the US have a significant relationship to income.

fit.lm<- glm(inc_adjz~male+agez+usborn+race_eth+educ_level+yearsintheUSz, data=joindata2)
summary(fit.lm)

## 
## Call:
## glm(formula = inc_adjz ~ male + agez + usborn + race_eth + educ_level + 
##     yearsintheUSz, data = joindata2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9286  -0.4095  -0.0586   0.2511  10.6956  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.751235   0.046251 -16.242  < 2e-16 ***
## male           0.559351   0.024735  22.614  < 2e-16 ***
## agez           0.092449   0.012610   7.331 2.65e-13 ***
## usborn        -0.921420   0.862671  -1.068 0.285528    
## race_eth      -0.087959   0.019733  -4.457 8.48e-06 ***
## educ_level     0.239952   0.007251  33.092  < 2e-16 ***
## yearsintheUSz  0.046061   0.013425   3.431 0.000606 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.7429637)
## 
##     Null deviance: 4870.0  on 4870  degrees of freedom
## Residual deviance: 3613.8  on 4864  degrees of freedom
## AIC: 12385
## 
## Number of Fisher Scoring iterations: 2

confint(fit.lm)

## Waiting for profiling to be done...

##                     2.5 %     97.5 %
## (Intercept)   -0.84188619 -0.6605847
## male           0.51087195  0.6078298
## agez           0.06773355  0.1171636
## usborn        -2.61222452  0.7693852
## race_eth      -0.12663538 -0.0492835
## educ_level     0.22574006  0.2541635
## yearsintheUSz  0.01974911  0.0723728

#removing files from global environment
rm("joindata")
rm("sub3")

Preparing for Bayesian Models

Installing Packages to Run BRM Function, Part 1

#To find the number of cores on my system
library(parallel)
detectCores()

## [1] 4

options(mc.cores = parallel::detectCores())


# # Set path of Rtools
# Sys.setenv(PATH = paste(Sys.getenv("PATH"), "*InstallDirectory*/Rtools/bin/",
#                        "*InstallDirectory*/Rtools/mingw_64/bin", sep = ";")) #for 64 bit version
# Sys.setenv(BINPREF = "*InstallDirectory*/Rtools/mingw_64/bin")
# library(devtools)

# Now you can install transformr then gganimate#
#devtools::install_github("thomasp85/transformr")
#devtools::install_github("dgrtwo/gganimate")

Installing Packages to Run BRM Function, Part 2

# #install.packages("StanHeaders", repos = "http://cran.us.r-project.org")
# library("StanHeaders")
# usethis::edit_r_environ()
# #devtools::install_github("stan-dev/rstan", ref = "develop", subdir = "rstan/rstan")
# 
# dotR <- file.path(Sys.getenv("HOME"), ".R")
# if (!file.exists(dotR)) 
#   dir.create(dotR)
# M <- file.path(dotR, "Makevars.win")
# if (!file.exists(M)) 
#   file.create(M)
# cat("\nCXX14FLAGS=-O3 -Wno-unused-variable -Wno-unused-function",
#     "CXX14 = $(BINPREF)g++ -m$(WIN) -std=c++1y",
#     "CXX11FLAGS=-O3 -Wno-unused-variable -Wno-unused-function",
#     file = M, sep = "\n", append = TRUE)
#

BAYESIAN MODEL

The first bayesian model, shows an RHAT=1. The results of this model are similar to those obtained by the simple regression model. Being a male, longer years living in the US and older age groups provide an advantage of higher incomes. However, race seems to be negatively related to income. Finally, education seems to have a positive relationship with income however, not as large as the one presented by males vs females.

Plots show a unimodal distribution resembling the “fuzzy caterpillar” expected.

#install.packages("brm", repos = "http://cran.us.r-project.org")
#install.packages("brms", repos = "http://cran.us.r-project.org")
#install.packages("rstan", repos = "http://cran.us.r-project.org")
#install.packages("rstanarm",  repos = "http://cran.us.r-project.org")

#library(rstanarm)
#library(rstan)
#library(brm)
library(brms)

#devtools::install_github('stan-dev/rstan', ref = 'develop', subdir = 'rstan/rstan')
options(mc.cores = parallel::detectCores())


fit.1<-brm(inc_adjz~male+race_eth+educ_level+usborn, data=joindata2, family=gaussian(),
              warmup=1000, #burnin for 500 interations for each chain = 1000 burnin
              iter=5000, chains=4, #2*5000 = 10000 - 1000 burnin = 9000 total iterations
              cores=4,seed = 1100, #number of computer cores, 1 per chain is good.
            refresh = 0)

## Compiling the C++ model

## Start sampling

summary(fit.lm)

## 
## Call:
## glm(formula = inc_adjz ~ male + agez + usborn + race_eth + educ_level + 
##     yearsintheUSz, data = joindata2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9286  -0.4095  -0.0586   0.2511  10.6956  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.751235   0.046251 -16.242  < 2e-16 ***
## male           0.559351   0.024735  22.614  < 2e-16 ***
## agez           0.092449   0.012610   7.331 2.65e-13 ***
## usborn        -0.921420   0.862671  -1.068 0.285528    
## race_eth      -0.087959   0.019733  -4.457 8.48e-06 ***
## educ_level     0.239952   0.007251  33.092  < 2e-16 ***
## yearsintheUSz  0.046061   0.013425   3.431 0.000606 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.7429637)
## 
##     Null deviance: 4870.0  on 4870  degrees of freedom
## Residual deviance: 3613.8  on 4864  degrees of freedom
## AIC: 12385
## 
## Number of Fisher Scoring iterations: 2

print(summary(fit.1), digits=3)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: inc_adjz ~ male + race_eth + educ_level + usborn 
##    Data: joindata2 (Number of observations: 4871) 
## Samples: 4 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup samples = 16000
## 
## Population-Level Effects: 
##            Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## Intercept    -0.709     0.045   -0.798   -0.620      23394 1.000
## male          0.563     0.025    0.514    0.611      20583 1.000
## race_eth     -0.098     0.020   -0.137   -0.059      20625 1.000
## educ_level    0.232     0.007    0.218    0.246      20782 1.000
## usborn       -0.819     0.856   -2.506    0.859      20047 1.000
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## sigma    0.869     0.009    0.852    0.887      20771 1.000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(fit.1, N=2)

Alternative Priors

Here, we change the default priors for the model parameters to be Gaussian for the \(\beta\)’s and Inverse Gamma for the residual variances

prior1<-c(set_prior("normal(0,1)", class="b"),#prior for the beta's
         set_prior("normal(0,1)", class="Intercept"), #prior for the intercept
         set_prior("inv_gamma(.5,.5)", class="sigma"))#prior for the residual std. deviation

fit.1b<-brm(inc_adjz~male+race_eth+educ_level+usborn, data=joindata2, family=gaussian(),
              warmup=1000, #burnin for 500 interations for each chain = 1000 burnin
              iter=5000, chains=4, #2*5000 = 10000 - 1000 burnin = 9000 total iterations
              cores=4,seed = 1100, #number of computer cores, 1 per chain is good.
              save_model="fit_lm_gamma_stan.txt", refresh = 0
            )

## Compiling the C++ model

## recompiling to avoid crashing R session

## Start sampling

Alternative Lilkelihoods

library(brms)
prior<-c(set_prior("normal(0,1)", class="b"),#prior for the beta's
         set_prior("cauchy(0,5)", class="sigma"),#prior for the residual std. deviation
         set_prior("gamma(1,1.5)", class="nu")) #prior for the t distribution d.f.

fit.lm.b.t<-brm(inc_adjz~male+race_eth+educ_level+usborn, data=joindata2, family=student(),
              prior = prior, 
              warmup=1000, 
              iter=5000, chains=4, 
              cores=4,seed = 1100,
              save_model="fit_lm_b_t_stan.txt", refresh=0)

## Compiling the C++ model

## Start sampling

print(summary(fit.lm.b.t), digits=3)

##  Family: student 
##   Links: mu = identity; sigma = identity; nu = identity 
## Formula: inc_adjz ~ male + race_eth + educ_level + usborn 
##    Data: joindata2 (Number of observations: 4871) 
## Samples: 4 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup samples = 16000
## 
## Population-Level Effects: 
##            Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## Intercept    -0.605     0.024   -0.652   -0.557      16520 1.000
## male          0.494     0.012    0.471    0.517      13774 1.000
## race_eth     -0.033     0.011   -0.056   -0.011      15361 1.000
## educ_level    0.061     0.005    0.051    0.072       9675 1.000
## usborn       -0.225     0.420   -1.058    0.721      10863 1.000
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## sigma    0.293     0.008    0.276    0.309       7971 1.000
## nu       1.521     0.061    1.405    1.642       8184 1.000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(fit.lm.b.t, N=2)

#Custom Summary Function A comparison between modified and default priors show that they are very similar confirming the robusticity of the analysis.

out1<-as.data.frame(fit.1)
out2<-as.data.frame(fit.1b)

mysum<-function(x){
  mu=mean(x,na.rm=T)
  sdev=sd(x,na.rm=T)
  lci=quantile(x, p=.025,na.rm=T)
  uci=quantile(x, p=.975,na.rm=T)
list(mu=mu, sd=sdev, lci=lci, uci=uci)
  }

sapply(out1, mysum )

##     b_Intercept b_male    b_race_eth  b_educ_level b_usborn   sigma      
## mu  -0.7091118  0.5625697 -0.09809845 0.2322387    -0.8185917 0.8688939  
## sd  0.0453085   0.0246808 0.01979048  0.007106638  0.8557206  0.008854295
## lci -0.7978184  0.5139353 -0.1366965  0.2182395    -2.506197  0.8516538  
## uci -0.6200543  0.6112396 -0.05942645 0.2460771    0.8590223  0.8865665  
##     lp__     
## mu  -6232.665
## sd  1.757167 
## lci -6237.09 
## uci -6230.286

sapply(out2, mysum)

##     b_Intercept b_male    b_race_eth  b_educ_level b_usborn   sigma      
## mu  -0.7091118  0.5625697 -0.09809845 0.2322387    -0.8185917 0.8688939  
## sd  0.0453085   0.0246808 0.01979048  0.007106638  0.8557206  0.008854295
## lci -0.7978184  0.5139353 -0.1366965  0.2182395    -2.506197  0.8516538  
## uci -0.6200543  0.6112396 -0.05942645 0.2460771    0.8590223  0.8865665  
##     lp__     
## mu  -6232.665
## sd  1.757167 
## lci -6237.09 
## uci -6230.286

Summary of Results Using 95% and 90% Credible Interval for Each Parameter

A comparison of each of the three models after modifying the posterior shows similar patterns. The USborn variable includes zero on all 3 models while the remaining variables, particularly the male variable (0.5-6.0) shows siginificant findings.

posterior_interval(fit.1, prob=0.9, type="central")

##                         5%           95%
## b_Intercept     -0.7835461 -6.343480e-01
## b_male           0.5219836  6.033911e-01
## b_race_eth      -0.1309635 -6.561054e-02
## b_educ_level     0.2205722  2.439621e-01
## b_usborn        -2.2219833  5.890871e-01
## sigma            0.8543759  8.836138e-01
## lp__         -6236.0218471 -6.230491e+03

posterior_interval(fit.1b, prob=0.9, type="central")

##                         5%           95%
## b_Intercept     -0.7835461 -6.343480e-01
## b_male           0.5219836  6.033911e-01
## b_race_eth      -0.1309635 -6.561054e-02
## b_educ_level     0.2205722  2.439621e-01
## b_usborn        -2.2219833  5.890871e-01
## sigma            0.8543759  8.836138e-01
## lp__         -6236.0218471 -6.230491e+03

posterior_interval(fit.lm.b.t, prob=0.9, type="central")

##                         5%           95%
## b_Intercept  -6.447860e-01 -5.643874e-01
## b_male        4.751248e-01  5.131386e-01
## b_race_eth   -5.200887e-02 -1.500036e-02
## b_educ_level  5.213940e-02  6.978204e-02
## b_usborn     -8.700491e-01  4.753107e-01
## sigma         2.790624e-01  3.066498e-01
## nu            1.423450e+00  1.623173e+00
## lp__         -4.452393e+03 -4.446122e+03

Densities for parameter in each model

If the models are invariant to the prior, the two lines should be very close to each other which in this case, they are.

ggplot()+geom_density(data=out1, aes(b_educ_level), col="blue", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(b_educ_level), col="green", alpha=.5, lwd=1.5)

ggplot()+geom_density(data=out1, aes(sigma), col="blue", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(sigma), col="green", alpha=.5, lwd=1.5)

#Calculation of Bayesian P-Values

Below is the function to calculate Bayesian p-values based on if the posterior mean of the parameter is positive or negative.

bayespval<-function(x){
  if(mean(x)<0) {
    pval=mean(x<0)
  }
  else{
    pval=mean(x>0)
  }
list(pval=pval)
}

sapply(out1, bayespval)

## $b_Intercept.pval
## [1] 1
## 
## $b_male.pval
## [1] 1
## 
## $b_race_eth.pval
## [1] 1
## 
## $b_educ_level.pval
## [1] 1
## 
## $b_usborn.pval
## [1] 0.830875
## 
## $sigma.pval
## [1] 1
## 
## $lp__.pval
## [1] 1

Comparison of Bayesian Models

library("brms")
WAIC1<-WAIC(fit.1)
WAIC2<-WAIC(fit.lm.b.t)
compare_ic(WAIC1, WAIC2, ic=c("waic"))

##                        WAIC     SE
## fit.1              12484.27 482.84
## fit.lm.b.t          8885.98 190.87
## fit.1 - fit.lm.b.t  3598.29 436.39

Which certainly looks like the Normal model is preferred here, because of the lower WAIC score. We can also evaluate the model fit to the sensitivity of the prior from the normal model:

WAIC(fit.1, fit.1b, compare = T)

##                    WAIC     SE
## fit.1          12484.27 482.84
## fit.1b         12484.27 482.84
## fit.1 - fit.1b     0.00   0.00

The WAIC shows that there is no difference between the two models.

#install.packages("loo")
#install.packages("blmeco")
library ("loo")

## This is loo version 2.0.0.
## **NOTE: As of version 2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session. Visit mc-stan.org/loo/news for details on other changes.

## 
## Attaching package: 'loo'

## The following object is masked from 'package:rstan':
## 
##     loo

library ("blmeco")

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## Attaching package: 'blmeco'

## The following object is masked from 'package:brms':
## 
##     WAIC

Cross-Validation - LOO Method

This technique is used for judging relative model fit as the predictive capacity of the model. The LOO method uses the posterior predictive distribution for the model. The posterior predictive distribution is the posterior distribution of new data, given the parameters we have obtained from our model.

Results of the LOOIC, shows that the model with the alternative likelihoods is a better fit.

library("loo")
loo(fit.1 , fit.lm.b.t)

## Warning: Found 1 observations with a pareto_k > 0.7 in model 'fit.1'. It
## is recommended to set 'reloo = TRUE' in order to calculate the ELPD without
## the assumption that these observations are negligible. This will refit
## the model 1 times to compute the ELPDs for the problematic observations
## directly.

##                       LOOIC     SE
## fit.1              12485.26 482.83
## fit.lm.b.t          8886.12 190.87
## fit.1 - fit.lm.b.t  3599.14 436.37

Assigment #4, DEM 7473 Hierarchical Models

Paulina Cano

October 8, 2018