library(haven)
library(SASxport)
## Warning: package 'SASxport' was built under R version 4.0.5
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(ggplot2)
library(pander)
library(knitr)
library(tidycensus)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Use the BRFSS for this assignment, UNLESS your data set has a higher level variable like school, county, metro area, etc. You can contact me if you are uncertain.Use a continuous outcome Conduct a multi-level analysis where you:

#avedrnk2 is my continuous outcome, or average alcoholic drinks consumed in the last 30 days.

brfss_17<-read_xpt("C:/Users/josha/Desktop/Fall 2020 Stats 1 Dem/WorkingDirectoryFall2020StatsDem1/MMSA2017.XPT")
nams<-names(brfss_17)
newnames<-gsub(pattern = "_",
               replacement =  "",
               x =  nams)
names(brfss_17)<-tolower(newnames)
#sex
brfss_17$male<-ifelse(brfss_17$sex==1, 1, 0)

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='employloyed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst,
                        ref='married')
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
head(data.frame(name=table(brfss_17$mmsaname), 
                id=unique(brfss_17$mmsa)))
##                                                          name.Var1 name.Freq
## 1                      Aberdeen, SD, Micropolitan Statistical Area       512
## 2       Albany-Schenectady-Troy, NY, Metropolitan Statistical Area       632
## 3                   Albuquerque, NM, Metropolitan Statistical Area      1654
## 4 Allentown-Bethlehem-Easton, PA-NJ, Metropolitan Statistical Area       866
## 5                     Anchorage, AK, Metropolitan Statistical Area      1029
## 6 Atlanta-Sandy Springs-Roswell, GA, Metropolitan Statistical Area      2944
##      id
## 1 10100
## 2 10580
## 3 10740
## 4 10900
## 5 11260
## 6 12060

#Estimate the random intercept model with just the higher level units as your predictor (NULL model)report the variance components and ICC

merged<-brfss_17%>%
  dplyr::select(avedrnk2, marst, mmsa, educ, race_eth, menthlth, male, employ, mmsawt)%>%
  filter(complete.cases(.))
fit.an<-lm(avedrnk2~as.factor(mmsa),
           merged)
anova(fit.an)
## Analysis of Variance Table
## 
## Response: avedrnk2
##                     Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(mmsa)    135    79895  591.81  5.7771 < 2.2e-16 ***
## Residuals       116016 11884884  102.44                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(lme4)
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(arm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is C:/Users/josha/Desktop/Fall 2020 Stats 1 Dem/WorkingDirectoryFall2020StatsDem1
## 
## Attaching package: 'arm'
## The following object is masked from 'package:car':
## 
##     logit
fit<-lmer(avedrnk2~(1|mmsa),
           data=merged)
arm::display(fit, detail=T)
## lmer(formula = avedrnk2 ~ (1 | mmsa), data = merged)
##             coef.est coef.se t value
## (Intercept)  3.52     0.08   44.47  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept)  0.81   
##  Residual             10.12   
## ---
## number of obs: 116152, groups: mmsa, 136
## AIC = 867552, DIC = 867539.2
## deviance = 867542.5
(.81^2)/(10.12^2+.81^2)
## [1] 0.006365546

Preleminary modeling results indicate there is a significant variation in average alcohol consumed within the last 30 days between cities. Thus a multileveling model can now be used to estimate the differences.With variance components at level 2 being .81 with an individual item of 10.12. Thus both cities and individuals differ in their average alcohol consumption in a month. The icc of which appears to be .006 meaning most likely no two people randomly selected will be similar to each other based on their average consumption of alcohol.

#Now fit the multilevel model for your outcome and include both the higher level term and appropriate individual - level predictors.#Report the variance components of the model and describe the results of the model Report the results for the fixed effects

fit2<-lmer(avedrnk2~male+race_eth+employ+marst+menthlth+educ+(1|mmsa),
           data=merged)
arm::display(fit2, detail=T)
## lmer(formula = avedrnk2 ~ male + race_eth + employ + marst + 
##     menthlth + educ + (1 | mmsa), data = merged)
##                      coef.est coef.se t value
## (Intercept)            3.40     0.12   29.12 
## male                   1.00     0.06   16.44 
## race_ethhispanic       1.77     0.12   14.52 
## race_ethnh black       0.92     0.12    8.00 
## race_ethnh multirace   0.56     0.23    2.45 
## race_ethnh other       0.89     0.17    5.35 
## employnilf             0.23     0.10    2.29 
## employretired         -0.33     0.07   -4.44 
## employunable           0.98     0.17    5.67 
## marstcohab             0.63     0.15    4.10 
## marstdivorced          0.56     0.09    6.04 
## marstnm                1.21     0.08   14.45 
## marstseparated         0.55     0.23    2.41 
## marstwidowed           0.40     0.11    3.45 
## menthlth               0.00     0.00    2.35 
## educ0Prim              2.22     0.31    7.08 
## educ1somehs            2.34     0.20   11.63 
## educ3somecol          -1.05     0.09  -11.75 
## educ4colgrad          -1.79     0.08  -22.00 
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept)  0.62   
##  Residual             10.02   
## ---
## number of obs: 116152, groups: mmsa, 136
## AIC = 865150, DIC = 864995
## deviance = 865051.2
(.62^2)/(10.02^2+.62^2)
## [1] 0.003814067

The second model indicates both city level items, with individual level predicots. After controlling for other variables, the level one residuals and individuals both have dropped to .62 and 10.02 respectively. Meaning there are differences between cities, and explaining individual level alcohol consumption on average by controling for different characteristics of the population. Additionally, in this model the icc is now .003 half as low as the previous model. In this model at least, two randomly selected people from two different cities have an even lower chance of being similar in their average alcohol consumption, after for controlling all other variables. In terms of inter group differences in alcohol consumption in days, males have higher on average than females. All races have higher amounts of consumption than whites, with Hispanics on average totalling 1.77 days more. Those not actively in the work force and unable to work consume more alcohol on average within the last thrity days than employed individuals, with unable to work almost consuming alcohol one more days on average. Married individuals consume less alchol on average than all other types of partnered individuals, not married individuals drink about 1.22 more days on average than married persons. Although days of negative mental health is associated with more drinking, weirdly the average increase is .00. Drinking more days on average is significantly associated only with educations totalling less than highschool. Educational differences also accounted for the largest on average increases in drinking with just primary and just some with high school drinking on average 2.22 to 2.34 days than people who completed alcohol.

##Compare the models from 1 and 2 using a likelihood ratio test (anova(fit1, fit2, test=”Chisq”)) .

anova(fit, fit2)
## refitting model(s) with ML (instead of REML)
## Data: merged
## Models:
## fit: avedrnk2 ~ (1 | mmsa)
## fit2: avedrnk2 ~ male + race_eth + employ + marst + menthlth + educ + 
## fit2:     (1 | mmsa)
##      npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## fit     3 867548 867577 -433771   867542                         
## fit2   21 865093 865296 -432526   865051 2491.2 18  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rand(fit2)
## ANOVA-like table for random-effects: Single term deletions
## 
## Model:
## avedrnk2 ~ male + race_eth + employ + marst + menthlth + educ + (1 | mmsa)
##            npar  logLik    AIC    LRT Df Pr(>Chisq)    
## <none>       21 -432554 865149                         
## (1 | mmsa)   20 -432661 865361 213.86  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Does the inclusion of the individual level variables increase the overall model fit? An anova was run to compare the two models, with fit1 just being city level indicators, and fit2 being the more complicated model. The aic of fit is lower by about 2000, with the chisq coefficient being pretty high at 2491.2 meaning there is a small chance this at random. The second more comprehensive model fits the better than the previous one. Additionally to test this, a random effect test was run, indicating that not all cities will have the same on average consumption of alcohol during the month.