All the variables in this dataset are from the dataset:Add Health.
For this assignment, I will focus on suicide as my binary variable. I examine the relationship between grade level, Hispanic and non-Hispanic and Health care exam.
I predict that the higher level of health care exams will be associated with lower levels of suicide thoughts, particularly for Hispanics.
library(haven)
d1<-read_dta("C:/Users/tim1203/Desktop/utsa/C. Sparks/Dem 7283/Data sets/wholeaddhealthstata.dta")
w1<-read_dta("C:/Users/tim1203/Desktop/utsa/C. Sparks/Dem 7283/Data sets/21600-0004-Data.dta")
d2<-merge(d1, w1, by="AID")
library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
## Warning: package 'questionr' was built under R version 3.6.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
##Recoding variables #Suicide thoughts
d2$suicide<-Recode(d2$H1SU1, recodes="0=0; 1=1; else=NA")
table(d2$suicide)
##
## 0 1
## 5614 821
table(d2$H1SU1)
##
## 0 1 6 8 9
## 5614 821 46 22 1
is.numeric(d2$suicide)
## [1] TRUE
#Received Health Services in Last Year
d2$hlthexam<-Recode(d2$H1HS1, recodes="0=0; 1=1; else=NA")
table(d2$hlthexam)
##
## 0 1
## 2137 4340
table(d2$H1HS1)
##
## 0 1 6 8
## 2137 4340 5 22
is.numeric(d2$hlthexam)
## [1] TRUE
#Received Counseling in Last Year
d2$counsel<-Recode(d2$H1HS3, recodes="0=0; 1=1; else=NA")
table(d2$counsel)
##
## 0 1
## 5672 813
table(d2$H1HS3)
##
## 0 1 6 8
## 5672 813 7 12
is.numeric(d2$counsel)
## [1] TRUE
#Current Grade
d2$grade<-Recode(d2$H1GI20, recodes="7=7; 8=8; 9=9; 10=10; 11=11; 12=12; else=NA")
table(d2$grade)
##
## 7 8 9 10 11 12
## 979 992 1107 1144 1122 993
table(d2$H1GI20)
##
## 7 8 9 10 11 12 96 97 98 99
## 979 992 1107 1144 1122 993 1 128 3 35
is.numeric(d2$grade)
## [1] TRUE
#Recode Hispanic
table(d2$H1GI4)
##
## 0 1 6 8
## 5738 743 3 20
d2$Hispanic<-Recode(d2$H1GI4, recodes="0=0; 1=1; else=NA")
table(d2$Hispanic)
##
## 0 1
## 5738 743
is.numeric(d2$Hispanic)
## [1] TRUE
is.character(d2$Hispanic)
## [1] FALSE
class(d2$Hispanic)
## [1] "haven_labelled"
d2 <- d2 %>%
mutate(
grade = as.factor(grade),
Hispanic = as.factor(Hispanic),
hlthexam = as.factor(hlthexam)
)
options(survey.lonely.psu = "PSUSCID")
des<-svydesign(ids=~CLUSTER2, weights=~GSWGT1, data=d2)
##Analysis The following two anlysis: logit/probit analysis confirms the observed patterns with Hispanics being less likely to having suicide thoughts but it is not significant. A significant variable, having a health exam in the last year is associated with being less likely to have suicide thoughts. However, for all grade levels, there is positive assoication between individuals in grades 7th through 12th and suicide thoughts. Grades 9th through 11th are significant where 11th graders have the highest significance.
fit.logit<-svyglm(suicide~Hispanic+grade+hlthexam, design=des, family=binomial, data=d2)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = suicide ~ Hispanic + grade + hlthexam, design = des,
## family = binomial, data = d2)
##
## Survey design:
## svydesign(ids = ~CLUSTER2, weights = ~GSWGT1, data = d2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.05651 0.12538 -16.402 < 2e-16 ***
## Hispanic1 -0.04712 0.12921 -0.365 0.715993
## grade8 0.17359 0.19853 0.874 0.383595
## grade9 0.46587 0.16021 2.908 0.004312 **
## grade10 0.35551 0.16246 2.188 0.030524 *
## grade11 0.53968 0.15256 3.538 0.000569 ***
## grade12 0.23278 0.16124 1.444 0.151361
## hlthexam1 -0.24478 0.08383 -2.920 0.004160 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9982151)
##
## Number of Fisher Scoring iterations: 4
knitr::kable(data.frame(OR=exp(coef(fit.logit)), ci=exp(confint(fit.logit))))
| OR | ci.2.5.. | ci.97.5.. | |
|---|---|---|---|
| (Intercept) | 0.1278996 | 0.1000338 | 0.1635277 |
| Hispanic1 | 0.9539764 | 0.7405497 | 1.2289126 |
| grade8 | 1.1895716 | 0.8061230 | 1.7554152 |
| grade9 | 1.5934032 | 1.1640074 | 2.1812007 |
| grade10 | 1.4269056 | 1.0377872 | 1.9619241 |
| grade11 | 1.7154589 | 1.2721076 | 2.3133256 |
| grade12 | 1.2620975 | 0.9201193 | 1.7311779 |
| hlthexam1 | 0.7828734 | 0.6642523 | 0.9226776 |
fit.probit<-svyglm(suicide~Hispanic+grade+hlthexam, design=des, data=d2, family=binomial(link = "probit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fit.probit)
##
## Call:
## svyglm(formula = suicide ~ Hispanic + grade + hlthexam, design = des,
## family = binomial(link = "probit"), data = d2)
##
## Survey design:
## svydesign(ids = ~CLUSTER2, weights = ~GSWGT1, data = d2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.20458 0.06495 -18.546 < 2e-16 ***
## Hispanic1 -0.02402 0.06943 -0.346 0.729932
## grade8 0.08875 0.10333 0.859 0.392030
## grade9 0.24443 0.08447 2.894 0.004498 **
## grade10 0.18468 0.08537 2.163 0.032438 *
## grade11 0.28516 0.08072 3.533 0.000579 ***
## grade12 0.11801 0.08396 1.406 0.162369
## hlthexam1 -0.12990 0.04557 -2.850 0.005115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9981152)
##
## Number of Fisher Scoring iterations: 4
myexp<-function(x){exp(x)}
stargazer(fit.logit, fit.probit,
type="html",
style="demography",
covariate.labels=c("Hispanic", "Non-Hispanic", "7th grade", "8th grade", "9th grade", "10th grade", "11th grade", "12th grade", "Health exam=no", "Health exam=yes"), ci=T, column.sep.width="10pt")
| suicide | ||
| survey-weighted | survey-weighted | |
| logistic | probit | |
| Model 1 | Model 2 | |
| Hispanic | -0.047 | -0.024 |
| (-0.300, 0.206) | (-0.160, 0.112) | |
| Non-Hispanic | 0.174 | 0.089 |
| (-0.216, 0.563) | (-0.114, 0.291) | |
| 7th grade | 0.466** | 0.244** |
| (0.152, 0.780) | (0.079, 0.410) | |
| 8th grade | 0.356* | 0.185* |
| (0.037, 0.674) | (0.017, 0.352) | |
| 9th grade | 0.540*** | 0.285*** |
| (0.241, 0.839) | (0.127, 0.443) | |
| 10th grade | 0.233 | 0.118 |
| (-0.083, 0.549) | (-0.047, 0.283) | |
| 11th grade | -0.245** | -0.130** |
| (-0.409, -0.080) | (-0.219, -0.041) | |
| 12th grade | -2.057*** | -1.205*** |
| (-2.302, -1.811) | (-1.332, -1.077) | |
| N | 6,253 | 6,253 |
| Log Likelihood | -2,352.387 | -2,352.527 |
| AIC | 4,720.774 | 4,721.055 |
| p < .05; p < .01; p < .001 | ||
#Logit Marginal effects A comparison between the marginal effects from the logit and probit models appears to be very similar.
log.marg<-coef(fit.logit)*mean(dlogis(predict(fit.logit)), na.rm=T)
#Probit Marginal effects
prob.marg<-coef(fit.probit)*mean(dnorm(predict(fit.probit)), na.rm=T)
effects<-data.frame(effect=c(log.marg, prob.marg), term=rep(names(log.marg),2), model=c(rep("logit", length(log.marg)), rep("probit", length(log.marg))))
effects%>%
ggplot(aes(x=term, y=effect))+geom_point(aes(color=model, group=model, shape=model), position = position_jitterdodge(jitter.width = 1), size=2)+ylab("Marginal Effect")+xlab("Model Term")+geom_abline(intercept=0, slope = 0)+theme(axis.text.x = element_text(angle = 45, hjust = 1))+ggtitle(label = "Comparison of marginal effects in Logit and Probitmodels")
##Fitted values In calculating probabilities, we find no difference betwen grade levels, being Hispanic, or having a health exam.
dat<-expand.grid(grade=levels(d2$grade), Hispanic=levels(d2$Hispanic), hlthexam=levels(d2$hlthexam))
#str(d3)
summary(dat)
## grade Hispanic hlthexam
## 7 :4 0:12 0:12
## 8 :4 1:12 1:12
## 9 :4
## 10:4
## 11:4
## 12:4
fit<-predict(fit.logit, newdata=dat, type="response")
fit2<-predict(fit.probit, newdata=dat, type="response")
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fit2, 3)
knitr::kable(head(dat, n=20))
| grade | Hispanic | hlthexam | fitted.prob.lrm | fitted.prob.pro |
|---|---|---|---|---|
| 7 | 0 | 0 | 0.113 | 0.114 |
| 8 | 0 | 0 | 0.132 | 0.132 |
| 9 | 0 | 0 | 0.169 | 0.168 |
| 10 | 0 | 0 | 0.154 | 0.154 |
| 11 | 0 | 0 | 0.180 | 0.179 |
| 12 | 0 | 0 | 0.139 | 0.139 |
| 7 | 1 | 0 | 0.109 | 0.110 |
| 8 | 1 | 0 | 0.127 | 0.127 |
| 9 | 1 | 0 | 0.163 | 0.163 |
| 10 | 1 | 0 | 0.148 | 0.148 |
| 11 | 1 | 0 | 0.173 | 0.173 |
| 12 | 1 | 0 | 0.133 | 0.133 |
| 7 | 0 | 1 | 0.091 | 0.091 |
| 8 | 0 | 1 | 0.106 | 0.106 |
| 9 | 0 | 1 | 0.138 | 0.138 |
| 10 | 0 | 1 | 0.125 | 0.125 |
| 11 | 0 | 1 | 0.147 | 0.147 |
| 12 | 0 | 1 | 0.112 | 0.112 |
| 7 | 1 | 1 | 0.087 | 0.087 |
| 8 | 1 | 1 | 0.102 | 0.102 |