All the variables in this dataset are from the dataset:Add Health.

For this assignment, I will focus on suicide as my binary variable. I examine the relationship between grade level, Hispanic and non-Hispanic and Health care exam.

I predict that the higher level of health care exams will be associated with lower levels of suicide thoughts, particularly for Hispanics.

library(haven)
d1<-read_dta("C:/Users/tim1203/Desktop/utsa/C. Sparks/Dem 7283/Data sets/wholeaddhealthstata.dta") 
w1<-read_dta("C:/Users/tim1203/Desktop/utsa/C. Sparks/Dem 7283/Data sets/21600-0004-Data.dta")

d2<-merge(d1, w1, by="AID")
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
## Warning: package 'questionr' was built under R version 3.6.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

##Recoding variables #Suicide thoughts

d2$suicide<-Recode(d2$H1SU1, recodes="0=0; 1=1; else=NA")
table(d2$suicide)
## 
##    0    1 
## 5614  821
table(d2$H1SU1)
## 
##    0    1    6    8    9 
## 5614  821   46   22    1
is.numeric(d2$suicide)
## [1] TRUE

#Received Health Services in Last Year

d2$hlthexam<-Recode(d2$H1HS1, recodes="0=0; 1=1; else=NA")
table(d2$hlthexam)
## 
##    0    1 
## 2137 4340
table(d2$H1HS1)
## 
##    0    1    6    8 
## 2137 4340    5   22
is.numeric(d2$hlthexam)
## [1] TRUE

#Received Counseling in Last Year

d2$counsel<-Recode(d2$H1HS3, recodes="0=0; 1=1; else=NA")
table(d2$counsel)
## 
##    0    1 
## 5672  813
table(d2$H1HS3)
## 
##    0    1    6    8 
## 5672  813    7   12
is.numeric(d2$counsel)
## [1] TRUE

#Current Grade

d2$grade<-Recode(d2$H1GI20, recodes="7=7; 8=8; 9=9; 10=10; 11=11; 12=12; else=NA")
table(d2$grade)
## 
##    7    8    9   10   11   12 
##  979  992 1107 1144 1122  993
table(d2$H1GI20)
## 
##    7    8    9   10   11   12   96   97   98   99 
##  979  992 1107 1144 1122  993    1  128    3   35
is.numeric(d2$grade)
## [1] TRUE

#Recode Hispanic

table(d2$H1GI4)
## 
##    0    1    6    8 
## 5738  743    3   20
d2$Hispanic<-Recode(d2$H1GI4, recodes="0=0; 1=1; else=NA")
table(d2$Hispanic)
## 
##    0    1 
## 5738  743
is.numeric(d2$Hispanic)
## [1] TRUE
is.character(d2$Hispanic)
## [1] FALSE
class(d2$Hispanic)
## [1] "haven_labelled"
d2 <- d2 %>% 
  mutate(
    grade = as.factor(grade),
    Hispanic = as.factor(Hispanic),
    hlthexam = as.factor(hlthexam)
  )
options(survey.lonely.psu = "PSUSCID")
des<-svydesign(ids=~CLUSTER2, weights=~GSWGT1, data=d2)

##Analysis The following two anlysis: logit/probit analysis confirms the observed patterns with Hispanics being less likely to having suicide thoughts but it is not significant. A significant variable, having a health exam in the last year is associated with being less likely to have suicide thoughts. However, for all grade levels, there is positive assoication between individuals in grades 7th through 12th and suicide thoughts. Grades 9th through 11th are significant where 11th graders have the highest significance.

fit.logit<-svyglm(suicide~Hispanic+grade+hlthexam, design=des, family=binomial, data=d2)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fit.logit)
## 
## Call:
## svyglm(formula = suicide ~ Hispanic + grade + hlthexam, design = des, 
##     family = binomial, data = d2)
## 
## Survey design:
## svydesign(ids = ~CLUSTER2, weights = ~GSWGT1, data = d2)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.05651    0.12538 -16.402  < 2e-16 ***
## Hispanic1   -0.04712    0.12921  -0.365 0.715993    
## grade8       0.17359    0.19853   0.874 0.383595    
## grade9       0.46587    0.16021   2.908 0.004312 ** 
## grade10      0.35551    0.16246   2.188 0.030524 *  
## grade11      0.53968    0.15256   3.538 0.000569 ***
## grade12      0.23278    0.16124   1.444 0.151361    
## hlthexam1   -0.24478    0.08383  -2.920 0.004160 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9982151)
## 
## Number of Fisher Scoring iterations: 4
knitr::kable(data.frame(OR=exp(coef(fit.logit)), ci=exp(confint(fit.logit))))
OR ci.2.5.. ci.97.5..
(Intercept) 0.1278996 0.1000338 0.1635277
Hispanic1 0.9539764 0.7405497 1.2289126
grade8 1.1895716 0.8061230 1.7554152
grade9 1.5934032 1.1640074 2.1812007
grade10 1.4269056 1.0377872 1.9619241
grade11 1.7154589 1.2721076 2.3133256
grade12 1.2620975 0.9201193 1.7311779
hlthexam1 0.7828734 0.6642523 0.9226776
fit.probit<-svyglm(suicide~Hispanic+grade+hlthexam, design=des, data=d2, family=binomial(link = "probit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fit.probit)
## 
## Call:
## svyglm(formula = suicide ~ Hispanic + grade + hlthexam, design = des, 
##     family = binomial(link = "probit"), data = d2)
## 
## Survey design:
## svydesign(ids = ~CLUSTER2, weights = ~GSWGT1, data = d2)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.20458    0.06495 -18.546  < 2e-16 ***
## Hispanic1   -0.02402    0.06943  -0.346 0.729932    
## grade8       0.08875    0.10333   0.859 0.392030    
## grade9       0.24443    0.08447   2.894 0.004498 ** 
## grade10      0.18468    0.08537   2.163 0.032438 *  
## grade11      0.28516    0.08072   3.533 0.000579 ***
## grade12      0.11801    0.08396   1.406 0.162369    
## hlthexam1   -0.12990    0.04557  -2.850 0.005115 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9981152)
## 
## Number of Fisher Scoring iterations: 4
myexp<-function(x){exp(x)}
stargazer(fit.logit, fit.probit,
type="html",
style="demography",
covariate.labels=c("Hispanic", "Non-Hispanic", "7th grade", "8th grade", "9th grade", "10th grade", "11th grade", "12th grade", "Health exam=no", "Health exam=yes"), ci=T, column.sep.width="10pt")
suicide
survey-weighted survey-weighted
logistic probit
Model 1 Model 2
Hispanic -0.047 -0.024
(-0.300, 0.206) (-0.160, 0.112)
Non-Hispanic 0.174 0.089
(-0.216, 0.563) (-0.114, 0.291)
7th grade 0.466** 0.244**
(0.152, 0.780) (0.079, 0.410)
8th grade 0.356* 0.185*
(0.037, 0.674) (0.017, 0.352)
9th grade 0.540*** 0.285***
(0.241, 0.839) (0.127, 0.443)
10th grade 0.233 0.118
(-0.083, 0.549) (-0.047, 0.283)
11th grade -0.245** -0.130**
(-0.409, -0.080) (-0.219, -0.041)
12th grade -2.057*** -1.205***
(-2.302, -1.811) (-1.332, -1.077)
N 6,253 6,253
Log Likelihood -2,352.387 -2,352.527
AIC 4,720.774 4,721.055
p < .05; p < .01; p < .001

#Logit Marginal effects A comparison between the marginal effects from the logit and probit models appears to be very similar.

log.marg<-coef(fit.logit)*mean(dlogis(predict(fit.logit)), na.rm=T)

#Probit Marginal effects

prob.marg<-coef(fit.probit)*mean(dnorm(predict(fit.probit)), na.rm=T)
effects<-data.frame(effect=c(log.marg, prob.marg), term=rep(names(log.marg),2), model=c(rep("logit",  length(log.marg)), rep("probit", length(log.marg))))
effects%>%
  ggplot(aes(x=term, y=effect))+geom_point(aes(color=model, group=model, shape=model), position = position_jitterdodge(jitter.width = 1), size=2)+ylab("Marginal Effect")+xlab("Model Term")+geom_abline(intercept=0, slope = 0)+theme(axis.text.x = element_text(angle = 45, hjust = 1))+ggtitle(label = "Comparison of marginal effects in Logit and Probitmodels")

##Fitted values In calculating probabilities, we find no difference betwen grade levels, being Hispanic, or having a health exam.

dat<-expand.grid(grade=levels(d2$grade), Hispanic=levels(d2$Hispanic), hlthexam=levels(d2$hlthexam))
#str(d3)
summary(dat)
##  grade  Hispanic hlthexam
##  7 :4   0:12     0:12    
##  8 :4   1:12     1:12    
##  9 :4                    
##  10:4                    
##  11:4                    
##  12:4
fit<-predict(fit.logit, newdata=dat, type="response")
fit2<-predict(fit.probit, newdata=dat, type="response")
dat$fitted.prob.lrm<-round(fit, 3)
dat$fitted.prob.pro<-round(fit2, 3)

knitr::kable(head(dat, n=20))
grade Hispanic hlthexam fitted.prob.lrm fitted.prob.pro
7 0 0 0.113 0.114
8 0 0 0.132 0.132
9 0 0 0.169 0.168
10 0 0 0.154 0.154
11 0 0 0.180 0.179
12 0 0 0.139 0.139
7 1 0 0.109 0.110
8 1 0 0.127 0.127
9 1 0 0.163 0.163
10 1 0 0.148 0.148
11 1 0 0.173 0.173
12 1 0 0.133 0.133
7 0 1 0.091 0.091
8 0 1 0.106 0.106
9 0 1 0.138 0.138
10 0 1 0.125 0.125
11 0 1 0.147 0.147
12 0 1 0.112 0.112
7 1 1 0.087 0.087
8 1 1 0.102 0.102