library(readr)
library(expss)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:expss':
##
## vars
Dissertation_Dataset <- read_csv("Dissertation_Dataset.csv",
col_types = cols(FED_STATE = col_factor(levels = c("Indian Affairs", "Federal", "Local", "State", "Private", "Regional")), HEALTH_CJ = col_factor(levels = c("Law Enforcement", "Health", "Generalist", "Other")), SENATE = col_factor(levels = c("0", "1", "2"))))
Interest groups identified can be categorized using two categorical variables: HEALTH_CJ and FED_STATE.
ggplot(data=Dissertation_Dataset, aes(x = HEALTH_CJ, y = Problem_Demand, fill = FED_STATE)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Number of Words Describing the Problem as Demand",
subtitle = "Comparing Types of Actors Problem Definition",
caption = "Measured in word counts",
x = "Health vs. Criminal Justice",
y = "Word Count",
fill = "Level"
)
Local actors appear most likely to define the problem as a demand problem. [I tried dividing Prbolem_Demand by Total words in order to account for the varying lengths in testmony, but ]
Dissertation_Dataset$P_D_T_W = Dissertation_Dataset$Problem_Demand/Dissertation_Dataset$`TOTAL WORDS`
p <- ggplot(data=Dissertation_Dataset, aes(x = FED_STATE, y = (P_D_T_W)) )+
geom_bar(stat = "identity", fill="Blue")
p + scale_x_discrete(limits=c("Regional", "Indian Affairs", "Federal", "State", "Local"))
## Warning: Removed 39 rows containing missing values (position_stack).
Comparing groups on another dimension, health vs. law enforcement, health groups appear to be most likely to define a problem as a demand issue, after controlling for the length of the testimony.
p <- ggplot(data=Dissertation_Dataset, aes(x = HEALTH_CJ, y = (Problem_Demand/`TOTAL WORDS`)) )+
geom_bar(stat = "identity", fill="green")
p + scale_x_discrete(limits=c("Generalist", "Other", "Law Enforcement", "Health"))
library(sandwich)
library(msm)
mpd <- glm(Dissertation_Dataset$Problem_Demand~Dissertation_Dataset$HEALTH_CJ+offset(log(Dissertation_Dataset$`TOTAL WORDS`)), family=poisson)
summary(mpd)
##
## Call:
## glm(formula = Dissertation_Dataset$Problem_Demand ~ Dissertation_Dataset$HEALTH_CJ +
## offset(log(Dissertation_Dataset$`TOTAL WORDS`)), family = poisson)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -29.758 -15.253 -9.911 2.102 74.405
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -3.42681 0.01648 -207.97
## Dissertation_Dataset$HEALTH_CJHealth 0.77137 0.01853 41.63
## Dissertation_Dataset$HEALTH_CJGeneralist 1.17659 0.03829 30.73
## Dissertation_Dataset$HEALTH_CJOther 1.46261 0.02217 65.96
## Pr(>|z|)
## (Intercept) <2e-16 ***
## Dissertation_Dataset$HEALTH_CJHealth <2e-16 ***
## Dissertation_Dataset$HEALTH_CJGeneralist <2e-16 ***
## Dissertation_Dataset$HEALTH_CJOther <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 56745 on 143 degrees of freedom
## Residual deviance: 52121 on 140 degrees of freedom
## AIC: 52683
##
## Number of Fisher Scoring iterations: 7
#robust standard errors
cov.mpd <- vcovHC(mpd, type="HC0")
std.err <- sqrt(diag(cov.mpd))
r.est <- cbind(Estimate= coef(mpd), "Robust SE" = std.err,
"Pr(>|z|)" = 2 * pnorm(abs(coef(mpd)/std.err), lower.tail=FALSE),
LL = coef(mpd) - 1.96 * std.err,
UL = coef(mpd) + 1.96 * std.err)
#chisquare
with(mpd, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE)))
## res.deviance df p
## [1,] 52120.72 140 0
#IRR
spd <- deltamethod(list(~ exp(x1), ~ exp(x2), ~ exp(x3), ~ exp(x4)),
coef(mpd), cov.mpd)
## exponentiate old estimates dropping the p values
rexp.est <- exp(r.est[, -3])
## replace SEs with estimates for exponentiated coefficients
rexp.est[, "Robust SE"] <- spd
rexp.est
## Estimate Robust SE LL
## (Intercept) 0.03249027 0.01564277 0.01264521
## Dissertation_Dataset$HEALTH_CJHealth 2.16272468 1.11489046 0.78740190
## Dissertation_Dataset$HEALTH_CJGeneralist 3.24330297 2.40451341 0.75842710
## Dissertation_Dataset$HEALTH_CJOther 4.31721321 2.95658663 1.12786888
## UL
## (Intercept) 0.08347964
## Dissertation_Dataset$HEALTH_CJHealth 5.94026762
## Dissertation_Dataset$HEALTH_CJGeneralist 13.86951254
## Dissertation_Dataset$HEALTH_CJOther 16.52526299
The model is correctly specified if the deviance residuals are normally distributed and the median =0.(https://stats.idre.ucla.edu/r/dae/poisson-regression/) In this case, it appears that the model is skewed. The reference group for Health_CJ is the Law Enforcement group. [Do we need to make any adjustments because the model is skewed?]
All coefficients are significant indicating that there is a statistically significant difference between health vs. CJ groups in their definition of the problem as a demand problem, even after accounting for the differences in testimony length. The results show that health groups defined the problem as demand at a rate 2.16 times greater than law enforcement groups. Generalists defined the problem as demand at a rate 3.24 times greater than law enforcement groups. Lastly, “other”groups, which consisted of parents of overdose victims as well as ______, defined the problem as a demand problem at a rate 4.32 times greater than the law enforcement groups. These other groups were most likely to define the problem as a demand problem. [Given that the unit is a word, what does this rate tell us precisely?]
However, the goodness-of-fit-chi-squared test is statistically significant indicating that the data do not fit the model well (res. deviance=52,620, p=0). [NOTE: IT is odd that p=0 in both this and future analysis…did I do something wrong here?; Also is this truly necessary for our findings given that we are not looking to predict…?????]
with(Dissertation_Dataset, tapply(Problem_Demand, HEALTH_CJ, function(x) {
sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))
## Law Enforcement Health
## "M (SD) = 85.65 (263.77)" "M (SD) = 178.68 (286.84)"
## Generalist Other
## "M (SD) = 119.57 (236.58)" "M (SD) = 283.88 (598.02)"
Since the variance at each level of Health_CJ are higher than the mean it suggests overdispersion. Therefore, a negative binomial analysis may better suit the data.
??Not sure if I should use the log of the offset for the negative binomial model as well…
library(MASS)
summary(m1 <- glm.nb(Dissertation_Dataset$Problem_Demand ~ Dissertation_Dataset$HEALTH_CJ + offset(log(Dissertation_Dataset$`TOTAL WORDS`))))
##
## Call:
## glm.nb(formula = Dissertation_Dataset$Problem_Demand ~ Dissertation_Dataset$HEALTH_CJ +
## offset(log(Dissertation_Dataset$`TOTAL WORDS`)), init.theta = 0.1462694202,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.54285 -1.40181 -0.46166 0.01546 1.78527
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -3.1592 0.3991 -7.916
## Dissertation_Dataset$HEALTH_CJHealth 0.6252 0.4970 1.258
## Dissertation_Dataset$HEALTH_CJGeneralist 0.5395 1.0667 0.506
## Dissertation_Dataset$HEALTH_CJOther 1.0761 0.7661 1.405
## Pr(>|z|)
## (Intercept) 2.45e-15 ***
## Dissertation_Dataset$HEALTH_CJHealth 0.208
## Dissertation_Dataset$HEALTH_CJGeneralist 0.613
## Dissertation_Dataset$HEALTH_CJOther 0.160
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.1463) family taken to be 1)
##
## Null deviance: 146.92 on 143 degrees of freedom
## Residual deviance: 144.48 on 140 degrees of freedom
## AIC: 1336.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.1463
## Std. Err.: 0.0184
##
## 2 x log-likelihood: -1326.3280
Since the medians of the deviance residuals is close to 0 (-0.46) and is smaller than the Poisson model, it suggests that a negative binomial model is best for this data. Unfortunately, the coefficients are no longer significant… :-(
NB models assume that conditional means are not equal to conditional variance. To do this it is suggested we use a likelihood ratio test to compare the poisson model above with the NB. (https://stats.idre.ucla.edu/r/dae/negative-binomial-regression/).???I’m not sure how to interpret these results???
pchisq(2 * (logLik(m1) - logLik(mpd)), df = 1, lower.tail = FALSE)
## 'log Lik.' 0 (df=5)
Dissertation_Dataset$FED_STATE <- relevel(Dissertation_Dataset$FED_STATE, ref="Federal") #set federal as reference level
mpd2 <- glm(Problem_Demand~FED_STATE+offset(log(`TOTAL WORDS`)), family=poisson, data = Dissertation_Dataset)
summary(mpd2)
##
## Call:
## glm(formula = Problem_Demand ~ FED_STATE + offset(log(`TOTAL WORDS`)),
## family = poisson, data = Dissertation_Dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -32.747 -14.310 -7.991 1.736 71.595
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.83116 0.01765 -217.061 <2e-16 ***
## FED_STATEIndian Affairs 1.14512 0.04062 28.189 <2e-16 ***
## FED_STATELocal 1.68687 0.02236 75.456 <2e-16 ***
## FED_STATEState 1.01045 0.02416 41.818 <2e-16 ***
## FED_STATEPrivate 1.71105 0.02027 84.405 <2e-16 ***
## FED_STATERegional -13.18342 63.51605 -0.208 0.836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 56745 on 143 degrees of freedom
## Residual deviance: 46005 on 138 degrees of freedom
## AIC: 46571
##
## Number of Fisher Scoring iterations: 7
#robust standard errors
cov.mpd2 <- vcovHC(mpd2, type="HC0")
std.err2 <- sqrt(diag(cov.mpd2))
r.est2 <- cbind(Estimate= coef(mpd2), "Robust SE" = std.err2,
"Pr(>|z|)" = 2 * pnorm(abs(coef(mpd2)/std.err2), lower.tail=FALSE),
LL = coef(mpd2) - 1.96 * std.err,
UL = coef(mpd2) + 1.96 * std.err)
## Warning in coef(mpd2) - 1.96 * std.err: longer object length is not a
## multiple of shorter object length
## Warning in coef(mpd2) + 1.96 * std.err: longer object length is not a
## multiple of shorter object length
#chisquare
with(mpd2, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail=FALSE)))
## res.deviance df p
## [1,] 46004.52 138 0
#IRR
spd2 <- deltamethod(list(~ exp(x1), ~ exp(x2), ~ exp(x3), ~ exp(x4), ~ exp(x5), ~ exp(x6)),
coef(mpd2), cov.mpd2)
## exponentiate old estimates dropping the p values
rexp.est2 <- exp(r.est2[, -3])
## replace SEs with estimates for exponentiated coefficients
rexp.est2[, "Robust SE"] <- spd2
rexp.est2
## Estimate Robust SE LL
## (Intercept) 2.168450e-02 6.245426e-03 8.439607e-03
## FED_STATEIndian Affairs 3.142831e+00 1.777929e+00 1.144238e+00
## FED_STATELocal 5.402545e+00 2.505874e+00 1.263353e+00
## FED_STATEState 2.746846e+00 1.231417e+00 7.176115e-01
## FED_STATEPrivate 5.534743e+00 2.213419e+00 2.154122e+00
## FED_STATERegional 1.881532e-06 1.958016e-06 6.850258e-07
## UL
## (Intercept) 5.571557e-02
## FED_STATEIndian Affairs 8.632285e+00
## FED_STATELocal 2.310320e+01
## FED_STATEState 1.051427e+01
## FED_STATEPrivate 1.422082e+01
## FED_STATERegional 5.167928e-06
Again, residuals are skewed(?). Reference group is set to Federal. Chisquarefitness p=0(?)
All co-efficients are significant except for Regional, which isn’t surprising since it only has 1 case. Noteworthy: Private actors defined the problem as demand at a rate 5.53 times greater than the federal government. And, both state and local actors were more likely to define the problem as a demand problem, local groups at a rate 5.4 times greater and state groups 2.74 times greater.
with(Dissertation_Dataset, tapply(Problem_Demand, FED_STATE, function(x) {
sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))
## Federal Indian Affairs
## "M (SD) = 68.30 (130.40)" "M (SD) = 186.75 (187.82)"
## Local State
## "M (SD) = 241.45 (452.89)" "M (SD) = 118.45 (223.26)"
## Private Regional
## "M (SD) = 257.90 (452.83)" "M (SD) = 0.00 (NA)"
Since the variance at each level of FED_STATE are higher than the mean it suggests overdispersion. Therefore, a negative binomial analysis may better suit the data.
summary(m1 <- glm.nb(Dissertation_Dataset$Problem_Demand ~ Dissertation_Dataset$HEALTH_CJ + offset(log(Dissertation_Dataset$`TOTAL WORDS`))))
##
## Call:
## glm.nb(formula = Dissertation_Dataset$Problem_Demand ~ Dissertation_Dataset$HEALTH_CJ +
## offset(log(Dissertation_Dataset$`TOTAL WORDS`)), init.theta = 0.1462694202,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.54285 -1.40181 -0.46166 0.01546 1.78527
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -3.1592 0.3991 -7.916
## Dissertation_Dataset$HEALTH_CJHealth 0.6252 0.4970 1.258
## Dissertation_Dataset$HEALTH_CJGeneralist 0.5395 1.0667 0.506
## Dissertation_Dataset$HEALTH_CJOther 1.0761 0.7661 1.405
## Pr(>|z|)
## (Intercept) 2.45e-15 ***
## Dissertation_Dataset$HEALTH_CJHealth 0.208
## Dissertation_Dataset$HEALTH_CJGeneralist 0.613
## Dissertation_Dataset$HEALTH_CJOther 0.160
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.1463) family taken to be 1)
##
## Null deviance: 146.92 on 143 degrees of freedom
## Residual deviance: 144.48 on 140 degrees of freedom
## AIC: 1336.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.1463
## Std. Err.: 0.0184
##
## 2 x log-likelihood: -1326.3280
Here again the median of the deviance residuals is closer to 0, but my coefficients are not significant…