In the needle_sharing dataset, I kept the outcome of # of times a drug user shared a syringe in the past month (‘shared_syr’) and changed the predictors to sex, age, and depression diagnosis. Depression (dprsn_dx) was recoded with 2 levels, 1 = “No” for no depression diagnosis, 5 = “Yes” for confirmed depression diagnosis. Sex (sex)was recoded with 2 levels, M = “Male” and F = “Female”. Age was included as a predictor and not recoded. This recoded data was saved as ‘needle_cleaned’ to differentiate between the cleaned and original datasets. A table was created of these predictors and their percentage of the dataset. From here, summary statistics and diagnostic plots were completed for the predictors and syringe usage. Although it was mentioned that the poisson distribution was not a good model for this data, I would like to get a recommendation of what to use instead.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
set.seed(1)
N <- 10000
simdat <- data.frame(race = sample(c("white", "non-white"), N, replace = TRUE)) %>%
mutate(race = factor(race, levels = c("white", "non-white"))) %>%
mutate(y = rpois(N, lambda = ifelse(race == "white", 3.5, 3.0)))
fit <- glm(y ~ race, data = simdat, family = poisson(link = "log"))
summary(fit)
##
## Call:
## glm(formula = y ~ race, family = poisson(link = "log"), data = simdat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6483 -0.8760 0.0092 0.5588 3.5410
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.254710 0.007564 165.88 <2e-16 ***
## racenon-white -0.161428 0.011137 -14.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 11041 on 9999 degrees of freedom
## Residual deviance: 10831 on 9998 degrees of freedom
## AIC: 39419
##
## Number of Fisher Scoring iterations: 5
The critical threshold for rejection at p=0.05 is:
qchisq(0.95, df=1)
## [1] 3.841459
So we reject \(H_0\)
BEWARE OF MISSING DATA: THIS IS SAFER
fit0 <- glm(y ~ -1, data = simdat, family = poisson(link = "log"))
anova(fit0, fit, test = "LRT")
shared_syr)library(readr)
needle_sharing <- read_csv("needle_sharing.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## id = col_double(),
## sex = col_character(),
## ethn = col_character(),
## age = col_double(),
## dprsn_dx = col_double(),
## sexabuse = col_double(),
## shared_syr = col_double(),
## hivstat = col_double(),
## hplsns = col_double(),
## nivdu = col_double(),
## shsyryn = col_double(),
## sqrtnivd = col_double(),
## logshsyr = col_double(),
## polydrug = col_double(),
## sqrtninj = col_double(),
## homeless = col_double(),
## shsyr = col_double()
## )
View(needle_sharing)
summary(needle_sharing)
## id sex ethn age
## Min. :2001 Length:128 Length:128 Min. :19.00
## 1st Qu.:2034 Class :character Class :character 1st Qu.:35.00
## Median :2066 Mode :character Mode :character Median :41.00
## Mean :2065 Mean :40.73
## 3rd Qu.:2097 3rd Qu.:47.25
## Max. :2129 Max. :54.00
##
## dprsn_dx sexabuse shared_syr hivstat
## Min. :1.000 Min. :0.0000 Min. : 0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:0.000
## Median :1.000 Median :0.0000 Median : 0.000 Median :0.000
## Mean :2.206 Mean :0.1371 Mean : 2.976 Mean :0.114
## 3rd Qu.:5.000 3rd Qu.:0.0000 3rd Qu.: 0.000 3rd Qu.:0.000
## Max. :5.000 Max. :1.0000 Max. :60.000 Max. :2.000
## NA's :2 NA's :4 NA's :5 NA's :14
## hplsns nivdu shsyryn sqrtnivd
## Min. : 0.000 Min. : 0.0 Min. :0.00 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 42.5 1st Qu.:0.00 1st Qu.: 6.516
## Median : 5.000 Median : 90.0 Median :0.00 Median : 9.487
## Mean : 6.609 Mean :101.9 Mean :0.25 Mean : 8.982
## 3rd Qu.:10.000 3rd Qu.:120.0 3rd Qu.:0.25 3rd Qu.:10.954
## Max. :20.000 Max. :900.0 Max. :1.00 Max. :30.000
## NA's :1 NA's :1
## logshsyr polydrug sqrtninj homeless
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.6932 1st Qu.:0.0000 1st Qu.: 6.516 1st Qu.:0.0000
## Median :1.6094 Median :0.0000 Median : 9.487 Median :0.0000
## Mean :1.7765 Mean :0.1484 Mean : 8.982 Mean :0.4919
## 3rd Qu.:2.3026 3rd Qu.:0.0000 3rd Qu.:10.954 3rd Qu.:1.0000
## Max. :4.0943 Max. :1.0000 Max. :30.000 Max. :1.0000
## NA's :101 NA's :1 NA's :4
## shsyr
## Min. : 1.00
## 1st Qu.: 2.00
## Median : 5.00
## Mean :13.56
## 3rd Qu.:10.00
## Max. :60.00
## NA's :101
Some recoding:
suppressPackageStartupMessages(library(dplyr))
needle_cleaned <-
mutate(needle_sharing,
depression = factor(dprsn_dx, levels = c("1", "5"), labels = c("No", "Yes")),
sex = factor(sex, levels = c("M", "F"), labels = c("Male", "Female")),
age = factor(age)
) %>%
select(all_of(c("shared_syr", "depression", "sex", "age")))
view(needle_cleaned)
library(tableone)
t2 <- CreateTableOne(data=needle_cleaned, includeNA = TRUE)
print(t2, missing=TRUE, showAllLevels = TRUE)
##
## level Overall Missing
## n 128
## shared_syr (mean (SD)) 2.98 (10.32) 3.9
## depression (%) No 88 (68.8) 1.6
## Yes 38 (29.7)
## <NA> 2 ( 1.6)
## sex (%) Male 97 (75.8) 0.8
## Female 30 (23.4)
## <NA> 1 ( 0.8)
## age (%) 19 1 ( 0.8) 0.0
## 21 1 ( 0.8)
## 22 3 ( 2.3)
## 23 1 ( 0.8)
## 24 1 ( 0.8)
## 25 2 ( 1.6)
## 26 2 ( 1.6)
## 27 1 ( 0.8)
## 28 1 ( 0.8)
## 29 4 ( 3.1)
## 30 2 ( 1.6)
## 31 2 ( 1.6)
## 32 3 ( 2.3)
## 33 4 ( 3.1)
## 34 1 ( 0.8)
## 35 4 ( 3.1)
## 36 7 ( 5.5)
## 37 4 ( 3.1)
## 38 5 ( 3.9)
## 39 6 ( 4.7)
## 40 3 ( 2.3)
## 41 10 ( 7.8)
## 42 2 ( 1.6)
## 43 4 ( 3.1)
## 44 5 ( 3.9)
## 45 5 ( 3.9)
## 46 3 ( 2.3)
## 47 9 ( 7.0)
## 48 2 ( 1.6)
## 49 8 ( 6.2)
## 51 4 ( 3.1)
## 52 5 ( 3.9)
## 53 6 ( 4.7)
## 54 7 ( 5.5)
Create a histogram number of syringe uses
Create a scatter plot of number of syringe uses versus rank of number of syringe uses
library(dplyr)
mutate(needle_sharing, rnk = rank(shared_syr, ties.method = "first")) %>%
ggplot(aes(x = rnk, y = shared_syr)) +
geom_point() +
labs(title = "Count vs Rank Count of Syringe Sharing Incidents") +
xlab("rank of count") +
ylab("count of syringe sharing")
## Warning: Removed 5 rows containing missing values (geom_point).
fit.pois <- glm(shared_syr ~ depression + sex + age,
data = needle_cleaned,
family = poisson(link = "log"))
summary(fit.pois)
##
## Call:
## glm(formula = shared_syr ~ depression + sex + age, family = poisson(link = "log"),
## data = needle_cleaned)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.7171 -1.8524 -0.0003 -0.0002 12.4247
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.82694 0.60019 1.378 0.168266
## depressionYes -0.08759 0.15420 -0.568 0.570014
## sexFemale 0.35926 0.13952 2.575 0.010027 *
## age21 0.96482 0.72587 1.329 0.183788
## age22 1.50182 0.61037 2.461 0.013874 *
## age23 -19.12952 5717.53217 -0.003 0.997330
## age24 -19.12952 5717.53217 -0.003 0.997330
## age25 -19.12952 4042.90579 -0.005 0.996225
## age26 -0.78410 0.92236 -0.850 0.395265
## age27 -19.12952 5717.53217 -0.003 0.997330
## age28 -19.12952 5717.53217 -0.003 0.997330
## age29 -0.34247 0.69943 -0.490 0.624391
## age30 -19.14867 3796.26623 -0.005 0.995975
## age31 0.45127 0.67943 0.664 0.506573
## age32 -0.63825 0.77016 -0.829 0.407261
## age33 1.96684 0.60816 3.234 0.001220 **
## age34 -19.04194 5717.53217 -0.003 0.997343
## age35 -19.12952 2858.76613 -0.007 0.994661
## age36 1.13725 0.61027 1.864 0.062391 .
## age37 -18.93911 3044.07921 -0.006 0.995036
## age38 -18.80010 2226.29808 -0.008 0.993262
## age39 -0.02519 0.63994 -0.039 0.968600
## age40 2.30959 0.60954 3.789 0.000151 ***
## age41 -18.42726 1238.38004 -0.015 0.988128
## age42 -19.12952 4042.90579 -0.005 0.996225
## age43 -18.44754 1859.40148 -0.010 0.992084
## age44 -0.40158 0.68561 -0.586 0.558054
## age45 -0.79282 0.74284 -1.067 0.285846
## age46 -19.04194 4042.90579 -0.005 0.996242
## age47 0.29651 0.61170 0.485 0.627869
## age48 -19.12952 4042.90579 -0.005 0.996225
## age49 1.09485 0.60667 1.805 0.071122 .
## age51 -18.01157 1583.11477 -0.011 0.990922
## age52 -2.40226 1.16268 -2.066 0.038815 *
## age53 -18.38038 1699.89512 -0.011 0.991373
## age54 -0.99519 0.74563 -1.335 0.181978
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1611.81 on 119 degrees of freedom
## Residual deviance: 887.56 on 84 degrees of freedom
## (8 observations deleted due to missingness)
## AIC: 1054.4
##
## Number of Fisher Scoring iterations: 16
par(mfrow = c(2, 2))
plot(fit.pois)
## Warning: not plotting observations with leverage one:
## 10, 16, 43, 46, 54, 67, 107
## Warning: not plotting observations with leverage one:
## 10, 16, 43, 46, 54, 67, 107
qqnorm(resid(fit.pois))
qqline(resid(fit.pois))