## year cocyr heryr oxyyr
## Length:949285 Min. :0.00000 Min. :0.00000 Min. :-9.000
## Class :character 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:-9.000
## Mode :character Median :0.00000 Median :0.00000 Median : 0.000
## Mean :0.02787 Mean :0.00335 Mean :-3.178
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 0.000
## Max. :1.00000 Max. :1.00000 Max. : 1.000
##
## mthyr CATAG7 irsex income
## Min. :-9.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 0.000 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:2.000
## Median : 0.000 Median :5.000 Median :2.000 Median :2.000
## Mean :-2.144 Mean :4.657 Mean :1.521 Mean :2.494
## 3rd Qu.: 0.000 3rd Qu.:7.000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. : 1.000 Max. :7.000 Max. :2.000 Max. :4.000
##
## EDUCCAT2 HEALTH2 cocever herever
## Min. :-9.0000 Min. :-9.0000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 1.0000 1st Qu.: 1.0000 1st Qu.: 2.000 1st Qu.: 2.000
## Median : 3.0000 Median : 2.0000 Median : 2.000 Median : 2.000
## Mean : 0.3869 Mean : 0.8601 Mean : 1.921 Mean : 2.039
## 3rd Qu.: 4.0000 3rd Qu.: 3.0000 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. : 5.0000 Max. : 4.0000 Max. :97.000 Max. :97.000
## NA's :172
## methamevr ircocfy irherfy iroxyfy irmthfy
## Min. :-9.000 Min. : 1.0 Min. : 1 Min. : -9.0 Min. : -9.0
## 1st Qu.:-9.000 1st Qu.:991.0 1st Qu.:991 1st Qu.: -9.0 1st Qu.:991.0
## Median :-9.000 Median :991.0 Median :991 Median :991.0 Median :991.0
## Mean :-6.356 Mean :964.5 Mean :988 Mean :630.8 Mean :747.8
## 3rd Qu.:-9.000 3rd Qu.:991.0 3rd Qu.:991 3rd Qu.:991.0 3rd Qu.:991.0
## Max. :97.000 Max. :993.0 Max. :993 Max. :993.0 Max. :993.0
##
## fentpdapyu
## Min. :-9.00
## 1st Qu.:-9.00
## Median :-9.00
## Mean :-6.85
## 3rd Qu.:-9.00
## Max. : 1.00
##
There are 17 observations with variables like drugs, sex, income, health and education. The variables talk about the participants in the study, if they did the drugs, their age, sex and education.
coc_her <- select(drugs02_18, year, irsex, CATAG7, income, EDUCCAT2, cocyr, ircocfy, cocever, heryr,
irherfy, herever)
min(drugs02_18$irsex)
## [1] 1
max(drugs02_18$irsex)
## [1] 2
This represents that the min is men and the max is men in the dataset.
min(drugs02_18$CATAG7)
## [1] 1
max(drugs02_18$CATAG7)
## [1] 7
This represents that the miniumun is the youngest age and the max is the max age.
min(drugs02_18$income)
## [1] 1
max(drugs02_18$income)
## [1] 4
This represnts that the catagory 1 is the lowest income and the catagory 4 is the max income.
min(drugs02_18$EDUCCAT2)
## [1] -9
max(drugs02_18$EDUCCAT2)
## [1] 5
The -9 represents the NA data and 5 represents the kids who have not completed any degree of college yet.
min(drugs02_18$cocyr)
## [1] 0
max(drugs02_18$cocyr)
## [1] 1
The catagory 0 represent not using cocaine in the past year and catagory 1 represents using cocaine in the last year
min(drugs02_18$ircocfy)
## [1] 1
max(drugs02_18$ircocfy)
## [1] 993
The catagory 1 represents using cocaine only once in the past year and 993 represents not using cocaine in the past year
min(drugs02_18$cocever)
## [1] 1
max(drugs02_18$cocever)
## [1] 97
The catagory 1 represents that they have used cocaine before and 97 represents they refused cocaine
min(drugs02_18$heryr)
## [1] 0
max(drugs02_18$heryr)
## [1] 1
The catagory 0 represent not using cocaine in the past year and catagory 1 represents using cocaine in the last year
min(drugs02_18$irherfy)
## [1] 1
max(drugs02_18$irherfy)
## [1] 993
The catagory 1 represents using cocaine only once in the past year and 993 represents not using cocaine in the past year
min(drugs02_18$herever)
## [1] 1
max(drugs02_18$herever)
## [1] 97
The catagory 1 represents that they have used cocaine before and 97 represents they refused cocaine
##Labeling factors(names only) 2.
coc_her <- rename(coc_her, sex = irsex)
coc_her <- rename(coc_her, Age = CATAG7)
coc_her <- rename(coc_her, EDUCATION = EDUCCAT2)
coc_her <- rename(coc_her, CocPastYear = cocyr)
coc_her <- rename(coc_her, CocFreqPastYear = ircocfy)
coc_her <- rename(coc_her, HerPastYear = heryr)
coc_her <- rename(coc_her, HerFreqPastYear = irherfy)
###PART B, section 3
coc_her <- coc_her %>%
mutate(CocFreqPastYear = replace(CocFreqPastYear, CocFreqPastYear >= 991, NA),
HerFreqPastYear = replace(HerFreqPastYear, HerFreqPastYear >= 991, NA))
coc_her <- coc_her %>%
mutate(cocever = replace(cocever, cocever >= 84, NA),
herever = replace(herever, herever >= 84, NA))
coc_her <- coc_her %>%
mutate(EDUCATION = replace(EDUCATION, EDUCATION == -9, NA))
##Labeling factor levels 3.
coc_her$sex <- factor(coc_her$sex,
levels = c(1,2),
labels = c("male", "female"))
coc_her$CocPastYear <- factor(coc_her$CocPastYear,
levels = c(0,1),
labels = c("no", "yes"))
coc_her$HerPastYear <- factor(coc_her$HerPastYear,
levels = c(0,1),
labels = c("no", "yes"))
coc_her$cocever <- factor(coc_her$cocever,
levels = c(1,2),
labels = c("yes", "no"))
coc_her$herever <- factor(coc_her$herever,
levels = c(1,2),
labels = c("yes", "no"))
coc_her$Age <- factor(coc_her$Age,
levels = c(1,2,3,4,5,6,7),
labels = c("12-17", "14-15","16-17","18-20","21-25","26-34","35+"))
coc_her$income <- factor(coc_her$income,
levels = c(1,2,3,4),
labels = c(">20,000", "20,000-49,999","50,000-74,999","<75,000"))
coc_her$EDUCATION <- factor(coc_her$EDUCATION,
levels = c(1,2,3,4),
labels = c("LessThanHighSch", "HighSchGrad", "CollegeGrad", "12-17YearOld"))
coc_her$year <- as.numeric(coc_her$year)
HISTOGRAMS 1.
ggplot(coc_her, aes(x = CocFreqPastYear)) +
geom_histogram(binwidth = 10, fill = "blue", color = "black") +
labs(title = "Histogram of Cocaine Use Days", x = "Days", y = "Frequency")
## Warning: Removed 922827 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(coc_her, aes(x = HerFreqPastYear)) +
geom_histogram(binwidth = 10, fill = "blue", color = "black") +
labs(title = "Histogram of Heroin Use Days", x = "Days", y = "Frequency")
## Warning: Removed 946105 rows containing non-finite outside the scale range
## (`stat_bin()`).
FREQUENCY POLYGONS 3.
ggplot(coc_her, aes(x = CocFreqPastYear, color = sex)) +
geom_freqpoly(binwidth = 10) +
labs(title = "Frequency Polygon of Cocaine Use by Gender", x = "Days", y = "Frequency")
## Warning: Removed 922827 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(coc_her, aes(x = HerFreqPastYear, color = sex)) +
geom_freqpoly(binwidth = 100) +
labs(title = "Frequency Polygon of Heroin Use by Gender", x = "Days", y = "Frequency")
## Warning: Removed 946105 rows containing non-finite outside the scale range
## (`stat_bin()`).
I think that both in the cocaine use and heroin use, there is more usage of the drug from males. In cocane use men used it a lot for a few days but past like 100 of use a year it is about even between men and women. In herion use I think that overall there is more usage across the year from men according to the visual.
I think that the longer you make the binwidth, the harder it is to interpret the graph as you can really see the up and down patterns and the discrepiancy between male and female usage. I think if the binwidth is larger it would harm peoples abilities to fully understand the graph.
Looking at two groups might make us want to add a quantitative value to our intuition. Run 2 t-tests, one each for cocaine and heroin to compare the number of days use between males and females. I’ll help out with this first one by providing some code to get started, see below. Interpret your results completely and accurately. (4 points) t.test(HerFreqLastYear ~ sex, coc_her)
## Warning: In subset.data.frame(coc_her, sex = "Male") :
## extra argument 'sex' will be disregarded
## Warning: In subset.data.frame(coc_her, sex = "Female") :
## extra argument 'sex' will be disregarded
##
## Welch Two Sample t-test
##
## data: MaleCocFreqLastYear and FemCocFreqLastYear
## t = 0, df = 52914, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.048299 1.048299
## sample estimates:
## mean of x mean of y
## 32.41428 32.41428
MaleHerFreqLastYear <- subset(coc_her, sex = "Male")$HerFreqPastYear
## Warning: In subset.data.frame(coc_her, sex = "Male") :
## extra argument 'sex' will be disregarded
FemHerFreqLastYear <- subset(coc_her, sex = "Female")$HerFreqPastYear
## Warning: In subset.data.frame(coc_her, sex = "Female") :
## extra argument 'sex' will be disregarded
t.test(MaleHerFreqLastYear, FemHerFreqLastYear)
##
## Welch Two Sample t-test
##
## data: MaleHerFreqLastYear and FemHerFreqLastYear
## t = 0, df = 6358, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.861163 5.861163
## sample estimates:
## mean of x mean of y
## 95.33899 95.33899
Boxplots with confidence intervals 7.
## Warning: Removed 922827 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 946105 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 922827 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: In subset.data.frame(coc_her, Age = 18 - 20) :
## extra argument 'Age' will be disregarded
## Warning: In subset.data.frame(coc_her, Age = 21 - 25) :
## extra argument 'Age' will be disregarded
##
## Welch Two Sample t-test
##
## data: group1 and group2
## t = 0, df = 52914, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.048299 1.048299
## sample estimates:
## mean of x mean of y
## 32.41428 32.41428
## [1] 6
## [1] 6
Since the P-Value was greater than .05 I think that there was not a significant difference in the means of cocaine use between the two age groups. Since the means are also the same, it indicates that there age groups show a very similar trend in cocaine use. The results were as expected. The boxplots showed overlapping notches, and the t-test confirmed no significant difference. The medians were also similar, supporting the conclusion that the age groups experienced around the same usage of cocaine.
## Warning: In subset.data.frame(coc_her, Age = 18 - 20) :
## extra argument 'Age' will be disregarded
## Warning: In subset.data.frame(coc_her, Age = 21 - 25) :
## extra argument 'Age' will be disregarded
##
## Welch Two Sample t-test
##
## data: group1 and group2
## t = 0, df = 6358, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.861163 5.861163
## sample estimates:
## mean of x mean of y
## 95.33899 95.33899
## [1] 31
## [1] 31
Since the P-Value was greater than .05 I think that there was not a significant difference in the means of cocaine use between the two age groups. Since the means are also the same, it indicates that there age groups show a very similar trend in cocaine use. The results were as expected. The boxplots showed overlapping notches, and the t-test confirmed no significant difference. The medians were also similar, supporting the conclusion that the age groups experienced around the same usage of Heroin
ggplot(coc_her, aes(x=income, y=CocFreqPastYear, fill=income)) +
stat_summary(fun="mean", geom="bar") +
stat_summary(fun.data="mean_cl_normal", geom="errorbar",
fun.args=list(conf.int=.95), width=0.35) +
labs(title = "Average Days of Cocaine Use per Year by Income Group",
x = "Income Group", y = "Avg Days of Use (Cocaine)") +
theme_minimal() + theme(legend.position = "none")
## Warning: Removed 922827 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Removed 922827 rows containing non-finite outside the scale range
## (`stat_summary()`).
The bar plots show how average days of drug use vary by income level.
For cocaine, the <$20k group appears to have a higher average use
than the $75k+ group. For heroin, this pattern also appears, though the
differences may be more visually pronounced. The error bars (95%
confidence intervals) help us understand the uncertainty in these
averages. When the bars and intervals do not overlap much, it suggests
the differences may be statistically meaningful. We will now confirm
this with t-tests.
ggplot(coc_her, aes(x=income, y=HerFreqPastYear, fill=income)) +
stat_summary(fun="mean", geom="bar") +
stat_summary(fun.data="mean_cl_normal", geom="errorbar",
fun.args=list(conf.int=.95), width=0.35) +
labs(title = "Average Days of Heroin Use per Year by Income Group",
x = "Income Group", y = "Avg Days of Use (Heroin)") +
theme_minimal() + theme(legend.position = "none")
## Warning: Removed 946105 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Removed 946105 rows containing non-finite outside the scale range
## (`stat_summary()`).
# 13. T-tests comparing low vs high cocaine/heroin users (based on frequency quartiles)
# Cocaine frequency quartiles
q_coc <- quantile(coc_her$CocFreqPastYear, probs = c(0.25, 0.75), na.rm = TRUE)
#Had to do some googling for the above quantile method, since I was not able to get it to work the standard way.
low_coc <- coc_her %>%
filter(CocFreqPastYear <= q_coc[1], !is.na(CocFreqPastYear)) %>%
pull(CocFreqPastYear)
high_coc <- coc_her %>%
filter(CocFreqPastYear >= q_coc[2], !is.na(CocFreqPastYear)) %>%
pull(CocFreqPastYear)
# Run cocaine t-test
t_cocaine_income <- t.test(low_coc, high_coc)
t_cocaine_income
##
## Welch Two Sample t-test
##
## data: low_coc and high_coc
## t = -104.82, df = 6618.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -110.1243 -106.0810
## sample estimates:
## mean of x mean of y
## 1.428379 109.531047
# Heroin frequency quartiles
q_her <- quantile(coc_her$HerFreqPastYear, probs = c(0.25, 0.75), na.rm = TRUE)
low_her <- coc_her %>%
filter(HerFreqPastYear <= q_her[1], !is.na(HerFreqPastYear)) %>%
pull(HerFreqPastYear)
high_her <- coc_her %>%
filter(HerFreqPastYear >= q_her[2], !is.na(HerFreqPastYear)) %>%
pull(HerFreqPastYear)
# Run heroin t-test
t_heroin_income <- t.test(low_her, high_her)
t_heroin_income
##
## Welch Two Sample t-test
##
## data: low_her and high_her
## t = -106.85, df = 860.37, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -275.6156 -265.6728
## sample estimates:
## mean of x mean of y
## 1.949292 272.593496
The t-tests comparing average drug use between the lowest and highest income groups show statistically significant differences for both cocaine and heroin. For cocaine, individuals in the <$20k group used the drug more frequently on average than those in the $75k+ group, with a low p-value indicating this difference is unlikely due to chance. A similar pattern was observed for heroin, where the lowest income group again reported higher average use. These results suggest that income level may be an important factor associated with frequency of drug use.
Because the variables ‘CocPastYear’ and ‘HerPastYear’ are coded as 0 (no use) and 1 (used), the mean of the variable represents the proportion of respondents who used the drug. For example, if 10 out of 100 people used cocaine in a year, the mean of that binary variable would be 0.10 = 10%.
coc_her_summary <- coc_her %>%
group_by(year) %>%
dplyr::summarize(
percent_cocaine_use = mean(CocPastYear == "yes", na.rm = TRUE) * 100,
percent_heroin_use = mean(HerPastYear == "yes", na.rm = TRUE) * 100
)
head(coc_her_summary)
## # A tibble: 6 × 3
## year percent_cocaine_use percent_heroin_use
## <dbl> <dbl> <dbl>
## 1 2002 3.63 0.240
## 2 2003 3.57 0.239
## 3 2004 3.58 0.272
## 4 2005 3.44 0.286
## 5 2006 3.42 0.276
## 6 2007 3.36 0.236
ggplot(coc_her_summary, aes(x = as.numeric(year), y = percent_cocaine_use)) +
geom_line(color = "darkred", linewidth = 1.2) +
labs(title = "Cocaine Use (% of Users) by Year",
x = "Year", y = "Percent of Respondents Using Cocaine") +
theme_minimal()
ggplot(coc_her_summary, aes(x = as.numeric(year), y = percent_heroin_use)) +
geom_line(color = "steelblue", linewidth = 1.2) +
labs(title = "Heroin Use (% of Users) by Year",
x = "Year", y = "Percent of Respondents Using Heroin") +
theme_minimal()
In contrast, heroin use started lower, around 0.24%, and increased modestly over time, peaking at about 0.43% around 2016 before dipping slightly again. While the overall use remains low compared to cocaine, the trend suggests a gradual increase in heroin use until the last couple of years of the data.
first_year_coc <- filter(coc_her, year == 2002)
last_year_coc <- filter(coc_her, year == 2018)
x <- first_year_coc$CocPastYear == "yes"
y <- last_year_coc$CocPastYear == "yes"
t.test(x, y)
##
## Welch Two Sample t-test
##
## data: x and y
## t = 11.099, df = 105177, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.009475164 0.013539372
## sample estimates:
## mean of x mean of y
## 0.03626177 0.02475450
first_year_her <- filter(coc_her, year == 2002)
last_year_her <- filter(coc_her, year == 2018)
x <- first_year_her$HerPastYear == "yes"
y <- last_year_her$HerPastYear == "yes"
t.test(x, y)
##
## Welch Two Sample t-test
##
## data: x and y
## t = -3.6034, df = 107728, p-value = 0.0003142
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.001826775 -0.000539631
## sample estimates:
## mean of x mean of y
## 0.002403891 0.003587094
coc_her_bysex <- coc_her %>%
group_by(year, sex) %>%
dplyr::summarize(
percent_cocaine_use = mean(CocPastYear == "yes", na.rm = TRUE) * 100,
percent_heroin_use = mean(HerPastYear == "yes", na.rm = TRUE) * 100
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Line plot of cocaine use by year and sex
ggplot(coc_her_bysex, aes(x = year, y = percent_cocaine_use, color = sex)) +
geom_line(size = 1.2) +
labs(title = "Cocaine Use by Year and Sex",
x = "Year", y = "Percent Using Cocaine",
color = "Sex") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Line plot of heroin use by year and sex
ggplot(coc_her_bysex, aes(x = year, y = percent_heroin_use, color = sex)) +
geom_line(size = 1.2) +
labs(title = "Heroin Use by Year and Sex",
x = "Year", y = "Percent Using Heroin",
color = "Sex") +
theme_minimal()
Based on the 21 analytics questions that we have just done, we think that the visualizations like histograms, box plots, line plots and barplots suggest that younger adults in the age range of 18-25 show a higher rate for both cocaine and heroin usage. Additionally, it seems to be that males are both at higher risk and use the substances more than females. There are also some socioeconomic factors that play a role as individuals with lower incomes and lower educational levels are more likely to use these reported drugs. The data highlights the need for targeted intervention programs, especially for young adults and those from disadvantaged backgrounds. The most important thing is the age range of those using the drugs as it is super harmful to the young demographic of people using the drug and also for people that don’t have access to higher education can fall into this trap. They also face additional factors, such as job insecurity, lack of healthcare access, and social instability which could also contribute to higher drug use. Most times data also underscores how bad problems really are because of certain people left out of data collection so we will really never understand how bad the problem really is.
We think that based on the statistical results and visualizations, heroin appears to be the drug of greatest concern. The use of heroin among those who have used it seems to be significantly high which indicates a greater risk of addiction compared to cocaine. The data also suggest that ages 18-34, particularly males are at the highest risk for heroin abuse. The stat tests and visual trends also tell us that heroin use tends to be more frequent and also more sustained over time while cocaine use is more sporadic. Aswell given the dangers of heroin like overdosing, this demographic of people should be helped as so many lives can be ruined from the over usage of terrible drugs like heroin. Efforts to mitigate heroin use should focus on increased access to treatment and rehab programs, easier access to better education and job training and healthcare access will help to have less addiction to these drugs.The importance of addressing this issue is important as the long-term effects of this drug leads to irreversible harm.
This analysis could help public health officials, educators, and policy makers better understand drug use trends across time and across demographic groups. If communicated carefully, it could guide the development of targeted interventions and inform funding for substance abuse prevention. However, ethical concerns include data privacy, potential misuse of findings to stigmatize vulnerable populations, and the exclusion of people not represented in the dataset (e.g., unhoused individuals or those without access to surveys). It’s also important that results be interpreted in context—percentages alone don’t tell the full story of why people use drugs or how policies impact behavior. Ethical communication means emphasizing structural and public health factors rather than blaming individuals or communities.
The Columbus Dispatch article reports a surge in illegal drug use in Ohio during the COVID-19 pandemic, highlighting increased overdose deaths and signs of relapse among people in recovery. While the NSDUH data used in our analysis ends in 2018, our findings suggest that heroin use was gradually increasing even before the pandemic. This is consistent with the article’s narrative that opioid use and overdose risk were already growing, especially among vulnerable populations. The article’s emphasis on the impact of isolation and economic strain are factors not captured in our data. Future research should examine post-2019 trends and incorporate variables like mental health, housing instability, and access to care, which may help explain and address pandemic-era drug use surges.
On Lab 7, Caleb and Ashwath worked together to complete this Lab. While Caleb worked on questions 1-10 and the summary questions 1-2, Ashwath worked on questions 11-21 and the summary questions 3-4. We were able to split up the work evenly, work together well and complete the assignment. Although we did not need to meet up in person to do the lab, we communicated all the time over email and had good communication so that this lab was done in a way that both of us felt comfortable in. In these ways we believe that we did a good job of being partners and worked well together.