Purpose: This report demonstrates an end-to-end workflow for analyzing the impact of drug use among U.S. teenagers (ages 15–20), using the provided Excel template. It follows the scientific method and includes descriptive statistics, correlations, visualizations, and simple models.
Note: If the Excel file is not found, the code will generate a small synthetic dataset so the report still knits (for instructional use only).
library(tidyverse)
library(readxl)
library(janitor)
library(ggpubr)
library(broom)
library(scales)
library(tinytex)
# Path to preloaded workbook (adjust as needed)
xlsx_path <- "Teen_Drug_Use_Analysis_Template_Preload.xlsx"
if (file.exists(xlsx_path)) {
raw <- read_excel(xlsx_path, sheet = "Raw Data") %>%
clean_names()
data_source <- "Excel template (preloaded)"
} else {
set.seed(42)
# --- Synthetic data generation (100 rows) ---
genders <- c("Male","Female","Nonbinary","Prefer not to say")
freqs <- c("Never","Monthly","Weekly","Daily")
substances <- c("Cannabis","Opioids","Stimulants","Sedatives","Hallucinogens","Other","Prefer not to say")
choose_school <- function(age) ifelse(age <= 18, ifelse(runif(1) < .85, "High School", "College"), ifelse(runif(1) < .8, "College", "High School"))
choose_freq <- function(age) {
base <- if (age >= 18) c(.52,.25,.15,.08) else c(.58,.22,.13,.07)
freqs[which.max(runif(1) < cumsum(base))]
}
choose_sub <- function(freq) ifelse(freq == "Never", "Prefer not to say",
sample(substances, 1, prob = c(.65,.025,.10,.10,.08,.045,.10)))
gen_scores <- function(freq, age) {
bump <- c(Never=0, Monthly=2, Weekly=3, Daily=4)[freq]
base_peer <- ifelse(age >= 18, 3, 4)
peer <- pmin(10, pmax(0, round(rnorm(1, base_peer + bump, 1.6))))
base_parent <- ifelse(age <= 17, 7, 6)
drop <- c(Never=0, Monthly=1.5, Weekly=2.5, Daily=3.5)[freq]
parent <- pmin(10, pmax(0, round(rnorm(1, base_parent - drop, 1.8))))
dep_mu <- 10 + 1.4*peer + c(Never=0, Monthly=5, Weekly=10, Daily=15)[freq]
anx_mu <- 8 + 1.2*peer + c(Never=0, Monthly=4, Weekly=8, Daily=12)[freq]
dep <- pmin(60, pmax(0, round(rnorm(1, dep_mu, 6))))
anx <- pmin(60, pmax(0, round(rnorm(1, anx_mu, 6))))
gpa_base <- ifelse(age <= 18, 3.2, 3.0)
gpa_pen <- c(Never=0.0, Monthly=0.25, Weekly=0.45, Daily=0.7)[freq] + (dep+anx)/120
gpa <- pmin(4.0, pmax(0.0, round(rnorm(1, gpa_base - gpa_pen, 0.35), 2)))
abs_mu <- 2 + c(Never=0, Monthly=1, Weekly=2, Daily=3)[freq] + dep/30
absences <- pmin(30, pmax(0, round(rnorm(1, abs_mu, 2))))
c(peer, parent, dep, anx, gpa, absences)
}
n <- 100
df <- purrr::map_dfr(seq_len(n), function(i) {
age <- sample(15:20, 1)
freq <- choose_freq(age)
sc <- gen_scores(freq, age)
tibble(
respondent_id = sprintf("T%03d", i),
age = age,
gender = sample(genders, 1),
school_level = choose_school(age),
drug_use_frequency = freq,
primary_substance = choose_sub(freq),
peer_pressure_score = sc[1],
parental_supervision_score = sc[2],
depression_score = sc[3],
anxiety_score = sc[4],
gpa = sc[5],
absences_last30_days = sc[6],
notes = ""
)
})
raw <- df
data_source <- "Synthetic data (generated in R)"
}
# Preview
paste("Data source:", data_source)
## [1] "Data source: Synthetic data (generated in R)"
raw %>% dplyr::slice_head(n = 6)
## # A tibble: 6 × 13
## respondent_id age gender school_level drug_use_frequency primary_substance
## <chr> <int> <chr> <chr> <chr> <chr>
## 1 T001 15 Male College Daily Opioids
## 2 T002 18 Female High School Never Prefer not to say
## 3 T003 17 Female High School Weekly Cannabis
## 4 T004 19 Female College Never Prefer not to say
## 5 T005 17 Male High School Monthly Prefer not to say
## 6 T006 19 Female College Monthly Prefer not to say
## # ℹ 7 more variables: peer_pressure_score <dbl>,
## # parental_supervision_score <dbl>, depression_score <dbl>,
## # anxiety_score <dbl>, gpa <dbl>, absences_last30_days <dbl>, notes <chr>
freq_levels <- c("Never","Monthly","Weekly","Daily")
clean <- raw %>%
mutate(
drug_use_frequency = factor(drug_use_frequency, levels = freq_levels, ordered = TRUE),
gender = factor(gender),
school_level = factor(school_level, levels = c("High School","College"), ordered = TRUE),
drug_use_frequency_code = case_when(
drug_use_frequency == "Never" ~ 0L,
drug_use_frequency == "Monthly" ~ 1L,
drug_use_frequency == "Weekly" ~ 2L,
drug_use_frequency == "Daily" ~ 3L,
TRUE ~ NA_integer_
),
gender_code = case_when(
gender == "Male" ~ 1L,
gender == "Female" ~ 2L,
gender == "Nonbinary" ~ 3L,
gender == "Prefer not to say" ~ 9L,
TRUE ~ NA_integer_
),
school_level_code = case_when(
school_level == "High School" ~ 1L,
school_level == "College" ~ 2L,
TRUE ~ NA_integer_
),
risk_index = absences_last30_days*0.35 +
peer_pressure_score*0.25 +
(10 - parental_supervision_score)*0.20 +
depression_score*0.10 +
anxiety_score*0.10
)
summary(clean)
## respondent_id age gender school_level
## Length:100 Min. :15.00 Female :28 High School:66
## Class :character 1st Qu.:16.00 Male :28 College :34
## Mode :character Median :17.00 Nonbinary :22
## Mean :17.22 Prefer not to say:22
## 3rd Qu.:19.00
## Max. :20.00
## drug_use_frequency primary_substance peer_pressure_score
## Never :50 Length:100 Min. :0.00
## Monthly:23 Class :character 1st Qu.:3.00
## Weekly :17 Mode :character Median :5.00
## Daily :10 Mean :4.55
## 3rd Qu.:5.25
## Max. :9.00
## parental_supervision_score depression_score anxiety_score gpa
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. :1.000
## 1st Qu.: 4.00 1st Qu.:13.75 1st Qu.:10.00 1st Qu.:2.288
## Median : 5.00 Median :20.00 Median :16.00 Median :2.750
## Mean : 5.13 Mean :20.52 Mean :16.12 Mean :2.635
## 3rd Qu.: 7.00 3rd Qu.:28.00 3rd Qu.:22.00 3rd Qu.:3.002
## Max. :10.00 Max. :42.00 Max. :34.00 Max. :3.550
## absences_last30_days notes drug_use_frequency_code gender_code
## Min. :0.0 Length:100 Min. :0.00 Min. :1.00
## 1st Qu.:2.0 Class :character 1st Qu.:0.00 1st Qu.:1.00
## Median :3.0 Mode :character Median :0.50 Median :2.00
## Mean :3.5 Mean :0.87 Mean :3.48
## 3rd Qu.:5.0 3rd Qu.:2.00 3rd Qu.:3.00
## Max. :9.0 Max. :3.00 Max. :9.00
## school_level_code risk_index
## Min. :1.00 Min. : 0.550
## 1st Qu.:1.00 1st Qu.: 5.088
## Median :1.00 Median : 6.850
## Mean :1.34 Mean : 7.000
## 3rd Qu.:2.00 3rd Qu.: 8.713
## Max. :2.00 Max. :13.900
# Metrics
metrics <- list(
total_respondents = clean %>% dplyr::filter(!is.na(respondent_id)) %>% nrow(),
average_age = mean(clean$age, na.rm = TRUE),
pct_any_drug_use = mean(clean$drug_use_frequency != "Never", na.rm = TRUE),
avg_gpa_all = mean(clean$gpa, na.rm = TRUE),
avg_gpa_no_use = mean(clean$gpa[clean$drug_use_frequency == "Never"], na.rm = TRUE),
avg_gpa_any_use = mean(clean$gpa[clean$drug_use_frequency != "Never"], na.rm = TRUE),
cor_drugcode_gpa = cor(clean$drug_use_frequency_code, clean$gpa, use = "pairwise.complete.obs"),
cor_peer_drugcode = cor(clean$peer_pressure_score, clean$drug_use_frequency_code, use = "pairwise.complete.obs"),
avg_depression_users = mean(clean$depression_score[clean$drug_use_frequency != "Never"], na.rm = TRUE)
)
metrics_tbl <- tibble(
Metric = c("Total Respondents","Average Age","% Any Drug Use",
"Avg GPA (All)","Avg GPA (No Use)","Avg GPA (Any Use)",
"Correlation: DrugUseCode vs GPA","Correlation: PeerPressure vs DrugUseCode",
"Avg Depression Score (Users)"),
Value = c(
metrics$total_respondents,
round(metrics$average_age, 2),
scales::percent(metrics$pct_any_drug_use, accuracy = 0.1),
round(metrics$avg_gpa_all, 2),
round(metrics$avg_gpa_no_use, 2),
round(metrics$avg_gpa_any_use, 2),
round(metrics$cor_drugcode_gpa, 3),
round(metrics$cor_peer_drugcode, 3),
round(metrics$avg_depression_users, 2)
)
)
knitr::kable(metrics_tbl, caption = "Summary metrics mirroring the Excel Analysis sheet")
| Metric | Value |
|---|---|
| Total Respondents | 100 |
| Average Age | 17.22 |
| % Any Drug Use | 50.0% |
| Avg GPA (All) | 2.63 |
| Avg GPA (No Use) | 2.97 |
| Avg GPA (Any Use) | 2.3 |
| Correlation: DrugUseCode vs GPA | -0.786 |
| Correlation: PeerPressure vs DrugUseCode | 0.585 |
| Avg Depression Score (Users) | 27.22 |
# A) Distribution of Drug Use Frequency
freq_dist <- clean %>% count(drug_use_frequency) %>%
mutate(drug_use_frequency = factor(drug_use_frequency, levels = freq_levels))
p1 <- ggplot(freq_dist, aes(x = drug_use_frequency, y = n, fill = drug_use_frequency)) +
geom_col(width = 0.65, show.legend = FALSE) +
labs(title = "Distribution of Drug Use Frequency", x = "Frequency", y = "Count") +
theme_minimal(base_size = 13)
# B) Average Depression by Frequency
dep_by_freq <- clean %>% group_by(drug_use_frequency) %>%
summarize(avg_dep = mean(depression_score, na.rm = TRUE), .groups = "drop") %>%
mutate(drug_use_frequency = factor(drug_use_frequency, levels = freq_levels))
p2 <- ggplot(dep_by_freq, aes(x = drug_use_frequency, y = avg_dep, fill = drug_use_frequency)) +
geom_col(width = 0.65, show.legend = FALSE) +
labs(title = "Average Depression Score by Drug Use Frequency", x = "Frequency", y = "Avg Depression Score") +
theme_minimal(base_size = 13)
# C) Scatter: Drug Use Code vs GPA
p3 <- ggplot(clean, aes(x = drug_use_frequency_code, y = gpa)) +
geom_jitter(width = 0.15, height = 0.02, alpha = 0.7, color = "#3366CC") +
geom_smooth(method = "lm", se = TRUE, color = "#CC3333") +
scale_x_continuous(breaks = 0:3, labels = freq_levels) +
labs(title = "Drug Use Frequency Code vs GPA", x = "Drug Use Frequency (Never→Daily)", y = "GPA") +
theme_minimal(base_size = 13)
# Arrange
ggpubr::ggarrange(p1, p2, p3, ncol = 2, nrow = 2)
# Linear regression: GPA ~ key predictors
model1 <- lm(gpa ~ drug_use_frequency_code + age + gender + school_level + depression_score + anxiety_score, data = clean)
summary(model1)
##
## Call:
## lm(formula = gpa ~ drug_use_frequency_code + age + gender + school_level +
## depression_score + anxiety_score, data = clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73863 -0.23338 0.01072 0.25120 0.75415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.884488 0.434470 6.639 2.24e-09 ***
## drug_use_frequency_code -0.398482 0.056248 -7.084 2.87e-10 ***
## age 0.003199 0.024339 0.131 0.8957
## genderMale 0.094429 0.092817 1.017 0.3117
## genderNonbinary 0.040226 0.097077 0.414 0.6796
## genderPrefer not to say 0.070058 0.096954 0.723 0.4718
## school_level.L -0.119496 0.062697 -1.906 0.0598 .
## depression_score -0.002804 0.005628 -0.498 0.6195
## anxiety_score 0.001333 0.005594 0.238 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3357 on 91 degrees of freedom
## Multiple R-squared: 0.6458, Adjusted R-squared: 0.6146
## F-statistic: 20.74 on 8 and 91 DF, p-value: < 2.2e-16
broom::tidy(model1)
## # A tibble: 9 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.88 0.434 6.64 2.24e- 9
## 2 drug_use_frequency_code -0.398 0.0562 -7.08 2.87e-10
## 3 age 0.00320 0.0243 0.131 8.96e- 1
## 4 genderMale 0.0944 0.0928 1.02 3.12e- 1
## 5 genderNonbinary 0.0402 0.0971 0.414 6.80e- 1
## 6 genderPrefer not to say 0.0701 0.0970 0.723 4.72e- 1
## 7 school_level.L -0.119 0.0627 -1.91 5.98e- 2
## 8 depression_score -0.00280 0.00563 -0.498 6.20e- 1
## 9 anxiety_score 0.00133 0.00559 0.238 8.12e- 1
# ANOVA: Depression by drug-use category
anova_dep <- aov(depression_score ~ drug_use_frequency, data = clean)
summary(anova_dep)
## Df Sum Sq Mean Sq F value Pr(>F)
## drug_use_frequency 3 5332 1777.5 45.67 <2e-16 ***
## Residuals 96 3737 38.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova_dep)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = depression_score ~ drug_use_frequency, data = clean)
##
## $drug_use_frequency
## diff lwr upr p adj
## Monthly-Never 9.353913 5.2441327 13.46369 0.0000003
## Weekly-Never 15.062353 10.4826844 19.64202 0.0000000
## Daily-Never 19.880000 14.2293718 25.53063 0.0000000
## Weekly-Monthly 5.708440 0.4911169 10.92576 0.0262286
## Daily-Monthly 10.526087 4.3473578 16.70482 0.0001323
## Daily-Weekly 4.817647 -1.6831039 11.31840 0.2193649
References (links):
- Steele, D.W., et al. (2020). Brief Behavioral Interventions for
Substance Use in Adolescents. Pediatrics. https://doi.org/10.1542/peds.2020-0351
- AHRQ (2017). Interventions for Substance Use Disorders in
Adolescents: Evidence Summary. https://effectivehealthcare.ahrq.gov/sites/default/files/cer-225-substance-abuse-adolescents-evidence-summary%20225%20Substance%20Use%20Adolescents.pdf
- O’Connor, E., et al. (2020). USPSTF Evidence Report: Interventions
to Prevent Illicit and Nonmedical Drug Use. JAMA. https://jamanetwork.com/journals/jama/fullarticle/2766429
- Nawi, A.M., et al. (2021). Risk and protective factors of drug
abuse among adolescents: a systematic review. BMC Public Health. https://doi.org/10.1186/s12889-021-11906-2
Ethics note: If you replace the synthetic data with real student data, ensure appropriate consent, privacy safeguards, and IRB/administrative approval.
# Save plots and metrics to files (optional)
if (!dir.exists("outputs")) dir.create("outputs")
# Export plots
ggplot2::ggsave("outputs/chart_freq_distribution.png", p1, width = 8, height = 5, dpi = 300)
ggplot2::ggsave("outputs/chart_avg_depression_by_freq.png", p2, width = 8, height = 5, dpi = 300)
ggplot2::ggsave("outputs/chart_drugcode_vs_gpa.png", p3, width = 8, height = 5, dpi = 300)
# Export metrics
readr::write_csv(metrics_tbl, "outputs/metrics_summary.csv")
# Session info
sessionInfo()
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 22621)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tinytex_0.58 scales_1.4.0 broom_1.0.11 ggpubr_0.6.2
## [5] janitor_2.2.1 readxl_1.4.5 lubridate_1.9.4 forcats_1.0.1
## [9] stringr_1.6.0 dplyr_1.1.4 purrr_1.0.4 readr_2.1.5
## [13] tidyr_1.3.1 tibble_3.2.1 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.52 bslib_0.9.0 rstatix_0.7.3
## [5] lattice_0.22-7 tzdb_0.5.0 vctrs_0.6.5 tools_4.3.1
## [9] generics_0.1.4 parallel_4.3.1 pkgconfig_2.0.3 Matrix_1.6-2
## [13] RColorBrewer_1.1-3 S7_0.2.0 lifecycle_1.0.4 compiler_4.3.1
## [17] farver_2.1.2 textshaping_1.0.0 carData_3.0-5 snakecase_0.11.1
## [21] htmltools_0.5.8.1 sass_0.4.9 yaml_2.3.10 Formula_1.2-5
## [25] crayon_1.5.3 pillar_1.11.1 car_3.1-3 jquerylib_0.1.4
## [29] cachem_1.1.0 abind_1.4-8 nlme_3.1-168 tidyselect_1.2.1
## [33] digest_0.6.37 stringi_1.8.7 labeling_0.4.3 splines_4.3.1
## [37] cowplot_1.2.0 fastmap_1.2.0 grid_4.3.1 cli_3.6.1
## [41] magrittr_2.0.3 utf8_1.2.4 withr_3.0.2 backports_1.5.0
## [45] bit64_4.6.0-1 timechange_0.3.0 rmarkdown_2.30 bit_4.6.0
## [49] ggsignif_0.6.4 cellranger_1.1.0 ragg_1.3.3 hms_1.1.4
## [53] evaluate_1.0.5 knitr_1.50 viridisLite_0.4.2 mgcv_1.9-1
## [57] rlang_1.1.4 glue_1.7.0 vroom_1.6.5 rstudioapi_0.17.1
## [61] jsonlite_1.8.8 R6_2.6.1 systemfonts_1.2.2