HOW TO ANALYZE YOUR DATA: Teen Drug Use (15–20)

Purpose: This report demonstrates an end-to-end workflow for analyzing the impact of drug use among U.S. teenagers (ages 15–20), using the provided Excel template. It follows the scientific method and includes descriptive statistics, correlations, visualizations, and simple models.

Note: If the Excel file is not found, the code will generate a small synthetic dataset so the report still knits (for instructional use only).

0.1 1. Load Packages

library(tidyverse)
library(readxl)
library(janitor)
library(ggpubr)
library(broom)
library(scales)
library(tinytex)

0.2 2. Import Data

# Path to preloaded workbook (adjust as needed)
xlsx_path <- "Teen_Drug_Use_Analysis_Template_Preload.xlsx"

if (file.exists(xlsx_path)) {
  raw <- read_excel(xlsx_path, sheet = "Raw Data") %>%
    clean_names()
  data_source <- "Excel template (preloaded)"
} else {
  set.seed(42)
  # --- Synthetic data generation (100 rows) ---
  genders <- c("Male","Female","Nonbinary","Prefer not to say")
  freqs <- c("Never","Monthly","Weekly","Daily")
  substances <- c("Cannabis","Opioids","Stimulants","Sedatives","Hallucinogens","Other","Prefer not to say")
  choose_school <- function(age) ifelse(age <= 18, ifelse(runif(1) < .85, "High School", "College"), ifelse(runif(1) < .8, "College", "High School"))
  choose_freq <- function(age) {
    base <- if (age >= 18) c(.52,.25,.15,.08) else c(.58,.22,.13,.07)
    freqs[which.max(runif(1) < cumsum(base))]
  }
  choose_sub <- function(freq) ifelse(freq == "Never", "Prefer not to say",
                                      sample(substances, 1, prob = c(.65,.025,.10,.10,.08,.045,.10)))
  gen_scores <- function(freq, age) {
    bump <- c(Never=0, Monthly=2, Weekly=3, Daily=4)[freq]
    base_peer <- ifelse(age >= 18, 3, 4)
    peer <- pmin(10, pmax(0, round(rnorm(1, base_peer + bump, 1.6))))
    base_parent <- ifelse(age <= 17, 7, 6)
    drop <- c(Never=0, Monthly=1.5, Weekly=2.5, Daily=3.5)[freq]
    parent <- pmin(10, pmax(0, round(rnorm(1, base_parent - drop, 1.8))))
    dep_mu <- 10 + 1.4*peer + c(Never=0, Monthly=5, Weekly=10, Daily=15)[freq]
    anx_mu <- 8 + 1.2*peer + c(Never=0, Monthly=4, Weekly=8, Daily=12)[freq]
    dep <- pmin(60, pmax(0, round(rnorm(1, dep_mu, 6))))
    anx <- pmin(60, pmax(0, round(rnorm(1, anx_mu, 6))))
    gpa_base <- ifelse(age <= 18, 3.2, 3.0)
    gpa_pen <- c(Never=0.0, Monthly=0.25, Weekly=0.45, Daily=0.7)[freq] + (dep+anx)/120
    gpa <- pmin(4.0, pmax(0.0, round(rnorm(1, gpa_base - gpa_pen, 0.35), 2)))
    abs_mu <- 2 + c(Never=0, Monthly=1, Weekly=2, Daily=3)[freq] + dep/30
    absences <- pmin(30, pmax(0, round(rnorm(1, abs_mu, 2))))
    c(peer, parent, dep, anx, gpa, absences)
  }
  n <- 100
  df <- purrr::map_dfr(seq_len(n), function(i) {
    age <- sample(15:20, 1)
    freq <- choose_freq(age)
    sc <- gen_scores(freq, age)
    tibble(
      respondent_id = sprintf("T%03d", i),
      age = age,
      gender = sample(genders, 1),
      school_level = choose_school(age),
      drug_use_frequency = freq,
      primary_substance = choose_sub(freq),
      peer_pressure_score = sc[1],
      parental_supervision_score = sc[2],
      depression_score = sc[3],
      anxiety_score = sc[4],
      gpa = sc[5],
      absences_last30_days = sc[6],
      notes = ""
    )
  })
  raw <- df
  data_source <- "Synthetic data (generated in R)"
}

# Preview
paste("Data source:", data_source)

## [1] "Data source: Synthetic data (generated in R)"

raw %>% dplyr::slice_head(n = 6)

## # A tibble: 6 × 13
##   respondent_id   age gender school_level drug_use_frequency primary_substance
##   <chr>         <int> <chr>  <chr>        <chr>              <chr>            
## 1 T001             15 Male   College      Daily              Opioids          
## 2 T002             18 Female High School  Never              Prefer not to say
## 3 T003             17 Female High School  Weekly             Cannabis         
## 4 T004             19 Female College      Never              Prefer not to say
## 5 T005             17 Male   High School  Monthly            Prefer not to say
## 6 T006             19 Female College      Monthly            Prefer not to say
## # ℹ 7 more variables: peer_pressure_score <dbl>,
## #   parental_supervision_score <dbl>, depression_score <dbl>,
## #   anxiety_score <dbl>, gpa <dbl>, absences_last30_days <dbl>, notes <chr>

0.3 3. Clean & Recode

freq_levels <- c("Never","Monthly","Weekly","Daily")
clean <- raw %>%
  mutate(
    drug_use_frequency = factor(drug_use_frequency, levels = freq_levels, ordered = TRUE),
    gender = factor(gender),
    school_level = factor(school_level, levels = c("High School","College"), ordered = TRUE),
    drug_use_frequency_code = case_when(
      drug_use_frequency == "Never"   ~ 0L,
      drug_use_frequency == "Monthly" ~ 1L,
      drug_use_frequency == "Weekly"  ~ 2L,
      drug_use_frequency == "Daily"   ~ 3L,
      TRUE ~ NA_integer_
    ),
    gender_code = case_when(
      gender == "Male" ~ 1L,
      gender == "Female" ~ 2L,
      gender == "Nonbinary" ~ 3L,
      gender == "Prefer not to say" ~ 9L,
      TRUE ~ NA_integer_
    ),
    school_level_code = case_when(
      school_level == "High School" ~ 1L,
      school_level == "College" ~ 2L,
      TRUE ~ NA_integer_
    ),
    risk_index = absences_last30_days*0.35 +
                 peer_pressure_score*0.25 +
                 (10 - parental_supervision_score)*0.20 +
                 depression_score*0.10 +
                 anxiety_score*0.10
  )

summary(clean)

##  respondent_id           age                      gender        school_level
##  Length:100         Min.   :15.00   Female           :28   High School:66   
##  Class :character   1st Qu.:16.00   Male             :28   College    :34   
##  Mode  :character   Median :17.00   Nonbinary        :22                    
##                     Mean   :17.22   Prefer not to say:22                    
##                     3rd Qu.:19.00                                           
##                     Max.   :20.00                                           
##  drug_use_frequency primary_substance  peer_pressure_score
##  Never  :50         Length:100         Min.   :0.00       
##  Monthly:23         Class :character   1st Qu.:3.00       
##  Weekly :17         Mode  :character   Median :5.00       
##  Daily  :10                            Mean   :4.55       
##                                        3rd Qu.:5.25       
##                                        Max.   :9.00       
##  parental_supervision_score depression_score anxiety_score        gpa       
##  Min.   : 0.00              Min.   : 0.00    Min.   : 0.00   Min.   :1.000  
##  1st Qu.: 4.00              1st Qu.:13.75    1st Qu.:10.00   1st Qu.:2.288  
##  Median : 5.00              Median :20.00    Median :16.00   Median :2.750  
##  Mean   : 5.13              Mean   :20.52    Mean   :16.12   Mean   :2.635  
##  3rd Qu.: 7.00              3rd Qu.:28.00    3rd Qu.:22.00   3rd Qu.:3.002  
##  Max.   :10.00              Max.   :42.00    Max.   :34.00   Max.   :3.550  
##  absences_last30_days    notes           drug_use_frequency_code  gender_code  
##  Min.   :0.0          Length:100         Min.   :0.00            Min.   :1.00  
##  1st Qu.:2.0          Class :character   1st Qu.:0.00            1st Qu.:1.00  
##  Median :3.0          Mode  :character   Median :0.50            Median :2.00  
##  Mean   :3.5                             Mean   :0.87            Mean   :3.48  
##  3rd Qu.:5.0                             3rd Qu.:2.00            3rd Qu.:3.00  
##  Max.   :9.0                             Max.   :3.00            Max.   :9.00  
##  school_level_code   risk_index    
##  Min.   :1.00      Min.   : 0.550  
##  1st Qu.:1.00      1st Qu.: 5.088  
##  Median :1.00      Median : 6.850  
##  Mean   :1.34      Mean   : 7.000  
##  3rd Qu.:2.00      3rd Qu.: 8.713  
##  Max.   :2.00      Max.   :13.900

0.4 4. Key Metrics

# Metrics
metrics <- list(
  total_respondents = clean %>% dplyr::filter(!is.na(respondent_id)) %>% nrow(),
  average_age = mean(clean$age, na.rm = TRUE),
  pct_any_drug_use = mean(clean$drug_use_frequency != "Never", na.rm = TRUE),
  avg_gpa_all = mean(clean$gpa, na.rm = TRUE),
  avg_gpa_no_use = mean(clean$gpa[clean$drug_use_frequency == "Never"], na.rm = TRUE),
  avg_gpa_any_use = mean(clean$gpa[clean$drug_use_frequency != "Never"], na.rm = TRUE),
  cor_drugcode_gpa = cor(clean$drug_use_frequency_code, clean$gpa, use = "pairwise.complete.obs"),
  cor_peer_drugcode = cor(clean$peer_pressure_score, clean$drug_use_frequency_code, use = "pairwise.complete.obs"),
  avg_depression_users = mean(clean$depression_score[clean$drug_use_frequency != "Never"], na.rm = TRUE)
)

metrics_tbl <- tibble(
  Metric = c("Total Respondents","Average Age","% Any Drug Use",
             "Avg GPA (All)","Avg GPA (No Use)","Avg GPA (Any Use)",
             "Correlation: DrugUseCode vs GPA","Correlation: PeerPressure vs DrugUseCode",
             "Avg Depression Score (Users)"),
  Value = c(
    metrics$total_respondents,
    round(metrics$average_age, 2),
    scales::percent(metrics$pct_any_drug_use, accuracy = 0.1),
    round(metrics$avg_gpa_all, 2),
    round(metrics$avg_gpa_no_use, 2),
    round(metrics$avg_gpa_any_use, 2),
    round(metrics$cor_drugcode_gpa, 3),
    round(metrics$cor_peer_drugcode, 3),
    round(metrics$avg_depression_users, 2)
  )
)

knitr::kable(metrics_tbl, caption = "Summary metrics mirroring the Excel Analysis sheet")

Summary metrics mirroring the Excel Analysis sheet
Metric	Value
Total Respondents	100
Average Age	17.22
% Any Drug Use	50.0%
Avg GPA (All)	2.63
Avg GPA (No Use)	2.97
Avg GPA (Any Use)	2.3
Correlation: DrugUseCode vs GPA	-0.786
Correlation: PeerPressure vs DrugUseCode	0.585
Avg Depression Score (Users)	27.22

0.5 5. Visualizations

# A) Distribution of Drug Use Frequency
freq_dist <- clean %>% count(drug_use_frequency) %>%
  mutate(drug_use_frequency = factor(drug_use_frequency, levels = freq_levels))

p1 <- ggplot(freq_dist, aes(x = drug_use_frequency, y = n, fill = drug_use_frequency)) +
  geom_col(width = 0.65, show.legend = FALSE) +
  labs(title = "Distribution of Drug Use Frequency", x = "Frequency", y = "Count") +
  theme_minimal(base_size = 13)

# B) Average Depression by Frequency
dep_by_freq <- clean %>% group_by(drug_use_frequency) %>%
  summarize(avg_dep = mean(depression_score, na.rm = TRUE), .groups = "drop") %>%
  mutate(drug_use_frequency = factor(drug_use_frequency, levels = freq_levels))

p2 <- ggplot(dep_by_freq, aes(x = drug_use_frequency, y = avg_dep, fill = drug_use_frequency)) +
  geom_col(width = 0.65, show.legend = FALSE) +
  labs(title = "Average Depression Score by Drug Use Frequency", x = "Frequency", y = "Avg Depression Score") +
  theme_minimal(base_size = 13)

# C) Scatter: Drug Use Code vs GPA
p3 <- ggplot(clean, aes(x = drug_use_frequency_code, y = gpa)) +
  geom_jitter(width = 0.15, height = 0.02, alpha = 0.7, color = "#3366CC") +
  geom_smooth(method = "lm", se = TRUE, color = "#CC3333") +
  scale_x_continuous(breaks = 0:3, labels = freq_levels) +
  labs(title = "Drug Use Frequency Code vs GPA", x = "Drug Use Frequency (Never→Daily)", y = "GPA") +
  theme_minimal(base_size = 13)

# Arrange
ggpubr::ggarrange(p1, p2, p3, ncol = 2, nrow = 2)

0.6 6. Simple Models

# Linear regression: GPA ~ key predictors
model1 <- lm(gpa ~ drug_use_frequency_code + age + gender + school_level + depression_score + anxiety_score, data = clean)
summary(model1)

## 
## Call:
## lm(formula = gpa ~ drug_use_frequency_code + age + gender + school_level + 
##     depression_score + anxiety_score, data = clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.73863 -0.23338  0.01072  0.25120  0.75415 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.884488   0.434470   6.639 2.24e-09 ***
## drug_use_frequency_code -0.398482   0.056248  -7.084 2.87e-10 ***
## age                      0.003199   0.024339   0.131   0.8957    
## genderMale               0.094429   0.092817   1.017   0.3117    
## genderNonbinary          0.040226   0.097077   0.414   0.6796    
## genderPrefer not to say  0.070058   0.096954   0.723   0.4718    
## school_level.L          -0.119496   0.062697  -1.906   0.0598 .  
## depression_score        -0.002804   0.005628  -0.498   0.6195    
## anxiety_score            0.001333   0.005594   0.238   0.8122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3357 on 91 degrees of freedom
## Multiple R-squared:  0.6458, Adjusted R-squared:  0.6146 
## F-statistic: 20.74 on 8 and 91 DF,  p-value: < 2.2e-16

broom::tidy(model1)

## # A tibble: 9 × 5
##   term                    estimate std.error statistic  p.value
##   <chr>                      <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)              2.88      0.434       6.64  2.24e- 9
## 2 drug_use_frequency_code -0.398     0.0562     -7.08  2.87e-10
## 3 age                      0.00320   0.0243      0.131 8.96e- 1
## 4 genderMale               0.0944    0.0928      1.02  3.12e- 1
## 5 genderNonbinary          0.0402    0.0971      0.414 6.80e- 1
## 6 genderPrefer not to say  0.0701    0.0970      0.723 4.72e- 1
## 7 school_level.L          -0.119     0.0627     -1.91  5.98e- 2
## 8 depression_score        -0.00280   0.00563    -0.498 6.20e- 1
## 9 anxiety_score            0.00133   0.00559     0.238 8.12e- 1

# ANOVA: Depression by drug-use category
anova_dep <- aov(depression_score ~ drug_use_frequency, data = clean)
summary(anova_dep)

##                    Df Sum Sq Mean Sq F value Pr(>F)    
## drug_use_frequency  3   5332  1777.5   45.67 <2e-16 ***
## Residuals          96   3737    38.9                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(anova_dep)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = depression_score ~ drug_use_frequency, data = clean)
## 
## $drug_use_frequency
##                     diff        lwr      upr     p adj
## Monthly-Never   9.353913  5.2441327 13.46369 0.0000003
## Weekly-Never   15.062353 10.4826844 19.64202 0.0000000
## Daily-Never    19.880000 14.2293718 25.53063 0.0000000
## Weekly-Monthly  5.708440  0.4911169 10.92576 0.0262286
## Daily-Monthly  10.526087  4.3473578 16.70482 0.0001323
## Daily-Weekly    4.817647 -1.6831039 11.31840 0.2193649

0.7 7. Interpretation & Scientific Context

Findings often show that brief behavioral interventions (e.g., motivational interviewing) reduce alcohol use days among adolescents, while cannabis reductions are less consistent; interpret your alcohol vs. cannabis patterns accordingly (see Steele et al., 2020; AHRQ Evidence Summary, 2017).
Prevention efforts targeting family monitoring and school-based programs can influence risk/protective factors (peer norms, parental supervision), which you can explore via the Risk Index and subgroup analyses (see O’Connor et al., 2020; Nawi et al., 2021).
Consider how peer pressure and mental health are related to substance use and academic outcomes (e.g., GPA), and compare your correlations to published evidence.

References (links):
- Steele, D.W., et al. (2020). Brief Behavioral Interventions for Substance Use in Adolescents. Pediatrics. https://doi.org/10.1542/peds.2020-0351
- AHRQ (2017). Interventions for Substance Use Disorders in Adolescents: Evidence Summary. https://effectivehealthcare.ahrq.gov/sites/default/files/cer-225-substance-abuse-adolescents-evidence-summary%20225%20Substance%20Use%20Adolescents.pdf
- O’Connor, E., et al. (2020). USPSTF Evidence Report: Interventions to Prevent Illicit and Nonmedical Drug Use. JAMA. https://jamanetwork.com/journals/jama/fullarticle/2766429
- Nawi, A.M., et al. (2021). Risk and protective factors of drug abuse among adolescents: a systematic review. BMC Public Health. https://doi.org/10.1186/s12889-021-11906-2

Ethics note: If you replace the synthetic data with real student data, ensure appropriate consent, privacy safeguards, and IRB/administrative approval.

0.8 8. Reproducibility & Outputs

# Save plots and metrics to files (optional)
if (!dir.exists("outputs")) dir.create("outputs")

# Export plots
ggplot2::ggsave("outputs/chart_freq_distribution.png", p1, width = 8, height = 5, dpi = 300)
ggplot2::ggsave("outputs/chart_avg_depression_by_freq.png", p2, width = 8, height = 5, dpi = 300)

ggplot2::ggsave("outputs/chart_drugcode_vs_gpa.png", p3, width = 8, height = 5, dpi = 300)

# Export metrics
readr::write_csv(metrics_tbl, "outputs/metrics_summary.csv")

# Session info
sessionInfo()

## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 22621)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tinytex_0.58    scales_1.4.0    broom_1.0.11    ggpubr_0.6.2   
##  [5] janitor_2.2.1   readxl_1.4.5    lubridate_1.9.4 forcats_1.0.1  
##  [9] stringr_1.6.0   dplyr_1.1.4     purrr_1.0.4     readr_2.1.5    
## [13] tidyr_1.3.1     tibble_3.2.1    ggplot2_4.0.1   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.52          bslib_0.9.0        rstatix_0.7.3     
##  [5] lattice_0.22-7     tzdb_0.5.0         vctrs_0.6.5        tools_4.3.1       
##  [9] generics_0.1.4     parallel_4.3.1     pkgconfig_2.0.3    Matrix_1.6-2      
## [13] RColorBrewer_1.1-3 S7_0.2.0           lifecycle_1.0.4    compiler_4.3.1    
## [17] farver_2.1.2       textshaping_1.0.0  carData_3.0-5      snakecase_0.11.1  
## [21] htmltools_0.5.8.1  sass_0.4.9         yaml_2.3.10        Formula_1.2-5     
## [25] crayon_1.5.3       pillar_1.11.1      car_3.1-3          jquerylib_0.1.4   
## [29] cachem_1.1.0       abind_1.4-8        nlme_3.1-168       tidyselect_1.2.1  
## [33] digest_0.6.37      stringi_1.8.7      labeling_0.4.3     splines_4.3.1     
## [37] cowplot_1.2.0      fastmap_1.2.0      grid_4.3.1         cli_3.6.1         
## [41] magrittr_2.0.3     utf8_1.2.4         withr_3.0.2        backports_1.5.0   
## [45] bit64_4.6.0-1      timechange_0.3.0   rmarkdown_2.30     bit_4.6.0         
## [49] ggsignif_0.6.4     cellranger_1.1.0   ragg_1.3.3         hms_1.1.4         
## [53] evaluate_1.0.5     knitr_1.50         viridisLite_0.4.2  mgcv_1.9-1        
## [57] rlang_1.1.4        glue_1.7.0         vroom_1.6.5        rstudioapi_0.17.1 
## [61] jsonlite_1.8.8     R6_2.6.1           systemfonts_1.2.2

HOW TO ANALYZE YOUR DATA: Teen Drug Use (15–20) — R Markdown Report

Gredy Garrido PhD

2025-12-16