## Load data
setwd(
"/Users/jasonhoskin/Library/CloudStorage/OneDrive-IndianaUniversity/spring-2026/research-in-health-behavior"
)
## Load libraries
library(haven)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(purrr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
## Import dataset
gambling <- read_sav("Gambling2024_Weighted.sav")
gambling <- gambling %>%
distinct(uid, .keep_all = TRUE)
### Create demographic categories
location <- gambling %>%
mutate(
location = case_when(
Q_pers2 == 1 ~ "On a farm",
Q_pers2 == 2 ~ "In a rural setting, not on a farm",
Q_pers2 == 3 ~ "In a rural subdivision outside of city limits",
Q_pers2 == 4 ~ "In a small town of less than 5,000 people",
Q_pers2 == 5 ~ "In a larger town of 5,000 to less than 25,000 people",
Q_pers2 == 6 ~ "In a city of 25,000 to less than 50,000 people",
Q_pers2 == 7 ~ "In a city of 50,000 to less than 150,000 people",
Q_pers2 == 8 ~ "In a larger city of 150,000 or more people",
)
) %>%
select(uid, location)
age <- gambling %>%
mutate(
age = case_when(
Q_pers3 == 1 ~ "18-24",
Q_pers3 == 2 ~ "25-34",
Q_pers3 == 3 ~ "35-44",
Q_pers3 == 4 ~ "45-54",
Q_pers3 == 5 ~ "55-64",
Q_pers3 == 6 ~ "65-74",
Q_pers3 == 7 ~ "75+"
)
) %>%
select(uid, age)
marital_status <- gambling %>%
mutate(
marital_status = case_when(
Q_pers4 == 1 ~ "Married",
Q_pers4 == 2 ~ "A member of an unmarried couple",
Q_pers4 == 3 ~ "Divorced",
Q_pers4 == 4 ~ "Separated",
Q_pers4 == 5 ~ "Widowed",
Q_pers4 == 6 ~ "Never been married"
)
) %>%
select(uid, marital_status)
education <- gambling %>%
mutate(
education = case_when(
Q_pers5 == 1 ~ "Less than high school (no diploma, GED, or alternative credential)",
Q_pers5 == 2 ~ "High school graduate (diploma, GED, or alternative credential)",
Q_pers5 == 3 ~ "Some college credit, but no degree",
Q_pers5 == 4 ~ "Associate’s degree",
Q_pers5 == 5 ~ "Bachelor’s degree",
Q_pers5 == 6 ~ "Master’s, doctoral, or professional degree"
)
) %>%
select(uid, education)
employment <- gambling %>%
mutate(
employment = case_when(
Q_pers6 == 1 ~ "Employed 35 or more hours a week",
Q_pers6 == 2 ~ "Employed fewer than 35 hours a week",
Q_pers6 == 3 ~ "Not employed, and looking for work",
Q_pers6 == 4 ~ "Not employed, but not looking for work",
Q_pers6 == 5 ~ "Retired"
)
) %>%
select(uid, employment)
income <- gambling %>%
mutate(
income = case_when(
Q_pers7 == 1 ~ "Less than $15,000",
Q_pers7 == 2 ~ "$15,000 to $34,999",
Q_pers7 == 3 ~ "$35,000 to $49,999",
Q_pers7 == 4 ~ "$50,000 to $74,999",
Q_pers7 == 5 ~ "$75,000 to $99,999",
Q_pers7 == 6 ~ "$100,000 to $149,999",
Q_pers7 == 7 ~ "$150,000 or more"
)
) %>%
select(uid, income)
sex <- gambling %>%
mutate(sex = case_when(Q_pers8 == 1 ~ "Male", Q_pers8 == 2 ~ "Female")) %>%
select(uid, sex)
ethnicity <- gambling %>%
mutate(ethnicity = case_when(Q_pers10 == 1 ~ "Yes", Q_pers10 == 2 ~ "No")) %>%
select(uid, ethnicity)
race <- gambling %>%
mutate(
n_races = rowSums(across(starts_with("Q_pers11")) == 1, na.rm = TRUE),
race = case_when(
n_races > 1 ~ "More than one race",
Q_pers11_1 == 1 ~ "White",
Q_pers11_2 == 1 ~ "Black or African American",
Q_pers11_3 == 1 ~ "Asian",
Q_pers11_4 == 1 ~ "Native Hawaiian/Other Pacific Islander",
Q_pers11_5 == 1 ~ "American Indian or Alaska Native",
Q_pers11_6 == 1 ~ "Other",
n_races == 0 ~ NA_character_,
TRUE ~ "Other"
)
) %>%
select(uid, race)
# Create a list of the specific dataframes with 2 columns
dfs_to_join <- list(age,
education,
employment,
ethnicity,
income,
location,
marital_status,
race,
sex)
# Use reduce to join them all by "uid"
combined_demographics <- dfs_to_join %>%
reduce(full_join, by = "uid")
dems_to_join <- list(combined_demographics, gambling)
gambling_dems <- dems_to_join %>%
reduce(full_join, by = "uid")
# Remove unnecessary variables
rm(
race,
sex,
marital_status,
location,
income,
ethnicity,
employment,
education,
age,
dfs_to_join,
dems_to_join
)
# Ascribe categories to gambling types
gambling_grouped <- gambling_dems %>%
mutate(
age_cat = factor(
as_factor(Q_pers3),
levels = as_factor(1:7)
),
any_gambling = if_any(starts_with("Q_activ"), ~ . > 1),
any_lottery = if_any(c(Q_activ_9, Q_activ_10, Q_activ_11), ~ . > 1),
any_casino = if_any(
c(
Q_activ_6,
Q_activ_7,
Q_activ_8,
Q_activ_13,
Q_activ_14,
Q_activ_17
),
~ . > 1
),
any_sports = if_any(c(Q_activ_1, Q_activ_3), ~ . > 1),
any_online = if_any(c(
Q_activ_2, Q_activ_4, Q_activ_5, Q_activ_12
), ~ . > 1),
other_gamb = if_any(
c(
Q_activ_15,
Q_activ_16,
Q_activ_18,
Q_activ_19,
Q_activ_20,
Q_activ_21
),
~ . > 1
)
)
# Count of gambling activities with any participation
gambling_grouped <- gambling_grouped %>%
mutate(gambling_count = rowSums(across(starts_with("Q_activ"), ~ . > 1), na.rm = TRUE),
any_gambling_count = gambling_count > 0)
gambling_grouped <- gambling_grouped %>%
mutate(
alcohol_use = Q_health2_1 == 1,
tobacco_use = Q_health3_1 == 1,
vape_use = Q_health4_1 == 1,
marijuana_use = Q_health5_1 == 1,
rx_misuse = Q_health6_1 == 1,
any_substance = alcohol_use | tobacco_use | vape_use |
marijuana_use | rx_misuse
)
Visualize age distribution.
age_freq_labeled <- gambling_grouped %>%
mutate(
age_cat = as_factor(Q_pers3)
) %>%
count(age_cat) %>%
mutate(
percent = round(100 * n / sum(n), 1)
)
age_freq_labeled
gambling_grouped <- gambling_grouped %>%
mutate(
age_cat = factor(
as_factor(Q_pers3),
levels = c(
"18-24", "25-34", "35-44",
"45-54", "55-64", "65-74", "75+"
)
)
)
ggplot(
gambling_grouped %>% filter(!is.na(age)),
aes(x = age)
) +
geom_bar(fill = "grey70", color = "black") +
labs(
x = "Age category",
y = "Number of participants",
title = "Age Distribution of Survey Respondents"
)
Visualize substance use frequency. Substance use frequency was assessed
using the number of days each substance was used in the past 30 days,
and polysubstance use was defined as use of two or more substances for
at least one day. Heavy polysubstance use is defined as having used at
least two substances for at least ten days.
gambling_grouped <- gambling_grouped %>%
mutate(
alcohol_days = Q_health2_1,
tobacco_days = Q_health3_1,
vape_days = Q_health4_1,
marijuana_days = Q_health5_1,
rx_misuse_days = Q_health6_1
)
substance_days_long <- gambling_grouped %>%
select(
alcohol_days,
tobacco_days,
vape_days,
marijuana_days,
rx_misuse_days
) %>%
pivot_longer(
cols = everything(),
names_to = "substance",
values_to = "days_used"
) %>%
filter(!is.na(days_used))
ggplot(substance_days_long, aes(x = days_used)) +
geom_histogram(
binwidth = 1,
boundary = -0.5,
fill = "grey70",
color = "black"
) +
facet_wrap(~ substance, scales = "free_y") +
scale_x_continuous(breaks = c(0, 5, 10, 15, 20, 25, 30)) +
labs(
x = "Number of days used (past 30 days)",
y = "Number of participants",
title = "Distribution of Substance Use Frequency by Substance"
)
gambling_grouped <- gambling_grouped %>%
mutate(
substance_count_days = rowSums(
cbind(
alcohol_days > 0,
tobacco_days > 0,
vape_days > 0,
marijuana_days > 0,
rx_misuse_days > 0
),
na.rm = TRUE
)
)
ggplot(gambling_grouped, aes(x = substance_count_days)) +
geom_histogram(
binwidth = 1,
boundary = -0.5,
fill = "grey70",
color = "black"
) +
scale_x_continuous(breaks = 0:5) +
labs(
x = "Number of substances used (≥1 day in past month)",
y = "Number of participants",
title = "Distribution of Number of Substances Used"
)
gambling_grouped %>%
summarise(
any_use = sum(substance_count_days >= 1, na.rm = TRUE),
polysubstance_use = sum(substance_count_days >= 2, na.rm = TRUE),
heavy_polysubstance = sum(
(alcohol_days >= 10) +
(tobacco_days >= 10) +
(vape_days >= 10) +
(marijuana_days >= 10) +
(rx_misuse_days >= 10) >= 2,
na.rm = TRUE
)
)
ggplot(gambling_grouped, aes(x = gambling_count)) +
geom_histogram(binwidth = 1,
color = "black",
fill = "grey70") +
labs(x = "Number of gambling activities (past 12 months)", y = "Count", title = "Distribution of gambling activity count")
mean(gambling_grouped$gambling_count == 0, na.rm = TRUE)
## [1] 0.1354037
gambling_grouped %>%
group_by(any_substance) %>%
summarise(prop_any_gambling = mean(any_gambling_count, na.rm = TRUE)) %>%
ggplot(aes(x = any_substance, y = prop_any_gambling)) +
geom_col(fill = "grey70") +
labs(x = "Any past-month substance use", y = "P(Any gambling activity)", title = "Crossing the zero threshold by substance use")
ggplot(
gambling_grouped %>% filter(gambling_count > 0),
aes(x = any_substance, y = gambling_count)
) +
geom_boxplot(outlier.shape = NA) +
labs(x = "Any past-month substance use", y = "Number of gambling activities", title = "Gambling intensity among gamblers only")
gambling_grouped %>%
summarise(
mean_count = mean(gambling_count, na.rm = TRUE),
var_count = var(gambling_count, na.rm = TRUE)
)
Create long form table for plotting purposes.
dsm_items <- c(
"Q_exp_1",
"Q_exp_2",
"Q_exp_3",
"Q_exp_4",
"Q_exp_5",
"Q_exp_6",
"Q_exp_7",
"Q_exp_8",
"Q_exp_9"
)
pgsi_items <- c(
"Q_consq_1",
"Q_consq_2",
"Q_consq_3",
"Q_consq_4",
"Q_consq_5",
"Q_consq_6",
"Q_consq_7",
"Q_consq_8",
"Q_consq_9"
)
gambling_grouped <- gambling_grouped %>%
mutate(across(
all_of(pgsi_items),
~ case_when(
. == 1 ~ 0,
# Never
. == 2 ~ 1,
# Sometimes
. == 3 ~ 2,
# Most of the time
. == 4 ~ 3,
# Almost always
TRUE ~ NA_real_
)
))
nods_items <- paste0("Q", 1:17)
gambling_grouped <- gambling_grouped %>%
mutate(
nods_total = rowSums(across(all_of(nods_items)) == 1, na.rm = TRUE),
nods_cat = case_when(
nods_total == 0 ~ "No risk",
nods_total %in% 1:2 ~ "Mild risk",
nods_total %in% 3:4 ~ "Moderate risk",
nods_total >= 5 ~ "Pathological gambling",
TRUE ~ NA_character_
),
dsm_total = rowSums(across(all_of(dsm_items)) == 1, na.rm = TRUE),
dsm_dx = dsm_total >= 4,
pgsi_total = rowSums(across(all_of(pgsi_items)), na.rm = TRUE),
pgsi_cat = case_when(
pgsi_total == 0 ~ "Non-problem",
pgsi_total %in% 1:2 ~ "Low severity",
pgsi_total %in% 3:7 ~ "Moderate severity",
pgsi_total >= 8 ~ "Problem gambling",
TRUE ~ NA_character_
),
mental_unhealthy_days = Q_health1_1
)
substance_long <- gambling_grouped %>%
select(
dsm_dx,
pgsi_cat,
nods_cat,
alcohol_use,
tobacco_use,
vape_use,
marijuana_use,
rx_misuse
) %>%
pivot_longer(
cols = c(alcohol_use, tobacco_use, vape_use, marijuana_use, rx_misuse),
names_to = "substance",
values_to = "use"
)
gambling_grouped <- gambling_grouped %>%
mutate(
dsm_cat = factor(
if_else(dsm_dx, "Gambling disorder", "No gambling disorder"),
levels = c("No gambling disorder", "Gambling disorder")
),
nods_cat = factor(
nods_cat,
levels = c("No risk", "Mild risk", "Moderate risk", "Pathological gambling")
),
pgsi_cat = factor(
pgsi_cat,
levels = c(
"Non-problem",
"Low severity",
"Moderate severity",
"Problem gambling"
)
)
)
Assess substance use by DSM-V diagnosis.
dsm_substance <- substance_long %>%
group_by(substance, dsm_dx) %>%
summarise(prop_use = mean(use, na.rm = TRUE), .groups = "drop")
dsm_summary <- gambling_grouped %>%
filter(!is.na(dsm_dx)) %>%
count(dsm_dx) %>%
mutate(prop = n / sum(n))
dsm_summary %>%
mutate(prop = round(prop * 100, 1))
## Create frequency bar graph
ggplot(gambling_grouped %>% filter(!is.na(dsm_cat)), aes(x = dsm_cat)) +
geom_bar(fill = "grey70", color = "black") +
labs(x = "DSM-V gambling diagnosis", y = "Number of participants", title = "DSM-V Gambling Disorder Classification")
## Create overall ggplot
ggplot(dsm_substance, aes(x = substance, y = prop_use, fill = dsm_dx)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = percent) +
labs(x = "Substance",
y = "Proportion using (past month)",
fill = "DSM-V gambling disorder",
title = "Substance use by DSM-V gambling diagnosis")
ggplot(dsm_substance, aes(x = substance, y = prop_use)) +
geom_col(fill = "grey70") +
facet_wrap( ~ dsm_dx) +
scale_y_continuous(labels = percent) +
labs(x = "Substance", y = "Proportion using (past month)", title = "Substance use by DSM-V gambling diagnosis")
## Plot gambling disorder by number of poor mental health days.
mental_by_dsm <- gambling_grouped %>%
filter(!is.na(dsm_dx), !is.na(mental_unhealthy_days)) %>%
group_by(dsm_dx) %>%
summarise(
mean_days = mean(mental_unhealthy_days),
n = n(),
.groups = "drop"
)
ggplot(mental_by_dsm, aes(x = dsm_dx, y = mean_days)) +
geom_col(fill = "grey70", color = "black") +
labs(x = "DSM-V diagnosis", y = "Mean mentally unhealthy days", title = "Mental Health by DSM-V diagnosis")
Assess substance use by PGSI score.
pgsi_substance <- substance_long %>%
group_by(pgsi_cat, substance) %>%
summarise(
prop_use = mean(use, na.rm = TRUE),
n = n(),
.groups = "drop"
)
pgsi_summary <- gambling_grouped %>%
filter(!is.na(pgsi_cat)) %>%
count(pgsi_cat) %>%
mutate(
prop = n / sum(n)
)
pgsi_summary %>%
mutate(prop = round(prop * 100, 1))
ggplot(
gambling_grouped %>% filter(!is.na(pgsi_cat)),
aes(x = pgsi_cat)
) +
geom_bar(fill = "grey70", color = "black") +
labs(
x = "PGSI severity category",
y = "Number of participants",
title = "PGSI Gambling Severity Categories"
)
ggplot(pgsi_substance,
aes(x = pgsi_cat, y = prop_use, fill = substance)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent) +
labs(
x = "PGSI severity category",
y = "Past-month substance use",
fill = "Substance",
title = "Substance use by PGSI severity"
)
## Plot days of poor mental health
mental_by_pgsi <- gambling_grouped %>%
filter(!is.na(pgsi_cat), !is.na(mental_unhealthy_days)) %>%
group_by(pgsi_cat) %>%
summarise(
mean_days = mean(mental_unhealthy_days),
n = n(),
.groups = "drop"
)
ggplot(mental_by_pgsi,
aes(x = pgsi_cat, y = mean_days)) +
geom_col(fill = "grey70", color = "black") +
labs(
x = "PGSI severity category",
y = "Mean mentally unhealthy days",
title = "Mental Health by PGSI Severity"
)
Assess substance use by NODS score.
nods_substance <- substance_long %>%
group_by(nods_cat, substance) %>%
summarise(
prop_use = mean(use, na.rm = TRUE),
n = n(),
.groups = "drop"
)
nods_summary <- gambling_grouped %>%
filter(!is.na(nods_cat)) %>%
count(nods_cat) %>%
mutate(
prop = n / sum(n)
)
nods_summary %>%
mutate(prop = round(prop * 100, 1))
ggplot(
gambling_grouped %>% filter(!is.na(nods_cat)),
aes(x = nods_cat)
) +
geom_bar(fill = "grey70", color = "black") +
labs(
x = "NODS risk category",
y = "Number of participants",
title = "NODS Gambling Risk Categories"
)
ggplot(nods_substance,
aes(x = nods_cat, y = prop_use, fill = substance)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent) +
labs(
x = "NODS risk category",
y = "Past-month substance use",
fill = "Substance",
title = "Substance use by NODS risk level"
)
mental_by_nods <- gambling_grouped %>%
filter(!is.na(nods_cat), !is.na(mental_unhealthy_days)) %>%
group_by(nods_cat) %>%
summarise(
mean_days = mean(mental_unhealthy_days),
n = n(),
.groups = "drop"
)
ggplot(mental_by_nods,
aes(x = nods_cat, y = mean_days)) +
geom_col(fill = "grey70", color = "black") +
labs(
x = "NODS severity category",
y = "Mean mentally unhealthy days",
title = "Mental Health by NODS severity"
)
Now report on the number of NAs in the dataset.
item_na_summary <- gambling_grouped %>%
summarise(across(c(
all_of(dsm_items), all_of(nods_items), all_of(pgsi_items)
), ~ sum(is.na(.)), .names = "na_{.col}")) %>%
pivot_longer(cols = everything(),
names_to = "item",
values_to = "na_count") %>%
arrange(desc(na_count))
item_na_summary %>%
filter(na_count > 0)
###Calculate how many respondents excluded from gambling measures.
scale_na_summary <- tibble(
scale = c("DSM-V", "NODS", "PGSI"),
total_sample = nrow(gambling_grouped),
complete_cases = c(sum(!is.na(
gambling_grouped$dsm_cat
)), sum(!is.na(
gambling_grouped$nods_cat
)), sum(!is.na(
gambling_grouped$pgsi_cat
)))
) %>%
mutate(
na_count = total_sample - complete_cases,
na_percent = round(100 * na_count / total_sample, 1)
)
scale_na_summary
### Calculate NAs for substance use variables
substance_na_summary <- gambling_grouped %>%
summarise(across(
c(
mental_unhealthy_days,
alcohol_use,
tobacco_use,
vape_use,
marijuana_use,
rx_misuse,
any_substance
),
~ sum(is.na(.)),
.names = "na_{.col}"
)) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "na_count") %>%
mutate(
variable = str_remove(variable, "^na_"),
total_n = nrow(gambling_grouped),
na_percent = round(100 * na_count / total_n, 1)
) %>%
arrange(desc(na_count))
substance_na_summary
Report age with gambling severity.
### PGSI by age
pgsi_by_age <- gambling_grouped %>%
filter(!is.na(pgsi_cat), !is.na(age_cat)) %>%
count(age_cat, pgsi_cat) %>%
group_by(age_cat) %>%
mutate(
percent = round(100 * n / sum(n), 1)
) %>%
ungroup()
ggplot(pgsi_by_age, aes(x = age_cat, y = percent, fill = pgsi_cat)) +
geom_col(position = "stack") +
labs(
x = "Age category",
y = "Percent of age group",
fill = "PGSI category",
title = "PGSI Gambling Severity by Age"
) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
### DSM-V diagnosis by age
dsm_by_age <- gambling_grouped %>%
filter(!is.na(dsm_cat), !is.na(age_cat)) %>%
count(age_cat, dsm_cat) %>%
group_by(age_cat) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup()
ggplot(dsm_by_age,
aes(x = age_cat, y = percent, fill = dsm_cat)) +
geom_col(position = "stack") +
labs(
x = "Age category",
y = "Percent",
fill = "DSM-V diagnosis",
title = "DSM-V Gambling Disorder by Age"
) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))