This is an R Markdown
Notebook. When you execute code within the notebook, the results appear
beneath the code.
library(dplyr)
library(ggplot2)
install.packages("survey")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/seise/AppData/Local/R/win-library/4.2’
(as ‘lib’ is unspecified)
also installing the dependencies ‘DBI’, ‘mitools’
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/DBI_1.1.3.zip'
Content type 'application/zip' length 767082 bytes (749 KB)
downloaded 749 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/mitools_2.4.zip'
Content type 'application/zip' length 298587 bytes (291 KB)
downloaded 291 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/survey_4.2-1.zip'
Content type 'application/zip' length 3198784 bytes (3.1 MB)
downloaded 3.1 MB
package ‘DBI’ successfully unpacked and MD5 sums checked
package ‘mitools’ successfully unpacked and MD5 sums checked
package ‘survey’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\seise\AppData\Local\Temp\RtmpIrH1dg\downloaded_packages
library(survey)
Warning: package ‘survey’ was built under R version 4.2.3Loading required package: grid
Loading required package: Matrix
Loading required package: survival
Attaching package: ‘survey’
The following object is masked from ‘package:graphics’:
dotchart
#import dataset csv
dataset<-use_this_July26_2023
dataset
NA
#number of research participants = 19,997
#name column variables for ease of use and understanding
dataset$Year -> year
dataset$RIAGENDR -> gender
dataset$RIDAGEYR -> age
dataset$INDFMPIR -> ratio_income_to_poverty_family
dataset$nhanes_drug_use_csv.DUQ200 -> mj_hashish_ever
dataset$nhanes_drug_use_csv.DUQ210 -> age_1st_mj
dataset$nhanes_drug_use_csv.DUQ213 -> age_start_reg_use_mj
dataset$nhanes_drug_use_csv.DUQ230 -> no_days_used_mj_in_last_month
dataset$nhanes_drug_use_csv.DUQ240 -> coc_her_meth_ever
dataset$nhanes_drug_use_csv.DUQ250 -> coc_ever
dataset$nhanes_drug_use_csv.DUQ260 -> age_1st_coc
dataset$nhanes_drug_use_csv.DUQ280 -> no_days_used_coc_in_last_month
dataset$nhanes_drug_use_csv.DUQ290 -> heroin_ever
dataset$nhanes_drug_use_csv.DUQ300 -> age_1st_her
dataset$nhanes_drug_use_csv.DUQ320 -> no_days_used_her_in_last_month
dataset$nhanes_drug_use_csv.DUQ330 -> meth_ever
dataset$nhanes_drug_use_csv.DUQ340 -> age_1st_meth
dataset$nhanes_drug_use_csv.DUQ360 -> no_days_used_meth_in_last_month
dataset$nhanes_drug_use_csv.DUQ370 -> needle_use_ever
#Survey weights
#Use "svydesign" to assign NHANES-recommended weights. We will use this new design variable "dataset_weighted" when obtaining descriptive statistics.
dataset_weighted <- svydesign(id = dataset$SDMVPSU,
strata = dataset$SDMVSTRA,
weights = dataset$WTINT2YR,
nest = TRUE,
data = dataset)
dataset_weighted
Stratified 1 - level Cluster Sampling design (with replacement)
With (122) clusters.
svydesign(id = dataset$SDMVPSU, strata = dataset$SDMVSTRA, weights = dataset$WTINT2YR,
nest = TRUE, data = dataset)
#Scatterplots_____________________________________________________________________________________
##Marijuana:
###Number of days participants reported using marijuana in last month plotted against their current age
#how many research participants reported using marijuana at least one day in the last 30 days?
19997-17836
[1] 2161
#2161 participants
ggplot(dataset, aes(x = age, y = no_days_used_mj_in_last_month)) +
geom_point()

Conclusion: There is not a clear relationship between current age and
number of days marijuana was used during the past month. Some 50-60 year
olds used marijuana nearly every day while others used marijuana less
frequently. This is true of younger participants as well. But what about
by gender?
ggplot(dataset, aes(x = age, y = no_days_used_mj_in_last_month, color = gender)) +
geom_point()

Hmm, there is still no evidence of a relationship between age and
number of days marijuana was used that varies by gender (1, red = male;
2, blue = female).
Maybe, instead of age, income to family ratio may be more interesting
to explore. I will substitute income to family ratio for age as the
x-axis.
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_mj_in_last_month)) +
geom_point()

n = 19997-17997 = 2000 participants who had these data. Conclusion:
In this scatter plot, one can see that individuals with a smaller ratio
of income to family size tended to report greater number of days that
marijuana was used during the last month. This figure presents more
interesting information compared to the previous scatterplot (age vs #
days marijuana used during last month).
Does the relationship between income:family size and #days marijuana
used during last month vary by gender?
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_mj_in_last_month, color = gender)) +
geom_point()

It looks like the relationship between income:family size and #days
marijuana used during last month are similar across males (1, red) and
females (2, blue).Let’s make sure these patterns are consistent across
cross-sectional time points.
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_mj_in_last_month, color = gender)) +
geom_point() +
facet_wrap(year)

Yep, it looks like at each time point at which this data was
collected, similar patterns are seen: lower income:family size ratio is
related to self-report of more frequent use of marijuana during the past
month in males and females.
How about frequency of use? Let’s get some simple descriptives
grouped by gender.
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_mj_use_past_month_days <- summarise(dataset_by_gender, mean_mj_use_past_month_days = mean(nhanes_drug_use_csv.DUQ230, na.rm = TRUE))
mean_mj_use_past_month_days
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_1st_mj_use <- summarise(dataset_by_gender, mean_age_1stmj_use = mean(nhanes_drug_use_csv.DUQ210, na.rm = TRUE))
mean_age_1st_mj_use
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_reg_mj_use <- summarise(dataset_by_gender, mean_age_reg_mj_use = mean(nhanes_drug_use_csv.DUQ213, na.rm = TRUE))
mean_age_reg_mj_use
NA
Conclusion: On average, males self-reported they first used marijuana
at a younger age, started regular use at a younger age, and had higher
self-reported # days marijuana use during last month than that
self-reported by females.
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_mj_use_past_month_days <- summarise(dataset_by_gender, mean_mj_use_past_month_days = mean(dataset$nhanes_drug_use_csv.DUQ230, na.rm = TRUE))
mean_mj_use_past_month_days
svymean(~nhanes_drug_use_csv.DUQ230, na.rm = TRUE)
Error in .svycheck(design) :
argument "design" is missing, with no default
##Cocaine
###Number of days participants reported using cocaine in last month plotted against their current age
#how many research participants reported using cocaine at least one day in the last 30 days?
19997-19760
[1] 237
#2161 participants
ggplot(dataset, aes(x = age, y = no_days_used_coc_in_last_month)) +
geom_point()

Conclusion: Only 237 men and women reported using cocaine during the
past 30 days. The number of days cocaine was used is similar across all
ages.
Are there differences by gender?
ggplot(dataset, aes(x = age, y = no_days_used_coc_in_last_month, color = gender)) +
geom_point()

Conclusion: it looks like both males (1, red) and females (2, blue)
mostly tended to self-report using cocaine <15 days. Possibly males
self-reported using cocaine on more days than females.
Let’s look at the relationship between income:family size and cocaine
use self-reported during the last 30 days.
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_coc_in_last_month)) +
geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_coc_in_last_month, color = gender)) +
geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_coc_in_last_month, color = gender)) +
geom_point() +
facet_wrap(year)

The pattern is similar at each time point, although the sample is
small.
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_coc_use_past_month_days <- summarise(dataset_by_gender, mean_coc_use_past_month_days = mean(nhanes_drug_use_csv.DUQ280, na.rm = TRUE))
mean_coc_use_past_month_days
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_1st_coc_use <- summarise(dataset_by_gender, mean_age_1st_coc_use = mean(nhanes_drug_use_csv.DUQ260, na.rm = TRUE))
mean_age_1st_coc_use
NA
NA
##Heroin
###Number of days participants reported using heroin in last month plotted against their current age
#how many research participants reported using heroin at least one day in the last 30 days?
19997-19959
[1] 38
#n = 38 participants
ggplot(dataset, aes(x = age, y = no_days_used_her_in_last_month)) +
geom_point()

ggplot(dataset, aes(x = age, y = no_days_used_her_in_last_month, color = gender)) +
geom_point()

NA
NA
NA
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_her_in_last_month)) +
geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_her_in_last_month, color = gender)) +
geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_her_in_last_month, color = gender)) +
geom_point() +
facet_wrap(year)

dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_her_use_past_month_days <- summarise(dataset_by_gender, mean_her_use_past_month_days = mean(nhanes_drug_use_csv.DUQ320, na.rm = TRUE))
mean_her_use_past_month_days
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_1st_her_use <- summarise(dataset_by_gender, mean_age_1st_her_use = mean(nhanes_drug_use_csv.DUQ300, na.rm = TRUE))
mean_age_1st_her_use
##Methamphetamine
###Number of days participants reported using methamphetamine in last month plotted against their current age
#how many research participants reported using heroin at least one day in the last 30 days?
19997-19915
[1] 82
#n = 38 participants
ggplot(dataset, aes(x = age, y = no_days_used_meth_in_last_month)) +
geom_point()

ggplot(dataset, aes(x = age, y = no_days_used_meth_in_last_month, color = gender)) +
geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_meth_in_last_month)) +
geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_meth_in_last_month, color = gender)) +
geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_meth_in_last_month, color = gender)) +
geom_point() +
facet_wrap(year)

dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_meth_use_past_month_days <- summarise(dataset_by_gender, mean_meth_use_past_month_days = mean(nhanes_drug_use_csv.DUQ360, na.rm = TRUE))
mean_meth_use_past_month_days
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_1st_meth_use <- summarise(dataset_by_gender, mean_age_1st_meth_use = mean(nhanes_drug_use_csv.DUQ340, na.rm = TRUE))
mean_age_1st_meth_use
---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

```{r}
#for data wrangling; dplyr::select, mutate, select, recode
library(dplyr) 

#for data visualization with various plots
library(ggplot2)

#for using survey weights; survey::svydesign, svymean, svyglm
install.packages("survey")
library(survey)
```
```{r}
#import dataset csv
```

```{r}
dataset<-use_this_July26_2023

dataset

```
#number of research participants = 19,997

```{r}
#name column variables for ease of use and understanding
dataset$Year -> year
dataset$RIAGENDR -> gender
dataset$RIDAGEYR -> age
dataset$INDFMPIR -> ratio_income_to_poverty_family
dataset$nhanes_drug_use_csv.DUQ200 -> mj_hashish_ever
dataset$nhanes_drug_use_csv.DUQ210 -> age_1st_mj
dataset$nhanes_drug_use_csv.DUQ213 -> age_start_reg_use_mj
dataset$nhanes_drug_use_csv.DUQ230 -> no_days_used_mj_in_last_month
dataset$nhanes_drug_use_csv.DUQ240 -> coc_her_meth_ever
dataset$nhanes_drug_use_csv.DUQ250 -> coc_ever
dataset$nhanes_drug_use_csv.DUQ260 -> age_1st_coc
dataset$nhanes_drug_use_csv.DUQ280 -> no_days_used_coc_in_last_month
dataset$nhanes_drug_use_csv.DUQ290 -> heroin_ever
dataset$nhanes_drug_use_csv.DUQ300 -> age_1st_her
dataset$nhanes_drug_use_csv.DUQ320 -> no_days_used_her_in_last_month
dataset$nhanes_drug_use_csv.DUQ330 -> meth_ever
dataset$nhanes_drug_use_csv.DUQ340 -> age_1st_meth
dataset$nhanes_drug_use_csv.DUQ360 -> no_days_used_meth_in_last_month
dataset$nhanes_drug_use_csv.DUQ370 -> needle_use_ever
```

```{r}
#Survey weights

#Use "svydesign" to assign NHANES-recommended weights. We will use this new design variable "dataset_weighted" when obtaining descriptive statistics.

dataset_weighted <- svydesign(id = dataset$SDMVPSU,
                     strata = dataset$SDMVSTRA,
                     weights = dataset$WTINT2YR,
                     nest = TRUE,
                     data = dataset)

dataset_weighted
```


```{r}
#Scatterplots_____________________________________________________________________________________

##Marijuana:

###Number of days participants reported using marijuana in last month plotted against their current age

#how many research participants reported using marijuana at least one day in the last 30 days?
19997-17836
#2161 participants 

ggplot(dataset, aes(x = age, y = no_days_used_mj_in_last_month)) + 
  geom_point()
```
Conclusion: There is not a clear relationship between current age and number of days marijuana was used during the past month. Some 50-60 year olds used marijuana nearly every day while others used marijuana less frequently. This is true of younger participants as well. But what about by gender?

```{r}
ggplot(dataset, aes(x = age, y = no_days_used_mj_in_last_month, color = gender)) + 
  geom_point()
```
Hmm, there is still no evidence of a relationship between age and number of days marijuana was used that varies by gender (1, red = male; 2, blue = female).

Maybe, instead of age, income to family ratio may be more interesting to explore. I will substitute income to family ratio for age as the x-axis.
```{r}
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_mj_in_last_month)) + 
  geom_point()
```
n = 19997-17997 = 2000 participants who had these data.
Conclusion: In this scatter plot, one can see that individuals with a smaller ratio of income to family size tended to report greater number of days that marijuana was used during the last month. This figure presents more interesting information compared to the previous scatterplot (age vs # days marijuana used during last month).

Does the relationship between income:family size and #days marijuana used during last month vary by gender?
```{r}
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_mj_in_last_month, color = gender)) + 
  geom_point()
```
It looks like the relationship between income:family size and #days marijuana used during last month are similar across males (1, red) and females (2, blue).Let's make sure these patterns are consistent across cross-sectional time points.
```{r}
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_mj_in_last_month, color = gender)) + 
  geom_point() +
  facet_wrap(year)
```
Yep, it looks like at each time point at which this data was collected, similar patterns are seen: lower income:family size ratio is related to self-report of more frequent use of marijuana during the past month in males and females.

How about frequency of use? Let's get some simple descriptives grouped by gender.

```{r}


dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_mj_use_past_month_days <- summarise(dataset_by_gender, mean_mj_use_past_month_days = mean(nhanes_drug_use_csv.DUQ230, na.rm = TRUE))
mean_mj_use_past_month_days

dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_1st_mj_use <- summarise(dataset_by_gender, mean_age_1stmj_use = mean(nhanes_drug_use_csv.DUQ210, na.rm = TRUE))
mean_age_1st_mj_use

dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_reg_mj_use <- summarise(dataset_by_gender, mean_age_reg_mj_use = mean(nhanes_drug_use_csv.DUQ213, na.rm = TRUE))
mean_age_reg_mj_use

```
Conclusion: On average, males self-reported they first used marijuana at a younger age, started regular use at a younger age, and had higher self-reported # days marijuana use during last month than that self-reported by females.
```{r}
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_mj_use_past_month_days <- summarise(dataset_by_gender, mean_mj_use_past_month_days = mean(dataset$nhanes_drug_use_csv.DUQ230, na.rm = TRUE))
mean_mj_use_past_month_days
```
```{r}
svymean(~nhanes_drug_use_csv.DUQ230, na.rm = TRUE)


```

```{r}
##Cocaine

###Number of days participants reported using cocaine in last month plotted against their current age

#how many research participants reported using cocaine at least one day in the last 30 days?
19997-19760
#n = 237 participants 

ggplot(dataset, aes(x = age, y = no_days_used_coc_in_last_month)) + 
  geom_point()
```
Conclusion: Only 237 men and women reported using cocaine during the past 30 days. The number of days cocaine was used is similar across all ages.

Are there differences by gender?

```{r}

ggplot(dataset, aes(x = age, y = no_days_used_coc_in_last_month, color = gender)) + 
  geom_point()
```
Conclusion: it looks like both males (1, red) and females (2, blue) mostly tended to self-report using cocaine <15 days. Possibly males self-reported using cocaine on more days than females. 

Let's look at the relationship between income:family size and cocaine use self-reported during the last 30 days.

```{r}
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_coc_in_last_month)) + 
  geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_coc_in_last_month, color = gender)) + 
  geom_point()
```
```{r}
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_coc_in_last_month, color = gender)) + 
  geom_point() +
  facet_wrap(year)
```
The pattern is similar at each time point, although the sample is small.

```{r}
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_coc_use_past_month_days <- summarise(dataset_by_gender, mean_coc_use_past_month_days = mean(nhanes_drug_use_csv.DUQ280, na.rm = TRUE))
mean_coc_use_past_month_days

dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_1st_coc_use <- summarise(dataset_by_gender, mean_age_1st_coc_use = mean(nhanes_drug_use_csv.DUQ260, na.rm = TRUE))
mean_age_1st_coc_use


```

```{r}
##Heroin

###Number of days participants reported using heroin in last month plotted against their current age

#how many research participants reported using heroin at least one day in the last 30 days?
19997-19959
#n = 38 participants 

ggplot(dataset, aes(x = age, y = no_days_used_her_in_last_month)) + 
  geom_point()

ggplot(dataset, aes(x = age, y = no_days_used_her_in_last_month, color = gender)) + 
  geom_point()



```
```{r}
ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_her_in_last_month)) + 
  geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_her_in_last_month, color = gender)) + 
  geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_her_in_last_month, color = gender)) + 
  geom_point() +
  facet_wrap(year)

```

```{r}
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_her_use_past_month_days <- summarise(dataset_by_gender, mean_her_use_past_month_days = mean(nhanes_drug_use_csv.DUQ320, na.rm = TRUE))
mean_her_use_past_month_days

dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_1st_her_use <- summarise(dataset_by_gender, mean_age_1st_her_use = mean(nhanes_drug_use_csv.DUQ300, na.rm = TRUE))
mean_age_1st_her_use
```
```{r}

##Methamphetamine

###Number of days participants reported using methamphetamine in last month plotted against their current age

#how many research participants reported using heroin at least one day in the last 30 days?
19997-19915
#n = 82 participants 

ggplot(dataset, aes(x = age, y = no_days_used_meth_in_last_month)) + 
  geom_point()

ggplot(dataset, aes(x = age, y = no_days_used_meth_in_last_month, color = gender)) + 
  geom_point()
```
```{r}

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_meth_in_last_month)) + 
  geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_meth_in_last_month, color = gender)) + 
  geom_point()

ggplot(dataset, aes(x = ratio_income_to_poverty_family, y = no_days_used_meth_in_last_month, color = gender)) + 
  geom_point() +
  facet_wrap(year)

```

```{r}
dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_meth_use_past_month_days <- summarise(dataset_by_gender, mean_meth_use_past_month_days = mean(nhanes_drug_use_csv.DUQ360, na.rm = TRUE))
mean_meth_use_past_month_days

dataset_by_gender <- group_by(dataset, RIAGENDR)
mean_age_1st_meth_use <- summarise(dataset_by_gender, mean_age_1st_meth_use = mean(nhanes_drug_use_csv.DUQ340, na.rm = TRUE))
mean_age_1st_meth_use
```

