This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
You can also embed plots, for example:
##Title: How Baby Boomers Get High ### This dataset covers the story behind “How Baby Boomers Get High” and it covers 13 drugs across 17 age groups. ### It holds the records of percentage of those in age group who used these 13 drugs in the last 12 months. ### It holds the records of median number of times a user in an age group uses these drugs in the last 12 months. ### Link to access article: https://fivethirtyeight.com/features/how-baby-boomers-get-high/ # Load Libraries:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
drug_df <- read_csv("https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week1/drug-use-by-age.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## age = col_character(),
## `cocaine-frequency` = col_character(),
## `crack-frequency` = col_character(),
## `heroin-frequency` = col_character(),
## `inhalant-frequency` = col_character(),
## `oxycontin-frequency` = col_character(),
## `meth-frequency` = col_character()
## )
## i Use `spec()` for the full column specifications.
drug_df
## # A tibble: 17 x 28
## age n `alcohol-use` `alcohol-frequen~ `marijuana-use` `marijuana-frequ~
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2798 3.9 3 1.1 4
## 2 13 2757 8.5 6 3.4 15
## 3 14 2792 18.1 5 8.7 24
## 4 15 2956 29.2 6 14.5 25
## 5 16 3058 40.1 10 22.5 30
## 6 17 3038 49.3 13 28 36
## 7 18 2469 58.7 24 33.7 52
## 8 19 2223 64.6 36 33.4 60
## 9 20 2271 69.7 48 34 60
## 10 21 2354 83.2 52 33 52
## 11 22-23 4707 84.2 52 28.4 52
## 12 24-25 4591 83.1 52 24.9 60
## 13 26-29 2628 80.7 52 20.8 52
## 14 30-34 2864 77.5 52 16.4 72
## 15 35-49 7391 75 52 10.4 48
## 16 50-64 3923 67.2 52 7.3 52
## 17 65+ 2448 49.3 52 1.2 36
## # ... with 22 more variables: cocaine-use <dbl>, cocaine-frequency <chr>,
## # crack-use <dbl>, crack-frequency <chr>, heroin-use <dbl>,
## # heroin-frequency <chr>, hallucinogen-use <dbl>,
## # hallucinogen-frequency <dbl>, inhalant-use <dbl>, inhalant-frequency <chr>,
## # pain-releiver-use <dbl>, pain-releiver-frequency <dbl>,
## # oxycontin-use <dbl>, oxycontin-frequency <chr>, tranquilizer-use <dbl>,
## # tranquilizer-frequency <dbl>, stimulant-use <dbl>,
## # stimulant-frequency <dbl>, meth-use <dbl>, meth-frequency <chr>,
## # sedative-use <dbl>, sedative-frequency <dbl>
names(drug_df) <- gsub("-", "_", names(drug_df))
drug_df
## # A tibble: 17 x 28
## age n alcohol_use alcohol_frequency marijuana_use marijuana_frequency
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2798 3.9 3 1.1 4
## 2 13 2757 8.5 6 3.4 15
## 3 14 2792 18.1 5 8.7 24
## 4 15 2956 29.2 6 14.5 25
## 5 16 3058 40.1 10 22.5 30
## 6 17 3038 49.3 13 28 36
## 7 18 2469 58.7 24 33.7 52
## 8 19 2223 64.6 36 33.4 60
## 9 20 2271 69.7 48 34 60
## 10 21 2354 83.2 52 33 52
## 11 22-23 4707 84.2 52 28.4 52
## 12 24-25 4591 83.1 52 24.9 60
## 13 26-29 2628 80.7 52 20.8 52
## 14 30-34 2864 77.5 52 16.4 72
## 15 35-49 7391 75 52 10.4 48
## 16 50-64 3923 67.2 52 7.3 52
## 17 65+ 2448 49.3 52 1.2 36
## # ... with 22 more variables: cocaine_use <dbl>, cocaine_frequency <chr>,
## # crack_use <dbl>, crack_frequency <chr>, heroin_use <dbl>,
## # heroin_frequency <chr>, hallucinogen_use <dbl>,
## # hallucinogen_frequency <dbl>, inhalant_use <dbl>, inhalant_frequency <chr>,
## # pain_releiver_use <dbl>, pain_releiver_frequency <dbl>,
## # oxycontin_use <dbl>, oxycontin_frequency <chr>, tranquilizer_use <dbl>,
## # tranquilizer_frequency <dbl>, stimulant_use <dbl>,
## # stimulant_frequency <dbl>, meth_use <dbl>, meth_frequency <chr>,
## # sedative_use <dbl>, sedative_frequency <dbl>
colnames(drug_df)
## [1] "age" "n"
## [3] "alcohol_use" "alcohol_frequency"
## [5] "marijuana_use" "marijuana_frequency"
## [7] "cocaine_use" "cocaine_frequency"
## [9] "crack_use" "crack_frequency"
## [11] "heroin_use" "heroin_frequency"
## [13] "hallucinogen_use" "hallucinogen_frequency"
## [15] "inhalant_use" "inhalant_frequency"
## [17] "pain_releiver_use" "pain_releiver_frequency"
## [19] "oxycontin_use" "oxycontin_frequency"
## [21] "tranquilizer_use" "tranquilizer_frequency"
## [23] "stimulant_use" "stimulant_frequency"
## [25] "meth_use" "meth_frequency"
## [27] "sedative_use" "sedative_frequency"
names(drug_df)[2] <- "sample_size"
str(drug_df)
## spec_tbl_df [17 x 28] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : chr [1:17] "12" "13" "14" "15" ...
## $ sample_size : num [1:17] 2798 2757 2792 2956 3058 ...
## $ alcohol_use : num [1:17] 3.9 8.5 18.1 29.2 40.1 49.3 58.7 64.6 69.7 83.2 ...
## $ alcohol_frequency : num [1:17] 3 6 5 6 10 13 24 36 48 52 ...
## $ marijuana_use : num [1:17] 1.1 3.4 8.7 14.5 22.5 28 33.7 33.4 34 33 ...
## $ marijuana_frequency : num [1:17] 4 15 24 25 30 36 52 60 60 52 ...
## $ cocaine_use : num [1:17] 0.1 0.1 0.1 0.5 1 2 3.2 4.1 4.9 4.8 ...
## $ cocaine_frequency : chr [1:17] "5.0" "1.0" "5.5" "4.0" ...
## $ crack_use : num [1:17] 0 0 0 0.1 0 0.1 0.4 0.5 0.6 0.5 ...
## $ crack_frequency : chr [1:17] "-" "3.0" "-" "9.5" ...
## $ heroin_use : num [1:17] 0.1 0 0.1 0.2 0.1 0.1 0.4 0.5 0.9 0.6 ...
## $ heroin_frequency : chr [1:17] "35.5" "-" "2.0" "1.0" ...
## $ hallucinogen_use : num [1:17] 0.2 0.6 1.6 2.1 3.4 4.8 7 8.6 7.4 6.3 ...
## $ hallucinogen_frequency : num [1:17] 52 6 3 4 3 3 4 3 2 4 ...
## $ inhalant_use : num [1:17] 1.6 2.5 2.6 2.5 3 2 1.8 1.4 1.5 1.4 ...
## $ inhalant_frequency : chr [1:17] "19.0" "12.0" "5.0" "5.5" ...
## $ pain_releiver_use : num [1:17] 2 2.4 3.9 5.5 6.2 8.5 9.2 9.4 10 9 ...
## $ pain_releiver_frequency: num [1:17] 36 14 12 10 7 9 12 12 10 15 ...
## $ oxycontin_use : num [1:17] 0.1 0.1 0.4 0.8 1.1 1.4 1.7 1.5 1.7 1.3 ...
## $ oxycontin_frequency : chr [1:17] "24.5" "41.0" "4.5" "3.0" ...
## $ tranquilizer_use : num [1:17] 0.2 0.3 0.9 2 2.4 3.5 4.9 4.2 5.4 3.9 ...
## $ tranquilizer_frequency : num [1:17] 52 25.5 5 4.5 11 7 12 4.5 10 7 ...
## $ stimulant_use : num [1:17] 0.2 0.3 0.8 1.5 1.8 2.8 3 3.3 4 4.1 ...
## $ stimulant_frequency : num [1:17] 2 4 12 6 9.5 9 8 6 12 10 ...
## $ meth_use : num [1:17] 0 0.1 0.1 0.3 0.3 0.6 0.5 0.4 0.9 0.6 ...
## $ meth_frequency : chr [1:17] "-" "5.0" "24.0" "10.5" ...
## $ sedative_use : num [1:17] 0.2 0.1 0.2 0.4 0.2 0.5 0.4 0.3 0.5 0.3 ...
## $ sedative_frequency : num [1:17] 13 19 16.5 30 3 6.5 10 6 4 9 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_character(),
## .. n = col_double(),
## .. `alcohol-use` = col_double(),
## .. `alcohol-frequency` = col_double(),
## .. `marijuana-use` = col_double(),
## .. `marijuana-frequency` = col_double(),
## .. `cocaine-use` = col_double(),
## .. `cocaine-frequency` = col_character(),
## .. `crack-use` = col_double(),
## .. `crack-frequency` = col_character(),
## .. `heroin-use` = col_double(),
## .. `heroin-frequency` = col_character(),
## .. `hallucinogen-use` = col_double(),
## .. `hallucinogen-frequency` = col_double(),
## .. `inhalant-use` = col_double(),
## .. `inhalant-frequency` = col_character(),
## .. `pain-releiver-use` = col_double(),
## .. `pain-releiver-frequency` = col_double(),
## .. `oxycontin-use` = col_double(),
## .. `oxycontin-frequency` = col_character(),
## .. `tranquilizer-use` = col_double(),
## .. `tranquilizer-frequency` = col_double(),
## .. `stimulant-use` = col_double(),
## .. `stimulant-frequency` = col_double(),
## .. `meth-use` = col_double(),
## .. `meth-frequency` = col_character(),
## .. `sedative-use` = col_double(),
## .. `sedative-frequency` = col_double()
## .. )
#Identifying and subseting the columns to replace the value
drug_df[ ,7:27 ][drug_df[ ,7:27] == '-'] <- '0'
drug_df
## # A tibble: 17 x 28
## age sample_size alcohol_use alcohol_frequen~ marijuana_use marijuana_frequ~
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2798 3.9 3 1.1 4
## 2 13 2757 8.5 6 3.4 15
## 3 14 2792 18.1 5 8.7 24
## 4 15 2956 29.2 6 14.5 25
## 5 16 3058 40.1 10 22.5 30
## 6 17 3038 49.3 13 28 36
## 7 18 2469 58.7 24 33.7 52
## 8 19 2223 64.6 36 33.4 60
## 9 20 2271 69.7 48 34 60
## 10 21 2354 83.2 52 33 52
## 11 22-23 4707 84.2 52 28.4 52
## 12 24-25 4591 83.1 52 24.9 60
## 13 26-29 2628 80.7 52 20.8 52
## 14 30-34 2864 77.5 52 16.4 72
## 15 35-49 7391 75 52 10.4 48
## 16 50-64 3923 67.2 52 7.3 52
## 17 65+ 2448 49.3 52 1.2 36
## # ... with 22 more variables: cocaine_use <dbl>, cocaine_frequency <chr>,
## # crack_use <dbl>, crack_frequency <chr>, heroin_use <dbl>,
## # heroin_frequency <chr>, hallucinogen_use <dbl>,
## # hallucinogen_frequency <dbl>, inhalant_use <dbl>, inhalant_frequency <chr>,
## # pain_releiver_use <dbl>, pain_releiver_frequency <dbl>,
## # oxycontin_use <dbl>, oxycontin_frequency <chr>, tranquilizer_use <dbl>,
## # tranquilizer_frequency <dbl>, stimulant_use <dbl>,
## # stimulant_frequency <dbl>, meth_use <dbl>, meth_frequency <chr>,
## # sedative_use <dbl>, sedative_frequency <dbl>
head(drug_df)
## # A tibble: 6 x 28
## age sample_size alcohol_use alcohol_frequency marijuana_use marijuana_frequ~
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12 2798 3.9 3 1.1 4
## 2 13 2757 8.5 6 3.4 15
## 3 14 2792 18.1 5 8.7 24
## 4 15 2956 29.2 6 14.5 25
## 5 16 3058 40.1 10 22.5 30
## 6 17 3038 49.3 13 28 36
## # ... with 22 more variables: cocaine_use <dbl>, cocaine_frequency <chr>,
## # crack_use <dbl>, crack_frequency <chr>, heroin_use <dbl>,
## # heroin_frequency <chr>, hallucinogen_use <dbl>,
## # hallucinogen_frequency <dbl>, inhalant_use <dbl>, inhalant_frequency <chr>,
## # pain_releiver_use <dbl>, pain_releiver_frequency <dbl>,
## # oxycontin_use <dbl>, oxycontin_frequency <chr>, tranquilizer_use <dbl>,
## # tranquilizer_frequency <dbl>, stimulant_use <dbl>,
## # stimulant_frequency <dbl>, meth_use <dbl>, meth_frequency <chr>,
## # sedative_use <dbl>, sedative_frequency <dbl>
high_alcohol_use <- drug_df %>% # Using dplyr functions
select(age, sample_size, alcohol_use, alcohol_frequency) %>%
filter(alcohol_use >= 50)
high_alcohol_use
## # A tibble: 10 x 4
## age sample_size alcohol_use alcohol_frequency
## <chr> <dbl> <dbl> <dbl>
## 1 18 2469 58.7 24
## 2 19 2223 64.6 36
## 3 20 2271 69.7 48
## 4 21 2354 83.2 52
## 5 22-23 4707 84.2 52
## 6 24-25 4591 83.1 52
## 7 26-29 2628 80.7 52
## 8 30-34 2864 77.5 52
## 9 35-49 7391 75 52
## 10 50-64 3923 67.2 52
summary(high_alcohol_use)
## age sample_size alcohol_use alcohol_frequency
## Length:10 Min. :2223 Min. :58.70 Min. :24.0
## Class :character 1st Qu.:2383 1st Qu.:67.83 1st Qu.:49.0
## Mode :character Median :2746 Median :76.25 Median :52.0
## Mean :3542 Mean :74.39 Mean :47.2
## 3rd Qu.:4424 3rd Qu.:82.50 3rd Qu.:52.0
## Max. :7391 Max. :84.20 Max. :52.0
ggplot(high_alcohol_use, aes(x = age, y = alcohol_use, fill = alcohol_use)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic()
ggplot(data = drug_df, aes(x = age, y = marijuana_use, color= marijuana_use)) +
geom_point() +
labs(title = "Percentage of Marijuana Use by Age in past 12 Months")
geom_smooth(method=lm) # add linear trend line
## geom_smooth: na.rm = FALSE, orientation = NA, se = TRUE
## stat_smooth: na.rm = FALSE, orientation = NA, se = TRUE, method = function (formula, data, subset, weights, na.action, method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, contrasts = NULL, offset, ...)
## {
## ret.x <- x
## ret.y <- y
## cl <- match.call()
## mf <- match.call(expand.dots = FALSE)
## m <- match(c("formula", "data", "subset", "weights", "na.action", "offset"), names(mf), 0)
## mf <- mf[c(1, m)]
## mf$drop.unused.levels <- TRUE
## mf[[1]] <- quote(stats::model.frame)
## mf <- eval(mf, parent.frame())
## if (method == "model.frame")
## return(mf)
## else if (method != "qr")
## warning(gettextf("method = '%s' is not supported. Using 'qr'", method), domain = NA)
## mt <- attr(mf, "terms")
## y <- model.response(mf, "numeric")
## w <- as.vector(model.weights(mf))
## if (!is.null(w) && !is.numeric(w))
## stop("'weights' must be a numeric vector")
## offset <- model.offset(mf)
## mlm <- is.matrix(y)
## ny <- if (mlm)
## nrow(y)
## else length(y)
## if (!is.null(offset)) {
## if (!mlm)
## offset <- as.vector(offset)
## if (NROW(offset) != ny)
## stop(gettextf("number of offsets is %d, should equal %d (number of observations)", NROW(offset), ny), domain = NA)
## }
## if (is.empty.model(mt)) {
## x <- NULL
## z <- list(coefficients = if (mlm) matrix(NA, 0, ncol(y)) else numeric(), residuals = y, fitted.values = 0 * y, weights = w, rank = 0, df.residual = if (!is.null(w)) sum(w != 0) else ny)
## if (!is.null(offset)) {
## z$fitted.values <- offset
## z$residuals <- y - offset
## }
## }
## else {
## x <- model.matrix(mt, mf, contrasts)
## z <- if (is.null(w))
## lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...)
## else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, ...)
## }
## class(z) <- c(if (mlm) "mlm", "lm")
## z$na.action <- attr(mf, "na.action")
## z$offset <- offset
## z$contrasts <- attr(x, "contrasts")
## z$xlevels <- .getXlevels(mt, mf)
## z$call <- cl
## z$terms <- mt
## if (model)
## z$model <- mf
## if (ret.x)
## z$x <- x
## if (ret.y)
## z$y <- y
## if (!qr)
## z$qr <- NULL
## z
## }
## position_identity
# using subset function
heroin_data <- subset(drug_df, age >= 20,
select=c(age, heroin_use, heroin_frequency))
heroin_data
## # A tibble: 9 x 3
## age heroin_use heroin_frequency
## <chr> <dbl> <chr>
## 1 20 0.9 45.0
## 2 21 0.6 30.0
## 3 22-23 1.1 57.5
## 4 24-25 0.7 88.0
## 5 26-29 0.6 50.0
## 6 30-34 0.4 66.0
## 7 35-49 0.1 280.0
## 8 50-64 0.1 41.0
## 9 65+ 0 120.0
ggplot(data = heroin_data, aes(x = age, y = heroin_use, color= heroin_use)) +
geom_point() +
geom_smooth(method=lm) # add linear trend line
## `geom_smooth()` using formula 'y ~ x'
ggplot(drug_df, aes(x = age, y = cocaine_use, fill = cocaine_use)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic()
# Conclusion Analysis:
summary(drug_df)
## age sample_size alcohol_use alcohol_frequency
## Length:17 Min. :2223 Min. : 3.90 Min. : 3.00
## Class :character 1st Qu.:2469 1st Qu.:40.10 1st Qu.:10.00
## Mode :character Median :2798 Median :64.60 Median :48.00
## Mean :3251 Mean :55.43 Mean :33.35
## 3rd Qu.:3058 3rd Qu.:77.50 3rd Qu.:52.00
## Max. :7391 Max. :84.20 Max. :52.00
## marijuana_use marijuana_frequency cocaine_use cocaine_frequency
## Min. : 1.10 Min. : 4.00 Min. :0.000 Length:17
## 1st Qu.: 8.70 1st Qu.:30.00 1st Qu.:0.500 Class :character
## Median :20.80 Median :52.00 Median :2.000 Mode :character
## Mean :18.92 Mean :42.94 Mean :2.176
## 3rd Qu.:28.40 3rd Qu.:52.00 3rd Qu.:4.000
## Max. :34.00 Max. :72.00 Max. :4.900
## crack_use crack_frequency heroin_use heroin_frequency
## Min. :0.0000 Length:17 Min. :0.0000 Length:17
## 1st Qu.:0.0000 Class :character 1st Qu.:0.1000 Class :character
## Median :0.4000 Mode :character Median :0.2000 Mode :character
## Mean :0.2941 Mean :0.3529
## 3rd Qu.:0.5000 3rd Qu.:0.6000
## Max. :0.6000 Max. :1.1000
## hallucinogen_use hallucinogen_frequency inhalant_use inhalant_frequency
## Min. :0.100 Min. : 2.000 Min. :0.000 Length:17
## 1st Qu.:0.600 1st Qu.: 3.000 1st Qu.:0.600 Class :character
## Median :3.200 Median : 3.000 Median :1.400 Mode :character
## Mean :3.394 Mean : 8.412 Mean :1.388
## 3rd Qu.:5.200 3rd Qu.: 4.000 3rd Qu.:2.000
## Max. :8.600 Max. :52.000 Max. :3.000
## pain_releiver_use pain_releiver_frequency oxycontin_use oxycontin_frequency
## Min. : 0.600 Min. : 7.00 Min. :0.0000 Length:17
## 1st Qu.: 3.900 1st Qu.:12.00 1st Qu.:0.4000 Class :character
## Median : 6.200 Median :12.00 Median :1.1000 Mode :character
## Mean : 6.271 Mean :14.71 Mean :0.9353
## 3rd Qu.: 9.000 3rd Qu.:15.00 3rd Qu.:1.4000
## Max. :10.000 Max. :36.00 Max. :1.7000
## tranquilizer_use tranquilizer_frequency stimulant_use stimulant_frequency
## Min. :0.200 Min. : 4.50 Min. :0.000 Min. : 2.00
## 1st Qu.:1.400 1st Qu.: 6.00 1st Qu.:0.600 1st Qu.: 7.00
## Median :3.500 Median :10.00 Median :1.800 Median : 10.00
## Mean :2.806 Mean :11.74 Mean :1.918 Mean : 31.15
## 3rd Qu.:4.200 3rd Qu.:11.00 3rd Qu.:3.000 3rd Qu.: 12.00
## Max. :5.400 Max. :52.00 Max. :4.100 Max. :364.00
## meth_use meth_frequency sedative_use sedative_frequency
## Min. :0.0000 Length:17 Min. :0.0000 Min. : 3.00
## 1st Qu.:0.2000 Class :character 1st Qu.:0.2000 1st Qu.: 6.50
## Median :0.4000 Mode :character Median :0.3000 Median : 10.00
## Mean :0.3824 Mean :0.2824 Mean : 19.38
## 3rd Qu.:0.6000 3rd Qu.:0.4000 3rd Qu.: 17.50
## Max. :0.9000 Max. :0.5000 Max. :104.00
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.