Data 607 Project 2

library(tidyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

First, load data into Rstudio from Github. I chose the drug used by age data set. The ask of this rate set is to, “many of the columns are the drug names and they could become row objects. The table is based on the age of the user and it would be better to read if the ages read across the table as column headers.” What I will attempt to do is

drugdata <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/drug-use-by-age/drug-use-by-age.csv")
drugdata <- tbl_df(drugdata)
drugdata
## # A tibble: 17 x 28
##       age     n alcohol.use alcohol.frequency marijuana.use
##    <fctr> <int>       <dbl>             <dbl>         <dbl>
##  1     12  2798         3.9                 3           1.1
##  2     13  2757         8.5                 6           3.4
##  3     14  2792        18.1                 5           8.7
##  4     15  2956        29.2                 6          14.5
##  5     16  3058        40.1                10          22.5
##  6     17  3038        49.3                13          28.0
##  7     18  2469        58.7                24          33.7
##  8     19  2223        64.6                36          33.4
##  9     20  2271        69.7                48          34.0
## 10     21  2354        83.2                52          33.0
## 11  22-23  4707        84.2                52          28.4
## 12  24-25  4591        83.1                52          24.9
## 13  26-29  2628        80.7                52          20.8
## 14  30-34  2864        77.5                52          16.4
## 15  35-49  7391        75.0                52          10.4
## 16  50-64  3923        67.2                52           7.3
## 17    65+  2448        49.3                52           1.2
## # ... with 23 more variables: marijuana.frequency <dbl>,
## #   cocaine.use <dbl>, cocaine.frequency <fctr>, crack.use <dbl>,
## #   crack.frequency <fctr>, heroin.use <dbl>, heroin.frequency <fctr>,
## #   hallucinogen.use <dbl>, hallucinogen.frequency <dbl>,
## #   inhalant.use <dbl>, inhalant.frequency <fctr>,
## #   pain.releiver.use <dbl>, pain.releiver.frequency <dbl>,
## #   oxycontin.use <dbl>, oxycontin.frequency <fctr>,
## #   tranquilizer.use <dbl>, tranquilizer.frequency <dbl>,
## #   stimulant.use <dbl>, stimulant.frequency <dbl>, meth.use <dbl>,
## #   meth.frequency <fctr>, sedative.use <dbl>, sedative.frequency <dbl>

First, I’d like to select just the use of the different drugs and put them in a data frame

dd1 <- drugdata %>% 
  select("age", "alcohol.use", "marijuana.use", "cocaine.use", "crack.use", "heroin.use", "hallucinogen.use", "inhalant.use", "pain.releiver.use", "oxycontin.use", "tranquilizer.use", "stimulant.use", "meth.use", "sedative.use" )
dd1 <- as.data.frame(dd1)

Now, I’d like to transform the data so that the ages are the columns

dd2 <- melt(dd1, id.vars = c("age")) #had some help from stack overflow with figuring out the melt function.
dd2 %>% 
  spread(age, value) %>% 
  group_by(variable)
## # A tibble: 13 x 18
## # Groups:   variable [13]
##             variable  `12`  `13`  `14`  `15`  `16`  `17`  `18`  `19`  `20`
##  *            <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1       alcohol.use   3.9   8.5  18.1  29.2  40.1  49.3  58.7  64.6  69.7
##  2     marijuana.use   1.1   3.4   8.7  14.5  22.5  28.0  33.7  33.4  34.0
##  3       cocaine.use   0.1   0.1   0.1   0.5   1.0   2.0   3.2   4.1   4.9
##  4         crack.use   0.0   0.0   0.0   0.1   0.0   0.1   0.4   0.5   0.6
##  5        heroin.use   0.1   0.0   0.1   0.2   0.1   0.1   0.4   0.5   0.9
##  6  hallucinogen.use   0.2   0.6   1.6   2.1   3.4   4.8   7.0   8.6   7.4
##  7      inhalant.use   1.6   2.5   2.6   2.5   3.0   2.0   1.8   1.4   1.5
##  8 pain.releiver.use   2.0   2.4   3.9   5.5   6.2   8.5   9.2   9.4  10.0
##  9     oxycontin.use   0.1   0.1   0.4   0.8   1.1   1.4   1.7   1.5   1.7
## 10  tranquilizer.use   0.2   0.3   0.9   2.0   2.4   3.5   4.9   4.2   5.4
## 11     stimulant.use   0.2   0.3   0.8   1.5   1.8   2.8   3.0   3.3   4.0
## 12          meth.use   0.0   0.1   0.1   0.3   0.3   0.6   0.5   0.4   0.9
## 13      sedative.use   0.2   0.1   0.2   0.4   0.2   0.5   0.4   0.3   0.5
## # ... with 8 more variables: `21` <dbl>, `22-23` <dbl>, `24-25` <dbl>,
## #   `26-29` <dbl>, `30-34` <dbl>, `35-49` <dbl>, `50-64` <dbl>,
## #   `65+` <dbl>

Now, I’d like to plot the data to find out with age group is using which drug the most.

dd3 <- ggplot(data=dd2, aes(x=age, y=value, fill=variable)) +
    geom_bar(stat="identity", position=position_dodge()) +
   theme_minimal() +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
dd3

ggplot(data=dd2, aes(x=age, y=value, fill=variable)) +
    geom_bar(stat="identity", position=position_dodge()) +
   theme_minimal() +
  theme(legend.position = "bottom") +
  coord_cartesian(ylim=c(0,2)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

I could use lapply to divid the different variables out and make several plots to more clearly show the values of each variables, but I’m not totally sure how to do that.

A general analysis of this data is that alcohol use remains the most popular drug amongst all age groups, followed by marijuana use. There is a sike in inhalant use and pain releiver use in the late teens through the 35-49 age range.