Project2 - Data Set2

Import Drug Use Data

drug_use_data <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/drug-use-by-age/drug-use-by-age.csv"
du <- read.csv(drug_use_data, stringsAsFactors = F)
head(du)

##   age    n alcohol.use alcohol.frequency marijuana.use marijuana.frequency
## 1  12 2798         3.9                 3           1.1                   4
## 2  13 2757         8.5                 6           3.4                  15
## 3  14 2792        18.1                 5           8.7                  24
## 4  15 2956        29.2                 6          14.5                  25
## 5  16 3058        40.1                10          22.5                  30
## 6  17 3038        49.3                13          28.0                  36
##   cocaine.use cocaine.frequency crack.use crack.frequency heroin.use
## 1         0.1               5.0       0.0               -        0.1
## 2         0.1               1.0       0.0             3.0        0.0
## 3         0.1               5.5       0.0               -        0.1
## 4         0.5               4.0       0.1             9.5        0.2
## 5         1.0               7.0       0.0             1.0        0.1
## 6         2.0               5.0       0.1            21.0        0.1
##   heroin.frequency hallucinogen.use hallucinogen.frequency inhalant.use
## 1             35.5              0.2                     52          1.6
## 2                -              0.6                      6          2.5
## 3              2.0              1.6                      3          2.6
## 4              1.0              2.1                      4          2.5
## 5             66.5              3.4                      3          3.0
## 6             64.0              4.8                      3          2.0
##   inhalant.frequency pain.releiver.use pain.releiver.frequency
## 1               19.0               2.0                      36
## 2               12.0               2.4                      14
## 3                5.0               3.9                      12
## 4                5.5               5.5                      10
## 5                3.0               6.2                       7
## 6                4.0               8.5                       9
##   oxycontin.use oxycontin.frequency tranquilizer.use
## 1           0.1                24.5              0.2
## 2           0.1                41.0              0.3
## 3           0.4                 4.5              0.9
## 4           0.8                 3.0              2.0
## 5           1.1                 4.0              2.4
## 6           1.4                 6.0              3.5
##   tranquilizer.frequency stimulant.use stimulant.frequency meth.use
## 1                   52.0           0.2                 2.0      0.0
## 2                   25.5           0.3                 4.0      0.1
## 3                    5.0           0.8                12.0      0.1
## 4                    4.5           1.5                 6.0      0.3
## 5                   11.0           1.8                 9.5      0.3
## 6                    7.0           2.8                 9.0      0.6
##   meth.frequency sedative.use sedative.frequency
## 1              -          0.2               13.0
## 2            5.0          0.1               19.0
## 3           24.0          0.2               16.5
## 4           10.5          0.4               30.0
## 5           36.0          0.2                3.0
## 6           48.0          0.5                6.5

Tidy Drug Use Data

Looking at the column names we can see that age and n(number) are columns that can stay. The other columns are separated by drug.use (percentage of the age group that uses the drug) and drug.frequency (the median number of times the user in the age group used the drug in the past 12 months).

I would like to change this data set to a ‘Long’ format by keeping the columns ‘age’ and ‘n’, and then creating the columns ‘drug’, ‘percentage-used’, and ‘frequency-used’.

Display Column Names

#Display column names
colnames(du)

##  [1] "age"                     "n"                      
##  [3] "alcohol.use"             "alcohol.frequency"      
##  [5] "marijuana.use"           "marijuana.frequency"    
##  [7] "cocaine.use"             "cocaine.frequency"      
##  [9] "crack.use"               "crack.frequency"        
## [11] "heroin.use"              "heroin.frequency"       
## [13] "hallucinogen.use"        "hallucinogen.frequency" 
## [15] "inhalant.use"            "inhalant.frequency"     
## [17] "pain.releiver.use"       "pain.releiver.frequency"
## [19] "oxycontin.use"           "oxycontin.frequency"    
## [21] "tranquilizer.use"        "tranquilizer.frequency" 
## [23] "stimulant.use"           "stimulant.frequency"    
## [25] "meth.use"                "meth.frequency"         
## [27] "sedative.use"            "sedative.frequency"

Create subset for drug.use and drug.frequency

use <- du %>% 
  select(c(age, n, ends_with("use"))) %>%  #select the 'age', 'n', and any columns ending with 'use'
  gather("drug", "percent_used", ends_with("use")) #create 'drug' and 'percent_used' columns
 
head(use)

##   age    n        drug percent_used
## 1  12 2798 alcohol.use          3.9
## 2  13 2757 alcohol.use          8.5
## 3  14 2792 alcohol.use         18.1
## 4  15 2956 alcohol.use         29.2
## 5  16 3058 alcohol.use         40.1
## 6  17 3038 alcohol.use         49.3

freq <- du %>% 
  select(c(age, n, ends_with("frequency"))) %>% #select the 'age', 'n', and any columns ending with 'cy'
  gather("drug", "frequency", ends_with("cy")) #create 'drug' and 'frequency' columns

head(freq)

##   age    n              drug frequency
## 1  12 2798 alcohol.frequency         3
## 2  13 2757 alcohol.frequency         6
## 3  14 2792 alcohol.frequency         5
## 4  15 2956 alcohol.frequency         6
## 5  16 3058 alcohol.frequency        10
## 6  17 3038 alcohol.frequency        13

Strip the ‘.use’ and ‘.frequency’ from the end of the drug value.

use$drug <- gsub("(.use)$", "", use$drug)
head(use)

##   age    n    drug percent_used
## 1  12 2798 alcohol          3.9
## 2  13 2757 alcohol          8.5
## 3  14 2792 alcohol         18.1
## 4  15 2956 alcohol         29.2
## 5  16 3058 alcohol         40.1
## 6  17 3038 alcohol         49.3

freq$drug <- gsub("(.frequency)$", "", freq$drug)
head(freq)

##   age    n    drug frequency
## 1  12 2798 alcohol         3
## 2  13 2757 alcohol         6
## 3  14 2792 alcohol         5
## 4  15 2956 alcohol         6
## 5  16 3058 alcohol        10
## 6  17 3038 alcohol        13

Merge the ‘use’ and ‘freq’ data frames.

drug_use <- cbind(use, frequency=as.double(freq$frequency))

## Warning in cbind(use, frequency = as.double(freq$frequency)): NAs
## introduced by coercion

head(drug_use)

##   age    n    drug percent_used frequency
## 1  12 2798 alcohol          3.9         3
## 2  13 2757 alcohol          8.5         6
## 3  14 2792 alcohol         18.1         5
## 4  15 2956 alcohol         29.2         6
## 5  16 3058 alcohol         40.1        10
## 6  17 3038 alcohol         49.3        13

Analyze Drug Use Data

al <- filter(drug_use, drug == "alcohol")
mar <- filter(drug_use, drug == "marijuana")
co <- filter(drug_use, drug == "cocaine")
cr <- filter(drug_use, drug == "crack")
her <- filter(drug_use, drug == "heroin")
hal <- filter(drug_use, drug == "hallucinogen")
inh <- filter(drug_use, drug == "inhalant")
pr <- filter(drug_use, drug == "pain.releiver")
ox <- filter(drug_use, drug == "oxycontin")
tr <- filter(drug_use, drug == "tranquilizer")
st <- filter(drug_use, drug == "stimulant")
meth <- filter(drug_use, drug == "meth")
sed <- filter(drug_use, drug == "sedative")

maxf <- max(drug_use$frequency, na.rm = T)
maxp <- max(drug_use$percent_used, na.rm = T)

 ggplot(data=al, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Alcohol Use") +
  ggtitle("Alcohol") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

ggplot(data=al, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Alcohol") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=mar, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Marijuana Use") +
  ggtitle("Marijuana") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

ggplot(data=mar, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Marijuana") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=co, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Cocaine Use") +
  ggtitle("Cocaine") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=co, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Cocaine") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data=cr, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Crack Use") +
  ggtitle("Crack") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=cr, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Crack") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

## Warning: Removed 3 rows containing missing values (position_stack).

## Warning: Removed 3 rows containing missing values (geom_text).

 ggplot(data=her, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Heroin Use") +
  ggtitle("Heroin") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=her, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Heroin") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

## Warning: Removed 5 rows containing missing values (position_stack).

## Warning: Removed 5 rows containing missing values (geom_text).

 ggplot(data=hal, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Hallucinogen Use") +
  ggtitle("Hallucinogen") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=hal, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Hallucinogen") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=inh, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Inhalant Use") +
  ggtitle("Inhalant") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

ggplot(data=inh, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Inhalant") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

 ggplot(data=pr, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Pain Reliever Use") +
  ggtitle("Pain Reliever") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

ggplot(data=pr, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Pain Reliever") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=ox, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Oxycontin Use") +
  ggtitle("Oxycontin") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

ggplot(data=ox, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Oxycontin") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

## Warning: Removed 1 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (geom_text).

 ggplot(data=tr, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Tranquilizer Use") +
  ggtitle("Tranquilizer") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

ggplot(data=tr, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Tranquilizer") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

ggplot(data=st, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Stimulant Use") +
  ggtitle("Stimulant") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=st, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Stimulant") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=meth, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Meth Use") +
  ggtitle("Meth") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

ggplot(data=meth, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Meth") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

## Warning: Removed 2 rows containing missing values (position_stack).

## Warning: Removed 2 rows containing missing values (geom_text).

 ggplot(data=sed, aes(x=age, y=percent_used, fill=age)) + 
  xlab("age group") + 
  ylab("Percentage of Sedative Use") +
  ggtitle("Sedative") +
  ylim(0, maxp) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=percent_used), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

 ggplot(data=sed, aes(x=age, y=frequency, fill=age)) + 
  xlab("age group") + 
  ylab("Times Used in Past Year") +
  ggtitle("Sedative") +
  ylim(0, maxf) +
    geom_bar(colour="black", stat="identity") +
  geom_text(aes(label=frequency), position = position_dodge(width = .9), vjust=-.25) +
  guides(fill=F)

Project2 - Data Set2

Chad Smith

October 7, 2017

Import Drug Use Data

Tidy Drug Use Data

Analyze Drug Use Data