Extract and prepare data

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(ggplot2)
url<-"https://raw.githubusercontent.com/uplotnik/Data607/master/drug%20use%20by%20age.csv"

##Read empty and "-" cells with NA
dataset <- read.csv(url, na.strings = c("-", "NA"),
                    sep = ",", header = TRUE)
head(dataset,10)
##    age    n alcohol.use alcohol.frequency marijuana.use
## 1   12 2798         3.9                 3           1.1
## 2   13 2757         8.5                 6           3.4
## 3   14 2792        18.1                 5           8.7
## 4   15 2956        29.2                 6          14.5
## 5   16 3058        40.1                10          22.5
## 6   17 3038        49.3                13          28.0
## 7   18 2469        58.7                24          33.7
## 8   19 2223        64.6                36          33.4
## 9   20 2271        69.7                48          34.0
## 10  21 2354        83.2                52          33.0
##    marijuana.frequency cocaine.use cocaine.frequency crack.use
## 1                    4         0.1               5.0       0.0
## 2                   15         0.1               1.0       0.0
## 3                   24         0.1               5.5       0.0
## 4                   25         0.5               4.0       0.1
## 5                   30         1.0               7.0       0.0
## 6                   36         2.0               5.0       0.1
## 7                   52         3.2               5.0       0.4
## 8                   60         4.1               5.5       0.5
## 9                   60         4.9               8.0       0.6
## 10                  52         4.8               5.0       0.5
##    crack.frequency heroin.use heroin.frequency hallucinogen.use
## 1               NA        0.1             35.5              0.2
## 2              3.0        0.0               NA              0.6
## 3               NA        0.1              2.0              1.6
## 4              9.5        0.2              1.0              2.1
## 5              1.0        0.1             66.5              3.4
## 6             21.0        0.1             64.0              4.8
## 7             10.0        0.4             46.0              7.0
## 8              2.0        0.5            180.0              8.6
## 9              5.0        0.9             45.0              7.4
## 10            17.0        0.6             30.0              6.3
##    hallucinogen.frequency inhalant.use inhalant.frequency
## 1                      52          1.6               19.0
## 2                       6          2.5               12.0
## 3                       3          2.6                5.0
## 4                       4          2.5                5.5
## 5                       3          3.0                3.0
## 6                       3          2.0                4.0
## 7                       4          1.8                4.0
## 8                       3          1.4                3.0
## 9                       2          1.5                4.0
## 10                      4          1.4                2.0
##    pain.releiver.use pain.releiver.frequency oxycontin.use
## 1                2.0                      36           0.1
## 2                2.4                      14           0.1
## 3                3.9                      12           0.4
## 4                5.5                      10           0.8
## 5                6.2                       7           1.1
## 6                8.5                       9           1.4
## 7                9.2                      12           1.7
## 8                9.4                      12           1.5
## 9               10.0                      10           1.7
## 10               9.0                      15           1.3
##    oxycontin.frequency tranquilizer.use tranquilizer.frequency
## 1                 24.5              0.2                   52.0
## 2                 41.0              0.3                   25.5
## 3                  4.5              0.9                    5.0
## 4                  3.0              2.0                    4.5
## 5                  4.0              2.4                   11.0
## 6                  6.0              3.5                    7.0
## 7                  7.0              4.9                   12.0
## 8                  7.5              4.2                    4.5
## 9                 12.0              5.4                   10.0
## 10                13.5              3.9                    7.0
##    stimulant.use stimulant.frequency meth.use meth.frequency sedative.use
## 1            0.2                 2.0      0.0             NA          0.2
## 2            0.3                 4.0      0.1            5.0          0.1
## 3            0.8                12.0      0.1           24.0          0.2
## 4            1.5                 6.0      0.3           10.5          0.4
## 5            1.8                 9.5      0.3           36.0          0.2
## 6            2.8                 9.0      0.6           48.0          0.5
## 7            3.0                 8.0      0.5           12.0          0.4
## 8            3.3                 6.0      0.4          105.0          0.3
## 9            4.0                12.0      0.9           12.0          0.5
## 10           4.1                10.0      0.6            2.0          0.3
##    sedative.frequency
## 1                13.0
## 2                19.0
## 3                16.5
## 4                30.0
## 5                 3.0
## 6                 6.5
## 7                10.0
## 8                 6.0
## 9                 4.0
## 10                9.0

Rename columns

##Rename clumns for easy manipulation 
dataset <- dataset %>%
  rename("pain releiver.use" = pain.releiver.use,
         "pain releiver.frequency" = pain.releiver.frequency)

Gather columns

data2 <- dataset%>%
  gather(drug, value, alcohol.use:sedative.frequency) %>% 
  arrange(age)
 head(data2,5) 
##   age    n                drug value
## 1  12 2798         alcohol.use   3.9
## 2  12 2798   alcohol.frequency   3.0
## 3  12 2798       marijuana.use   1.1
## 4  12 2798 marijuana.frequency   4.0
## 5  12 2798         cocaine.use   0.1

Separate column

data3<-data2 %>% 
  separate(drug, into = c("drug", "var"), sep = "\\." )
head(data3,5)
##   age    n      drug       var value
## 1  12 2798   alcohol       use   3.9
## 2  12 2798   alcohol frequency   3.0
## 3  12 2798 marijuana       use   1.1
## 4  12 2798 marijuana frequency   4.0
## 5  12 2798   cocaine       use   0.1

Filter and arrange

newdata <- data3 %>% 
  filter(var=="use") %>%
  arrange(drug)
  head(newdata,5)
##   age    n    drug var value
## 1  12 2798 alcohol use   3.9
## 2  13 2757 alcohol use   8.5
## 3  14 2792 alcohol use  18.1
## 4  15 2956 alcohol use  29.2
## 5  16 3058 alcohol use  40.1

Mean, Median and Standard deviation of overall drug usage

group_by(newdata, age)  %>% summarise(mean_usage = mean(value),median_usage = median(value), sd_usage = sd(value))
## # A tibble: 17 x 4
##    age   mean_usage median_usage sd_usage
##    <fct>      <dbl>        <dbl>    <dbl>
##  1 12         0.746          0.2     1.15
##  2 13         1.42           0.3     2.42
##  3 14         2.88           0.8     5.17
##  4 15         4.58           1.5     8.36
##  5 16         6.32           1.8    11.8 
##  6 17         7.97           2      14.5 
##  7 18         9.61           3      17.2 
##  8 19        10.2            3.3    18.6 
##  9 20        10.9            4      19.8 
## 10 21        11.5            3.9    23.2 
## 11 22-23     11.2            3.6    23.2 
## 12 24-25     10.5            2.6    22.8 
## 13 26-29      9.73           2.3    22.0 
## 14 30-34      8.59           1.4    21.2 
## 15 35-49      7.38           0.6    20.5 
## 16 50-64      6.26           0.4    18.4 
## 17 65+        3.95           0      13.6

Frequency of drug consumption

newdata1 <- data3 %>% 
  filter(var=="frequency") %>%
  arrange(drug)
  head(newdata1)
##   age    n    drug       var value
## 1  12 2798 alcohol frequency     3
## 2  13 2757 alcohol frequency     6
## 3  14 2792 alcohol frequency     5
## 4  15 2956 alcohol frequency     6
## 5  16 3058 alcohol frequency    10
## 6  17 3038 alcohol frequency    13
group_by(newdata1, age)  %>% summarise(mean_frequency = mean(value),median_frequency = median(value), sd_frequency = sd(value))
## # A tibble: 17 x 4
##    age   mean_frequency median_frequency sd_frequency
##    <fct>          <dbl>            <dbl>        <dbl>
##  1 12             NA                  NA       NaN   
##  2 13             NA                  NA       NaN   
##  3 14             NA                  NA       NaN   
##  4 15              9.15                6         8.66
##  5 16             14.7                 7        18.9 
##  6 17             17.8                 9        19.4 
##  7 18             15.8                10        15.6 
##  8 19             33.1                 6        53.5 
##  9 20             17.8                10        19.5 
## 10 21             16.8                10        17.3 
## 11 22-23          25.5                15        22.3 
## 12 24-25          23.8                15        26.3 
## 13 26-29          19.2                10        19.6 
## 14 30-34          28.5                15        25.5 
## 15 35-49          48                  15        75.2 
## 16 50-64          37.3                36        27.0 
## 17 65+            NA                  NA       NaN
total_frequency<-aggregate(newdata1$value, by=list(DRUG=newdata1$drug), FUN=sum)

##  rename column
total_frequency<- dplyr::rename(total_frequency, 'Total'='x')
total_frequency
##             DRUG Total
## 1        alcohol 567.0
## 2        cocaine    NA
## 3          crack    NA
## 4   hallucinogen 143.0
## 5         heroin    NA
## 6       inhalant    NA
## 7      marijuana 730.0
## 8           meth    NA
## 9      oxycontin    NA
## 10 pain releiver 250.0
## 11      sedative 329.5
## 12     stimulant 529.5
## 13  tranquilizer 199.5
ggplot(
  total_frequency, aes(x = DRUG, y = Total ,fill=DRUG)) + 
  geom_bar(stat="identity") +
  ggtitle("DRUG CONSUMTION FREQUENCY")+ 
  theme(axis.text=element_text(angle=45))+
  labs(x="DRUG",y="Frequency")
## Warning: Removed 6 rows containing missing values (position_stack).

Let’s further investigate Alchohol and Marijuana consumption

Alchohol

alcohol <- data3 %>% 
  filter(drug=="alcohol")

head(alcohol,5)
##   age    n    drug       var value
## 1  12 2798 alcohol       use   3.9
## 2  12 2798 alcohol frequency   3.0
## 3  13 2757 alcohol       use   8.5
## 4  13 2757 alcohol frequency   6.0
## 5  14 2792 alcohol       use  18.1
ggplot(alcohol, aes(x = factor(age), y = value, colour = var, size = 1)) + 
    geom_point() + xlab('age')+ggtitle("Alchohol")+theme(axis.text=element_text(angle=45))

Based on the data visualisation the highest value of Alchohol consumption is between 21-25 y.o and it is slightly going down with the age. It is interesting that the frequency of Alchohol use reaches its peak at 21y.o and almost not changing during the life.

Marijuana

marijuana <- data3 %>% 
  filter(drug=="marijuana") 
head(marijuana,5)
##   age    n      drug       var value
## 1  12 2798 marijuana       use   1.1
## 2  12 2798 marijuana frequency   4.0
## 3  13 2757 marijuana       use   3.4
## 4  13 2757 marijuana frequency  15.0
## 5  14 2792 marijuana       use   8.7
ggplot(marijuana, aes(x = factor(age), y = value, colour = var, size=1)) + 
    geom_point() + xlab('age')+  ggtitle("Marijuana")+theme(axis.text=element_text(angle=45))

Smoking Marijuana reaches its peak between 18-21 y.o and slightly goes down. Interesting, that the highest frequency point is 30-34 y.o.

I was surprised that 12 y.o kids were included in the data and decided to check what drugs are commonly used.

age_12 <- data3 %>% 
  filter(age == 12) 
ggplot(age_12, aes(x = factor(drug), y = value, colour = var, size=1)) + 
    geom_point() + xlab('age')+ggtitle("12 years old")+theme(axis.text=element_text(angle=45))
## Warning: Removed 2 rows containing missing values (geom_point).

Let’s now review 65 y.o.

age_65 <- data3 %>% 
  filter(age == '65+') 
ggplot(age_65, aes(x = factor(drug), y = value, colour = var, size=1)) + 
    geom_point() + xlab('age')+ggtitle("65+ years old")+  theme(axis.text=element_text(angle=45))
## Warning: Removed 5 rows containing missing values (geom_point).