Data 607 Project 2a

Extract and prepare data

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.5.3

library(ggplot2)

url<-"https://raw.githubusercontent.com/uplotnik/Data607/master/drug%20use%20by%20age.csv"

##Read empty and "-" cells with NA
dataset <- read.csv(url, na.strings = c("-", "NA"),
                    sep = ",", header = TRUE)
head(dataset,10)

##    age    n alcohol.use alcohol.frequency marijuana.use
## 1   12 2798         3.9                 3           1.1
## 2   13 2757         8.5                 6           3.4
## 3   14 2792        18.1                 5           8.7
## 4   15 2956        29.2                 6          14.5
## 5   16 3058        40.1                10          22.5
## 6   17 3038        49.3                13          28.0
## 7   18 2469        58.7                24          33.7
## 8   19 2223        64.6                36          33.4
## 9   20 2271        69.7                48          34.0
## 10  21 2354        83.2                52          33.0
##    marijuana.frequency cocaine.use cocaine.frequency crack.use
## 1                    4         0.1               5.0       0.0
## 2                   15         0.1               1.0       0.0
## 3                   24         0.1               5.5       0.0
## 4                   25         0.5               4.0       0.1
## 5                   30         1.0               7.0       0.0
## 6                   36         2.0               5.0       0.1
## 7                   52         3.2               5.0       0.4
## 8                   60         4.1               5.5       0.5
## 9                   60         4.9               8.0       0.6
## 10                  52         4.8               5.0       0.5
##    crack.frequency heroin.use heroin.frequency hallucinogen.use
## 1               NA        0.1             35.5              0.2
## 2              3.0        0.0               NA              0.6
## 3               NA        0.1              2.0              1.6
## 4              9.5        0.2              1.0              2.1
## 5              1.0        0.1             66.5              3.4
## 6             21.0        0.1             64.0              4.8
## 7             10.0        0.4             46.0              7.0
## 8              2.0        0.5            180.0              8.6
## 9              5.0        0.9             45.0              7.4
## 10            17.0        0.6             30.0              6.3
##    hallucinogen.frequency inhalant.use inhalant.frequency
## 1                      52          1.6               19.0
## 2                       6          2.5               12.0
## 3                       3          2.6                5.0
## 4                       4          2.5                5.5
## 5                       3          3.0                3.0
## 6                       3          2.0                4.0
## 7                       4          1.8                4.0
## 8                       3          1.4                3.0
## 9                       2          1.5                4.0
## 10                      4          1.4                2.0
##    pain.releiver.use pain.releiver.frequency oxycontin.use
## 1                2.0                      36           0.1
## 2                2.4                      14           0.1
## 3                3.9                      12           0.4
## 4                5.5                      10           0.8
## 5                6.2                       7           1.1
## 6                8.5                       9           1.4
## 7                9.2                      12           1.7
## 8                9.4                      12           1.5
## 9               10.0                      10           1.7
## 10               9.0                      15           1.3
##    oxycontin.frequency tranquilizer.use tranquilizer.frequency
## 1                 24.5              0.2                   52.0
## 2                 41.0              0.3                   25.5
## 3                  4.5              0.9                    5.0
## 4                  3.0              2.0                    4.5
## 5                  4.0              2.4                   11.0
## 6                  6.0              3.5                    7.0
## 7                  7.0              4.9                   12.0
## 8                  7.5              4.2                    4.5
## 9                 12.0              5.4                   10.0
## 10                13.5              3.9                    7.0
##    stimulant.use stimulant.frequency meth.use meth.frequency sedative.use
## 1            0.2                 2.0      0.0             NA          0.2
## 2            0.3                 4.0      0.1            5.0          0.1
## 3            0.8                12.0      0.1           24.0          0.2
## 4            1.5                 6.0      0.3           10.5          0.4
## 5            1.8                 9.5      0.3           36.0          0.2
## 6            2.8                 9.0      0.6           48.0          0.5
## 7            3.0                 8.0      0.5           12.0          0.4
## 8            3.3                 6.0      0.4          105.0          0.3
## 9            4.0                12.0      0.9           12.0          0.5
## 10           4.1                10.0      0.6            2.0          0.3
##    sedative.frequency
## 1                13.0
## 2                19.0
## 3                16.5
## 4                30.0
## 5                 3.0
## 6                 6.5
## 7                10.0
## 8                 6.0
## 9                 4.0
## 10                9.0

Rename columns

##Rename clumns for easy manipulation 
dataset <- dataset %>%
  rename("pain releiver.use" = pain.releiver.use,
         "pain releiver.frequency" = pain.releiver.frequency)

Gather columns

data2 <- dataset%>%
  gather(drug, value, alcohol.use:sedative.frequency) %>% 
  arrange(age)
 head(data2,5)

##   age    n                drug value
## 1  12 2798         alcohol.use   3.9
## 2  12 2798   alcohol.frequency   3.0
## 3  12 2798       marijuana.use   1.1
## 4  12 2798 marijuana.frequency   4.0
## 5  12 2798         cocaine.use   0.1

Separate column

data3<-data2 %>% 
  separate(drug, into = c("drug", "var"), sep = "\\." )
head(data3,5)

##   age    n      drug       var value
## 1  12 2798   alcohol       use   3.9
## 2  12 2798   alcohol frequency   3.0
## 3  12 2798 marijuana       use   1.1
## 4  12 2798 marijuana frequency   4.0
## 5  12 2798   cocaine       use   0.1

Filter and arrange

newdata <- data3 %>% 
  filter(var=="use") %>%
  arrange(drug)
  head(newdata,5)

##   age    n    drug var value
## 1  12 2798 alcohol use   3.9
## 2  13 2757 alcohol use   8.5
## 3  14 2792 alcohol use  18.1
## 4  15 2956 alcohol use  29.2
## 5  16 3058 alcohol use  40.1

Mean, Median and Standard deviation of overall drug usage

group_by(newdata, age)  %>% summarise(mean_usage = mean(value),median_usage = median(value), sd_usage = sd(value))

## # A tibble: 17 x 4
##    age   mean_usage median_usage sd_usage
##    <fct>      <dbl>        <dbl>    <dbl>
##  1 12         0.746          0.2     1.15
##  2 13         1.42           0.3     2.42
##  3 14         2.88           0.8     5.17
##  4 15         4.58           1.5     8.36
##  5 16         6.32           1.8    11.8 
##  6 17         7.97           2      14.5 
##  7 18         9.61           3      17.2 
##  8 19        10.2            3.3    18.6 
##  9 20        10.9            4      19.8 
## 10 21        11.5            3.9    23.2 
## 11 22-23     11.2            3.6    23.2 
## 12 24-25     10.5            2.6    22.8 
## 13 26-29      9.73           2.3    22.0 
## 14 30-34      8.59           1.4    21.2 
## 15 35-49      7.38           0.6    20.5 
## 16 50-64      6.26           0.4    18.4 
## 17 65+        3.95           0      13.6

The most popular drug

total_drug<-aggregate(newdata$value, by=list(DRUG=newdata$drug), FUN=sum)%>%
  group_by(DRUG) %>% 
  summarise(count=(x)) %>% 
  mutate(perc=count/sum(count))

##  rename column
total_drug<- dplyr::rename(total_drug, 'Total'='count')
head (total_drug,5 )

## # A tibble: 5 x 3
##   DRUG         Total    perc
##   <chr>        <dbl>   <dbl>
## 1 alcohol      942.  0.586  
## 2 cocaine       37   0.0230 
## 3 crack          5   0.00311
## 4 hallucinogen  57.7 0.0359 
## 5 heroin         6   0.00373

    ggplot(total_drug, aes(x = DRUG, y = perc, fill = DRUG, label = scales::percent(perc))) + 
    geom_col(position = 'dodge') + 
    geom_text(position = position_dodge(width = .9),    # move to center of bars
              vjust = -0.5,    # nudge above top of bar
              size = 3) + 
    scale_y_continuous(labels = scales::percent)+  theme(axis.text=element_text(angle=90))+  ggtitle("DRUG CONSUMTION 12 y.o - 65+ y.o")

The most popular drug by age

group_by(newdata, age)  %>% filter(value == max(value)) %>% arrange(age)  %>% select (age,drug,value)

## # A tibble: 17 x 3
## # Groups:   age [17]
##    age   drug    value
##    <fct> <chr>   <dbl>
##  1 12    alcohol   3.9
##  2 13    alcohol   8.5
##  3 14    alcohol  18.1
##  4 15    alcohol  29.2
##  5 16    alcohol  40.1
##  6 17    alcohol  49.3
##  7 18    alcohol  58.7
##  8 19    alcohol  64.6
##  9 20    alcohol  69.7
## 10 21    alcohol  83.2
## 11 22-23 alcohol  84.2
## 12 24-25 alcohol  83.1
## 13 26-29 alcohol  80.7
## 14 30-34 alcohol  77.5
## 15 35-49 alcohol  75  
## 16 50-64 alcohol  67.2
## 17 65+   alcohol  49.3

The least popular drug by age

group_by(newdata, age)  %>% filter(value == min(value)) %>% arrange(age)  %>% select (age,drug,value)

## # A tibble: 33 x 3
## # Groups:   age [17]
##    age   drug   value
##    <fct> <chr>  <dbl>
##  1 12    crack    0  
##  2 12    meth     0  
##  3 13    crack    0  
##  4 13    heroin   0  
##  5 14    crack    0  
##  6 15    crack    0.1
##  7 16    crack    0  
##  8 17    crack    0.1
##  9 17    heroin   0.1
## 10 18    crack    0.4
## # ... with 23 more rows

Frequency of drug consumption

newdata1 <- data3 %>% 
  filter(var=="frequency") %>%
  arrange(drug)
  head(newdata1)

##   age    n    drug       var value
## 1  12 2798 alcohol frequency     3
## 2  13 2757 alcohol frequency     6
## 3  14 2792 alcohol frequency     5
## 4  15 2956 alcohol frequency     6
## 5  16 3058 alcohol frequency    10
## 6  17 3038 alcohol frequency    13

group_by(newdata1, age)  %>% summarise(mean_frequency = mean(value),median_frequency = median(value), sd_frequency = sd(value))

## # A tibble: 17 x 4
##    age   mean_frequency median_frequency sd_frequency
##    <fct>          <dbl>            <dbl>        <dbl>
##  1 12             NA                  NA       NaN   
##  2 13             NA                  NA       NaN   
##  3 14             NA                  NA       NaN   
##  4 15              9.15                6         8.66
##  5 16             14.7                 7        18.9 
##  6 17             17.8                 9        19.4 
##  7 18             15.8                10        15.6 
##  8 19             33.1                 6        53.5 
##  9 20             17.8                10        19.5 
## 10 21             16.8                10        17.3 
## 11 22-23          25.5                15        22.3 
## 12 24-25          23.8                15        26.3 
## 13 26-29          19.2                10        19.6 
## 14 30-34          28.5                15        25.5 
## 15 35-49          48                  15        75.2 
## 16 50-64          37.3                36        27.0 
## 17 65+            NA                  NA       NaN

total_frequency<-aggregate(newdata1$value, by=list(DRUG=newdata1$drug), FUN=sum)

##  rename column
total_frequency<- dplyr::rename(total_frequency, 'Total'='x')
total_frequency

##             DRUG Total
## 1        alcohol 567.0
## 2        cocaine    NA
## 3          crack    NA
## 4   hallucinogen 143.0
## 5         heroin    NA
## 6       inhalant    NA
## 7      marijuana 730.0
## 8           meth    NA
## 9      oxycontin    NA
## 10 pain releiver 250.0
## 11      sedative 329.5
## 12     stimulant 529.5
## 13  tranquilizer 199.5

ggplot(
  total_frequency, aes(x = DRUG, y = Total ,fill=DRUG)) + 
  geom_bar(stat="identity") +
  ggtitle("DRUG CONSUMTION FREQUENCY")+ 
  theme(axis.text=element_text(angle=45))+
  labs(x="DRUG",y="Frequency")

## Warning: Removed 6 rows containing missing values (position_stack).

Let’s further investigate Alchohol and Marijuana consumption

Alchohol

alcohol <- data3 %>% 
  filter(drug=="alcohol")

head(alcohol,5)

##   age    n    drug       var value
## 1  12 2798 alcohol       use   3.9
## 2  12 2798 alcohol frequency   3.0
## 3  13 2757 alcohol       use   8.5
## 4  13 2757 alcohol frequency   6.0
## 5  14 2792 alcohol       use  18.1

ggplot(alcohol, aes(x = factor(age), y = value, colour = var, size = 1)) + 
    geom_point() + xlab('age')+ggtitle("Alchohol")+theme(axis.text=element_text(angle=45))

Based on the data visualisation the highest value of Alchohol consumption is between 21-25 y.o and it is slightly going down with the age. It is interesting that the frequency of Alchohol use reaches its peak at 21y.o and almost not changing during the life.

Marijuana

marijuana <- data3 %>% 
  filter(drug=="marijuana") 
head(marijuana,5)

##   age    n      drug       var value
## 1  12 2798 marijuana       use   1.1
## 2  12 2798 marijuana frequency   4.0
## 3  13 2757 marijuana       use   3.4
## 4  13 2757 marijuana frequency  15.0
## 5  14 2792 marijuana       use   8.7

ggplot(marijuana, aes(x = factor(age), y = value, colour = var, size=1)) + 
    geom_point() + xlab('age')+  ggtitle("Marijuana")+theme(axis.text=element_text(angle=45))

Smoking Marijuana reaches its peak between 18-21 y.o and slightly goes down. Interesting, that the highest frequency point is 30-34 y.o.

I was surprised that 12 y.o kids were included in the data and decided to check what drugs are commonly used.

age_12 <- data3 %>% 
  filter(age == 12)

ggplot(age_12, aes(x = factor(drug), y = value, colour = var, size=1)) + 
    geom_point() + xlab('age')+ggtitle("12 years old")+theme(axis.text=element_text(angle=45))

## Warning: Removed 2 rows containing missing values (geom_point).

Let’s now review 65 y.o.

age_65 <- data3 %>% 
  filter(age == '65+')

ggplot(age_65, aes(x = factor(drug), y = value, colour = var, size=1)) + 
    geom_point() + xlab('age')+ggtitle("65+ years old")+  theme(axis.text=element_text(angle=45))

## Warning: Removed 5 rows containing missing values (geom_point).