library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(ggplot2)
url<-"https://raw.githubusercontent.com/uplotnik/Data607/master/drug%20use%20by%20age.csv"
##Read empty and "-" cells with NA
dataset <- read.csv(url, na.strings = c("-", "NA"),
sep = ",", header = TRUE)
head(dataset,10)
## age n alcohol.use alcohol.frequency marijuana.use
## 1 12 2798 3.9 3 1.1
## 2 13 2757 8.5 6 3.4
## 3 14 2792 18.1 5 8.7
## 4 15 2956 29.2 6 14.5
## 5 16 3058 40.1 10 22.5
## 6 17 3038 49.3 13 28.0
## 7 18 2469 58.7 24 33.7
## 8 19 2223 64.6 36 33.4
## 9 20 2271 69.7 48 34.0
## 10 21 2354 83.2 52 33.0
## marijuana.frequency cocaine.use cocaine.frequency crack.use
## 1 4 0.1 5.0 0.0
## 2 15 0.1 1.0 0.0
## 3 24 0.1 5.5 0.0
## 4 25 0.5 4.0 0.1
## 5 30 1.0 7.0 0.0
## 6 36 2.0 5.0 0.1
## 7 52 3.2 5.0 0.4
## 8 60 4.1 5.5 0.5
## 9 60 4.9 8.0 0.6
## 10 52 4.8 5.0 0.5
## crack.frequency heroin.use heroin.frequency hallucinogen.use
## 1 NA 0.1 35.5 0.2
## 2 3.0 0.0 NA 0.6
## 3 NA 0.1 2.0 1.6
## 4 9.5 0.2 1.0 2.1
## 5 1.0 0.1 66.5 3.4
## 6 21.0 0.1 64.0 4.8
## 7 10.0 0.4 46.0 7.0
## 8 2.0 0.5 180.0 8.6
## 9 5.0 0.9 45.0 7.4
## 10 17.0 0.6 30.0 6.3
## hallucinogen.frequency inhalant.use inhalant.frequency
## 1 52 1.6 19.0
## 2 6 2.5 12.0
## 3 3 2.6 5.0
## 4 4 2.5 5.5
## 5 3 3.0 3.0
## 6 3 2.0 4.0
## 7 4 1.8 4.0
## 8 3 1.4 3.0
## 9 2 1.5 4.0
## 10 4 1.4 2.0
## pain.releiver.use pain.releiver.frequency oxycontin.use
## 1 2.0 36 0.1
## 2 2.4 14 0.1
## 3 3.9 12 0.4
## 4 5.5 10 0.8
## 5 6.2 7 1.1
## 6 8.5 9 1.4
## 7 9.2 12 1.7
## 8 9.4 12 1.5
## 9 10.0 10 1.7
## 10 9.0 15 1.3
## oxycontin.frequency tranquilizer.use tranquilizer.frequency
## 1 24.5 0.2 52.0
## 2 41.0 0.3 25.5
## 3 4.5 0.9 5.0
## 4 3.0 2.0 4.5
## 5 4.0 2.4 11.0
## 6 6.0 3.5 7.0
## 7 7.0 4.9 12.0
## 8 7.5 4.2 4.5
## 9 12.0 5.4 10.0
## 10 13.5 3.9 7.0
## stimulant.use stimulant.frequency meth.use meth.frequency sedative.use
## 1 0.2 2.0 0.0 NA 0.2
## 2 0.3 4.0 0.1 5.0 0.1
## 3 0.8 12.0 0.1 24.0 0.2
## 4 1.5 6.0 0.3 10.5 0.4
## 5 1.8 9.5 0.3 36.0 0.2
## 6 2.8 9.0 0.6 48.0 0.5
## 7 3.0 8.0 0.5 12.0 0.4
## 8 3.3 6.0 0.4 105.0 0.3
## 9 4.0 12.0 0.9 12.0 0.5
## 10 4.1 10.0 0.6 2.0 0.3
## sedative.frequency
## 1 13.0
## 2 19.0
## 3 16.5
## 4 30.0
## 5 3.0
## 6 6.5
## 7 10.0
## 8 6.0
## 9 4.0
## 10 9.0
##Rename clumns for easy manipulation
dataset <- dataset %>%
rename("pain releiver.use" = pain.releiver.use,
"pain releiver.frequency" = pain.releiver.frequency)
data2 <- dataset%>%
gather(drug, value, alcohol.use:sedative.frequency) %>%
arrange(age)
head(data2,5)
## age n drug value
## 1 12 2798 alcohol.use 3.9
## 2 12 2798 alcohol.frequency 3.0
## 3 12 2798 marijuana.use 1.1
## 4 12 2798 marijuana.frequency 4.0
## 5 12 2798 cocaine.use 0.1
data3<-data2 %>%
separate(drug, into = c("drug", "var"), sep = "\\." )
head(data3,5)
## age n drug var value
## 1 12 2798 alcohol use 3.9
## 2 12 2798 alcohol frequency 3.0
## 3 12 2798 marijuana use 1.1
## 4 12 2798 marijuana frequency 4.0
## 5 12 2798 cocaine use 0.1
newdata <- data3 %>%
filter(var=="use") %>%
arrange(drug)
head(newdata,5)
## age n drug var value
## 1 12 2798 alcohol use 3.9
## 2 13 2757 alcohol use 8.5
## 3 14 2792 alcohol use 18.1
## 4 15 2956 alcohol use 29.2
## 5 16 3058 alcohol use 40.1
group_by(newdata, age) %>% summarise(mean_usage = mean(value),median_usage = median(value), sd_usage = sd(value))
## # A tibble: 17 x 4
## age mean_usage median_usage sd_usage
## <fct> <dbl> <dbl> <dbl>
## 1 12 0.746 0.2 1.15
## 2 13 1.42 0.3 2.42
## 3 14 2.88 0.8 5.17
## 4 15 4.58 1.5 8.36
## 5 16 6.32 1.8 11.8
## 6 17 7.97 2 14.5
## 7 18 9.61 3 17.2
## 8 19 10.2 3.3 18.6
## 9 20 10.9 4 19.8
## 10 21 11.5 3.9 23.2
## 11 22-23 11.2 3.6 23.2
## 12 24-25 10.5 2.6 22.8
## 13 26-29 9.73 2.3 22.0
## 14 30-34 8.59 1.4 21.2
## 15 35-49 7.38 0.6 20.5
## 16 50-64 6.26 0.4 18.4
## 17 65+ 3.95 0 13.6
total_drug<-aggregate(newdata$value, by=list(DRUG=newdata$drug), FUN=sum)%>%
group_by(DRUG) %>%
summarise(count=(x)) %>%
mutate(perc=count/sum(count))
## rename column
total_drug<- dplyr::rename(total_drug, 'Total'='count')
head (total_drug,5 )
## # A tibble: 5 x 3
## DRUG Total perc
## <chr> <dbl> <dbl>
## 1 alcohol 942. 0.586
## 2 cocaine 37 0.0230
## 3 crack 5 0.00311
## 4 hallucinogen 57.7 0.0359
## 5 heroin 6 0.00373
ggplot(total_drug, aes(x = DRUG, y = perc, fill = DRUG, label = scales::percent(perc))) +
geom_col(position = 'dodge') +
geom_text(position = position_dodge(width = .9), # move to center of bars
vjust = -0.5, # nudge above top of bar
size = 3) +
scale_y_continuous(labels = scales::percent)+ theme(axis.text=element_text(angle=90))+ ggtitle("DRUG CONSUMTION 12 y.o - 65+ y.o")
group_by(newdata, age) %>% filter(value == max(value)) %>% arrange(age) %>% select (age,drug,value)
## # A tibble: 17 x 3
## # Groups: age [17]
## age drug value
## <fct> <chr> <dbl>
## 1 12 alcohol 3.9
## 2 13 alcohol 8.5
## 3 14 alcohol 18.1
## 4 15 alcohol 29.2
## 5 16 alcohol 40.1
## 6 17 alcohol 49.3
## 7 18 alcohol 58.7
## 8 19 alcohol 64.6
## 9 20 alcohol 69.7
## 10 21 alcohol 83.2
## 11 22-23 alcohol 84.2
## 12 24-25 alcohol 83.1
## 13 26-29 alcohol 80.7
## 14 30-34 alcohol 77.5
## 15 35-49 alcohol 75
## 16 50-64 alcohol 67.2
## 17 65+ alcohol 49.3
group_by(newdata, age) %>% filter(value == min(value)) %>% arrange(age) %>% select (age,drug,value)
## # A tibble: 33 x 3
## # Groups: age [17]
## age drug value
## <fct> <chr> <dbl>
## 1 12 crack 0
## 2 12 meth 0
## 3 13 crack 0
## 4 13 heroin 0
## 5 14 crack 0
## 6 15 crack 0.1
## 7 16 crack 0
## 8 17 crack 0.1
## 9 17 heroin 0.1
## 10 18 crack 0.4
## # ... with 23 more rows
newdata1 <- data3 %>%
filter(var=="frequency") %>%
arrange(drug)
head(newdata1)
## age n drug var value
## 1 12 2798 alcohol frequency 3
## 2 13 2757 alcohol frequency 6
## 3 14 2792 alcohol frequency 5
## 4 15 2956 alcohol frequency 6
## 5 16 3058 alcohol frequency 10
## 6 17 3038 alcohol frequency 13
group_by(newdata1, age) %>% summarise(mean_frequency = mean(value),median_frequency = median(value), sd_frequency = sd(value))
## # A tibble: 17 x 4
## age mean_frequency median_frequency sd_frequency
## <fct> <dbl> <dbl> <dbl>
## 1 12 NA NA NaN
## 2 13 NA NA NaN
## 3 14 NA NA NaN
## 4 15 9.15 6 8.66
## 5 16 14.7 7 18.9
## 6 17 17.8 9 19.4
## 7 18 15.8 10 15.6
## 8 19 33.1 6 53.5
## 9 20 17.8 10 19.5
## 10 21 16.8 10 17.3
## 11 22-23 25.5 15 22.3
## 12 24-25 23.8 15 26.3
## 13 26-29 19.2 10 19.6
## 14 30-34 28.5 15 25.5
## 15 35-49 48 15 75.2
## 16 50-64 37.3 36 27.0
## 17 65+ NA NA NaN
total_frequency<-aggregate(newdata1$value, by=list(DRUG=newdata1$drug), FUN=sum)
## rename column
total_frequency<- dplyr::rename(total_frequency, 'Total'='x')
total_frequency
## DRUG Total
## 1 alcohol 567.0
## 2 cocaine NA
## 3 crack NA
## 4 hallucinogen 143.0
## 5 heroin NA
## 6 inhalant NA
## 7 marijuana 730.0
## 8 meth NA
## 9 oxycontin NA
## 10 pain releiver 250.0
## 11 sedative 329.5
## 12 stimulant 529.5
## 13 tranquilizer 199.5
ggplot(
total_frequency, aes(x = DRUG, y = Total ,fill=DRUG)) +
geom_bar(stat="identity") +
ggtitle("DRUG CONSUMTION FREQUENCY")+
theme(axis.text=element_text(angle=45))+
labs(x="DRUG",y="Frequency")
## Warning: Removed 6 rows containing missing values (position_stack).
Let’s further investigate Alchohol and Marijuana consumption
alcohol <- data3 %>%
filter(drug=="alcohol")
head(alcohol,5)
## age n drug var value
## 1 12 2798 alcohol use 3.9
## 2 12 2798 alcohol frequency 3.0
## 3 13 2757 alcohol use 8.5
## 4 13 2757 alcohol frequency 6.0
## 5 14 2792 alcohol use 18.1
ggplot(alcohol, aes(x = factor(age), y = value, colour = var, size = 1)) +
geom_point() + xlab('age')+ggtitle("Alchohol")+theme(axis.text=element_text(angle=45))
Based on the data visualisation the highest value of Alchohol consumption is between 21-25 y.o and it is slightly going down with the age. It is interesting that the frequency of Alchohol use reaches its peak at 21y.o and almost not changing during the life.
marijuana <- data3 %>%
filter(drug=="marijuana")
head(marijuana,5)
## age n drug var value
## 1 12 2798 marijuana use 1.1
## 2 12 2798 marijuana frequency 4.0
## 3 13 2757 marijuana use 3.4
## 4 13 2757 marijuana frequency 15.0
## 5 14 2792 marijuana use 8.7
ggplot(marijuana, aes(x = factor(age), y = value, colour = var, size=1)) +
geom_point() + xlab('age')+ ggtitle("Marijuana")+theme(axis.text=element_text(angle=45))
Smoking Marijuana reaches its peak between 18-21 y.o and slightly goes down. Interesting, that the highest frequency point is 30-34 y.o.
I was surprised that 12 y.o kids were included in the data and decided to check what drugs are commonly used.
age_12 <- data3 %>%
filter(age == 12)
ggplot(age_12, aes(x = factor(drug), y = value, colour = var, size=1)) +
geom_point() + xlab('age')+ggtitle("12 years old")+theme(axis.text=element_text(angle=45))
## Warning: Removed 2 rows containing missing values (geom_point).
Let’s now review 65 y.o.
age_65 <- data3 %>%
filter(age == '65+')
ggplot(age_65, aes(x = factor(drug), y = value, colour = var, size=1)) +
geom_point() + xlab('age')+ggtitle("65+ years old")+ theme(axis.text=element_text(angle=45))
## Warning: Removed 5 rows containing missing values (geom_point).