#Quantitative Univariate visualisation ## Load and wrangle - Youtube Data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Youtube <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/Youtube.csv")
#there are issues with the binwidths in this histogram
p6 <- ggplot(Youtube, aes(x = duration))
p6 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#find the bin widths (xmin/xmax) and counts for each bin
p6 <- p6 + geom_histogram()
hist <- ggplot_build(p6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist$data
## [[1]]
## y count x xmin xmax density ncount
## 1 21654 21654 0.0000 -445.5862 445.5862 9.653303e-04 1.000000e+00
## 2 2948 2948 891.1724 445.5862 1336.7586 1.314212e-04 1.361411e-01
## 3 303 303 1782.3448 1336.7586 2227.9310 1.350767e-05 1.399280e-02
## 4 125 125 2673.5172 2227.9310 3119.1034 5.572471e-06 5.772606e-03
## 5 61 61 3564.6897 3119.1034 4010.2759 2.719366e-06 2.817031e-03
## 6 27 27 4455.8621 4010.2759 4901.4483 1.203654e-06 1.246883e-03
## 7 17 17 5347.0345 4901.4483 5792.6207 7.578560e-07 7.850744e-04
## 8 6 6 6238.2069 5792.6207 6683.7931 2.674786e-07 2.770851e-04
## 9 14 14 7129.3793 6683.7931 7574.9655 6.241167e-07 6.465318e-04
## 10 7 7 8020.5517 7574.9655 8466.1379 3.120584e-07 3.232659e-04
## 11 0 0 8911.7241 8466.1379 9357.3103 0.000000e+00 0.000000e+00
## 12 4 4 9802.8966 9357.3103 10248.4828 1.783191e-07 1.847234e-04
## 13 1 1 10694.0690 10248.4828 11139.6552 4.457977e-08 4.618084e-05
## 14 0 0 11585.2414 11139.6552 12030.8276 0.000000e+00 0.000000e+00
## 15 0 0 12476.4138 12030.8276 12922.0000 0.000000e+00 0.000000e+00
## 16 0 0 13367.5862 12922.0000 13813.1724 0.000000e+00 0.000000e+00
## 17 1 1 14258.7586 13813.1724 14704.3448 4.457977e-08 4.618084e-05
## 18 0 0 15149.9310 14704.3448 15595.5172 0.000000e+00 0.000000e+00
## 19 1 1 16041.1034 15595.5172 16486.6897 4.457977e-08 4.618084e-05
## 20 0 0 16932.2759 16486.6897 17377.8621 0.000000e+00 0.000000e+00
## 21 1 1 17823.4483 17377.8621 18269.0345 4.457977e-08 4.618084e-05
## 22 0 0 18714.6207 18269.0345 19160.2069 0.000000e+00 0.000000e+00
## 23 0 0 19605.7931 19160.2069 20051.3793 0.000000e+00 0.000000e+00
## 24 0 0 20496.9655 20051.3793 20942.5517 0.000000e+00 0.000000e+00
## 25 0 0 21388.1379 20942.5517 21833.7241 0.000000e+00 0.000000e+00
## 26 0 0 22279.3103 21833.7241 22724.8966 0.000000e+00 0.000000e+00
## 27 0 0 23170.4828 22724.8966 23616.0690 0.000000e+00 0.000000e+00
## 28 0 0 24061.6552 23616.0690 24507.2414 0.000000e+00 0.000000e+00
## 29 0 0 24952.8276 24507.2414 25398.4138 0.000000e+00 0.000000e+00
## 30 1 1 25844.0000 25398.4138 26289.5862 4.457977e-08 4.618084e-05
## ndensity flipped_aes PANEL group ymin ymax colour fill size linetype
## 1 1.000000e+00 FALSE 1 -1 0 21654 NA grey35 0.5 1
## 2 1.361411e-01 FALSE 1 -1 0 2948 NA grey35 0.5 1
## 3 1.399280e-02 FALSE 1 -1 0 303 NA grey35 0.5 1
## 4 5.772606e-03 FALSE 1 -1 0 125 NA grey35 0.5 1
## 5 2.817031e-03 FALSE 1 -1 0 61 NA grey35 0.5 1
## 6 1.246883e-03 FALSE 1 -1 0 27 NA grey35 0.5 1
## 7 7.850744e-04 FALSE 1 -1 0 17 NA grey35 0.5 1
## 8 2.770851e-04 FALSE 1 -1 0 6 NA grey35 0.5 1
## 9 6.465318e-04 FALSE 1 -1 0 14 NA grey35 0.5 1
## 10 3.232659e-04 FALSE 1 -1 0 7 NA grey35 0.5 1
## 11 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 12 1.847234e-04 FALSE 1 -1 0 4 NA grey35 0.5 1
## 13 4.618084e-05 FALSE 1 -1 0 1 NA grey35 0.5 1
## 14 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 15 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 16 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 17 4.618084e-05 FALSE 1 -1 0 1 NA grey35 0.5 1
## 18 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 19 4.618084e-05 FALSE 1 -1 0 1 NA grey35 0.5 1
## 20 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 21 4.618084e-05 FALSE 1 -1 0 1 NA grey35 0.5 1
## 22 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 23 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 24 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 25 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 26 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 27 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 28 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 29 0.000000e+00 FALSE 1 -1 0 0 NA grey35 0.5 1
## 30 4.618084e-05 FALSE 1 -1 0 1 NA grey35 0.5 1
## alpha
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## 7 NA
## 8 NA
## 9 NA
## 10 NA
## 11 NA
## 12 NA
## 13 NA
## 14 NA
## 15 NA
## 16 NA
## 17 NA
## 18 NA
## 19 NA
## 20 NA
## 21 NA
## 22 NA
## 23 NA
## 24 NA
## 25 NA
## 26 NA
## 27 NA
## 28 NA
## 29 NA
## 30 NA
#Can use a boxplot to check for outliers creating issues with the scale
p7 <- ggplot(Youtube, aes(x = factor(1), y = duration))
p7 + geom_boxplot(width = .25)
#Filter out the outliers <Q1-IQR*3 and >Q3+IQR*3.
#Use qqplot_build function to identify where these fences are located
p7 <- p7 + geom_boxplot(width = .25)
box <- ggplot_build(p7)
box$data[[1]][1:5]
## ymin lower middle upper ymax
## 1 1 52 139 281 624
#ymin and ymax refer to the lower and upper outlier fences respectively (1 and 624)
#filter the outliers
Youtube_clean<-filter(Youtube, duration > 1 & duration < 624)
#Rerun box plot with outliers filtered out
p8 <- ggplot(Youtube_clean, aes(x = factor(1), y = duration))
p8 + geom_boxplot(width = .25)
#Rerun histogram
p9 <- ggplot(Youtube_clean, aes(x = duration))
p9 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Improve histogram visualisation
p9 + geom_histogram(colour = "white", bins = 100)
p9 + geom_density(fill = "grey")
## Overlay Histogram and density plot
p9 + geom_density(fill = "grey") +
geom_histogram(colour="white",aes(duration,..density..),
alpha = 1/2,bins=100)
# To ensure the histogram and density plot share the same scale, we use the ..density.. aesthetic option in geom_histogram(). We also use the alpha option to make the histogram transparent. As you can see, the density estimate smooths out the minor peaks and troughs that are apparent in the histogram. You can add some additional transparency and colour to help differentiate the two plot types. Use the following code:
p9 + geom_density(fill = "dodgerblue", alpha = 1/2) +
geom_histogram(colour="white",aes(duration,..density..),
alpha = 1/2,bins = 100)
# Add markers and annotations for mean and median
p9 <- p9 + geom_density(fill = "dodgerblue", alpha = 1/2) +
geom_histogram(colour="white",aes(duration,..density..),
alpha = 1/2,bins = 100) +
geom_vline(xintercept= median(Youtube_clean$duration)) +
annotate("text",label = "Median",x = 190, y = 0.006) +
geom_vline(xintercept= mean(Youtube_clean$duration),linetype=2) +
annotate("text",label = "Mean",x = 240, y = 0.004)
p9
p10 <- ggplot(Youtube_clean,aes(x=factor(1),y = duration))
p10 + geom_violin(width = .25,fill="grey")
# Overlap with boxplot to see how each offers a different perspective of the data
p10 <- ggplot(Youtube_clean,aes(x=factor(1),y = duration))
p10 + geom_violin(width = .25, fill="grey") + geom_boxplot(width = .25, alpha = .25)
##Stacked Dot Plot
p11 <- ggplot(Youtube_clean,aes(x = duration))
p11 + geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
#Adjust the bins - the sample size is too big to support use of dot plot
p11 <- ggplot(Youtube_clean,aes(x = duration))
p11 + geom_dotplot(binwidth = 10) +
theme(axis.text.y= element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank())
# Take a random sample of n=50
set.seed(462243) #Set the random seed to replicate the plot below
p11 <- ggplot(sample_n(Youtube_clean,50),aes(x = duration))
p11 + geom_dotplot() +
theme(axis.text.y= element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank())
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
# Use the cowplot package to align multiple ggplots together
# It is important to assign all the layers to the plotting object before calling the plot_grid() function from the cowplot package
# First ensure plotting layers are all assigned correctly to the p8 & p9 ggplot objects
# Notice how we set the scale limits to scale_x_continuous(limits = c(0, 800)) for the density plot and and scale_y_continuous(limits = c(0, 800)) for the box plot. This ensures the plots are perfectly aligned by sharing a common scale.
p8 <- ggplot(Youtube_clean, aes(x = factor(1), y = duration)) +
geom_boxplot(width = .50) + scale_y_continuous(limits = c(0, 800))
p9 <- ggplot(Youtube_clean, aes(x = duration)) +
geom_density(fill = "dodgerblue", alpha = 1/2) +
geom_histogram(colour="white",aes(duration,..density..),
alpha = 1/2,bins = 100) +
geom_vline(xintercept= median(Youtube_clean$duration)) +
annotate("text",label = "Median",x = 180, y = 0.006) +
geom_vline(xintercept= mean(Youtube_clean$duration),linetype=2) +
annotate("text",label = "Mean",x = 240, y = 0.004) +
scale_x_continuous(limits = c(0, 800))
# Now install (if needed) and load the cowplot
#install.packages("cowplot")
library(cowplot)
#cowplot may change the default ggplot grey theme. To retain the grey theme:
theme_set(theme_gray())
#Now juxtapose the plots
plot_grid(p9, p8 + coord_flip() + theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y = element_blank()), ncol=1, align="v",
rel_heights = c(2,1))
## Warning: Removed 2 rows containing missing values (geom_bar).
#We had to flip the box plot to run horizontally using coord_flip(). We also set a number of layout parameters to juxtapose correctly. ncol sets the number of columns to use in the grid, align sets the alignment as vertical or horizontal, and rel_heights sets the relative heights of each plot. In the code above, we set p9 to be twice the size of p8, the box plot.
#Also, notice how the y-axis label for the box plot was suppressed. This is a nice example of juxtaposing visualisations of the same data. Sometimes this can lead to surprising findings and also a greater understanding behind the nature of different methods.