#Quantitative Univariate visualisation ## Load and wrangle - Youtube Data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
Youtube <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/Youtube.csv")

Histogram

#there are issues with the binwidths in this histogram
p6 <- ggplot(Youtube, aes(x = duration))
p6 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#find the bin widths (xmin/xmax) and counts for each bin
p6 <- p6 + geom_histogram()
hist <- ggplot_build(p6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist$data
## [[1]]
##        y count          x       xmin       xmax      density       ncount
## 1  21654 21654     0.0000  -445.5862   445.5862 9.653303e-04 1.000000e+00
## 2   2948  2948   891.1724   445.5862  1336.7586 1.314212e-04 1.361411e-01
## 3    303   303  1782.3448  1336.7586  2227.9310 1.350767e-05 1.399280e-02
## 4    125   125  2673.5172  2227.9310  3119.1034 5.572471e-06 5.772606e-03
## 5     61    61  3564.6897  3119.1034  4010.2759 2.719366e-06 2.817031e-03
## 6     27    27  4455.8621  4010.2759  4901.4483 1.203654e-06 1.246883e-03
## 7     17    17  5347.0345  4901.4483  5792.6207 7.578560e-07 7.850744e-04
## 8      6     6  6238.2069  5792.6207  6683.7931 2.674786e-07 2.770851e-04
## 9     14    14  7129.3793  6683.7931  7574.9655 6.241167e-07 6.465318e-04
## 10     7     7  8020.5517  7574.9655  8466.1379 3.120584e-07 3.232659e-04
## 11     0     0  8911.7241  8466.1379  9357.3103 0.000000e+00 0.000000e+00
## 12     4     4  9802.8966  9357.3103 10248.4828 1.783191e-07 1.847234e-04
## 13     1     1 10694.0690 10248.4828 11139.6552 4.457977e-08 4.618084e-05
## 14     0     0 11585.2414 11139.6552 12030.8276 0.000000e+00 0.000000e+00
## 15     0     0 12476.4138 12030.8276 12922.0000 0.000000e+00 0.000000e+00
## 16     0     0 13367.5862 12922.0000 13813.1724 0.000000e+00 0.000000e+00
## 17     1     1 14258.7586 13813.1724 14704.3448 4.457977e-08 4.618084e-05
## 18     0     0 15149.9310 14704.3448 15595.5172 0.000000e+00 0.000000e+00
## 19     1     1 16041.1034 15595.5172 16486.6897 4.457977e-08 4.618084e-05
## 20     0     0 16932.2759 16486.6897 17377.8621 0.000000e+00 0.000000e+00
## 21     1     1 17823.4483 17377.8621 18269.0345 4.457977e-08 4.618084e-05
## 22     0     0 18714.6207 18269.0345 19160.2069 0.000000e+00 0.000000e+00
## 23     0     0 19605.7931 19160.2069 20051.3793 0.000000e+00 0.000000e+00
## 24     0     0 20496.9655 20051.3793 20942.5517 0.000000e+00 0.000000e+00
## 25     0     0 21388.1379 20942.5517 21833.7241 0.000000e+00 0.000000e+00
## 26     0     0 22279.3103 21833.7241 22724.8966 0.000000e+00 0.000000e+00
## 27     0     0 23170.4828 22724.8966 23616.0690 0.000000e+00 0.000000e+00
## 28     0     0 24061.6552 23616.0690 24507.2414 0.000000e+00 0.000000e+00
## 29     0     0 24952.8276 24507.2414 25398.4138 0.000000e+00 0.000000e+00
## 30     1     1 25844.0000 25398.4138 26289.5862 4.457977e-08 4.618084e-05
##        ndensity flipped_aes PANEL group ymin  ymax colour   fill size linetype
## 1  1.000000e+00       FALSE     1    -1    0 21654     NA grey35  0.5        1
## 2  1.361411e-01       FALSE     1    -1    0  2948     NA grey35  0.5        1
## 3  1.399280e-02       FALSE     1    -1    0   303     NA grey35  0.5        1
## 4  5.772606e-03       FALSE     1    -1    0   125     NA grey35  0.5        1
## 5  2.817031e-03       FALSE     1    -1    0    61     NA grey35  0.5        1
## 6  1.246883e-03       FALSE     1    -1    0    27     NA grey35  0.5        1
## 7  7.850744e-04       FALSE     1    -1    0    17     NA grey35  0.5        1
## 8  2.770851e-04       FALSE     1    -1    0     6     NA grey35  0.5        1
## 9  6.465318e-04       FALSE     1    -1    0    14     NA grey35  0.5        1
## 10 3.232659e-04       FALSE     1    -1    0     7     NA grey35  0.5        1
## 11 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 12 1.847234e-04       FALSE     1    -1    0     4     NA grey35  0.5        1
## 13 4.618084e-05       FALSE     1    -1    0     1     NA grey35  0.5        1
## 14 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 15 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 16 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 17 4.618084e-05       FALSE     1    -1    0     1     NA grey35  0.5        1
## 18 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 19 4.618084e-05       FALSE     1    -1    0     1     NA grey35  0.5        1
## 20 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 21 4.618084e-05       FALSE     1    -1    0     1     NA grey35  0.5        1
## 22 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 23 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 24 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 25 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 26 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 27 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 28 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 29 0.000000e+00       FALSE     1    -1    0     0     NA grey35  0.5        1
## 30 4.618084e-05       FALSE     1    -1    0     1     NA grey35  0.5        1
##    alpha
## 1     NA
## 2     NA
## 3     NA
## 4     NA
## 5     NA
## 6     NA
## 7     NA
## 8     NA
## 9     NA
## 10    NA
## 11    NA
## 12    NA
## 13    NA
## 14    NA
## 15    NA
## 16    NA
## 17    NA
## 18    NA
## 19    NA
## 20    NA
## 21    NA
## 22    NA
## 23    NA
## 24    NA
## 25    NA
## 26    NA
## 27    NA
## 28    NA
## 29    NA
## 30    NA
#Can use a boxplot to check for outliers creating issues with the scale
p7 <- ggplot(Youtube, aes(x = factor(1), y = duration))
p7 + geom_boxplot(width = .25)

#Filter out the outliers <Q1-IQR*3 and >Q3+IQR*3.
#Use qqplot_build function to identify where these fences are located
p7 <- p7 + geom_boxplot(width = .25)
box <- ggplot_build(p7)
box$data[[1]][1:5]
##   ymin lower middle upper ymax
## 1    1    52    139   281  624
#ymin and ymax refer to the lower and upper outlier fences respectively (1 and 624)

#filter the outliers
Youtube_clean<-filter(Youtube, duration > 1 & duration < 624)

#Rerun box plot with outliers filtered out
p8 <- ggplot(Youtube_clean, aes(x = factor(1), y = duration))
p8 + geom_boxplot(width = .25)

#Rerun histogram
p9 <- ggplot(Youtube_clean, aes(x = duration))
p9 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Improve histogram visualisation
p9 + geom_histogram(colour = "white", bins = 100)

Density Plot

p9 + geom_density(fill = "grey")

## Overlay Histogram and density plot

p9 + geom_density(fill = "grey") +
  geom_histogram(colour="white",aes(duration,..density..),
                 alpha = 1/2,bins=100)

# To ensure the histogram and density plot share the same scale, we use the ..density.. aesthetic option in geom_histogram(). We also use the alpha option to make the histogram transparent. As you can see, the density estimate smooths out the minor peaks and troughs that are apparent in the histogram. You can add some additional transparency and colour to help differentiate the two plot types. Use the following code:
p9 + geom_density(fill = "dodgerblue", alpha = 1/2) +
  geom_histogram(colour="white",aes(duration,..density..),
                 alpha = 1/2,bins = 100)

# Add markers and annotations for mean and median
p9 <- p9 + geom_density(fill = "dodgerblue", alpha = 1/2) +
  geom_histogram(colour="white",aes(duration,..density..),
                 alpha = 1/2,bins = 100) +
  geom_vline(xintercept= median(Youtube_clean$duration)) +
  annotate("text",label = "Median",x = 190, y = 0.006) +
  geom_vline(xintercept= mean(Youtube_clean$duration),linetype=2) +
  annotate("text",label = "Mean",x = 240, y = 0.004)
p9

Violin Plot

p10 <- ggplot(Youtube_clean,aes(x=factor(1),y = duration))
p10 + geom_violin(width = .25,fill="grey")

# Overlap with boxplot to see how each offers a different perspective of the data
p10 <- ggplot(Youtube_clean,aes(x=factor(1),y = duration))
p10 + geom_violin(width = .25, fill="grey") + geom_boxplot(width = .25, alpha = .25)

##Stacked Dot Plot

p11 <- ggplot(Youtube_clean,aes(x = duration))
p11 + geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

#Adjust the bins - the sample size is too big to support use of dot plot
p11 <- ggplot(Youtube_clean,aes(x = duration))
p11 + geom_dotplot(binwidth = 10) +
  theme(axis.text.y= element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank())

# Take a random sample of n=50
set.seed(462243) #Set the random seed to replicate the plot below
p11 <- ggplot(sample_n(Youtube_clean,50),aes(x = duration))
p11 + geom_dotplot() +
  theme(axis.text.y= element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank())
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Juxtaposing

# Use the cowplot package to align multiple ggplots together
# It is important to assign all the layers to the plotting object before calling the plot_grid() function from the cowplot package
# First ensure plotting layers are all assigned correctly to the p8 & p9 ggplot objects
# Notice how we set the scale limits to scale_x_continuous(limits = c(0, 800)) for the density plot and and scale_y_continuous(limits = c(0, 800)) for the box plot. This ensures the plots are perfectly aligned by sharing a common scale.

p8 <- ggplot(Youtube_clean, aes(x = factor(1), y = duration)) +
  geom_boxplot(width = .50) + scale_y_continuous(limits = c(0, 800))

p9 <- ggplot(Youtube_clean, aes(x = duration)) +
  geom_density(fill = "dodgerblue", alpha = 1/2) +
  geom_histogram(colour="white",aes(duration,..density..),
                 alpha = 1/2,bins = 100) +
  geom_vline(xintercept= median(Youtube_clean$duration)) +
  annotate("text",label = "Median",x = 180, y = 0.006) +
  geom_vline(xintercept= mean(Youtube_clean$duration),linetype=2) +
  annotate("text",label = "Mean",x = 240, y = 0.004) +
  scale_x_continuous(limits = c(0, 800))

# Now install (if needed) and load the cowplot
#install.packages("cowplot")
library(cowplot)

#cowplot may change the default ggplot grey theme. To retain the grey theme:
theme_set(theme_gray())

#Now juxtapose the plots
plot_grid(p9, p8 + coord_flip() + theme(axis.title.y=element_blank(),
                                        axis.text.y=element_blank(),
                                        axis.ticks.y = element_blank()), ncol=1, align="v",
          rel_heights = c(2,1))
## Warning: Removed 2 rows containing missing values (geom_bar).

#We had to flip the box plot to run horizontally using coord_flip(). We also set a number of layout parameters to juxtapose correctly. ncol sets the number of columns to use in the grid, align sets the alignment as vertical or horizontal, and rel_heights sets the relative heights of each plot. In the code above, we set p9 to be twice the size of p8, the box plot.

#Also, notice how the y-axis label for the box plot was suppressed. This is a nice example of juxtaposing visualisations of the same data. Sometimes this can lead to surprising findings and also a greater understanding behind the nature of different methods.