Quantitative bivariate visualisation

Scatterplot Matrix

Body <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/Body.csv")
library(ggplot2)
#library(GGally)  
#use this visualisation to quickly scan for the strongest relationships
#ggpairs(Body, columns = c(3,6,7,10:19),axisLabels = "internal")

p3 <- ggplot(data = Body, aes(x = Abdomen, y = BFP_Siri))
p3 + geom_point()

#Add linear and smoothed trend lines
p3 + geom_point() + geom_smooth(method = "lm") + geom_smooth(colour = "red")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Add rug plots
p3 + geom_point() + geom_smooth(method = "lm") + geom_smooth(colour = "red") +
  geom_rug(alpha = 1/2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#If you have dense data, can represent bivariate density using contours
p3 + geom_point() + geom_density2d()

Convert to Long Table

economics <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/economics.csv")

economics$date <- as.Date(economics$date, format = "%d/%m/%Y")

#time series plot
p4 <-ggplot(data = economics, aes(x = date, pop))
p4 + geom_line() + labs(title = "US Population Growth 1967 - 2015")

#convert to long dataset format
library(tidyr) #Load tidyr to access gather() function
economics_l <- gather(economics, # Data frame
                      Variable, # Name of the variable to contain the original variable names
                      Value, # Name of the variable to contain the variables' values
                      pce:unemploy) # The variables to be merged into long format

economics_l$Variable <- factor(economics_l$Variable, # Define and label variable factor
                               labels = c("PCE",
                                        "Population '000",
                                        "PSR",
                                        "Unemployed '000",
                                        "Unemployed Duration"
                                        ))
head(economics_l)
##         date Variable Value
## 1 1967-01-07      PCE 507.4
## 2 1967-01-08      PCE 510.5
## 3 1967-01-09      PCE 516.3
## 4 1967-01-10      PCE 512.9
## 5 1967-01-11      PCE 518.1
## 6 1967-01-12      PCE 525.8
#plot
p4 <-ggplot(data = economics_l, aes(x = date, y = Value))
p4 + geom_line() + facet_grid(Variable ~ ., scales = "free",
                              labeller = label_value) +
  labs(title = "US Economic Data 1967 - 2015", y = "")

#smooth data by using quarterly data instead of monthly
#Reduce resolution
#Add a month and year variable to the wide economics dataset
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
economics <- mutate(economics,
                    quarter = quarters(date),
                    year = format(economics$date, "%Y"))

#Group the data by year and quarter
economics_ag<-group_by(economics, year, quarter)

#Create a summarised dataset with mean values for yearly quarters
#Save the date for the last day of each quarter
economics_ag <- summarise(economics_ag,
                          pce = mean(pce),
                          pop = mean(pop),
                          psavert = mean(psavert),
                          uempmed = mean(uempmed),
                          unemploy = mean(unemploy),
                          date = max(date))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
#Restructure data to long format
economics_ag_l <- gather(economics_ag,
                         Variable,
                         Value,
                         pce:unemploy)

#Assign factor and labels
economics_ag_l$Variable <- factor(economics_ag_l$Variable, #Define and label variable factor
                                  labels = c("PCE",
                                             "Population '000",
                                             "PSR",
                                             "Unemployed '000",
                                             "Unemployed Duration"))
head(economics_ag_l)
## # A tibble: 6 x 5
## # Groups:   year [4]
##   year  quarter date       Variable Value
##   <chr> <chr>   <date>     <fct>    <dbl>
## 1 1967  Q1      1967-01-12 PCE       515.
## 2 1968  Q1      1968-01-12 PCE       557.
## 3 1969  Q1      1969-01-12 PCE       604.
## 4 1970  Q1      1970-03-01 PCE       633.
## 5 1970  Q2      1970-06-01 PCE       643.
## 6 1970  Q3      1970-09-01 PCE       654.
#plot quarterly data
p5<-ggplot(data = economics_ag_l, aes(x = date, y = Value))
p5 + geom_line() + facet_grid(Variable ~ ., scales = "free",
                              labeller = label_value) +
  labs(title = "US Economic Quarterly Data 1967 - 2015 ", y = "")

# One Qual on Quant variable

mpg <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/mpg.csv")

Side-by-side box plots

p6 <- ggplot(data = mpg, aes(x = class, y = cty))
p6 + geom_boxplot()

#order from lowest to highest
mpg_rank <- mpg %>% group_by(class) %>% summarise(med = median(cty))

mpg$class <- mpg$class %>% factor(levels = mpg_rank$class[order(-mpg_rank$med)])

p6 <- ggplot(data = mpg, aes(x = class, y = cty))
p6 + geom_boxplot()

#horizontal
p6 + geom_boxplot() + coord_flip()

## Side-by-side variations

#violins
p6 + geom_violin() +
  stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.

#jittered point plots
p6 + geom_jitter(width = .2, alpha = .25) +
  stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.

#stacked dot plots
p6 + geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 1/2, alpha = .25) +
  stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.