Qualitative bivariate visualisation

Load and wrangle - Hair_Eye_Colour Data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
Hair_Eye_Colour <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/Hair_Eye_Colour.csv")

#run the str() function to summarise the data frame
str(Hair_Eye_Colour)
## 'data.frame':    592 obs. of  3 variables:
##  $ Hair  : chr  "Black" "Black" "Black" "Black" ...
##  $ Eyes  : chr  "Brown" "Brown" "Brown" "Brown" ...
##  $ Gender: chr  "Male" "Male" "Male" "Male" ...
#Summarise the 2 qualitative variables as follows
crosstab1 <- table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
crosstab1
##         Eyes
## Hair     Blue Brown Green Hazel
##   Black    20    68     5    15
##   Blonde   94     7    16    10
##   Brown    84   119    29    54
##   Red      17    26    14    14
margin.table(crosstab1,1) #Row marginals
## Hair
##  Black Blonde  Brown    Red 
##    108    127    286     71
margin.table(crosstab1,2) #Column marginals
## Eyes
##  Blue Brown Green Hazel 
##   215   220    64    93

Bar chart

p12 <- ggplot(data = Hair_Eye_Colour, aes(x = Hair, fill = Eyes))
p12 + geom_bar()

# The stacked barchart has perceptual issues. Fix this by converting counts to proportions
p12 + geom_bar(position = "fill")

#Alternately, we coul dcluster by hair colour so they share a baseline
p12 + geom_bar(position = "dodge")

#To compare proportions we need to create a table of conditional proportions
#Create crosstabulation
crosstab1<-table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
prop.table(crosstab1, 1) #Row proportions
##         Eyes
## Hair           Blue      Brown      Green      Hazel
##   Black  0.18518519 0.62962963 0.04629630 0.13888889
##   Blonde 0.74015748 0.05511811 0.12598425 0.07874016
##   Brown  0.29370629 0.41608392 0.10139860 0.18881119
##   Red    0.23943662 0.36619718 0.19718310 0.19718310
#Given a person has brown eyes, what is their most likely hair colour?
prop.table(crosstab1, 2) #Column proportions
##         Eyes
## Hair           Blue      Brown      Green      Hazel
##   Black  0.09302326 0.30909091 0.07812500 0.16129032
##   Blonde 0.43720930 0.03181818 0.25000000 0.10752688
##   Brown  0.39069767 0.54090909 0.45312500 0.58064516
##   Red    0.07906977 0.11818182 0.21875000 0.15053763
#Add proportions to dataframe
crosstab1 <- data.frame(prop.table(crosstab1, 1)) #Convert proportion table to df
str(crosstab1) #Data frame summary
## 'data.frame':    16 obs. of  3 variables:
##  $ Hair: Factor w/ 4 levels "Black","Blonde",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Eyes: Factor w/ 4 levels "Blue","Brown",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Freq: num  0.185 0.74 0.294 0.239 0.63 ...
colnames(crosstab1) <- c("Hair","Eyes","Proportion") #Fix variable names
str(crosstab1)
## 'data.frame':    16 obs. of  3 variables:
##  $ Hair      : Factor w/ 4 levels "Black","Blonde",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Eyes      : Factor w/ 4 levels "Blue","Brown",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Proportion: num  0.185 0.74 0.294 0.239 0.63 ...
#Now plot barchart
p13 <- ggplot(data = crosstab1, aes(x = Hair, y = Proportion, fill = Eyes))
p13 + geom_bar(stat = "identity",position = "dodge") +
  labs(y = "Proportion within Hair Colour")

#Change colours
p13 + geom_bar(stat = "identity",position = "dodge") +
  labs(y = "Proportion within Hair Colour") +
  scale_fill_manual(values = c("#1569C7","#94703D","#566638","#6B7E47"))

## Mosaic Plots

library(vcd)
## Loading required package: grid
vcd::mosaic(~ Hair + Eyes, data = Hair_Eye_Colour, dnn = c("Hair","Eyes"),
       shade=TRUE, pop = FALSE)

#Add conditional proportions - create table of labels
crosstab1<-table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
labs<-round(prop.table(crosstab1,1),2)
labs
##         Eyes
## Hair     Blue Brown Green Hazel
##   Black  0.19  0.63  0.05  0.14
##   Blonde 0.74  0.06  0.13  0.08
##   Brown  0.29  0.42  0.10  0.19
##   Red    0.24  0.37  0.20  0.20
#Create mosaic plot and add the labels
vcd::mosaic(crosstab1, pop = FALSE, legend=TRUE,shade=TRUE)
labeling_cells(text = labs, margin=0)(crosstab1)

#install ggmosaic package if needed
#install.packages("ggmosaic")
#library(ggmosaic)

#This package assumes you are using a frequency cross tabulation
#tb <- table(Hair_Eye_Colour$Hair, Hair_Eye_Colour$Eyes)
#tb <- data.frame(tb)
#colnames(tb) <- c("Hair", "Eyes", "Freq")
#tb

#now ggplot
#p14 <- ggplot(tb)
#p14 + geom_mosaic(aes(x = product(Hair), weight = Freq, fill = Eyes)) + labs(x = "Hair Colour")

#add value labels in centre of each stacked bar
#levVar1 <- length(levels(Hair_Eye_Colour$Hair))

#jointTable <- prop.table(table(Hair_Eye_Colour$Hair, Hair_Eye_Colour$Eyes))
#plotData <- as.data.frame(jointTable)
#plotData$marginVar1 <- prop.table(table(Hair_Eye_Colour$Hair))
#plotData$var2Height <- plotData$Freq / plotData$marginVar1
#plotData$var1Center <- c(0, cumsum(plotData$marginVar1)[1:levVar1 -1]) +
#  plotData$marginVar1 / 2
#df<-data.frame(prop.table(table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes),1))
#df<-group_by(df,Var1)
#df<-transmute(df,
#              csum = (cumsum(Freq)-Freq)+(Freq/2))
#plotData$centerlab <- df$csum

#p14 + geom_mosaic(aes(x = product(Hair), weight = Freq, fill = Eyes)) +
#  labs(x = "Hair Colour", y = "Eye Colour Proportion within Hair Colour") +
#  geom_text(data = plotData, aes(x = var1Center, y = centerlab,label=round(var2Height,2)),
#            inherit.aes = FALSE)