library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Hair_Eye_Colour <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/Hair_Eye_Colour.csv")
#run the str() function to summarise the data frame
str(Hair_Eye_Colour)
## 'data.frame': 592 obs. of 3 variables:
## $ Hair : chr "Black" "Black" "Black" "Black" ...
## $ Eyes : chr "Brown" "Brown" "Brown" "Brown" ...
## $ Gender: chr "Male" "Male" "Male" "Male" ...
#Summarise the 2 qualitative variables as follows
crosstab1 <- table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
crosstab1
## Eyes
## Hair Blue Brown Green Hazel
## Black 20 68 5 15
## Blonde 94 7 16 10
## Brown 84 119 29 54
## Red 17 26 14 14
margin.table(crosstab1,1) #Row marginals
## Hair
## Black Blonde Brown Red
## 108 127 286 71
margin.table(crosstab1,2) #Column marginals
## Eyes
## Blue Brown Green Hazel
## 215 220 64 93
p12 <- ggplot(data = Hair_Eye_Colour, aes(x = Hair, fill = Eyes))
p12 + geom_bar()
# The stacked barchart has perceptual issues. Fix this by converting counts to proportions
p12 + geom_bar(position = "fill")
#Alternately, we coul dcluster by hair colour so they share a baseline
p12 + geom_bar(position = "dodge")
#To compare proportions we need to create a table of conditional proportions
#Create crosstabulation
crosstab1<-table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
prop.table(crosstab1, 1) #Row proportions
## Eyes
## Hair Blue Brown Green Hazel
## Black 0.18518519 0.62962963 0.04629630 0.13888889
## Blonde 0.74015748 0.05511811 0.12598425 0.07874016
## Brown 0.29370629 0.41608392 0.10139860 0.18881119
## Red 0.23943662 0.36619718 0.19718310 0.19718310
#Given a person has brown eyes, what is their most likely hair colour?
prop.table(crosstab1, 2) #Column proportions
## Eyes
## Hair Blue Brown Green Hazel
## Black 0.09302326 0.30909091 0.07812500 0.16129032
## Blonde 0.43720930 0.03181818 0.25000000 0.10752688
## Brown 0.39069767 0.54090909 0.45312500 0.58064516
## Red 0.07906977 0.11818182 0.21875000 0.15053763
#Add proportions to dataframe
crosstab1 <- data.frame(prop.table(crosstab1, 1)) #Convert proportion table to df
str(crosstab1) #Data frame summary
## 'data.frame': 16 obs. of 3 variables:
## $ Hair: Factor w/ 4 levels "Black","Blonde",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Eyes: Factor w/ 4 levels "Blue","Brown",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Freq: num 0.185 0.74 0.294 0.239 0.63 ...
colnames(crosstab1) <- c("Hair","Eyes","Proportion") #Fix variable names
str(crosstab1)
## 'data.frame': 16 obs. of 3 variables:
## $ Hair : Factor w/ 4 levels "Black","Blonde",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Eyes : Factor w/ 4 levels "Blue","Brown",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Proportion: num 0.185 0.74 0.294 0.239 0.63 ...
#Now plot barchart
p13 <- ggplot(data = crosstab1, aes(x = Hair, y = Proportion, fill = Eyes))
p13 + geom_bar(stat = "identity",position = "dodge") +
labs(y = "Proportion within Hair Colour")
#Change colours
p13 + geom_bar(stat = "identity",position = "dodge") +
labs(y = "Proportion within Hair Colour") +
scale_fill_manual(values = c("#1569C7","#94703D","#566638","#6B7E47"))
## Mosaic Plots
library(vcd)
## Loading required package: grid
vcd::mosaic(~ Hair + Eyes, data = Hair_Eye_Colour, dnn = c("Hair","Eyes"),
shade=TRUE, pop = FALSE)
#Add conditional proportions - create table of labels
crosstab1<-table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes, dnn = c("Hair","Eyes"))
labs<-round(prop.table(crosstab1,1),2)
labs
## Eyes
## Hair Blue Brown Green Hazel
## Black 0.19 0.63 0.05 0.14
## Blonde 0.74 0.06 0.13 0.08
## Brown 0.29 0.42 0.10 0.19
## Red 0.24 0.37 0.20 0.20
#Create mosaic plot and add the labels
vcd::mosaic(crosstab1, pop = FALSE, legend=TRUE,shade=TRUE)
labeling_cells(text = labs, margin=0)(crosstab1)
#install ggmosaic package if needed
#install.packages("ggmosaic")
#library(ggmosaic)
#This package assumes you are using a frequency cross tabulation
#tb <- table(Hair_Eye_Colour$Hair, Hair_Eye_Colour$Eyes)
#tb <- data.frame(tb)
#colnames(tb) <- c("Hair", "Eyes", "Freq")
#tb
#now ggplot
#p14 <- ggplot(tb)
#p14 + geom_mosaic(aes(x = product(Hair), weight = Freq, fill = Eyes)) + labs(x = "Hair Colour")
#add value labels in centre of each stacked bar
#levVar1 <- length(levels(Hair_Eye_Colour$Hair))
#jointTable <- prop.table(table(Hair_Eye_Colour$Hair, Hair_Eye_Colour$Eyes))
#plotData <- as.data.frame(jointTable)
#plotData$marginVar1 <- prop.table(table(Hair_Eye_Colour$Hair))
#plotData$var2Height <- plotData$Freq / plotData$marginVar1
#plotData$var1Center <- c(0, cumsum(plotData$marginVar1)[1:levVar1 -1]) +
# plotData$marginVar1 / 2
#df<-data.frame(prop.table(table(Hair_Eye_Colour$Hair,Hair_Eye_Colour$Eyes),1))
#df<-group_by(df,Var1)
#df<-transmute(df,
# csum = (cumsum(Freq)-Freq)+(Freq/2))
#plotData$centerlab <- df$csum
#p14 + geom_mosaic(aes(x = product(Hair), weight = Freq, fill = Eyes)) +
# labs(x = "Hair Colour", y = "Eye Colour Proportion within Hair Colour") +
# geom_text(data = plotData, aes(x = var1Center, y = centerlab,label=round(var2Height,2)),
# inherit.aes = FALSE)