Introduction

This is a quick EDA on the dataset. I am using my own “blue” theme background for the plots.

Data can be found here: https://www.kaggle.com/australian-bureau-of-statistics/australian-marriage-law-postal-survey/data

View Our Data

glimpse(vote)
## Observations: 27,000
## Variables: 11
## $ State           <fctr> New South Wales Divisions, New South Wales Di...
## $ ElectDivision   <fctr> Banks, Banks, Banks, Banks, Banks, Banks, Ban...
## $ Gender          <fctr> female, female, female, female, female, femal...
## $ Age             <fctr> 18-19, 20-24, 25-29, 30-34, 35-39, 40-44, 45-...
## $ Eligible        <fctr> 1,514, 4,095, 3,959, 4,070, 4,362, 4,511, 4,8...
## $ UnderAge        <int> 18, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70...
## $ SeniorAge       <int> 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74...
## $ Vote            <fctr> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, ...
## $ Total_Votes     <int> 37736, 37736, 37736, 37736, 37736, 37736, 3773...
## $ Responses       <fctr> ClearResponse, ClearResponse, ClearResponse, ...
## $ Total_Responses <int> 84079, 84079, 84079, 84079, 84079, 84079, 8407...

Now let’s start with an exploratory Data Analysis on our dataset

Data Visualisation

Yes or No Vote Per Each State

ggplot(vote, aes(State, y= Total_Votes, fill = Vote)) + geom_bar(stat = "identity", position = "dodge") + 
  labs(title = "Australian Same-Sex Marriage Law Survey", subtitle = "How each territory has voted, 'Yes' or 'No' to legalised same-sex marriages in the country", caption="Source:marriagesurvey.abs.gov.au. Plot@twitter.com/brenborbon")  + scale_fill_manual(values=c("#66C2A5", "#3288BD")) + bluetheme + xlab("") + 
  ylab("")

We can see the Australian Capital Territory Regions has the most Yes votes followed by Victoria Divisions while Northern Territory Divisions have the lowest Yes vote total. On the other hand, New South Wales Division and Victoria Divisions has the highest no vote total.

Responses Per Each State

ggplot(vote, aes(State, y= Total_Responses, fill = Responses)) + geom_bar(stat = "identity", position = "dodge") + bluetheme + 
  labs(title = "Australian Same-Sex Marriage Survey Response", subtitle="How each territory has responded with the survey, to allow same-sex marriages in the country" , caption="Source:marriagesurvey.abs.gov.au. Plot@twitter.com/brenborbon")  + scale_fill_manual(values=c("#66C2A5" ,"#3288BD", "#ABDDA4")) +
  xlab("") + ylab("")

We can observe that there are only very few values for UnclearResponse.

Top Electoral Divisions Which have Voted “Yes”

division <- vote %>% filter(Vote == "Yes") %>%
 group_by(ElectDivision, Total_Votes) %>%
 summarise(Total = n()) %>%
 arrange(desc(Total_Votes)) %>% ungroup() %>% head(10)

reg <- aggregate(division$Total_Votes, by=list(division$ElectDivision), FUN=mean) # aggregate
colnames(reg) <- c("division", "votes") # change column names
reg <- reg[order(reg$votes), ] # sort
reg$division <- factor(reg$division, levels = reg$division) # to retain the order in plot.

ggplot(reg, aes(x=division, y=votes, fill = colors)) + 
 geom_bar(stat="identity", width=.5) + 
 labs(title="Top 10 Electoral Division Which Have Voted 'Yes'")+ bluetheme + coord_flip() + theme(legend.position = "") 

Lowest Electoral Divisions Which have Voted “Yes”

division <- vote %>% filter(Vote == "Yes") %>%
 group_by(ElectDivision, Total_Votes) %>%
 summarise(Total = n()) %>%
 arrange(desc(Total_Votes)) %>% ungroup() %>% tail(10)

reg <- aggregate(division$Total_Votes, by=list(division$ElectDivision), FUN=mean) # aggregate
colnames(reg) <- c("division", "votes") # change column names
reg <- reg[order(reg$votes), ] # sort
reg$division <- factor(reg$division, levels = reg$division) # to retain the order in plot.

ggplot(reg, aes(x=division, y=votes, fill = colors)) + 
 geom_bar(stat="identity", width=.5) + 
 labs(title="Lowest Electoral Division Which Have Voted 'Yes'")+ bluetheme + coord_flip() + theme(legend.position = "")

Top Electoral Divisions Which have Voted “No”

division <- vote %>% filter(Vote == "No") %>%
 group_by(ElectDivision, Total_Votes) %>%
 summarise(Total = n()) %>%
 arrange(desc(Total_Votes)) %>% ungroup() %>% head(10)

reg <- aggregate(division$Total_Votes, by=list(division$ElectDivision), FUN=mean) # aggregate
colnames(reg) <- c("division", "votes") # change column names
reg <- reg[order(reg$votes), ] # sort
reg$division <- factor(reg$division, levels = reg$division) # to retain the order in plot.

ggplot(reg, aes(x=division, y=votes, fill = division)) + 
 geom_bar(stat="identity", width=.5) + 
 labs(title="Top 10 Electoral Division Which Have Voted 'No'")+ bluetheme + coord_flip() + theme(legend.position = "")

Lowest Electoral Divisions Which have Voted “No”

division <- vote %>% filter(Vote == "No") %>%
 group_by(ElectDivision, Total_Votes) %>%
 summarise(Total = n()) %>%
 arrange(desc(Total_Votes)) %>% ungroup() %>% tail(10)

reg <- aggregate(division$Total_Votes, by=list(division$ElectDivision), FUN=mean) # aggregate
colnames(reg) <- c("division", "votes") # change column names
reg <- reg[order(reg$votes), ] # sort
reg$division <- factor(reg$division, levels = reg$division) # to retain the order in plot.

ggplot(reg, aes(x=division, y=votes, fill = division)) + 
 geom_bar(stat="identity", width=.5) + 
 labs(title="Lowest Electoral Division Which Have Voted 'No'")+ bluetheme + coord_flip() + theme(legend.position = "")