#Load the required libraries for data analysis.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Which city had more number of people participated in the poll ?
What is the Age group with highest, lowest support for Cullen from each city ?
What is the proportion of the age groups participated in the polls ?
What is the overall support for Cullen ?
response = c("Yes", "No")
edinburgh.16_24 = c(80100, 35900)
edinburgh.25_plus = c(143000, 214800)
glasgow.16_24 = c(99400, 43000)
glasgow.25_plus = c(150400, 207000)
data_orig <- tbl_df(data.frame(response, edinburgh.16_24, edinburgh.25_plus, glasgow.16_24, glasgow.25_plus))
colnames(data_orig)[1] <- "prefer_cullen"
data_orig
## Source: local data frame [2 x 5]
##
## prefer_cullen edinburgh.16_24 edinburgh.25_plus glasgow.16_24
## 1 Yes 80100 143000 99400
## 2 No 35900 214800 43000
## Variables not shown: glasgow.25_plus (dbl)
data_tidy <- data_orig %>%
gather(key = city_agegroup, value = count, -prefer_cullen) %>%
separate(col = city_agegroup, into=c("city", "agegroup"), sep = "\\.")
data_tidy
## Source: local data frame [8 x 4]
##
## prefer_cullen city agegroup count
## 1 Yes edinburgh 16_24 80100
## 2 No edinburgh 16_24 35900
## 3 Yes edinburgh 25_plus 143000
## 4 No edinburgh 25_plus 214800
## 5 Yes glasgow 16_24 99400
## 6 No glasgow 16_24 43000
## 7 Yes glasgow 25_plus 150400
## 8 No glasgow 25_plus 207000
Which city had more number of people participated in the poll ?
data_tidy %>%
group_by(city) %>%
summarise(total = sum(count)) %>%
arrange(desc(total)) %>%
top_n(1)
## Selecting by total
## Source: local data frame [1 x 2]
##
## city total
## 1 glasgow 499800
What is the Age group with highest, lowest support for Cullen from each city ?
data_tidy %>%
filter(prefer_cullen == 'Yes') %>%
group_by(city, agegroup) %>%
summarise(total = sum(count)) %>%
top_n(1, total)
## Source: local data frame [2 x 3]
## Groups: city
##
## city agegroup total
## 1 edinburgh 25_plus 143000
## 2 glasgow 25_plus 150400
data_tidy %>%
filter(prefer_cullen == 'Yes') %>%
group_by(city, agegroup) %>%
summarise(total = sum(count)) %>%
top_n(1, desc(total))
## Source: local data frame [2 x 3]
## Groups: city
##
## city agegroup total
## 1 edinburgh 16_24 80100
## 2 glasgow 16_24 99400
What is the proportion of the age groups participated in the polls ?
age_16_24 <- data_tidy %>%
filter(agegroup == '16_24') %>%
summarise(total = sum(count))
age_25_plus <- data_tidy %>%
filter(agegroup == '25_plus') %>%
summarise(total = sum(count))
paste(round((age_16_24)/sum(data_tidy$count) * 100,2) , '% of population is between 16 and 24')
## [1] "26.54 % of population is between 16 and 24"
paste(round((age_25_plus)/sum(data_tidy$count) * 100,2), '% of population is between 25 and above')
## [1] "73.46 % of population is between 25 and above"
What is the overall support for Cullen ?
prefer_cullen <- data_tidy %>%
filter(prefer_cullen == 'Yes') %>%
summarise(total = sum(count))
do_not_prefer_cullen <- data_tidy %>%
filter(prefer_cullen == 'No') %>%
summarise(total = sum(count))
paste(round((prefer_cullen)/sum(data_tidy$count) * 100,2) , '% of population prefer Cullen')
## [1] "48.57 % of population prefer Cullen"
paste(round((do_not_prefer_cullen)/sum(data_tidy$count) * 100,2) , '% of population do NOT prefer Cullen')
## [1] "51.43 % of population do NOT prefer Cullen"
From the above analysis, it appears like Partan Bree has got slightly more support than Cullen Skink. With the given data, the above tidy data set appears to be sufficient for this analysis, however it gets tricky when we need to anwer questions based on the mulitple factors like age group, city. Probably by combining age groups could simplify the data frame to look at just the majority preference in each city and overall.