#1. Write down 3 questions that you might want to answer based on this data.
##1. Is cullen skink is preffered over Partan bree?
##2. Which age group support cullen skink more?
##3. Which city does not support cullen skink?
#2. Create an R data frame with 2 observations to store this data in its current "messy" state.
#Use whatever method you want to re-create and/or load the data.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.3
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.1.3
#a. loaded from CSV file
# messy.df<-read.csv("ScotlandPoll.csv",header=TRUE,sep=",")
# messy.df
#b. Loaded directely to dataframe
messy <- data.frame(
city = c("Edinburgh", "Glasgow"),
age16_24Yes = c(80000, 99400),
age25_plYes = c(143000, 150400),
age16_24No = c(35900, 43000),
age25_plNo = c(214800, 207000)
)
messy
## city age16_24Yes age25_plYes age16_24No age25_plNo
## 1 Edinburgh 80000 143000 35900 214800
## 2 Glasgow 99400 150400 43000 207000
#3. Use the functionality in the tidyr package to convert the data frame to be "tidy data."
tidy<-messy %>%
gather(key,value,age16_24Yes:age25_plNo)
tidy
## city key value
## 1 Edinburgh age16_24Yes 80000
## 2 Glasgow age16_24Yes 99400
## 3 Edinburgh age25_plYes 143000
## 4 Glasgow age25_plYes 150400
## 5 Edinburgh age16_24No 35900
## 6 Glasgow age16_24No 43000
## 7 Edinburgh age25_plNo 214800
## 8 Glasgow age25_plNo 207000
tidier <-tidy %>%
separate(key,into =c("age", "pollval"), 8)
tidier
## city age pollval value
## 1 Edinburgh age16_24 Yes 80000
## 2 Glasgow age16_24 Yes 99400
## 3 Edinburgh age25_pl Yes 143000
## 4 Glasgow age25_pl Yes 150400
## 5 Edinburgh age16_24 No 35900
## 6 Glasgow age16_24 No 43000
## 7 Edinburgh age25_pl No 214800
## 8 Glasgow age25_pl No 207000
#4. Use the functionality in the dplyr package to answer the questions that you asked in step 1.
## 1. Is cullen skink is preffered over Partan bree?
tidier %>%
group_by(pollval) %>%
summarise(n=sum(value))
## Source: local data frame [2 x 2]
##
## pollval n
## 1 No 500700
## 2 Yes 472800
## 2.Which age group support cullen skink more?
tidier %>%
filter(pollval=="Yes") %>%
group_by(age) %>%
summarise(n=sum(value)) %>%
arrange(desc(n))
## Source: local data frame [2 x 2]
##
## age n
## 1 age25_pl 293400
## 2 age16_24 179400
##3. Which city does not support cullen skink?
tidier %>%
filter(pollval=="Yes") %>%
group_by(city) %>%
summarise(n=sum(value)) %>%
arrange(n)
## Source: local data frame [2 x 2]
##
## city n
## 1 Edinburgh 223000
## 2 Glasgow 249800
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
tidier1 <-tidier %>%
filter(city=="Edinburgh")
tidier2 <-tidier %>%
filter(city=="Glasgow")
p1 <- ggplot(data=tidier1, aes(x=age, y=value, fill=pollval)) +
geom_bar(stat="identity", position=position_dodge(), colour="black")+ggtitle("Edinburgh")
p2 <- ggplot(data=tidier2, aes(x=age, y=value, fill=pollval)) +
geom_bar(stat="identity", position=position_dodge(), colour="black")+ggtitle("Glasgow")
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.1.3
## Loading required package: grid
grid.arrange(p1,p2)

#5. Having gone through the process,
# would you ask different questions and/or change the way that you structured your data frame?
# 1. In each age group what is the percentage difference of support for cullen skink per city