# Create Messy data in R; table name POLL
col1 <- c('Yes','No')
Edin16to24 <- c(80100,35900)
Edinover25 <- c(143000,214800)
Glas16to24 <- c(99400,43000)
Glasover25 <- c(150400,207000)
POLL <- data.frame(col1,Edin16to24,Edinover25,Glas16to24,Glasover25)
print(POLL)
## col1 Edin16to24 Edinover25 Glas16to24 Glasover25
## 1 Yes 80100 143000 99400 150400
## 2 No 35900 214800 43000 207000
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.1.3
## Loading required package: ggplot2
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.1.3
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.3
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
tidy<-POLL %>%
gather(key, value, Edin16to24:Glasover25) %>%
separate(key,c("City", "Age"), 4)
tidy
## col1 City Age value
## 1 Yes Edin 16to24 80100
## 2 No Edin 16to24 35900
## 3 Yes Edin over25 143000
## 4 No Edin over25 214800
## 5 Yes Glas 16to24 99400
## 6 No Glas 16to24 43000
## 7 Yes Glas over25 150400
## 8 No Glas over25 207000
## col1 City Age value
## 1 Yes Edin 16to24 80100
## 2 Yes Edin over25 143000
## col1 City Age value
## 1 Yes Edin over25 143000
## 2 Yes Glas over25 150400
## col1 Edin16to24 Edinover25 Glas16to24 Glasover25
## 1 Yes 80100 143000 99400 150400
by_city <- group_by(tidy, City,col1)
result_city <- summarise(by_city,
total=sum(value))
print(result_city)
## Source: local data frame [4 x 3]
## Groups: City
##
## City col1 total
## 1 Edin No 250700
## 2 Edin Yes 223100
## 3 Glas No 250000
## 4 Glas Yes 249800
by_age <- group_by(tidy, Age, col1)
result_age <- summarise(by_age, total=sum(value))
print(result_age)
## Source: local data frame [4 x 3]
## Groups: Age
##
## Age col1 total
## 1 16to24 No 78900
## 2 16to24 Yes 179500
## 3 over25 No 421800
## 4 over25 Yes 293400
p1 <- ggplot(result_city,aes(x=City,y=total, fill=col1))+geom_histogram(stat="identity", position="dodge")
p2 <- ggplot(result_age, aes(x=Age, y=total, fill=col1))+geom_histogram(stat="identity",position="dodge")
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.1.3
grid.arrange(p1,p2,ncol=2)
City <- c('Edinburg','Edinburg','Glasgow','Glasgow')
Age <- c('16-24','25+','16-24','25+')
Yes <- c(80100,143000,99400,150400)
No <- c(35900,214800,43000,207000)
NewPOLL <- data.frame(City,Age,Yes,No)
print(NewPOLL)
## City Age Yes No
## 1 Edinburg 16-24 80100 35900
## 2 Edinburg 25+ 143000 214800
## 3 Glasgow 16-24 99400 43000
## 4 Glasgow 25+ 150400 207000
pop <- sum(Yes)+sum(No)
pop
## [1] 973600
Analysis <- NewPOLL %>%
filter(Age=="16-24")%>%
mutate(Yes = Yes/pop,No=No/pop)
Analysis
## City Age Yes No
## 1 Edinburg 16-24 0.08227198 0.03687346
## 2 Glasgow 16-24 0.10209532 0.04416598
yes <- ggplot(Analysis, aes(x=City, y=Yes*100))+geom_histogram(stat="identity", position="dodge")+labs(title="% of Age 16-24 Yes per Total" )
no <- ggplot(Analysis, aes (x=City, y=No*100))+ geom_histogram(stat="identity", position="dodge")+labs(title="% of Age16-24 No Per Total")
grid.arrange (yes,no,ncol=2)
I think this data need two tables one for city and another for age group. Under the condition of having one table from the given data, I was not happy with my work since I wasn’t able to utilize mutate(),spread() functions as I want. I had numerous errors. I need more practice definitely.