It should be noted that plyr to be loaded first and then dplyr otherwise it will cause issuess when using the functions of both plyr and dplyr since they have similar named functions
setwd('E:\\SMU Assignements\\Visual Analytics\\TakeHome4')
library(plyr)
library(dplyr)
library(tidyr)
library(plotly)
library(reshape2)
popdf=tbl_df(read.csv("respopagsex2000to2016.csv"))
head(popdf)
## # A tibble: 6 x 6
## PA SZ AG Sex Pop Time
## <fctr> <fctr> <fctr> <fctr> <int> <int>
## 1 Ang Mo Kio Cheng San 0_to_4 Males 810 2000
## 2 Ang Mo Kio Cheng San 0_to_4 Females 700 2000
## 3 Ang Mo Kio Cheng San 5_to_9 Males 1030 2000
## 4 Ang Mo Kio Cheng San 5_to_9 Females 960 2000
## 5 Ang Mo Kio Cheng San 10_to_14 Males 970 2000
## 6 Ang Mo Kio Cheng San 10_to_14 Females 890 2000
summary(popdf)
## PA SZ AG
## Bukit Merah : 10404 Hong Kah : 1008 0_to_4 : 10722
## Queenstown : 9216 Marina East : 1008 10_to_14: 10722
## Downtown Core: 7344 Marymount : 1008 15_to_19: 10722
## Toa Payoh : 7344 Pang Sua : 1008 20_to_24: 10722
## Ang Mo Kio : 6552 Tanjong Pagar: 1008 25_to_29: 10722
## Jurong East : 6120 Trafalgar : 1008 30_to_34: 10722
## (Other) :146016 (Other) :186948 (Other) :128664
## Sex Pop Time
## Females:96498 Min. : 0.0 Min. :2000
## Males :96498 1st Qu.: 0.0 1st Qu.:2004
## Median : 70.0 Median :2008
## Mean : 319.6 Mean :2008
## 3rd Qu.: 390.0 3rd Qu.:2012
## Max. :7370.0 Max. :2016
##
sum(is.na(popdf$Pop))
## [1] 0
sum(is.na(popdf$Time))
## [1] 0
sum(is.na(popdf$Sex))
## [1] 0
sum(is.na(popdf$AG))
## [1] 0
popdf_rename=plyr::rename(popdf,c("PA"="ZONE","SZ"="SUBZONE","Pop"="Population","Time"="Year","Sex"="Gender","AG"="Age"))
head(popdf_rename)
## # A tibble: 6 x 6
## ZONE SUBZONE Age Gender Population Year
## <fctr> <fctr> <fctr> <fctr> <int> <int>
## 1 Ang Mo Kio Cheng San 0_to_4 Males 810 2000
## 2 Ang Mo Kio Cheng San 0_to_4 Females 700 2000
## 3 Ang Mo Kio Cheng San 5_to_9 Males 1030 2000
## 4 Ang Mo Kio Cheng San 5_to_9 Females 960 2000
## 5 Ang Mo Kio Cheng San 10_to_14 Males 970 2000
## 6 Ang Mo Kio Cheng San 10_to_14 Females 890 2000
levels(popdf_rename$Age)
## [1] "0_to_4" "10_to_14" "15_to_19" "20_to_24" "25_to_29"
## [6] "30_to_34" "35_to_39" "40_to_44" "45_to_49" "5_to_9"
## [11] "50_to_54" "55_to_59" "60_to_64" "65_to_69" "70_to_74"
## [16] "75_to_79" "80_to_84" "85_and_over"
f <- list(
family = "Calibri",
size = 18,
color = "#3f3f3f"
)
format_x=function(txt1){
list(
title = txt1,
titlefont = f
)}
format_y=function(txt1){
list(
title = txt1,
titlefont = f
)
}
pop_pyramid=popdf_rename%>%filter(Year==2016)%>%select(Age,Gender,Population)%>%group_by(Age,Gender)%>%dplyr::summarise(Population=sum(Population))%>%mutate(Population=ifelse(Gender %in% "Males",-1*Population,Population))
head(pop_pyramid)
## Source: local data frame [6 x 3]
## Groups: Age [3]
##
## # A tibble: 6 x 3
## Age Gender Population
## <fctr> <fctr> <dbl>
## 1 0_to_4 Females 91540
## 2 0_to_4 Males -95720
## 3 10_to_14 Females 101990
## 4 10_to_14 Males -105630
## 5 15_to_19 Females 116870
## 6 15_to_19 Males -122980
p=plot_ly(pop_pyramid, x = pop_pyramid$Population, y = pop_pyramid$Age, type = 'bar', orientation = 'h',split = ~Gender)%>%layout(title="Pyramid Plot for Population/Age group(Year-2016) ",xaxis=format_x('Population in Thousands'),yaxis=format_x("Age Group"),margin=list(l=200),autosize = T)
p
Economy Active (i.e. age 25-64) Young (i.e. 0-24) and Aged (i.e. 65 and above) After dividing them we created a new column named as AgeGroup and then sorted them as per the year and then converted the Agegroup column as factor level.
popdf_age is the master table
Young=c("0_to_4","10_to_14","15_to_19","20_to_24")
Economy_Active=c("25_to_29","30_to_34","35_to_39","40_to_44","45_to_49","5_to_9","50_to_54","55_to_59","60_to_64")
Aged=c("65_to_69","70_to_74","75_to_79","80_to_84","85_and_over")
popdf_age=mutate(popdf_rename,AgeGroup=ifelse(popdf_rename$Age %in% Young,"Young",ifelse(popdf_rename$Age %in% Economy_Active,"Economy Active","Aged"))) %>%select(-Age)%>%arrange(Year) %>% mutate(AgeGroup=factor(AgeGroup))
head(popdf_age)
## # A tibble: 6 x 6
## ZONE SUBZONE Gender Population Year AgeGroup
## <fctr> <fctr> <fctr> <int> <int> <fctr>
## 1 Ang Mo Kio Cheng San Males 810 2000 Young
## 2 Ang Mo Kio Cheng San Females 700 2000 Young
## 3 Ang Mo Kio Cheng San Males 1030 2000 Economy Active
## 4 Ang Mo Kio Cheng San Females 960 2000 Economy Active
## 5 Ang Mo Kio Cheng San Males 970 2000 Young
## 6 Ang Mo Kio Cheng San Females 890 2000 Young
new=popdf_age%>%arrange(Year) %>% mutate(Year=factor(Year))%>%select(Population,Year,AgeGroup)%>%group_by(Year,AgeGroup)%>%dplyr::summarise(Population=sum(Population))
head(new)
## Source: local data frame [6 x 3]
## Groups: Year [2]
##
## # A tibble: 6 x 3
## Year AgeGroup Population
## <fctr> <fctr> <int>
## 1 2000 Aged 235860
## 2 2000 Economy Active 2154440
## 3 2000 Young 885130
## 4 2001 Aged 244260
## 5 2001 Economy Active 2187770
## 6 2001 Young 896080
p=plot_ly(new,x=new$Year,y=new$Population,type = 'scatter',split = ~new$AgeGroup,mode="Scatter",hoverinfo = 'text',
text = ~paste('Year:', new$Year, '<br>AgeGroup:', new$AgeGroup,'<br>Population:',new$Population))%>%layout(title="Population/Year by Age group(2000-2016) ",xaxis=format_x('Year'),yaxis=format_x('Population in Millions'),margin=list(b=100),autosize = T)
p
popdf_spread=popdf_age%>%dcast(Year+SUBZONE+Gender~AgeGroup,value.var="Population",fun.aggregate=sum)
popdf_spread=mutate(popdf_spread,Old_age_ratio=popdf_spread$`Economy Active`/popdf_spread$Aged)
head(popdf_spread)
## Year SUBZONE Gender Aged Economy Active Young Old_age_ratio
## 1 2000 Admiralty Females 130 1060 380 8.153846
## 2 2000 Admiralty Males 140 1020 380 7.285714
## 3 2000 Airport Females 0 0 0 NaN
## 4 2000 Airport Males 0 0 0 NaN
## 5 2000 Alexandra Hill Females 880 4850 1690 5.511364
## 6 2000 Alexandra Hill Males 850 5070 1800 5.964706
popdf_Year=popdf_age%>%dcast(Year+Gender~AgeGroup,value.var="Population",fun.aggregate=sum)%>%mutate(Year=factor(Year))
popdf_Year=mutate(popdf_Year,Old_age_ratio=round(popdf_Year$`Economy Active`/popdf_Year$Aged,2))
head(popdf_Year)
## Year Gender Aged Economy Active Young Old_age_ratio
## 1 2000 Females 129750 1078810 431200 8.31
## 2 2000 Males 106110 1075630 453930 10.14
## 3 2001 Females 134570 1097390 436370 8.15
## 4 2001 Males 109690 1090380 459710 9.94
## 5 2002 Females 138690 1120880 440200 8.08
## 6 2002 Males 112900 1110430 461710 9.84
library(plotly)
p=plot_ly(popdf_Year,x=popdf_Year$Year,y=popdf_Year$Old_age_ratio,type = 'scatter',split = ~Gender,mode="Scatter",hoverinfo = 'text',
text = ~paste('Year:', Year, '<br>Old Age Ratio:', popdf_Year$Old_age_ratio,'<br>Gender:',Gender)) %>%layout(title="Old Age Ratio Support(Economy Active/Old Age)/Year(2000-2016)",xaxis=format_x('Age Group'),yaxis=format_x('Population in Millions'),margin=list(b=80),autosize = T)
p
popdf_gap=popdf_age%>%dcast(Year~Gender,value.var="Population",fun.aggregate=sum)
popdf_gap=mutate(popdf_gap,Gender_gap=popdf_gap$Females-popdf_gap$Males)%>%filter(Females!=0 & Males!=0)%>%filter(Year %in% c("2000","2002","2004","2006","2008","2010","2012","2016"))%>%mutate(Year=factor(Year))
head(popdf_gap)
## Year Females Males Gender_gap
## 1 2000 1639760 1635670 4090
## 2 2002 1699770 1685040 14730
## 3 2004 1719460 1696020 23440
## 4 2006 1779300 1749260 30040
## 5 2008 1840720 1803940 36780
## 6 2010 1911810 1862160 49650
p = plot_ly(popdf_gap, x = ~Females, y = ~Males, type = 'scatter', mode = 'markers', size = ~Gender_gap, color = ~Year, colors = 'Set1',sizes = c(10, 50),
marker = list(opacity = 0.5,sizemode = 'diameter'),hoverinfo = 'text',
text = ~paste('Year:', Year, '<br>Gender gap:', Gender_gap,'<br>Females:',Females,'<br>Males:',Males)) %>%
layout(title = 'Gender Gap by Year',
xaxis = list(showgrid = TRUE),
yaxis = list(showgrid = TRUE),
showlegend = TRUE)
p
label = function(txt) {
list(
text = txt,
x = 0.1, y = 1,
ax = 0.1, ay = 0,
xref = "paper", yref = "paper",
align = "bottom",
font = list(family = "calibri", size = 15, color = "#3f3f3f"),
bgcolor = "white", bordercolor = "Black", borderwidth = 1
)
}
# reusable function for axis formatting
axis = function(txt) {
list(
title = txt, tickformat = ".0%", tickfont = list(size = 10)
)
}
ternaryAxes = list(
aaxis = axis("Ageing Group(A)"),
baxis = axis("Young Group(B)"),
caxis = axis("Economy Active Group(C)")
)
popdf_ternary_Year=popdf_age%>%dcast(Year~AgeGroup,value.var="Population",fun.aggregate=sum)%>%mutate(Year=factor(Year))%>%filter(Year %in% c("2000","2002","2004","2006","2008","2010","2012","2016"))
popdf_ternary_Year=mutate(popdf_ternary_Year,Old_age_ratio=round(popdf_ternary_Year$`Economy Active`/popdf_ternary_Year$Aged,2))
head(popdf_ternary_Year)
## Year Aged Economy Active Young Old_age_ratio
## 1 2000 235860 2154440 885130 9.13
## 2 2002 251590 2231310 901910 8.87
## 3 2004 265150 2244960 905370 8.47
## 4 2006 295380 2315360 917820 7.84
## 5 2008 316520 2391510 936630 7.56
## 6 2010 339110 2484640 950220 7.33
library(plotly)
p <- plot_ly(
popdf_ternary_Year, a = ~Aged, b = ~Young, c = ~`Economy Active`, text=~paste('Old Age Ratio:',Old_age_ratio),type = "scatterternary",color = ~Year,mode="markers",marker=list( size = ~popdf_ternary_Year$Old_age_ratio,opacity=1),colors = "Accent"
) %>%
layout(title="Ternary plot Old Age Support Ratio/Year(2000-2016) ",
annotations = label("Marker size:Old Age Support Ratio "), ternary = ternaryAxes,margin = 0.05,showlegend=TRUE,autosize=TRUE
)
p
popdf_ternary_subzone=popdf_age%>%dcast(Year+SUBZONE~AgeGroup,value.var="Population",fun.aggregate=sum)%>%filter(Year %in% c('2000','2002','2004','2006','2008','2010','2012','2016'))%>%mutate(Year=factor(Year))
popdf_ternary_subzone=mutate(popdf_ternary_subzone,Old_age_ratio=round(popdf_ternary_subzone$`Economy Active`/popdf_ternary_subzone$Aged,2))%>%subset(Old_age_ratio!="NaN")
popdf_ternary_subzone=dplyr::bind_rows(popdf_ternary_subzone,.id='id')
popdf_ternary_subzone_2000=popdf_ternary_subzone%>%filter(Year %in% '2000')
popdf_ternary_subzone_2004=popdf_ternary_subzone%>%filter(Year %in% '2004')
popdf_ternary_subzone_2008=popdf_ternary_subzone%>%filter(Year %in% '2008')
popdf_ternary_subzone_2016=popdf_ternary_subzone%>%filter(Year %in% '2016')
#
head(popdf_ternary_subzone)
## id Year SUBZONE Aged Economy Active Young Old_age_ratio
## 1 1 2000 Admiralty 270 2080 760 7.70
## 2 1 2000 Alexandra Hill 1730 9920 3490 5.73
## 3 1 2000 Aljunied 4840 29400 10450 6.07
## 4 1 2000 Anak Bukit 1420 13790 5800 9.71
## 5 1 2000 Balestier 3640 20800 7080 5.71
## 6 1 2000 Bangkit 1490 17140 7350 11.50
p = plot_ly(
popdf_ternary_subzone_2000, a = ~Aged, b = ~Young, c = ~`Economy Active`, color = ~SUBZONE, type = "scatterternary",text=~paste('Old Age Ratio:',Old_age_ratio,"<br>Subzone:",popdf_ternary_subzone_2000$SUBZONE),size=popdf_ternary_subzone_2000$Old_age_ratio ,mode="markers",marker=list( opacity=1),colors = "Set1"
) %>%
layout(title="Subzone/Old Age SupportRatio(C/A) for Year 2000 ",
annotations = label("Marker size:Old Age Ratio"), ternary = ternaryAxes,margin = 0.05,showlegend = TRUE
)
#split = ~ZONE, mode = "dot"
p
p = plot_ly(
popdf_ternary_subzone_2004, a = ~Aged, b = ~Young, c = ~`Economy Active`, color = ~SUBZONE, type = "scatterternary",text=~paste('Old Age Ratio:',Old_age_ratio,"<br>Subzone:",popdf_ternary_subzone_2004$SUBZONE),size=popdf_ternary_subzone_2004$Old_age_ratio ,mode="markers",marker=list( opacity=1),colors = "Set1"
) %>%
layout(title="Subzone/Old Age Support Ratio(C/A) for Year 2004 ",
annotations = label("Marker size:Old Age Ratio"), ternary = ternaryAxes,margin = 0.05,showlegend = TRUE
)
#split = ~ZONE, mode = "dot"
p
p = plot_ly(
popdf_ternary_subzone_2008, a = ~Aged, b = ~Young, c = ~`Economy Active`, color = ~SUBZONE, type = "scatterternary",text=~paste('Old Age Ratio:',Old_age_ratio,"<br>Subzone:",popdf_ternary_subzone_2008$SUBZONE),size=popdf_ternary_subzone_2008$Old_age_ratio ,mode="markers",marker=list( opacity=1),colors = "Set1"
) %>%
layout(title="Subzone/Old Age Support Ratio(C/A) for Year 2008 ",
annotations = label("Marker size:Old Age Ratio"), ternary = ternaryAxes,margin = 0.05,showlegend = TRUE
)
#split = ~ZONE, mode = "dot"
p
p = plot_ly(
popdf_ternary_subzone_2016, a = ~Aged, b = ~Young, c = ~`Economy Active`, color = ~SUBZONE, type = "scatterternary",text=~paste('Old Age Ratio:',Old_age_ratio,"<br>Subzone:",popdf_ternary_subzone_2016$SUBZONE),size=popdf_ternary_subzone_2016$Old_age_ratio ,mode="markers",marker=list( opacity=1),colors = "Set1"
) %>%
layout(title="Subzone/Old Age Support Ratio(C/A) for Year 2016 ",
annotations = label("Marker size:Old Age Ratio"), ternary = ternaryAxes,margin = 0.05,showlegend = TRUE
)
#split = ~ZONE, mode = "dot"
p