Reading the data

It should be noted that plyr to be loaded first and then dplyr otherwise it will cause issuess when using the functions of both plyr and dplyr since they have similar named functions

setwd('E:\\SMU Assignements\\Visual Analytics\\TakeHome4')
library(plyr)
library(dplyr)
library(tidyr)
library(plotly)
library(reshape2)
popdf=tbl_df(read.csv("respopagsex2000to2016.csv"))

head(popdf)
## # A tibble: 6 x 6
##           PA        SZ       AG     Sex   Pop  Time
##       <fctr>    <fctr>   <fctr>  <fctr> <int> <int>
## 1 Ang Mo Kio Cheng San   0_to_4   Males   810  2000
## 2 Ang Mo Kio Cheng San   0_to_4 Females   700  2000
## 3 Ang Mo Kio Cheng San   5_to_9   Males  1030  2000
## 4 Ang Mo Kio Cheng San   5_to_9 Females   960  2000
## 5 Ang Mo Kio Cheng San 10_to_14   Males   970  2000
## 6 Ang Mo Kio Cheng San 10_to_14 Females   890  2000

let us look into the summary stastics of the data

summary(popdf)
##              PA                     SZ                AG        
##  Bukit Merah  : 10404   Hong Kah     :  1008   0_to_4  : 10722  
##  Queenstown   :  9216   Marina East  :  1008   10_to_14: 10722  
##  Downtown Core:  7344   Marymount    :  1008   15_to_19: 10722  
##  Toa Payoh    :  7344   Pang Sua     :  1008   20_to_24: 10722  
##  Ang Mo Kio   :  6552   Tanjong Pagar:  1008   25_to_29: 10722  
##  Jurong East  :  6120   Trafalgar    :  1008   30_to_34: 10722  
##  (Other)      :146016   (Other)      :186948   (Other) :128664  
##       Sex             Pop              Time     
##  Females:96498   Min.   :   0.0   Min.   :2000  
##  Males  :96498   1st Qu.:   0.0   1st Qu.:2004  
##                  Median :  70.0   Median :2008  
##                  Mean   : 319.6   Mean   :2008  
##                  3rd Qu.: 390.0   3rd Qu.:2012  
##                  Max.   :7370.0   Max.   :2016  
## 

check for missing values in columns

sum(is.na(popdf$Pop))
## [1] 0
sum(is.na(popdf$Time))
## [1] 0
sum(is.na(popdf$Sex))
## [1] 0
sum(is.na(popdf$AG))
## [1] 0

Rename the columns to proper names.Here we need to be careful when using rename since dplyr and plyr both have rename as a function and hence you can get sometime error.

popdf_rename=plyr::rename(popdf,c("PA"="ZONE","SZ"="SUBZONE","Pop"="Population","Time"="Year","Sex"="Gender","AG"="Age"))

head(popdf_rename)
## # A tibble: 6 x 6
##         ZONE   SUBZONE      Age  Gender Population  Year
##       <fctr>    <fctr>   <fctr>  <fctr>      <int> <int>
## 1 Ang Mo Kio Cheng San   0_to_4   Males        810  2000
## 2 Ang Mo Kio Cheng San   0_to_4 Females        700  2000
## 3 Ang Mo Kio Cheng San   5_to_9   Males       1030  2000
## 4 Ang Mo Kio Cheng San   5_to_9 Females        960  2000
## 5 Ang Mo Kio Cheng San 10_to_14   Males        970  2000
## 6 Ang Mo Kio Cheng San 10_to_14 Females        890  2000

check the levels

levels(popdf_rename$Age)
##  [1] "0_to_4"      "10_to_14"    "15_to_19"    "20_to_24"    "25_to_29"   
##  [6] "30_to_34"    "35_to_39"    "40_to_44"    "45_to_49"    "5_to_9"     
## [11] "50_to_54"    "55_to_59"    "60_to_64"    "65_to_69"    "70_to_74"   
## [16] "75_to_79"    "80_to_84"    "85_and_over"

Creating a common function for formating axis and labels

f <- list(
  family = "Calibri",
  size = 18,
  color = "#3f3f3f"
)

format_x=function(txt1){
 list(
  title = txt1,
  titlefont = f
)}

format_y=function(txt1){
 list(
  title = txt1,
  titlefont = f
)
}

Creating Population Pyramid. Here you have to use dplyr quote for summarising since plyr also has the same function name which works differently

pop_pyramid=popdf_rename%>%filter(Year==2016)%>%select(Age,Gender,Population)%>%group_by(Age,Gender)%>%dplyr::summarise(Population=sum(Population))%>%mutate(Population=ifelse(Gender %in% "Males",-1*Population,Population))
head(pop_pyramid)
## Source: local data frame [6 x 3]
## Groups: Age [3]
## 
## # A tibble: 6 x 3
##        Age  Gender Population
##     <fctr>  <fctr>      <dbl>
## 1   0_to_4 Females      91540
## 2   0_to_4   Males     -95720
## 3 10_to_14 Females     101990
## 4 10_to_14   Males    -105630
## 5 15_to_19 Females     116870
## 6 15_to_19   Males    -122980
p=plot_ly(pop_pyramid, x = pop_pyramid$Population, y = pop_pyramid$Age,  type = 'bar', orientation = 'h',split = ~Gender)%>%layout(title="Pyramid Plot for Population/Age group(Year-2016) ",xaxis=format_x('Population in Thousands'),yaxis=format_x("Age Group"),margin=list(l=200),autosize = T)
p

Let us divide the age group into three segments i.e.

Economy Active (i.e. age 25-64) Young (i.e. 0-24) and Aged (i.e. 65 and above) After dividing them we created a new column named as AgeGroup and then sorted them as per the year and then converted the Agegroup column as factor level.

popdf_age is the master table

Young=c("0_to_4","10_to_14","15_to_19","20_to_24")
Economy_Active=c("25_to_29","30_to_34","35_to_39","40_to_44","45_to_49","5_to_9","50_to_54","55_to_59","60_to_64")
Aged=c("65_to_69","70_to_74","75_to_79","80_to_84","85_and_over")

popdf_age=mutate(popdf_rename,AgeGroup=ifelse(popdf_rename$Age %in% Young,"Young",ifelse(popdf_rename$Age %in% Economy_Active,"Economy Active","Aged"))) %>%select(-Age)%>%arrange(Year) %>% mutate(AgeGroup=factor(AgeGroup))

head(popdf_age)
## # A tibble: 6 x 6
##         ZONE   SUBZONE  Gender Population  Year       AgeGroup
##       <fctr>    <fctr>  <fctr>      <int> <int>         <fctr>
## 1 Ang Mo Kio Cheng San   Males        810  2000          Young
## 2 Ang Mo Kio Cheng San Females        700  2000          Young
## 3 Ang Mo Kio Cheng San   Males       1030  2000 Economy Active
## 4 Ang Mo Kio Cheng San Females        960  2000 Economy Active
## 5 Ang Mo Kio Cheng San   Males        970  2000          Young
## 6 Ang Mo Kio Cheng San Females        890  2000          Young

We prepared the data here for ploting the relation between population of different age group throughout different years.

new=popdf_age%>%arrange(Year) %>% mutate(Year=factor(Year))%>%select(Population,Year,AgeGroup)%>%group_by(Year,AgeGroup)%>%dplyr::summarise(Population=sum(Population))

head(new)
## Source: local data frame [6 x 3]
## Groups: Year [2]
## 
## # A tibble: 6 x 3
##     Year       AgeGroup Population
##   <fctr>         <fctr>      <int>
## 1   2000           Aged     235860
## 2   2000 Economy Active    2154440
## 3   2000          Young     885130
## 4   2001           Aged     244260
## 5   2001 Economy Active    2187770
## 6   2001          Young     896080
p=plot_ly(new,x=new$Year,y=new$Population,type = 'scatter',split = ~new$AgeGroup,mode="Scatter",hoverinfo = 'text',
        text = ~paste('Year:', new$Year, '<br>AgeGroup:', new$AgeGroup,'<br>Population:',new$Population))%>%layout(title="Population/Year by Age group(2000-2016) ",xaxis=format_x('Year'),yaxis=format_x('Population in Millions'),margin=list(b=100),autosize = T)
p

Now our aim is to divide the Age Group into different categories as per their population.

popdf_spread=popdf_age%>%dcast(Year+SUBZONE+Gender~AgeGroup,value.var="Population",fun.aggregate=sum)

popdf_spread=mutate(popdf_spread,Old_age_ratio=popdf_spread$`Economy Active`/popdf_spread$Aged)

head(popdf_spread)
##   Year        SUBZONE  Gender Aged Economy Active Young Old_age_ratio
## 1 2000      Admiralty Females  130           1060   380      8.153846
## 2 2000      Admiralty   Males  140           1020   380      7.285714
## 3 2000        Airport Females    0              0     0           NaN
## 4 2000        Airport   Males    0              0     0           NaN
## 5 2000 Alexandra Hill Females  880           4850  1690      5.511364
## 6 2000 Alexandra Hill   Males  850           5070  1800      5.964706

The popdf_year will be used to find out the relationship between old age ratio and year

popdf_Year=popdf_age%>%dcast(Year+Gender~AgeGroup,value.var="Population",fun.aggregate=sum)%>%mutate(Year=factor(Year))

popdf_Year=mutate(popdf_Year,Old_age_ratio=round(popdf_Year$`Economy Active`/popdf_Year$Aged,2))

head(popdf_Year)
##   Year  Gender   Aged Economy Active  Young Old_age_ratio
## 1 2000 Females 129750        1078810 431200          8.31
## 2 2000   Males 106110        1075630 453930         10.14
## 3 2001 Females 134570        1097390 436370          8.15
## 4 2001   Males 109690        1090380 459710          9.94
## 5 2002 Females 138690        1120880 440200          8.08
## 6 2002   Males 112900        1110430 461710          9.84

Finding out the relation between Old age ratio and the year from the dataframe popdf_year

library(plotly)
p=plot_ly(popdf_Year,x=popdf_Year$Year,y=popdf_Year$Old_age_ratio,type = 'scatter',split = ~Gender,mode="Scatter",hoverinfo = 'text',
        text = ~paste('Year:', Year, '<br>Old Age Ratio:', popdf_Year$Old_age_ratio,'<br>Gender:',Gender)) %>%layout(title="Old Age Ratio Support(Economy Active/Old Age)/Year(2000-2016)",xaxis=format_x('Age Group'),yaxis=format_x('Population in Millions'),margin=list(b=80),autosize = T)
p

Now let us see the gender gap between male and female population

popdf_gap=popdf_age%>%dcast(Year~Gender,value.var="Population",fun.aggregate=sum)

popdf_gap=mutate(popdf_gap,Gender_gap=popdf_gap$Females-popdf_gap$Males)%>%filter(Females!=0  & Males!=0)%>%filter(Year %in% c("2000","2002","2004","2006","2008","2010","2012","2016"))%>%mutate(Year=factor(Year))

head(popdf_gap)
##   Year Females   Males Gender_gap
## 1 2000 1639760 1635670       4090
## 2 2002 1699770 1685040      14730
## 3 2004 1719460 1696020      23440
## 4 2006 1779300 1749260      30040
## 5 2008 1840720 1803940      36780
## 6 2010 1911810 1862160      49650
p = plot_ly(popdf_gap, x = ~Females, y = ~Males, type = 'scatter', mode = 'markers', size = ~Gender_gap, color = ~Year, colors = 'Set1',sizes = c(10, 50),
        marker = list(opacity = 0.5,sizemode = 'diameter'),hoverinfo = 'text',
        text = ~paste('Year:', Year, '<br>Gender gap:', Gender_gap,'<br>Females:',Females,'<br>Males:',Males)) %>%
  layout(title = 'Gender Gap by Year',
         xaxis = list(showgrid = TRUE),
         yaxis = list(showgrid = TRUE),
         showlegend = TRUE)

p

A common function is written for the ternary plot for visulaization.

label = function(txt) {
  list(
    text = txt,
    x = 0.1, y = 1,
    ax = 0.1, ay = 0,
    xref = "paper", yref = "paper",
    align = "bottom",
    font = list(family = "calibri", size = 15, color = "#3f3f3f"),
    bgcolor = "white", bordercolor = "Black", borderwidth = 1
    
  )
}


# reusable function for axis formatting
axis = function(txt) {
  list(
    title = txt, tickformat = ".0%", tickfont = list(size = 10)
  )
}

ternaryAxes = list(
  aaxis = axis("Ageing Group(A)"),
  baxis = axis("Young Group(B)"),
  caxis = axis("Economy Active Group(C)")
)

Ternary plot is constructed to see the population spread during different years

popdf_ternary_Year=popdf_age%>%dcast(Year~AgeGroup,value.var="Population",fun.aggregate=sum)%>%mutate(Year=factor(Year))%>%filter(Year %in% c("2000","2002","2004","2006","2008","2010","2012","2016"))

popdf_ternary_Year=mutate(popdf_ternary_Year,Old_age_ratio=round(popdf_ternary_Year$`Economy Active`/popdf_ternary_Year$Aged,2))

head(popdf_ternary_Year)
##   Year   Aged Economy Active  Young Old_age_ratio
## 1 2000 235860        2154440 885130          9.13
## 2 2002 251590        2231310 901910          8.87
## 3 2004 265150        2244960 905370          8.47
## 4 2006 295380        2315360 917820          7.84
## 5 2008 316520        2391510 936630          7.56
## 6 2010 339110        2484640 950220          7.33
library(plotly)


p <- plot_ly(
  popdf_ternary_Year, a = ~Aged, b = ~Young, c = ~`Economy Active`,  text=~paste('Old Age Ratio:',Old_age_ratio),type = "scatterternary",color = ~Year,mode="markers",marker=list( size = ~popdf_ternary_Year$Old_age_ratio,opacity=1),colors = "Accent"
  ) %>%
  layout(title="Ternary plot Old Age Support Ratio/Year(2000-2016) ",
    annotations = label("Marker size:Old Age Support Ratio "), ternary = ternaryAxes,margin = 0.05,showlegend=TRUE,autosize=TRUE
  ) 

p

popdf_ternary_subzone is constructed to plot the ternary plot in order to see the population trend at different planning areas for the years 2000,2004,2008,2016

popdf_ternary_subzone=popdf_age%>%dcast(Year+SUBZONE~AgeGroup,value.var="Population",fun.aggregate=sum)%>%filter(Year %in% c('2000','2002','2004','2006','2008','2010','2012','2016'))%>%mutate(Year=factor(Year))

popdf_ternary_subzone=mutate(popdf_ternary_subzone,Old_age_ratio=round(popdf_ternary_subzone$`Economy Active`/popdf_ternary_subzone$Aged,2))%>%subset(Old_age_ratio!="NaN")

popdf_ternary_subzone=dplyr::bind_rows(popdf_ternary_subzone,.id='id')
popdf_ternary_subzone_2000=popdf_ternary_subzone%>%filter(Year %in% '2000')
popdf_ternary_subzone_2004=popdf_ternary_subzone%>%filter(Year %in% '2004')
popdf_ternary_subzone_2008=popdf_ternary_subzone%>%filter(Year %in% '2008')
popdf_ternary_subzone_2016=popdf_ternary_subzone%>%filter(Year %in% '2016')


#
head(popdf_ternary_subzone)
##   id Year        SUBZONE Aged Economy Active Young Old_age_ratio
## 1  1 2000      Admiralty  270           2080   760          7.70
## 2  1 2000 Alexandra Hill 1730           9920  3490          5.73
## 3  1 2000       Aljunied 4840          29400 10450          6.07
## 4  1 2000     Anak Bukit 1420          13790  5800          9.71
## 5  1 2000      Balestier 3640          20800  7080          5.71
## 6  1 2000        Bangkit 1490          17140  7350         11.50
p = plot_ly(
  popdf_ternary_subzone_2000, a = ~Aged, b = ~Young, c = ~`Economy Active`, color = ~SUBZONE, type = "scatterternary",text=~paste('Old Age Ratio:',Old_age_ratio,"<br>Subzone:",popdf_ternary_subzone_2000$SUBZONE),size=popdf_ternary_subzone_2000$Old_age_ratio ,mode="markers",marker=list( opacity=1),colors = "Set1"
) %>%
  layout(title="Subzone/Old Age SupportRatio(C/A) for Year 2000 ",
    annotations = label("Marker size:Old Age Ratio"), ternary = ternaryAxes,margin = 0.05,showlegend = TRUE
  ) 

#split = ~ZONE, mode = "dot"
p
p = plot_ly(
  popdf_ternary_subzone_2004, a = ~Aged, b = ~Young, c = ~`Economy Active`, color = ~SUBZONE, type = "scatterternary",text=~paste('Old Age Ratio:',Old_age_ratio,"<br>Subzone:",popdf_ternary_subzone_2004$SUBZONE),size=popdf_ternary_subzone_2004$Old_age_ratio ,mode="markers",marker=list( opacity=1),colors = "Set1"
) %>%
  layout(title="Subzone/Old Age Support Ratio(C/A)  for Year 2004 ",
    annotations = label("Marker size:Old Age Ratio"), ternary = ternaryAxes,margin = 0.05,showlegend = TRUE
  ) 

#split = ~ZONE, mode = "dot"
p
p = plot_ly(
  popdf_ternary_subzone_2008, a = ~Aged, b = ~Young, c = ~`Economy Active`, color = ~SUBZONE, type = "scatterternary",text=~paste('Old Age Ratio:',Old_age_ratio,"<br>Subzone:",popdf_ternary_subzone_2008$SUBZONE),size=popdf_ternary_subzone_2008$Old_age_ratio ,mode="markers",marker=list( opacity=1),colors = "Set1"
) %>%
  layout(title="Subzone/Old Age Support Ratio(C/A)  for Year 2008 ",
    annotations = label("Marker size:Old Age Ratio"), ternary = ternaryAxes,margin = 0.05,showlegend = TRUE
  )

#split = ~ZONE, mode = "dot"
p
p = plot_ly(
  popdf_ternary_subzone_2016, a = ~Aged, b = ~Young, c = ~`Economy Active`, color = ~SUBZONE, type = "scatterternary",text=~paste('Old Age Ratio:',Old_age_ratio,"<br>Subzone:",popdf_ternary_subzone_2016$SUBZONE),size=popdf_ternary_subzone_2016$Old_age_ratio ,mode="markers",marker=list( opacity=1),colors = "Set1"
) %>%
  layout(title="Subzone/Old Age Support Ratio(C/A)  for Year 2016 ",
    annotations = label("Marker size:Old Age Ratio"), ternary = ternaryAxes,margin = 0.05,showlegend = TRUE
  ) 

#split = ~ZONE, mode = "dot"
p