Data preparation
- get selected variables: PA, AG, Sex, Pop, Time
table1 = read.csv("respopagesextod2000to2010.csv")
table2 = read.csv("respopagesextod2011to2019.csv")
table1 = table1[,c(1,3,4,6,7)]
table2 = table2[,c(1,3,4,6,7)]
- sum population by PA,AG,SEX,POP to reduce records
table1 <- table1 %>%
group_by(PA,AG,Time)%>%
summarise(pop=sum(Pop))
table2 <- table2 %>%
group_by(PA,AG,Time)%>%
summarise(pop=sum(Pop))
- concatenate two tables
data <- union_all(table1,table2)
glimpse(data)
## Observations: 20,900
## Variables: 4
## Groups: PA, AG [1,083]
## $ PA <chr> "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", "Ang M...
## $ AG <fct> 0_to_4, 0_to_4, 0_to_4, 0_to_4, 0_to_4, 0_to_4, 0_to_4, 0_to_4...
## $ Time <int> 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 20...
## $ pop <int> 9290, 8950, 8690, 8370, 7980, 7690, 7490, 7590, 7830, 8310, 80...
- get the total population within each year, each year
age_total <- data %>%
group_by(PA,Time)%>%
summarise(pop=sum(pop))
- calculate the percentage of population within each planning area and year
df = merge(x = data, y = age_total, by = c("PA","Time"), all.x = TRUE)
df$Proportion = df$pop.x/df$pop.y*100
- sort age groups from young to elder
df$AG = factor(df$AG)
levels(df$AG) <- c("0_to_4","5_to_9","10_to_14","15_to_19","20_to_24",
"25_to_29","30_to_34","35_to_39","40_to_44","45_to_49",
"50_to_54","55_to_59","60_to_64","65_to_69","70_to_74",
"75_to_79","80_to_84","85_to_89","90_and_over")
Build the visualization
# Rename axis and remove grid lines
x <- list(
title = "Planning Area",
showgrid = F
)
y <- list(
title = "Age group",
showgrid = F
)
df[df$Time==2019,] %>%
plot_ly(z=~Proportion, y=~AG, x=~PA, type = "heatmap", colors = "Blues",
hovertemplate = paste('<b>Planning area: </br> %{x}',
'<br><b>Age group</b>: %{y}<b>',
'<br><b>AG%</b>: %{z:.2f}%<br>')) %>%
layout(title = "Age structure of Singapore population by planning area in 2019",
width=1200,
height=800,
xaxis=x,
yaxis=y
)