The following will be a data visualization to reveal the demographic structure of Singapore population by age cohort and by planning area in 2019. It will be built using ggplot2 and other appropriate R packages.
The data from https://www.singstat.gov.sg/find-data/search-by-theme/population/geographic-distribution/latest-data comes as a 2011-2019 series. Hence, data cleaning and filtering is required to extract only year 2019 data.
The main variables are Type of Dwelling, Gender and Age Group. The challenge lies in categorizing the variables neatly to show noticeable details.
The proposed data design will bind 2 stacked bar charts side by side to portray Age Group percentage by Area Planning and total Population count by Area Planning. Area Planning will be sorted starting with the highest population count. Labels in percentage will be displayed for bars that have enough width size.
The design aims to find pattern between age group and type of dwelling.
packages = c('ggplot2', 'scales', 'DT', 'funModeling', 'gridExtra', 'tidyverse', 'ggthemes', 'dplyr')
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
data_raw <- read_csv("Data/SG2011to2019.csv")
data <- data_raw %>%
filter(`Time` == '2019')
data <- data[-7]
Checking for missing values with data health status
status(data)
## variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
## 1 PA 0 0.0000000 0 0 0 0 character 55
## 2 SZ 0 0.0000000 0 0 0 0 character 323
## 3 AG 0 0.0000000 0 0 0 0 character 19
## 4 Sex 0 0.0000000 0 0 0 0 character 2
## 5 TOD 0 0.0000000 0 0 0 0 character 8
## 6 Pop 68193 0.6944863 0 0 0 0 numeric 208
popdata <- data %>%
pivot_wider(names_from = AG, values_from = Pop)
popdata <- popdata[,-c(2:4)]
popdata <- popdata %>%
group_by(PA) %>%
summarise_all(sum)
popdata <- ungroup(popdata)
popdata <- popdata[apply(popdata[,-1], 1, function(x) !all(x==0)),]
Young (0 to 19), Adult (20 to 64) and Old (65 and above)
popdata <- popdata %>% mutate(Young = rowSums(.[2:5]))
popdata <- popdata %>% mutate(Adult = rowSums(.[6:14]))
popdata <- popdata %>% mutate(Old = rowSums(.[15:20]))
popdata <- popdata[,-c(2:20)]
popdata_pct <- popdata
popdata_pct <- popdata_pct %>% mutate(Young_pct = Young / rowSums(.[2:4]) * 100)
popdata_pct <- popdata_pct %>% mutate(Adult_pct = Adult / rowSums(.[2:4]) * 100)
popdata_pct <- popdata_pct %>% mutate(Old_pct = Old / rowSums(.[2:4]) * 100)
popdata_pct <- popdata_pct[,-c(2:4)]
names(popdata_pct)[names(popdata_pct) == "Young_pct"] <- "Young"
names(popdata_pct)[names(popdata_pct) == "Adult_pct"] <- "Adult"
names(popdata_pct)[names(popdata_pct) == "Old_pct"] <- "Old"
popdata <- gather(popdata, AG, Pop, 2:4)
popdata_pct <- gather(popdata_pct, AG, Pop, 2:4)
popdata_tod <- data %>%
pivot_wider(names_from = AG, values_from = Pop)
popdata_tod <- popdata_tod[,-c(2:3)]
popdata_tod <- popdata_tod %>%
group_by(PA, TOD) %>%
summarise_all(sum)
popdata_tod <- ungroup(popdata_tod)
popdata_tod <- popdata_tod %>% mutate(Pop = rowSums(.[3:21]))
popdata_tod <- popdata_tod[,-c(3:21)]
Binning into 5 categories of ‘HDB1-2RM’, ‘HDB3-4RM’, ‘HDB5RM_EC_HUDC’, ‘Condo’, ‘Landed_Others’ (including Others)
popdata_tod_filtered <- popdata_tod %>%
pivot_wider(names_from = TOD, values_from = Pop)
popdata_tod_filtered <- popdata_tod_filtered %>%
group_by(PA) %>%
summarise_all(sum)
popdata_tod_filtered <- ungroup(popdata_tod_filtered)
popdata_tod_filtered <- popdata_tod_filtered %>% rename(Condo = `Condominiums and Other Apartments`)
popdata_tod_filtered <- popdata_tod_filtered %>% rename(`HDB1-2RM` = `HDB 1- and 2-Room Flats`)
popdata_tod_filtered <- popdata_tod_filtered %>% mutate(`HDB3-4RM` = rowSums(.[4:5]))
popdata_tod_filtered <- popdata_tod_filtered %>% mutate(`HDB5RM_EC_HUDC` = rowSums(.[6:7]))
popdata_tod_filtered <- popdata_tod_filtered %>% mutate(`Landed_Others` = rowSums(.[8:9]))
popdata_tod_filtered <- popdata_tod_filtered[,-c(4:9)]
popdata_tod_filtered <- popdata_tod_filtered[apply(popdata_tod_filtered[,-1], 1, function(x) !all(x==0)),]
popdata_tod_filtered <- gather(popdata_tod_filtered, TOD, Pop, 2:6)
status <- factor(reorder(popdata_tod_filtered$PA, popdata_tod_filtered$Pop, sum), ordered = TRUE)
status <- fct_rev(status)
popdata_pct <-popdata_pct[order(factor(popdata_pct$PA, levels=unique(levels(status)))),]
# Sum the total population of TOD
totals_tod <- popdata_tod_filtered %>%
group_by(PA) %>%
summarize(total = sum(Pop))
# Reorder TOD via factors
popdata_tod_filtered$TOD <- factor(popdata_tod_filtered$TOD, levels = c("HDB1-2RM", "HDB3-4RM", "HDB5RM_EC_HUDC", "Condo", "Landed_Others"))
popdata_tod_filtered <- popdata_tod_filtered %>% group_by(PA, TOD) %>%
summarise(Pop = sum(Pop)) %>%
mutate( pos = cumsum(Pop)-0.5*Pop) %>%
mutate( pct = Pop/sum(Pop)*100)
# Reorder Age Group via factors
popdata_pct$AG <- factor(popdata_pct$AG, levels = c("Young", "Adult", "Old"))
popdata_pct <- popdata_pct %>% group_by(PA) %>%
mutate( pos = cumsum(Pop)-0.5*Pop)
popdata_pct$PA <- factor(popdata_pct$PA, levels=unique(levels(status)))
popdata_pct$PA <- fct_rev(popdata_pct$PA)
popdata_pct$AG <- fct_rev(popdata_pct$AG)
# Plotting % AG of population barchart for the left side, labels will be displayed for % if above 20%
left <- ggplot(popdata_pct) +
geom_bar(width = 0.6, stat="identity", colour="#FF9999") + aes(x = PA, y = Pop, label = Pop, fill = AG) +
theme(axis.text.y = element_text(size=10), axis.title.y = element_text(size=20), axis.title.x = element_text(size=15), legend.position="bottom", legend.key.width = unit(0.3, "cm")) +
coord_flip() +
scale_y_continuous(name="Age Group %", limits=c(0, 101), labels=function(x) format(x, big.mark = ",", scientific = FALSE)) +
scale_x_discrete(name="Planning Area") +
geom_text(aes(label = ifelse(Pop>20, paste0(as.integer(Pop),"%"), NA)), y = popdata_pct$pos, color="black", size = 3) + scale_fill_brewer(palette="RdYlGn")
# Plotting TOD of population barchart for the right side, labels will be displayed for TOD above 15,000 population count
right <- ggplot(popdata_tod_filtered) + geom_bar(width = 0.6, stat="identity", colour="#FF9999") +
aes(x = reorder(PA, Pop, sum), y = Pop, fill = forcats::fct_rev(TOD)) +
theme(axis.ticks.y=element_blank(), axis.text.y=element_blank(), axis.title.y=element_blank(), axis.title.x = element_text(size=15), legend.position="bottom") +
geom_text(aes(PA, total, label = total, fill = NULL), size = 3.3, hjust = -0.3, data = totals_tod) +
geom_text(aes(PA, Pop, label = ifelse(Pop > 15000, paste0(as.integer(pct),"%"), "")), y=popdata_tod_filtered$pos, size = 3, colour="black") +
coord_flip() +
scale_y_continuous(name="Population", limits=c(0, 300000), labels=function(x) format(x, big.mark = ",", scientific = FALSE)) +
scale_x_discrete(name="Planning Area") +
scale_fill_brewer(palette="Spectral") + guides(fill=guide_legend(title="TOD"))
# Add 2 rows of grid to combine the left and right barcharts
#grid.arrange(left, right, nrow=1, widths=c(1, 2))
The data visualization binds 2 barcharts side by side, to portray % of Age Group and the Population count of residents, sharing the same y axis for Planning Area.
On the left, the 3 age groups are displayed as total percentage by Planning Area, sorted in descending total population count. Labels are only be displayed for values above 20%.
On the right, the population count are categorized into 5 groups for Type of Dwelling and displayed by Planning Area, sorted as per the left barchart. Labels for the percentage by TOD groups are only displayed for values above 15,000 population count.
Young age group represents age 0 to 19, Adult represents age 20 to 64 and Old represents age 65 and above.
Bukit Timah has the highest proportion of population staying in private properties with 47% Condo (36840) and 43% Landed and Others (33710). On the other hand, Bedok has the highest count of population residing in private properties with 54430 (19%) in Condo and 46950 (16%) in Landed and Others (33710). It is also worth noticing that Bukit Timah has a relatively high percentage of Young age group (22%) residing in these private establishments.
There is an even distribution, between 60% to 68%, of Adult age groups within areas with more than 70,000 residents. Punggol (29%), Sengkang (25%) and Sembawang (24%) are the top 3 areas with most proportion of Young age groups. It signifies that these 3 areas are hot zones for BTO projects and most of the BTOs are 3 room to 4 room HDB.