The aim of this assignment is to explore the population in Singapore by different planning areas and age groups. Through the visualization of the data, we can see the area that most people live in and if there’s the preference for living.
Statistics Singapore: https://www.singstat.gov.sg/find-data/search-by-theme/population/geographic-distribution/latest-data
For analyzing the population in different age group and gender, the population pyramid chart will be provided. Besides, to see the difference among different planning area, the bar chart of the population comparing by planning area would be used to see the difference.
Although the source of the data is credible, there are still some issues when we take a closer look in the data:
There are too many categories in the data and that would make our data more complicated.
For example, there are no people who live at “Central Water Catchment” but it is still on the list. However, we cannot reduce the “planing area” in our own preference, the demonstration of the data would be one challenge.
The order of the age group is incorrect if we solve directly.
In the original dataset, if we want to order by AG (Age Group), the order of “5-9” would order between “40-49” and “50-59”, which doesn’t make sense. Thus, we have to reorder it.
No obvious percentage either by gender or age so it would be hard to tell the difference.
In order to compare the population, it would be easier for us to tell the difference from the percentage than the total number. Since the information is not provided, we have to create the variable.
Sketch
library(corrplot)
packages = c('ggpubr', 'tidyverse','tidyverse','plotly','broom','stringr','viridis','data.table','formattable')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
Import the data.
Change the value of age group to make them in a better format and avoid the problem when order by age group.
df$AG <- df$AG %>%
str_replace_all("0_to_4", "00_to_04")
df$AG <- df$AG %>%
str_replace_all("5_to_9", "05_to_09")
df$AG <- df$AG %>%
str_replace_all("400_to_044", "40_to_44")
Create new column to reduce age group for planning area analysis
Immaturity: People below 20
Early Adulthood: People between 20 to 39
Adulthood: People between 40 to 64
Aged: People above 65
df <- mutate(df, AG_GROUP_NEW = case_when(
AG %in% c("00_to_04","05_to_09","10_to_14","15_to_19") ~ "Immaturity",
AG %in% c("20_to_24","25_to_29","30_to_34","35_to_39") ~ "Early Adulthood",
AG %in% c("40_to_44","45_to_49","50_to_54","55_to_59","60_to_64") ~ "Adulthood",
AG %in% c("65_to_69","70_to_74","75_to_79","80_to_84","85_to_89", "90_and_over") ~ "Aged"))
Select columns needed to use and filter the place with no population.
df2 = df %>% group_by(PA, Sex,AG_GROUP_NEW, AG, Time) %>%
dplyr::summarise(Pop = sum(Pop))
df2 = filter(df2, Pop != 0)
Focus on the stusy of 2019.
df3 <- df2 %>% filter(Time==2019)
Creat new variable to see the percentage in each age group.
df3 <- df3%>%
select(PA, AG, AG_GROUP_NEW, Sex,Pop) %>%
transform(AGPct = percent(ave(Pop, AG, FUN = prop.table)))
df3 <- df3%>%
transform(AG_GROUP_Pct = percent(ave(Pop, AG_GROUP_NEW, FUN = prop.table)))
Build population bar chart by order by planning area.
all<-ggplot(data = df3, mapping = aes(x=reorder(PA, Pop, sum), y = Pop, fill = AG_GROUP_NEW))+
geom_bar(stat="identity")+
theme_classic() +
coord_flip()+#become side bar
#scale_y_continuous(labels=fancy_scientific) +
labs(title = 'Overall Population by Planning Area',
x = "Planning Area",
y = "Population")+
scale_fill_brewer(palette="Set3")+
scale_y_continuous(name="Population", limits=c(0, 320000),labels=function(x) format(x, big.mark = ",", scientific = FALSE)) +
theme(text=element_text(size=12),
axis.text.x = element_text(angle = 0, vjust = 0.9), #fix the words position by angle
plot.title = element_text(hjust = 0.5))
Build population bar chart by planning area and seperate by age group new
df4_1 <- df3 %>% filter(AG_GROUP_NEW=="Immaturity")
df4_2 <- df3 %>% filter(AG_GROUP_NEW=="Early Adulthood")
df4_3 <- df3 %>% filter(AG_GROUP_NEW=="Adulthood")
df4_4 <- df3 %>% filter(AG_GROUP_NEW=="Aged")
group1 <- ggplot(data = df4_1, mapping = aes(x=reorder(PA, Pop,sum), y = Pop, fill = Sex))+
geom_bar(stat="identity")+
theme_classic() +
coord_flip()+ #become side bar
labs(title = 'Immaturity Population in the Planning Area',
x = "Planning Area",
y = "Population")+
theme(text=element_text(size=8),
axis.text.x = element_text(angle = 0, vjust = 0.9), #fix the words position by angle
plot.title = element_text(hjust = 0.5))
group2 <- ggplot(data = df4_2, mapping = aes(x=reorder(PA, Pop, sum), y = Pop, fill = Sex))+
geom_bar(stat="identity")+
theme_classic() +
coord_flip()+ #become side bar
labs(title = 'Early Adulthood Population in the Planning Area',
x = "Planning Area",
y = "Population")+
theme(text=element_text(size=8),
axis.text.x = element_text(angle = 0, vjust = 0.9), #fix the words position by angle
plot.title = element_text(hjust = 0.5))
group3 <- ggplot(data = df4_3, mapping = aes(x=reorder(PA, Pop, sum), y = Pop, fill = Sex))+
geom_bar(stat="identity")+
theme_classic() +
coord_flip()+ #become side bar
labs(title = 'Adulthood Population in the Planning Area',
x = "Planning Area",
y = "Population")+
theme(text=element_text(size=8),
axis.text.x = element_text(angle = 0, vjust = 0.9), #fix the words position by angle
plot.title = element_text(hjust = 0.5))
group4 <- ggplot(data = df4_4, mapping = aes(x=reorder(PA, Pop, sum), y = Pop, fill = Sex))+
geom_bar(stat="identity")+
theme_classic() +
coord_flip()+ #become side bar
labs(title = 'Aged Population in the Planning Area',
x = "Planning Area",
y = "Population")+
theme(text=element_text(size=8),
axis.text.x = element_text(angle = 0, vjust = 0.9), #fix the words position by angle
plot.title = element_text(hjust = 0.5))
Build population pyramid and population proportion chart
POP1 <- ggplot(data=df3) +
geom_bar(aes(AG,Pop,group=AG,fill=Sex), stat = "identity",subset(df3,df3$Sex=="Females")) +
geom_bar(aes(AG,-Pop,group=AG,fill=Sex), stat = "identity",subset(df3,df3$Sex=="Males")) +
coord_flip()+
theme_classic() +
scale_y_continuous(labels = abs,limits=c(-100000, 100000))+
#scale_y_continuous(scale_y_continuous(breaks=seq(-4000000, 4000000, 100000),labels=abs(seq(-4000000, 4000000, #100000))))+
#scale_y_continuous(breaks = seq(-4000000, 4000000, 100000),
# labels = paste0(as.character(c(4:0, 1:4)), "m")) +
labs(title = 'Population Pyramid of Singapore in 2019',
x = "Age Group",
y = "Population")+
theme(plot.title = element_text(hjust = 0.7))
POP2 <- ggplot(data=df3) +
geom_bar(aes(AG,AGPct,group=AG,fill=Sex), stat = "identity",subset(df3,df3$Sex=="Females")) +
geom_bar(aes(AG,-AGPct,group=AG,fill=Sex), stat = "identity",subset(df3,df3$Sex=="Males")) +
labs(title = 'Gender Population Proportion of Singapore in 2019',
x = "Age Group",
y = "Population Proportion")+
scale_y_continuous(breaks = seq(-1, 1, 0.25),
labels = paste0(as.character(c(seq(1,0,-0.25),
seq(0.25,1,0.25)))))+
coord_flip()+
theme_classic() +
theme(plot.title = element_text(hjust = 0.7))
According to the chart about the demographics in Singapore of 2019 above, we can observe three main findings: