IS428 Assignment 4

Author: Qiu Jun Wei

Date: 17 March 2021

1. Major Data and Design Challenges Faced.

1.1 Geographical coordinates.

My initial idea was to create a Singapore map representing the population in each planning area. However, the dataset gotten from singstat does not include the geographical coordinates.


1.2 Unavailability of Region data

Another idea was to create a area chart presenting the population in each region by different age group. However, the dataset does not have region data and it had to be included manually.

2. How to Overcome Challenges.

2.1 Data Sourcing.

To get the data i need stated above, I went to search through government data sites and downloaded the required datasets.

I managed to join the region data using vlookup in excel as it was just 1 additional column of data. As for the geographical coordinates, I’ve decided to join using R functions as it would be simpler that way.

2.2 Solution Sketch

Solution Sketch

Solution Sketch

3. Step by Step Guide for Data visualization

3.1 Import Packages & Data

library(tidyverse)
library(sf)
library(tmap)
library(DT)

data = read_csv("data/2011_2020data.csv")
#get geospatial data
mpsz <- st_read(dsn = "data/geospatial",
                layer = "MP14_SUBZONE_WEB_PL")
## Reading layer `MP14_SUBZONE_WEB_PL' from data source `C:\Users\Junwe\Desktop\4.2\VA\assignment4\data\geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 323 features and 15 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
## projected CRS:  SVY21

3.2 Data Preprocessing

#turn age group into sortable factors
data$AG <- factor(data$AG,levels = c("0_to_4",
                                                           "5_to_9",
                                                           "10_to_14",
                                                           "15_to_19",
                                                           "20_to_24",
                                                           "25_to_29",
                                                           "30_to_34",
                                                           "35_to_39",
                                                           "40_to_44",
                                                           "45_to_49",
                                                           "50_to_54",
                                                           "55_to_59",
                                                           "60_to_64",
                                                           "65_to_69",
                                                           "70_to_74",
                                                           "75_to_79",
                                                           "80_to_84",
                                                           "85_to_89",
                                                           "90_and_over"))
#Dataset for 2020
data2020 = data %>% 
  filter(Time == 2020)


#join mapdata
for_map = data2020 %>%
  select(PA,Pop) %>%
  mutate(PA = toupper(PA)) %>%
  group_by(PA) %>%
  summarise('Pop' = sum(Pop)/1000)
data2020_map <- left_join( mpsz,for_map,
                              by = c( "PLN_AREA_N" = "PA" ))

3.3 Distribution of Poulation by Planning Area 2020

Pop_by_PA <- tm_shape(data2020_map)+
  tm_fill("Pop",
          palette = "Blues", 
          legend.hist = TRUE,
          legend.is.portrait = TRUE,
          legend.hist.z = 0.1,
          title = "Count")+
  tm_layout(main.title = "Distribution of Population Planning Area 2020 (Thousands)",
            legend.height = 0.45, 
            legend.width = 0.35,
            legend.outside = FALSE,
            legend.position = c("right", "bottom"),
            frame = FALSE)


Pop_by_PA

3.4 Population by Age 2020

pop_by_age_data <- data2020 %>%
  select(AG,Pop) %>%
  group_by(AG) %>%
  summarise('Pop' = sum(Pop)/1000) %>%
  arrange(AG)

pop_by_age <- ggplot(pop_by_age_data, aes(x = AG, y = Pop)) +
  geom_bar(stat='identity',fill="seagreen")  +
  labs(x = "Age Group", y = "Population(per thousand)", title = "Population by Age Group 2020") +
  theme(plot.title = element_text(hjust = 0.5),
      axis.text.x = element_text(angle = 45))

pop_by_age_sex_data <- data2020  %>%
    select(AG,Sex,Pop) %>% 
    group_by(AG,Sex) %>%
    summarise('Pop' = sum(Pop)/1000) %>% 
    mutate(Pop = if_else(Sex == "Males", -Pop, Pop))


females_pop <-
  pop_by_age_sex_data %>% 
  filter(Sex == "Females") %>% 
  arrange(AG)

the_order <- females_pop$AG

pop_by_age_sex <- ggplot(pop_by_age_sex_data, aes(x = AG, y = Pop, fill = Sex)) + 
  geom_bar(stat='identity')  +
  coord_flip() +
  scale_x_discrete(limits = the_order) +
  scale_y_continuous(breaks = seq(-200, 200, 50), 
                     labels = abs(seq(-200, 200, 50))) +
  labs(x = "Age Group", y = "Population(per thousand)", title = "Population by Age by Gender 2020") +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5),
        panel.background = element_rect(fill =  "grey90")) +
  scale_fill_manual(values=c( "deepskyblue1","hotpink1"),
                    name="",
                    breaks=c("Males", "Females"),
                    labels=c("Males", "Female")) 

  
pop_by_age_sex
pop_by_age

3.5 Population Planning Area and Type Of Dwelling 2020

pop_by_TOD_PA_data <- data %>% 
  select(PA,TOD,Pop) %>% 
  group_by(PA,TOD) %>% 
  summarise("Pop" = sum(Pop)) %>% 
   ungroup()

 
pop_by_TOD_PA_chart <- ggplot(pop_by_TOD_PA_data,aes(TOD, PA, fill= Pop)) + 
      geom_tile() +
      scale_fill_distiller(palette = "RdYlBu") +
      theme(axis.text.x = element_text(angle = 90)) +
      labs(title="Population Heatmap by Type of Dwelling and Planning Area 2020",
           x ="Type of Dwelling", y = "Planning Area")+
      theme(plot.title = element_text(hjust = 0.5))

pop_by_TOD_PA_chart

3.6 Population Distribution by Region and Age 2020

by_age_region <- data2020 %>% 
  select(AG,Region,Pop) %>% 
  group_by(AG,Region) %>% 
  summarise("Pop" = sum(Pop)/1000)

by_age_region_chart <- ggplot(by_age_region, aes(x=as.numeric(AG),y=Pop,fill=Region )) +
  stat_smooth(
        geom = 'area', method = 'loess', span = 1/3,
        alpha = 1/2,position = 'identity' )+
  scale_fill_brewer(palette="Spectral") +
  labs(x = "Age Group", y = "Population(per thousand)", title = "Population by Region and Age 2020")+
  theme_classic() + 
    theme(axis.text.x = element_blank(),
          axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5))
by_age_region_chart

3.7 Ternary Chart

library(ggtern)
#assuming age 0 to 19
data_children <- data2020 %>%
    filter(AG %in% c('0_to_4','5_to_9','10_to_14','15_to_19')) %>%
    group_by(SZ, Region) %>%
    summarise('Children' = sum(Pop))

#assuming age 20 to 59
data_adult <- data2020 %>% filter(AG %in% c('20_to_24', '25_to_29','30_to_34',
                                            '35_to_39','40_to_44','45_to_49',
                                            '50_to_54','55_to_59')) %>%
    group_by(SZ, Region) %>%
    summarise('Adult' = sum(Pop))

#assuming age 60 and above, according to Senior Citizen Act
data_elderly <- data2020 %>% filter(AG %in% c('60_to_64', '65_to_69','70_to_74',
                                              '75_to_79','80_to_84',
                                              '90_and_over')) %>%
    group_by(SZ, Region) %>%
    summarise('Elderly'= sum(Pop))

#prep data
ternary_data <- merge(data_children, data_adult,
                      by=c("SZ", "Region"))
ternary_data <- merge(ternary_data, data_elderly,
                      by=c("SZ", "Region"))


ternary_chart <- ggtern(data=ternary_data, aes(x=Children,y=Adult, z=Elderly, color=Region)) +
    geom_point(alpha=0.5) +
    xlab("Children") +
    ylab("Adults") +
    zlab("Elderly") +
    labs(title="Singapore Population Structure, 2020") +
    theme(plot.title=element_text(hjust = 0.5),
          plot.title.position = "plot",
          plot.subtitle=element_text(size=12, margin=margin(b=12))) +
    theme_rgbg()
ternary_chart
Ternery Chart

Ternery Chart

4. Final Visualizations

5. Insights

5.1 General population residence in Singapore 2020

Insight 1

Insight 1

Based on the chart, we can see that generally the central areas are less populated than the outer areas. Jurong East, Woodlands and Bedok are the are the most populated areas.

5.2 Population by Age group and Gender

Insight 2

Insight 2

From the chart, we can see that most of the population are aged 25 to 64. The lower part of the chart seemed to have a trend of descreasing birth rate as the population decreases as age decrease. The top part of the chart showed that there were more females than males in the elderly age group. It could be implied that generally females in Singapore live longer than Males

The top part of the chart showed that more females in Singapore live to older age.

5.3 Population by type of Dwelling and Planning Area

Insight 3

Insight 3

From the heat map, we can see that majority of residents live in HDB 4 and 5 room flats.

Across every planning area, bedok seemed to be the planning area with the most balanced distribution of population living in different types of dwellings.

5.4

Insight 4

Insight 4

From the chart, we can see that the Central region seemed to have the most elderlies and the Northeast seemed to have the most children and young adults.

The population in North is the least compared to other region.