library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
The first subtask will get you familiarised with this dataset through a set of specific things to do, warming you up for the other subtask.
On 6 January 2022, The Straits Times published the article “Record 261 million-dollar HDB flats in 2021; resale prices rise in December as volume dips” (link: https://www.straitstimes.com/ singapore/housing/record-261-million-dollar-hdb-flats-in-2021-resale-prices-rise-in-december-as- volume-dips). This article is one of many similar articles published periodically in Singaporean media about real estate price trends, and so on. Your job is to use the latest available dataset on HDB resale flat prices, and verify the claims in the article.
Using R and the dataset, verify the following claims in the article:
In some cases, you will not get the exact numbers (in fact, I could not replicate all of them exactly on my own, some have a discrepancy that is larger than a few percents). What is important here is the approach you come up with and to understand the structure of this dataset. If you have developed the correct procedure, but get a different results from the article, you will still score full points.
The code and the number/figure you get must be visible in the report you submit.
resale_2017_onwards <- read_csv("resale-flat-prices/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv")
## Rows: 120363 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): month, town, flat_type, block, street_name, storey_range, flat_mode...
## dbl (3): floor_area_sqm, lease_commence_date, resale_price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Using R and the dataset, verify the following claims in the article:
resale_2021_onwards <- subset(resale_2017_onwards, month >= "2021-01")
resale_2021 <- subset(resale_2021_onwards, month <= "2021-12")
morethan_1m <- subset(resale_2021, resale_price >= 1000000)
nrow(morethan_1m)
## [1] 259
resale_2021_12 <- subset(resale_2021, month == "2021-12")
sum(resale_2021_12$resale_price)/sum(resale_2021_12$floor_area_sqm)
## [1] 5462.628
price_per_sqm_12 = sum(resale_2021_12$resale_price)/sum(resale_2021_12$floor_area_sqm)
resale_2021_11 <- subset(resale_2021, month == "2021-11")
sum(resale_2021_11$resale_price)/sum(resale_2021_11$floor_area_sqm)
## [1] 5393.432
price_per_sqm_11 = sum(resale_2021_11$resale_price)/sum(resale_2021_11$floor_area_sqm)
HDB resale prices rose ? percent in December 2021 from the previous month:
price_per_sqm_12/price_per_sqm_11
## [1] 1.01283
=1.283%
resale_2020_12 <- subset(resale_2017_onwards, month == "2020-12")
sum(resale_2020_12$resale_price)/sum(resale_2020_12$floor_area_sqm)
## [1] 4861.761
price_per_sqm_202012 = sum(resale_2020_12$resale_price)/sum(resale_2020_12$floor_area_sqm)
HDB resale prices in December 2021 were ? percent higher than a year ago:
price_per_sqm_12/price_per_sqm_202012
## [1] 1.12359
=12.359%
nrow(resale_2021_12)
## [1] 2425
subset1 <- subset(resale_2017_onwards, month >= "2020-12")
plotsubset <- subset(subset1, month <= "2021-12")
resale_2020_12 <- subset(plotsubset, month == "2020-12")
resale_2021_01 <- subset(plotsubset, month == "2021-01")
resale_2021_02 <- subset(plotsubset, month == "2021-02")
resale_2021_03 <- subset(plotsubset, month == "2021-03")
resale_2021_04 <- subset(plotsubset, month == "2021-04")
resale_2021_05 <- subset(plotsubset, month == "2021-05")
resale_2021_06 <- subset(plotsubset, month == "2021-06")
resale_2021_07 <- subset(plotsubset, month == "2021-07")
resale_2021_08 <- subset(plotsubset, month == "2021-08")
resale_2021_09 <- subset(plotsubset, month == "2021-09")
resale_2021_10 <- subset(plotsubset, month == "2021-10")
resale_2021_11 <- subset(plotsubset, month == "2021-11")
resale_2021_12 <- subset(plotsubset, month == "2021-12")
nrow(resale_2020_12)
## [1] 2486
nrow(resale_2021_01)
## [1] 2491
nrow(resale_2021_02)
## [1] 2160
nrow(resale_2021_03)
## [1] 2443
nrow(resale_2021_04)
## [1] 2334
nrow(resale_2021_05)
## [1] 1957
nrow(resale_2021_06)
## [1] 2302
nrow(resale_2021_07)
## [1] 2659
nrow(resale_2021_08)
## [1] 2740
nrow(resale_2021_09)
## [1] 2516
nrow(resale_2021_10)
## [1] 2500
nrow(resale_2021_11)
## [1] 2571
nrow(resale_2021_12)
## [1] 2425
(Time <- c("20.12","21.01","21.02","21.03","21.04","21.05","21.06","21.07","21.08","21.09","21.10","21.11","21.12"))
## [1] "20.12" "21.01" "21.02" "21.03" "21.04" "21.05" "21.06" "21.07" "21.08"
## [10] "21.09" "21.10" "21.11" "21.12"
(Transactions1 <- c(2486,2491,2160,2443,2334,1957,2302,2659,2740,2516,2500,2571,2425))
## [1] 2486 2491 2160 2443 2334 1957 2302 2659 2740 2516 2500 2571 2425
plotframe <- data.frame("Time" = Time, "Transactions1" = Transactions1)
p <- ggplot(data = plotframe, mapping=aes(x=Time, y=Transactions1,fill=Transactions1))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = Transactions1, vjust = -0.7, hjust = 0.5))
labs(x="Time",
y="No. of transactions",
title="HDB resale volume",
subtitle="No. of transactions",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
## NULL
You must include the description of the data and an exploratory data analysis.
You must give an impression that your analysis gave you a good understanding of this topic and the data.
Your analysis has to result in at least 5 meaningful insights. Examples of such are: “What is the trend of resale prices?”, “Are similar flats sold across Singapore at the same time?”, “Does the storey level have much influence on the price?”, “Is there a difference in the flats sold years ago and today?”, “What is the influence of the lease time? How was it before?”. Describing each of these in one paragraph is sufficient.
You must include at least 5 meaningful plots or tables, to accompany the aforementioned insights/conclusions. The plots that you generated in the exploratory data analysis count towards these 5 plots as long as they are meaningful and look sensible. You can go beyond 5 plots, but I prefer quality over quantity, together with meaningful insights you can deduce.
Tips:
library(tidyverse)
library(lubridate)
##
## 载入程辑包:'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
mean(resale_2021_01$resale_price)
## [1] 486898.9
mean(resale_2021_02$resale_price)
## [1] 493357.6
mean(resale_2021_03$resale_price)
## [1] 492348.4
mean(resale_2021_04$resale_price)
## [1] 501473.8
mean(resale_2021_05$resale_price)
## [1] 511869.2
mean(resale_2021_06$resale_price)
## [1] 511851.9
mean(resale_2021_07$resale_price)
## [1] 510371
mean(resale_2021_08$resale_price)
## [1] 517729.7
mean(resale_2021_09$resale_price)
## [1] 520277.4
mean(resale_2021_10$resale_price)
## [1] 522441.5
mean(resale_2021_11$resale_price)
## [1] 532362
mean(resale_2021_12$resale_price)
## [1] 531439.4
Time2021 <- c("01","02","03","04","05","06","07","08","09","10","11","12")
sg2021resaleprice <- c(486898.9, 493357.6, 492348.4, 501473.8, 511869.2, 511851.9, 510371, 517729.7, 520277.4, 522441.5, 532362, 531439.4)
frame2021resaleprice <- data.frame("Time2021" = Time2021, "sg2021resaleprice" = sg2021resaleprice)
p <- ggplot(data = frame2021resaleprice, mapping=aes(x=Time2021, y=sg2021resaleprice,fill=sg2021resaleprice))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = sg2021resaleprice, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Time",
y="Price",
title="HDB Resale Price Trends in Singapore 2021",
subtitle="resale price / sqm",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
resale_2017 <- subset(resale_2017_onwards,month <= "2017-12")
mean(resale_2017$floor_area_sqm)
## [1] 97.753
mean(resale_2021$floor_area_sqm)
## [1] 98.2373
mean(resale_2021$floor_area_sqm)-mean(resale_2017$floor_area_sqm)
## [1] 0.4843004
mean(resale_2017$resale_price)
## [1] 443888.5
mean(resale_2021$resale_price)
## [1] 511375.2
mean(resale_2021$resale_price)-mean(resale_2017$resale_price)
## [1] 67486.71
mean(resale_2017$lease_commence_date)
## [1] 1992.725
mean(resale_2021$lease_commence_date)
## [1] 1997.327
mean(resale_2021$lease_commence_date)-mean(resale_2017$lease_commence_date)
## [1] 4.601421
The average size of houses transacted in 2017 was 97.753 square meters. The average size of houses transacted in 2021 is 98.2373 square meters. In 2021, the houses traded in 2017 will be 0.4843004 square meters larger on average, indicating that the area of the houses traded is basically unchanged and slightly larger.
The average transaction value of houses transacted in 2017 was S$443,888.5. The average transaction value for houses transacted in 2021 is S$511,375.2. The average transaction value of houses transacted in 2021 is S$67,486.71 more than in 2017, indicating that the average transaction price of houses has increased.
The average rental start date for houses that transacted in 2017 was 1992. The average rental start date for homes transacted in 2021 is 1997. 2021 is 4.6 years later than the average lease start date for houses that transacted in 2017, indicating that more new homes have entered the transaction.
For 2017-:
r <- ggplot(data=resale_2017_onwards, mapping=aes(x=lease_commence_date, y=resale_price, colour=resale_price))
r + geom_count(alpha=0.2) +
labs(x="lease_commence_date", y="resale_price",
title="Relation between the lease commence date and resale price in 2017_onwards",
subtitle="What is the influence of the lease time? ",
colour="resale_price",
caption="Data: HDB, data.gov.sg (https://data.gov.sg/dataset/resale-flat-prices)")+
theme_minimal() + scale_colour_distiller(palette = "Oranges")
resale_1990_1999 <- read_csv("resale-flat-prices/resale-flat-prices-based-on-approval-date-1990-1999.csv")
## Rows: 287196 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): month, town, flat_type, block, street_name, storey_range, flat_model
## dbl (3): floor_area_sqm, lease_commence_date, resale_price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
For 1990-1999:
r <- ggplot(data=resale_1990_1999, mapping=aes(x=lease_commence_date, y=resale_price, colour=resale_price))
r + geom_count(alpha=0.2) +
labs(x="lease_commence_date", y="resale_price",
title="Relation between the lease commence date and resale price in 1990-1999",
subtitle="What is the influence of the lease time? ",
colour="resale_price",
caption="Data: HDB, data.gov.sg (https://data.gov.sg/dataset/resale-flat-prices)")+
theme_minimal() + scale_colour_distiller(palette = "Oranges")
In 1990-1999. HDBs built after 1990 are generally more expensive than those built before 1990. HDBs built between 1984-1988 participated in the most transactions. Very few HDBs were built in 1991 and hardly ever traded.
resale_2000_2012 <- read_csv("resale-flat-prices/resale-flat-prices-based-on-approval-date-2000-feb-2012.csv")
## Rows: 369651 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): month, town, flat_type, block, street_name, storey_range, flat_model
## dbl (3): floor_area_sqm, lease_commence_date, resale_price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
For 2000-2012:
r <- ggplot(data=resale_2000_2012, mapping=aes(x=lease_commence_date, y=resale_price, colour=resale_price))
r + geom_count(alpha=0.2) +
labs(x="lease_commence_date", y="resale_price",
title="Relation between the lease commence date and resale price in 2000-2012",
subtitle="What is the influence of the lease time? ",
colour="resale_price",
caption="Data: HDB, data.gov.sg (https://data.gov.sg/dataset/resale-flat-prices)")+
theme_minimal() + scale_colour_distiller(palette = "Oranges")
In 2000-2012. HDBs built after 1996 are generally more expensive than those built before 1996. HDBs built between 1984-1988 participated in the most transactions. Very few HDBs were built in 1991 and hardly ever traded.
resale2021_0103 <- subset (resale_2021, storey_range == "01 TO 03")
mean(resale2021_0103$resale_price)/mean(resale2021_0103$floor_area_sqm)
## [1] 4669.036
resale2021_0406 <- subset (resale_2021, storey_range == "04 TO 06")
mean(resale2021_0406$resale_price)/mean(resale2021_0406$floor_area_sqm)
## [1] 4895.715
resale2021_0709 <- subset (resale_2021, storey_range == "07 TO 09")
mean(resale2021_0709$resale_price)/mean(resale2021_0709$floor_area_sqm)
## [1] 5043.124
resale2021_1012 <- subset (resale_2021, storey_range == "10 TO 12")
mean(resale2021_1012$resale_price)/mean(resale2021_1012$floor_area_sqm)
## [1] 5149.67
resale2021_1315 <- subset (resale_2021, storey_range == "13 TO 15")
mean(resale2021_1315$resale_price)/mean(resale2021_1315$floor_area_sqm)
## [1] 5506.014
resale2021_1618 <- subset (resale_2021, storey_range == "16 TO 18")
mean(resale2021_1618$resale_price)/mean(resale2021_1618$floor_area_sqm)
## [1] 5917.691
resale2021_1921 <- subset (resale_2021, storey_range == "19 TO 21")
mean(resale2021_1921$resale_price)/mean(resale2021_1921$floor_area_sqm)
## [1] 6570.442
resale2021_2224 <- subset (resale_2021, storey_range == "22 TO 24")
mean(resale2021_2224$resale_price)/mean(resale2021_2224$floor_area_sqm)
## [1] 6918.376
resale2021_2527 <- subset (resale_2021, storey_range == "25 TO 27")
mean(resale2021_2527$resale_price)/mean(resale2021_2527$floor_area_sqm)
## [1] 7469.604
resale2021_2830 <- subset (resale_2021, storey_range == "28 TO 30")
mean(resale2021_2830$resale_price)/mean(resale2021_2830$floor_area_sqm)
## [1] 8556.826
resale2021_3133 <- subset (resale_2021, storey_range == "31 TO 33")
mean(resale2021_3133$resale_price)/mean(resale2021_3133$floor_area_sqm)
## [1] 9197.067
resale2021_3436 <- subset (resale_2021, storey_range == "34 TO 36")
mean(resale2021_3436$resale_price)/mean(resale2021_3436$floor_area_sqm)
## [1] 9664.741
resale2021_3739 <- subset (resale_2021, storey_range == "37 TO 39")
mean(resale2021_3739$resale_price)/mean(resale2021_3739$floor_area_sqm)
## [1] 9562.328
resale2021_4042 <- subset (resale_2021, storey_range == "40 TO 42")
mean(resale2021_4042$resale_price)/mean(resale2021_4042$floor_area_sqm)
## [1] 10405.58
resale2021_4345 <- subset (resale_2021, storey_range == "43 TO 45")
mean(resale2021_4345$resale_price)/mean(resale2021_4345$floor_area_sqm)
## [1] 10648.54
resale2021_4648 <- subset (resale_2021, storey_range == "46 TO 48")
mean(resale2021_4648$resale_price)/mean(resale2021_4648$floor_area_sqm)
## [1] 11391.85
resale2021_4951 <- subset (resale_2021, storey_range == "49 TO 51")
mean(resale2021_4951$resale_price)/mean(resale2021_4951$floor_area_sqm)
## [1] 11708.54
storey_range <- c("01 TO 03","04 TO 06","07 TO 09", "10 TO 12", "13 TO 15", "16 TO 18", "19 TO 21", "22 TO 24", "25 TO 27", "28 TO 30", "31 TO 33", "34 TO 36", "37 TO 39", "40 TO 42", "43 TO 45", "46 TO 48", "49 TO 51")
resale_price_sqm <- c(4669.036, 4895.715, 5043.124, 5149.67, 5506.014, 5917.691, 6570.442, 6918.376, 7469.604, 8556.826, 9197.067, 9664.741, 9562.328, 10405.58, 10648.54, 11391.85, 11708.54)
frame_storey <- data.frame("storey_range" = storey_range, "resale_price_sqm" = resale_price_sqm)
p <- ggplot(data = frame_storey, mapping=aes(x=storey_range, y=(resale_price_sqm),fill=resale_price_sqm))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = resale_price_sqm, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Storey range",
y="Resale price / sqm",
fill="Resale price",
title="The relationship between storey level and resale price",
subtitle="Resale in 2021",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
According to the chart I think it is obvious that as the number of layers increases, the price also increases. HDBs storey level low to high, ranging from around S$4,700 per sqm to around S$11,700 per sqm. The height of the floor has a great influence on the price.
p <- ggplot(data = resale_2021, mapping=aes(x=town, y=flat_type,fill=flat_type))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = storey_range, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Town",
y="Flat type",
fill="Flat type",
title="The relationship between Town and Flat type",
subtitle="Resale in 2021",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
p <- ggplot(data = resale_2021, mapping=aes(x=flat_type, y=town,fill=town))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = storey_range, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Flat type",
y="Town",
fill="Town",
title="The relationship between Town and Flat type",
subtitle="Resale in 2021",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
Not all town’s flat types are the same. For example, SENGKANG and PUNGGOL have the most HDB transactions, while MARINE PARADE, CENTRAL AREA and BUKIT TIMAH have the least. In general, it can be seen that the main hdb flat types in Singapore are 4 room, 5 room, 3 room, executive. Other 2 room, 1 room and multi-generation are very few.
p <- ggplot(data = resale_1990_1999, mapping=aes(x=flat_type, y=town,fill=town))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = storey_range, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Flat type",
y="Town",
fill="Town",
title="The relationship between Town and Flat type",
subtitle="Resale in 1990-1999",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
In 1990-1999, the number of 4 room and 3 room was the largest, followed by 5 room and executive.
p <- ggplot(data = resale_2000_2012, mapping=aes(x=flat_type, y=town,fill=town))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = storey_range, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Flat type",
y="Town",
fill="Town",
title="The relationship between Town and Flat type",
subtitle="Resale in 2000-2012",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
In 2000-2012, the number of transactions from most to least is 4 room, 3 room, 5 room and executive.
resale_2012_2014 <- read_csv("resale-flat-prices/resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv")
## Rows: 52203 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): month, town, flat_type, block, street_name, storey_range, flat_model
## dbl (3): floor_area_sqm, lease_commence_date, resale_price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
p <- ggplot(data = resale_2012_2014, mapping=aes(x=flat_type, y=town,fill=town))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = storey_range, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Flat type",
y="Town",
fill="Town",
title="The relationship between Town and Flat type",
subtitle="Resale in 2012-2014",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
In 2012-2014, the number of transactions from most to least is 4 room, 3 room, 5 room and executive.
resale_2015_2016 <- read_csv("resale-flat-prices/resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv")
## Rows: 37153 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): month, town, flat_type, block, street_name, storey_range, flat_model
## dbl (4): floor_area_sqm, lease_commence_date, remaining_lease, resale_price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
p <- ggplot(data = resale_2015_2016, mapping=aes(x=flat_type, y=town,fill=town))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = storey_range, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Flat type",
y="Town",
fill="Town",
title="The relationship between Town and Flat type",
subtitle="Resale in 2015-2016",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
In 2015-2016, the number of transactions from most to least is 4 room, 5 room, 3 room and executive.
p <- ggplot(data = resale_2017_onwards, mapping=aes(x=flat_type, y=town,fill=town))
p + geom_bar(stat = 'identity', width = 0.7) + geom_text(mapping = aes(label = storey_range, vjust = 0.5, hjust = 1, color="white")) + coord_flip() +
labs(x="Flat type",
y="Town",
fill="Town",
title="The relationship between Town and Flat type",
subtitle="Resale in 2017-",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
In 2017 onwards, the number of transactions from most to least is 4 room, 5 room, 3 room and executive.
Therefore, it can be seen that from 1990 to the present, the HDB with 3 rooms has gradually changed from being as popular as 4 rooms, to about the same as 5 rooms, and gradually surpassed by the number of 5 rooms.
Your task is to recreate the plot included below, using the HDB Resale Flat Prices dataset (the same as in the previous task). This type of plot is called the ridgeline plot (you may find it in literature also under the alternative name joyplot), and it is becoming increasingly popular. A ridgeline plot shows the distribution of a numeric value for multiple groups in the data. Ridgelines in R are made with the package ggridges, which has not been part of the class. For that you need to explore the package and figure out how it works. This task requires also a degree of data transformation to get the data in the form needed to accomplish such a plot. In addition, if you come up with something that communicates the same data in a better way — you’re more than welcome to submit that as well, on top of this plot. Please write one paragraph of conclusions/ observations.
You may struggle a bit with some details (e.g. sorting the towns by average price), but that’s normal and part of the learning process. Keep in mind that some of the code you develop in this assignment is yours and can be reused later (e.g. you can use it to visualise this particular plot type in Assignment 2 and the group project if applicable).
library(ggplot2)
library(ggridges)
resale_2021 %>%
group_by(town) %>%
summarise(min = min(resale_price))
ggplot(resale_2021, aes(x = resale_price, y =town, fill = stat(x))) +
geom_density_ridges_gradient(scale = 1.5,
rel_min_height = 0.005,
color="white",
linetype=1,
lwd=0.5) +
scale_fill_viridis_c(name = "Price", option = "C") +
coord_cartesian(clip = "off") +
theme_minimal() +
labs(title = "HDB resale proces in 2021 by neighbourhood",
subtitle = "Neighbourhoods exhibit large differences",
x="Price per square metre (SGD)",
y="Year",
fill="Town",
caption="Data: Housing and Development Board, Singapore, 2022.")
## Picking joint bandwidth of 39200
| 3. FREEFORM TASK |
| The final task is to find a topic and a dataset on your own, and provide an analysis that is of about half of the scope of the one in the task 1b (i.e. resulting in 2-3 insights and about 2-3 plots). |
| This is an open-ended task. Be creative and do something interesting. Use this opportunity to focus on a topic you personally find compelling and want to explore. The data does not have to necessarily be from Singapore. It can be from anywhere. Preferably it should be related to the urban context, but that is not a must (e.g. it can be a dataset about movies, politics, music, economics, food, covid-19, …), as long as you can demonstrate a solid grasp of what we have done so far during the sessions and that you are capable of using the attained transferrable skills to explore new topics on your own. Don’t know where to get interesting data? There are numerous options besides data.gov.sg, e.g. |
| - Type data() in the console, R comes with some datasets that you can use right away. - Check out this collection: https://data.fivethirtyeight.com from the popular FiveThirtyEight news site (it is also available as an R package called fivethirtyeight). - Explore questions and answers on the Open Data StackExchange (https:// opendata.stackexchange.com), which contains some interesting surprises. - If you have lived somewhere else besides Singapore, how about checking the open data portal of the government of that city or country? |
| While you have substantial freedom in this task, don’t do something overly simple. Do something you personally find interesting. Find an interesting dataset that we have not used. Feel free to use this opportunity to try out new types of plots and new packages. |
vaccination<- read_csv("covid-19-vaccination/primary-series-vaccination-take-up-by-age-group.csv")
## Rows: 3771 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): agecat
## dbl (2): first_dose_pcttakeup, full_regimen_pcttakeup
## date (1): vacc_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Analysis of vaccination status of young adults aged 20-39.
vacc20_39 <-
vaccination %>%
filter(agecat %in% c("20-29","30-39")) %>%
mutate(vaccinated = vacc_date + 14) %>%
select(vaccinated, everything())
p <- ggplot(data = vacc20_39, mapping=aes(x=vacc_date, y=first_dose_pcttakeup,fill=first_dose_pcttakeup))
p + geom_bar(stat = 'identity', width = 0.3) +
labs(x="Time_accination_date1",
y="First dose pcttakeup",
fill="first_dose_pcttakeup",
title="The relationship between Time and First dose pcttakeup in age 20-29",
subtitle="2020.12.30 - 2022.02.21",
caption="NOTE: Figure for December 2021 is a flash estimate.
Chart: STRAITS TIMES GRAPHICS Source: 99.co and SRX") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"))
renting_out_of_flats <- read_csv("renting-out-of-flats/renting-out-of-flats.csv")
## Rows: 39711 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): rent_approval_date, town, block, streetname, flat_type
## dbl (1): monthly_rent
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
1.What is the average monthly rent in Climenti and Queenstown?
rent_focus_blocks <- renting_out_of_flats %>%
filter(town %in% c('QUEENSTOWN', 'CLEMENTI') & monthly_rent >= 2000) %>%
arrange(desc(monthly_rent)) %>%
select(streetname, rent_approval_date,town,flat_type,monthly_rent)
rent_focus_blocks %>%
group_by(town) %>%
summarise(average_monthly_rent = mean(monthly_rent))
The average monthly rent in Climenti is 2412.128 and the average monthly rent in Queenstown is 2499.521.
renting_out_of_flats %>%
group_by(town) %>%
summarise(m=mean(monthly_rent)) %>%
ggplot(mapping=aes(x=town, y=m)) +
geom_point() +
coord_flip() +
labs(x="Town",
y="Monthly rent",
fill="first_dose_pcttakeup",
title="Average monthly rent in different towns",
subtitle="2021.02-2022.01",
caption="Source: Renting Out of Flats.
https://data.gov.sg/dataset/renting-out-of-flats")
renting_out_of_flats %>%
group_by(town, flat_type) %>%
summarise(avg = mean(monthly_rent)) %>%
ggplot(mapping=aes(x=reorder(town, -avg), y=avg, colour=flat_type)) +
geom_point() +
coord_flip() +
labs(x="Town",
y="Monthly rent",
fill="first_dose_pcttakeup",
title="Average monthly rent in different towns of different flat types",
subtitle="2021.02-2022.01",
caption="Source: Renting Out of Flats.
https://data.gov.sg/dataset/renting-out-of-flats")
## `summarise()` has grouped output by 'town'. You can override using the
## `.groups` argument.