Data source: https://data.gov.sg/dataset/resale-flat-prices
Data time range: Jan 2015 - Nov 2017
Libraries used:
library(dplyr)
Here are the variables in the data loaded in.
## [1] "month" "town" "flat_type"
## [4] "block" "street_name" "storey_range"
## [7] "floor_area_sqm" "flat_model" "lease_commence_date"
## [10] "remaining_lease" "resale_price"
Data is read with stringsAsFactors = TRUE by default. Variables that should not be factor type are changed back to character type.
data[['block']] = sapply(data[['block']], as.character)
data[['street_name']] = sapply(data[['street_name']], as.character)
Date comes in “YYYY-mm” format in factor type.
str(data[['month']])
## Factor w/ 35 levels "2015-01","2015-02",..: 1 1 1 1 1 1 1 1 1 1 ...
The following code convert it to a “YYYY-mm-dd” in character type and date type. Day is treated to be “01” but it shall bear no significance.
data[['date_char']] = sapply(data[['month']], as.character)
data[['date_char']] = paste(data[['date_char']], "-01",sep="")
data['date']=as.Date(data[['date_char']],"%Y-%m-%d")
dates = data.frame(matrix(unlist(strsplit(data[['date_char']],'-')),nrow = dim(data)[1], byrow=TRUE),stringsAsFactors = TRUE)
colnames(dates) = c('year','month','day')
data = cbind(dates[,-3], data[,-1])
remove(dates)
str(data[['date_char']])
## chr [1:55979] "2015-01-01" "2015-01-01" "2015-01-01" ...
str(data[['date']])
## Date[1:55979], format: "2015-01-01" "2015-01-01" "2015-01-01" "2015-01-01" ...
Create new variables.
data['floor_area_sqft'] = data['floor_area_sqm']*10.7639
data['age']=as.integer(format(Sys.Date(),'%Y'))-data['lease_commence_date']
data['total_lease']=data['age']+data['remaining_lease']
data ['price_per_sqft'] = data['resale_price']/data['floor_area_sqft']
Let’s map the 26 towns to 5 main regions per HDB’s categorization.
http://www.hdb.gov.sg/cs/infoweb/about-us/history/hdb-towns-your-home
## town region
## 1 ANG MO KIO NORTH-EAST
## 2 BEDOK EAST
## 3 BISHAN CENTRAL
## 4 BUKIT BATOK WEST
## 5 BUKIT MERAH CENTRAL
## 6 BUKIT PANJANG WEST
## 7 BUKIT TIMAH CENTRAL
## 8 CENTRAL AREA CENTRAL
## 9 CHOA CHU KANG WEST
## 10 CLEMENTI WEST
## 11 GEYLANG CENTRAL
## 12 HOUGANG NORTH-EAST
## 13 JURONG EAST WEST
## 14 JURONG WEST WEST
## 15 KALLANG/WHAMPOA CENTRAL
## 16 MARINE PARADE CENTRAL
## 17 PASIR RIS EAST
## 18 PUNGGOL NORTH-EAST
## 19 QUEENSTOWN CENTRAL
## 20 SEMBAWANG NORTH
## 21 SENGKANG NORTH-EAST
## 22 SERANGOON NORTH-EAST
## 23 TAMPINES EAST
## 24 TOA PAYOH CENTRAL
## 25 WOODLANDS NORTH
## 26 YISHUN NORTH
data = full_join(data,regions,by='town')
Bin continuous data.
year_break = seq(as.integer(min(data$lease_commence_date)/10)*10,as.integer(max(data$lease_commence_date)/10)*10+10,10)
year_bin = cut(data[['lease_commence_date']],year_break)
year_bin_names = paste(as.character(year_break[-length(year_break)]),'s',sep='')
levels(year_bin)=year_bin_names
data['lease_commence_year_bin'] = year_bin
remove(year_break, year_bin, year_bin_names)
remaining_lease_break = seq(as.integer(min(data$remaining_lease)/10)*10,as.integer(max(data$remaining_lease)/10)*10+10,10)
remaining_lease_bin = cut(data[['remaining_lease']],remaining_lease_break)
data['remaining_lease_bin'] = remaining_lease_bin
remove(remaining_lease_break, remaining_lease_bin)
age_break = seq(as.integer(min(data$age)/10)*10,as.integer(max(data$age)/10)*10+10,10)
age_bin = cut(data[['age']],age_break)
data['age_bin'] = age_bin
remove(age_break, age_bin)
sqft_break = c(250,500,750,1000,1250,1500, max(data[['floor_area_sqft']]))
sqft_bin = cut(data[['floor_area_sqft']],sqft_break)
sqft_bin_names = paste(as.character(sqft_break),as.character(sqft_break+250), sep="-")
sqft_bin_names = c(sqft_bin_names[1:(length(sqft_bin_names)-2)],">1500")
levels(sqft_bin)=sqft_bin_names
data['sqft_bin'] = sqft_bin
remove(sqft_break, sqft_bin,sqft_bin_names)
lower = as.integer(min(data$resale_price)/100000)*100000
higher = as.integer(max(data$resale_price)/100000)*100000 + 100000
resale_price_break = seq(lower,higher,100000)
resale_price_bin = cut(data[['resale_price']],resale_price_break)
resale_price_bin_names = paste(as.character(resale_price_break/1000),'-',as.character(resale_price_break/1000+100),'k',sep='')
levels(resale_price_bin) = resale_price_bin_names[-length(resale_price_bin_names)]
data['resale_price_bin'] = resale_price_bin
lower = as.integer(min(data$price_per_sqft)/100)*100
higher = as.integer(max(data$price_per_sqft)/100)*100 + 100
price_psf_break = seq(lower,higher,100)
price_psf_bin = cut(data[['price_per_sqft']],price_psf_break)
data['price_psf_bin'] = price_psf_bin
Let’s take a look at the final data.
str(data)
## 'data.frame': 55979 obs. of 25 variables:
## $ year : Factor w/ 3 levels "2015","2016",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ month : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ town : Factor w/ 26 levels "ANG MO KIO","BEDOK",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ flat_type : Factor w/ 7 levels "1 ROOM","2 ROOM",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ block : chr "174" "541" "163" "446" ...
## $ street_name : chr "ANG MO KIO AVE 4" "ANG MO KIO AVE 10" "ANG MO KIO AVE 4" "ANG MO KIO AVE 10" ...
## $ storey_range : Factor w/ 17 levels "01 TO 03","04 TO 06",..: 3 1 1 1 3 3 1 1 1 5 ...
## $ floor_area_sqm : num 60 68 69 68 68 67 68 68 67 68 ...
## $ flat_model : Factor w/ 20 levels "2-room","Adjoined flat",..: 5 12 12 12 12 12 12 12 12 12 ...
## $ lease_commence_date : int 1986 1981 1980 1979 1980 1980 1980 1981 1978 1985 ...
## $ remaining_lease : int 70 65 64 63 64 64 64 65 62 69 ...
## $ resale_price : num 255000 275000 285000 290000 290000 ...
## $ date_char : chr "2015-01-01" "2015-01-01" "2015-01-01" "2015-01-01" ...
## $ date : Date, format: "2015-01-01" "2015-01-01" ...
## $ floor_area_sqft : num 646 732 743 732 732 ...
## $ age : int 32 37 38 39 38 38 38 37 40 33 ...
## $ total_lease : int 102 102 102 102 102 102 102 102 102 102 ...
## $ price_per_sqft : num 395 376 384 396 396 ...
## $ region : Factor w/ 5 levels "CENTRAL","EAST",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ lease_commence_year_bin: Factor w/ 6 levels "1960s","1970s",..: 3 3 2 2 2 2 2 3 2 3 ...
## $ remaining_lease_bin : Factor w/ 6 levels "(40,50]","(50,60]",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ age_bin : Factor w/ 6 levels "(0,10]","(10,20]",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ sqft_bin : Factor w/ 6 levels "250-500","500-750",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ resale_price_bin : Factor w/ 11 levels "100-200k","200-300k",..: 2 2 2 2 2 2 2 2 2 3 ...
## $ price_psf_bin : Factor w/ 8 levels "(200,300]","(300,400]",..: 2 2 2 2 2 3 2 3 3 3 ...
summary(data)
## year month town flat_type
## 2015:17780 05 : 5359 JURONG WEST: 4387 1 ROOM : 24
## 2016:19374 04 : 5277 WOODLANDS : 4029 2 ROOM : 553
## 2017:18825 08 : 5277 SENGKANG : 4008 3 ROOM :14716
## 06 : 5271 TAMPINES : 3848 4 ROOM :23066
## 10 : 5208 BEDOK : 3498 5 ROOM :13291
## 11 : 5034 YISHUN : 3318 EXECUTIVE : 4321
## (Other):24553 (Other) :32891 MULTI-GENERATION: 8
## block street_name storey_range floor_area_sqm
## Length:55979 Length:55979 04 TO 06:13332 Min. : 31.0
## Class :character Class :character 07 TO 09:12290 1st Qu.: 75.0
## Mode :character Mode :character 10 TO 12:10673 Median : 96.0
## 01 TO 03:10341 Mean : 97.2
## 13 TO 15: 4934 3rd Qu.:111.0
## 16 TO 18: 2036 Max. :280.0
## (Other) : 2373
## flat_model lease_commence_date remaining_lease
## Model A :16862 Min. :1966 Min. :48
## Improved :14148 1st Qu.:1984 1st Qu.:66
## New Generation : 9107 Median :1989 Median :72
## Premium Apartment: 5751 Mean :1991 Mean :74
## Simplified : 2787 3rd Qu.:2000 3rd Qu.:83
## Apartment : 2191 Max. :2015 Max. :97
## (Other) : 5133
## resale_price date_char date
## Min. : 175000 Length:55979 Min. :2015-01-01
## 1st Qu.: 340000 Class :character 1st Qu.:2015-10-01
## Median : 408888 Mode :character Median :2016-07-01
## Mean : 438932 Mean :2016-06-23
## 3rd Qu.: 500000 3rd Qu.:2017-04-01
## Max. :1180000 Max. :2017-11-01
##
## floor_area_sqft age total_lease price_per_sqft
## Min. : 333.7 Min. : 3.00 Min. : 99.0 Min. :242.7
## 1st Qu.: 807.3 1st Qu.:18.00 1st Qu.:100.0 1st Qu.:357.3
## Median :1033.3 Median :29.00 Median :100.0 Median :399.2
## Mean :1046.3 Mean :26.51 Mean :100.5 Mean :424.1
## 3rd Qu.:1194.8 3rd Qu.:34.00 3rd Qu.:101.0 3rd Qu.:457.4
## Max. :3013.9 Max. :52.00 Max. :103.0 Max. :989.0
##
## region lease_commence_year_bin remaining_lease_bin
## CENTRAL :10858 1960s: 1289 (40,50] : 456
## EAST : 9108 1970s: 9369 (50,60] : 6168
## NORTH : 8842 1980s:18119 (60,70] :18536
## NORTH-EAST:13490 1990s:13288 (70,80] :12190
## WEST :13681 2000s: 9946 (80,90] :13566
## 2010s: 3968 (90,100]: 5063
##
## age_bin sqft_bin resale_price_bin price_psf_bin
## (0,10] : 5293 250-500 : 394 300-400k:19465 (300,400]:26955
## (10,20]:14182 500-750 :10404 400-500k:15502 (400,500]:18553
## (20,30]:12428 750-1000 :13876 200-300k: 7252 (500,600]: 5296
## (30,40]:18503 1000-1250:19211 500-600k: 6506 (600,700]: 2058
## (40,50]: 5215 1250-1500: 8688 600-700k: 3746 (700,800]: 1327
## (50,60]: 358 >1500 : 3406 700-800k: 2002 (200,300]: 1310
## (Other) : 1506 (Other) : 480
Wow, no missing values in the factor and numeric variables. Let’s check for missing values in the character variables.
sum(is.na(data$block))
## [1] 0
sum(is.na(data$street_name))
## [1] 0
The data is very clean! Thank you data.gov.sg!!!
Now save the cleaned data for use later.
write.csv(data,'cleaned_data.csv',row.names=FALSE)
classes = sapply(data,class)
write.csv(classes, 'cleaned_data_classes.csv')
save(data,file = 'cleaned_data')
End of part 1