Realis Part 02 Terrasse

Background

The data for this R script comes from the URA Realis web site (www.ura.gov.sg, click on Realis menu) The purpose of the web site
is to provide data and information about the caveats lodged for EVERY property transaction in Singapore.

The URA Realis web site contains a lot of data and we will only look at a small subset for this R script. The files for this
script are as follows:

(1) realis_terrasse_newsub_2008_jan_oct_15.csv

Contains information about ALL apartments in project Terrassee ONLY from January 2008 to current (13 October 2012).

library(psych)
library(ltm)
## Loading required package: MASS
## Loading required package: msm
## Loading required package: mvtnorm
## Loading required package: polycor
## Loading required package: sfsmisc
## Attaching package: 'polycor'
## The following object(s) are masked from 'package:psych':
## 
## polyserial
## Attaching package: 'ltm'
## The following object(s) are masked from 'package:psych':
## 
## factor.scores
library(gclus)
## Loading required package: cluster

#|------------------------------------------------------------------------------------------|
#|                                I N I T I A L I Z A T I O N                               |
#|------------------------------------------------------------------------------------------|
RealisReadDfr <- function(fileStr, partNum=1, 
                          workDirStr="C:/Users/denbrige/100 FxOption/103 FxOptionVerBack/080 Fx Git/R-nonsource")
{
  #---  Assert THREE (3) arguments:                                                   
  #       fileStr:      name of the file (without the suffix "_part_xx" and extension ".csv"
  #       partNum:      number of parts                                               
  #       workDirStr:   working directory                                             

  #---  Check that partNum is valid (between 1 to 99)                                 
  if( as.numeric(partNum) < 1 || as.numeric(partNum) > 99 ) 
    stop("partNum MUST be between 1 AND 99")
  #---  Set working directory                                                         
  setwd(workDirStr)
  #---  Read data from split parts
  #       Append suffix to the fileStr
  #       Read each part and merge them together

  if( as.numeric(partNum) > 1 )
  {
    retDfr <- read.csv( paste( fileStr, "_part01.csv", sep="" ), colClasses = "character" )

    for( id in 2:partNum )
    {
    #---  rbind() function will bind two data frames with the same header together
      partStr <- paste( fileStr, "_part", sprintf("%02d", as.numeric(id)), ".csv", sep="" )
      tmpDfr <- read.csv( partStr, colClasses = "character")
      retDfr <- rbind( retDfr, tmpDfr )
    }
  }
  else
    retDfr <- read.csv( paste( fileStr, ".csv", sep="" ), colClasses = "character" )

  #---  Return a data frame
  return(retDfr)
}

#|------------------------------------------------------------------------------------------|
#|                            I N T E R N A L   F U N C T I O N S                           |
#|------------------------------------------------------------------------------------------|
RealisAddressSplitDfr <- function( inChr )                                             
{                                                                                   
#---  Assert THREE (3) arguments:                                                   
#       inChr:      vector of addresses                                             

#---  Split address into parts      
  subChr <- substring(inChr, regexpr('#', inChr)) 
  lvlChr <- substr(subChr, 2, 3)
  untChr <- substr(subChr, 5, 6)

#---  Return a data frame         
  return(data.frame(level=lvlChr, unit=untChr))
}                                                                                   

#|------------------------------------------------------------------------------------------|
#|                                M A I N   P R O C E D U R E                               |
#|------------------------------------------------------------------------------------------|
#---  Init loading data
rawDfr <- RealisReadDfr("realis_terrasse_newsub_2008_jan_oct_15")
#---  Count of cols of data
ncol(rawDfr)
## [1] 19
#---  Count of rows of data
nrow(rawDfr)
## [1] 380
#---  Names of header
names(rawDfr)
##  [1] "Project.Name"                "Address"                    
##  [3] "No..of.Units"                "Area..sqm."                 
##  [5] "Type.of.Area"                "Transacted.Price...."       
##  [7] "Unit.Price....psm."          "Unit.Price....psf."         
##  [9] "Contract.Date"               "Property.Type"              
## [11] "Tenure"                      "Completion.Date"            
## [13] "Type.of.Sale"                "Purchaser.Address.Indicator"
## [15] "Postal.District"             "Postal.Sector"              
## [17] "Postal.Code"                 "Planning.Region"            
## [19] "Planning.Area"
#---  Peek at data
head(rawDfr)
##   Project.Name                   Address No..of.Units Area..sqm.
## 1     TERRASSE 25 Terrasse Lane  #01-24             1         89
## 2     TERRASSE 35 Terrasse Lane  #03-80             1        105
## 3     TERRASSE 33 Terrasse Lane  #01-71             1         89
## 4     TERRASSE 35 Terrasse Lane  #04-80             1        107
## 5     TERRASSE 35 Terrasse Lane  #02-80             1        114
## 6     TERRASSE 35 Terrasse Lane  #01-81             1         89
##   Type.of.Area Transacted.Price.... Unit.Price....psm. Unit.Price....psf.
## 1       Strata               998690              11221               1042
## 2       Strata              1145000              10905               1013
## 3       Strata               970000              10899               1013
## 4       Strata              1150000              10748                998
## 5       Strata              1150000              10088                937
## 6       Strata               950000              10674                992
##   Contract.Date Property.Type                 Tenure Completion.Date
## 1   02-JUL-2012   Condominium 99 Yrs From 23/08/2010     Uncompleted
## 2   12-JUN-2012   Condominium 99 Yrs From 23/08/2010     Uncompleted
## 3   08-JUN-2012   Condominium 99 Yrs From 23/08/2010     Uncompleted
## 4   06-JUN-2012   Condominium 99 Yrs From 23/08/2010     Uncompleted
## 5   06-JUN-2012   Condominium 99 Yrs From 23/08/2010     Uncompleted
## 6   04-JUN-2012   Condominium 99 Yrs From 23/08/2010     Uncompleted
##   Type.of.Sale Purchaser.Address.Indicator Postal.District Postal.Sector
## 1     New Sale                         HDB              19            54
## 2     New Sale                     Private              19            54
## 3     New Sale                         HDB              19            54
## 4     New Sale                         HDB              19            54
## 5     New Sale                     Private              19            54
## 6     New Sale                         HDB              19            54
##   Postal.Code   Planning.Region Planning.Area
## 1      544776 North East Region       Hougang
## 2      544781 North East Region       Hougang
## 3      544780 North East Region       Hougang
## 4      544781 North East Region       Hougang
## 5      544781 North East Region       Hougang
## 6      544781 North East Region       Hougang

#--- Coerce character into numeric or date
rawDfr[, 4] <- suppressWarnings( as.numeric( rawDfr[, 4] ) )    # Area sqm
rawDfr[, 6] <- suppressWarnings( as.numeric( rawDfr[, 6] ) )    # Transacted Price
rawDfr[, 7] <- suppressWarnings( as.numeric( rawDfr[, 7] ) )    # Unit Price psm
rawDfr[, 8] <- suppressWarnings( as.numeric( rawDfr[, 8] ) )    # Unit Price psf
rawDfr[, 9] <- as.Date(rawDfr[, 9], "%d-%b-%Y")

#--- Split address into parts and merge new columns
rawDfr <- cbind( rawDfr, RealisAddressSplitDfr(rawDfr[, 2]) )

Part ONE (1): What is the median value of a property for project Terrasse?

This part answers a really simple question: What is the overall median price, area and psf?

There are THREE HUNDRED AND EIGHTY (380) units that are available with a median price of $1.105 million and an area of 103
sqm, at a median psf of $993.

#
# |------------------------------------------------------------------------------------------|
# | P A R T O N E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Count of rows of data
nrow(rawDfr)
## [1] 380
describe(rawDfr[, 4])
##   var   n  mean   sd median trimmed   mad min max range skew kurtosis   se
## 1   1 380 106.9 41.6    103   103.1 41.51  47 232   185  0.9     0.56 2.13
describe(rawDfr[, 6])
##   var   n    mean     sd  median trimmed    mad    min     max   range
## 1   1 380 1103604 332439 1105800 1088157 355824 584400 1954600 1370200
##   skew kurtosis    se
## 1 0.37    -0.56 17054
describe(rawDfr[, 8])
##   var   n  mean    sd median trimmed   mad min  max range  skew kurtosis
## 1   1 380 988.9 92.44    993   990.9 96.37 709 1204   495 -0.35     0.24
##     se
## 1 4.74
# --- Plot a simple histogram
par(mfrow = c(3, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
hist(rawDfr[, 4], xlab = "Area Sqm", main = "Distribution of Floor Area")
boxplot(rawDfr[, 4], outchar = T, main = "Boxplot of Floor Area", col = "slateblue1")
hist(rawDfr[, 6], xlab = "", main = "Distribution of Purchase Price")
boxplot(rawDfr[, 6], outchar = T, main = "Boxplot of Purchase Price", col = "slateblue1")
hist(rawDfr[, 8], xlab = "Unit Price Psf", main = "Distribution of Unit Price Psf")
boxplot(rawDfr[, 8], outchar = T, main = "Boxplot of Unit Price Psf", col = "slateblue1")

plot of chunk unnamed-chunk-2

# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (rawDfr[, 4]/pi)
symbols(rawDfr[, 8], rawDfr[, 6], circles = radius, inches = 0.3, fg = "white", 
    bg = "lightblue", ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf")
text(rawDfr[, 8], rawDfr[, 6], rawDfr[, 4], cex = 0.6, col = "red")
text(993, 1105000, "MEDIAN", cex = 1.2, col = "red")

plot of chunk unnamed-chunk-3

This part answers a really simple question: Which period had the most transactions since launch?

The busiest period appeared to be at the launch last year, but there were significant activities this year.

# --- Plot a timeline
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
tempDfr <- rawDfr
tempDfr <- tempDfr[complete.cases(tempDfr[, 8]), ]
tempDfr <- tempDfr[complete.cases(tempDfr[, 9]), ]
plot(tempDfr[, 6] ~ tempDfr[, 9], xlim = c(as.Date("01-06-11", "%d-%m-%y"), 
    as.Date("30-07-12", "%d-%m-%y")), xlab = "Date", ylab = "", main = "Timeline of Transacted Price")

plot of chunk unnamed-chunk-4

Part TWO (2): Which property can we buy with a good view?

This part answers a really simple question: Which property can we buy with a good view on a high level?

There are ONLY FIVE (5) levels maximum per block. Therefore, we assume a LEVEL FIVE (5) unit for a good view.

There are SEVENTY-EIGHT (78) units that are available with a median price of $1.400 million at a median psf of $891.


#
# |------------------------------------------------------------------------------------------|
# | P A R T T W O P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
subDfr <- rawDfr[rawDfr$level == "05", ]
# --- Count of rows of data
nrow(subDfr)
## [1] 78
describe(subDfr[, 4])
##   var  n mean    sd median trimmed  mad min max range  skew kurtosis   se
## 1   1 78  148 39.43    144   152.7 34.1  50 206   156 -1.23     1.35 4.46
describe(subDfr[, 6])
##   var  n    mean     sd  median trimmed    mad    min     max   range
## 1   1 78 1418265 328798 1400150 1454329 221056 599700 1954600 1354900
##    skew kurtosis    se
## 1 -1.17     1.37 37229
describe(subDfr[, 8])
##   var  n  mean    sd median trimmed   mad min  max range skew kurtosis
## 1   1 78 908.8 77.54  891.5   893.2 34.84 823 1148   325 2.08     3.34
##     se
## 1 8.78

# --- Plot a simple histogram
par(mfrow = c(3, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
hist(subDfr[, 4], xlab = "Area Sqm", main = "Distribution of Floor Area")
boxplot(subDfr[, 4], outchar = T, main = "Boxplot of Floor Area", col = "slateblue1")
hist(subDfr[, 6], xlab = "", main = "Distribution of Purchase Price")
boxplot(subDfr[, 6], outchar = T, main = "Boxplot of Purchase Price", col = "slateblue1")
hist(subDfr[, 8], xlab = "Unit Price Psf", main = "Distribution of Unit Price Psf")
boxplot(subDfr[, 8], outchar = T, main = "Boxplot of Unit Price Psf", col = "slateblue1")

plot of chunk unnamed-chunk-5

# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (subDfr[, 4]/pi)
symbols(subDfr[, 8], subDfr[, 6], circles = radius, inches = 0.3, fg = "white", 
    bg = "lightblue", ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf")
text(subDfr[, 8], subDfr[, 6], subDfr[, 4], cex = 0.6, col = "red")
text(891, 1400000, "MEDIAN", cex = 1.2, col = "red")

plot of chunk unnamed-chunk-6

Part THREE (3): Which property can we buy for an area of similar size?

This part answers a really simple question: Which property can we buy for an area of size between 102 sqm and 106 sqm?

There are THIRTY-SIX(36) units that are available with a median price of $1.149 million at a median psf of $1,019.


#
# |------------------------------------------------------------------------------------------|
# | P A R T T H R E E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
subDfr <- rawDfr[rawDfr[, 4] > 102, ]
subDfr <- subDfr[subDfr[, 4] < 106, ]
# --- Count of rows of data
nrow(subDfr)
## [1] 36
describe(subDfr[, 4])
##   var  n  mean   sd median trimmed mad min max range  skew kurtosis   se
## 1   1 36 104.1 1.01    105   104.1   0 103 105     2 -0.11    -2.04 0.17
describe(subDfr[, 6])
##   var  n    mean    sd  median trimmed   mad     min     max  range  skew
## 1   1 36 1157541 66802 1149200 1158135 86065 1001800 1260000 258200 -0.07
##   kurtosis    se
## 1    -1.15 11134
describe(subDfr[, 8])
##   var  n mean    sd median trimmed   mad min  max range  skew kurtosis
## 1   1 36 1033 52.32   1019    1034 52.63 886 1115   229 -0.29    -0.31
##     se
## 1 8.72

# --- Plot a simple histogram
par(mfrow = c(2, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
hist(subDfr[, 6], xlab = "", main = "Distribution of Purchase Price")
boxplot(subDfr[, 6], outchar = T, main = "Boxplot of Purchase Price", col = "slateblue1")
hist(subDfr[, 8], xlab = "Unit Price Psf", main = "Distribution of Unit Price Psf")
boxplot(subDfr[, 8], outchar = T, main = "Boxplot of Unit Price Psf", col = "slateblue1")

plot of chunk unnamed-chunk-7

# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- subDfr$level
symbols(subDfr[, 8], subDfr[, 6], circles = radius, inches = 0.3, fg = "white", 
    bg = "lightblue", ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf")
text(subDfr[, 8], subDfr[, 6], subDfr$level, cex = 0.6, col = "red")
text(1019, 1149000, "MEDIAN", cex = 1.2, col = "red")

plot of chunk unnamed-chunk-8

Part FOUR (4): Which property can we buy for an amount of money?

This part answers a really simple question: Which property can we buy for between $1.4m and $1.5m?

There are FIFTEEN (15) units that are available with a median area of 144 sqm which we can purchase for a median psf of $916.

#
# |------------------------------------------------------------------------------------------|
# | P A R T F O U R P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
subDfr <- rawDfr[rawDfr[, 6] > 1400000, ]
subDfr <- subDfr[subDfr[, 6] < 1500000, ]
# --- Count of rows of data
nrow(subDfr)
## [1] 15
describe(subDfr[, 4])
##   var  n  mean    sd median trimmed   mad min max range skew kurtosis   se
## 1   1 15 144.1 15.27    144   143.5 16.31 127 169    42 0.56    -1.26 3.94
describe(subDfr[, 6])
##   var  n    mean    sd  median trimmed   mad     min     max range skew
## 1   1 15 1436541 32482 1436300 1434647 36472 1400300 1497400 97100 0.53
##   kurtosis   se
## 1    -1.17 8387
describe(subDfr[, 8])
##   var  n  mean    sd median trimmed   mad min  max range  skew kurtosis
## 1   1 15 933.9 82.58    916   933.8 103.8 813 1056   243 -0.05    -1.47
##      se
## 1 21.32

# --- Plot a simple histogram
par(mfrow = c(3, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
hist(subDfr[, 4], xlab = "Area Sqm", main = "Distribution of Floor Area")
boxplot(subDfr[, 4], outchar = T, main = "Boxplot of Floor Area", col = "slateblue1")
hist(subDfr[, 6], xlab = "", main = "Distribution of Purchase Price")
boxplot(subDfr[, 6], outchar = T, main = "Boxplot of Purchase Price", col = "slateblue1")
hist(subDfr[, 8], xlab = "Unit Price Psf", main = "Distribution of Unit Price Psf")
boxplot(subDfr[, 8], outchar = T, main = "Boxplot of Unit Price Psf", col = "slateblue1")

plot of chunk unnamed-chunk-9

# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (subDfr[, 4]/pi)
symbols(subDfr[, 8], subDfr[, 6], circles = radius, inches = 0.3, fg = "white", 
    bg = "lightblue", ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf")
text(subDfr[, 8], subDfr[, 6], subDfr[, 4], cex = 0.6, col = "red")
text(916, 1436300, "MEDIAN", cex = 1.2, col = "red")

plot of chunk unnamed-chunk-10

Part FIVE (4): What is a good predictor of price for the project?

This part answers a really simple question: Which variable can be used to predict price accurately?

The variable 'area' is an EXCELLENT (94.4%) predictor of price for the project.

Note: the same variable only explains 87.3% of price for Watertown project, with other unknown variables accounting for the rest of price.

#
# |------------------------------------------------------------------------------------------|
# | P A R T F I V E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Scatterplot and Correlation Analysis (library gclus and ltm)
# Scatterplot
subDfr <- data.frame(price = rawDfr[, 6], area = rawDfr[, 4], psf = rawDfr[, 
    8])
par(mfrow = c(1, 1), las = 1)
cpairs(subDfr, gap = 0.5, panel.colors = dmat.color(abs(cor(subDfr))), main = "RAW Variables Ordered and Colored by Correlations")

plot of chunk unnamed-chunk-11


# --- Correlation matrix
cor(subDfr)
##         price    area     psf
## price  1.0000  0.9718 -0.7037
## area   0.9718  1.0000 -0.8242
## psf   -0.7037 -0.8242  1.0000

# --- Perform correlation test for matrix (library ltm) Correlation null
# hypothesis is that the correlation is zero (not correlated) If the
# p-value is less than the alpha level, then the null hypothesis is
# rejected Check for correlation p<0.05 is correlated
rcor.test(subDfr)
## 
##       price  area   psf   
## price  *****  0.972 -0.704
## area  <0.001  ***** -0.824
## psf   <0.001 <0.001  *****
## 
## upper diagonal part contains correlation coefficient estimates 
## lower diagonal part contains corresponding p-values

# --- Simple Regression (unstandardized) Y = price; X = area;
raw1Lm <- lm(subDfr$price ~ subDfr$area)
summary(raw1Lm)
## 
## Call:
## lm(formula = subDfr$price ~ subDfr$area)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -303879  -44352   -4812   41550  264047 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 273671.3    11104.6    24.6   <2e-16 ***
## subDfr$area   7765.6       96.8    80.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 78400 on 378 degrees of freedom
## Multiple R-squared: 0.944,   Adjusted R-squared: 0.944 
## F-statistic: 6.43e+03 on 1 and 378 DF,  p-value: <2e-16

#
# |------------------------------------------------------------------------------------------|
# | E N D O F S C R I P T |
# |------------------------------------------------------------------------------------------|