Realis Part 03 ALL Properties (including EC)

Background

The data for this R script comes from the URA Realis web site (www.ura.gov.sg, click on Realis menu) The purpose of the web site
is to provide data and information about the caveats lodged for EVERY property transaction in Singapore.

The URA Realis web site contains a lot of data and we will only look at a small subset for this R script. The files for this
script are as follows:

(1) realis_realis_all_type_incl_ec_newsub_2012_jul_oct_16.csv

Contains information about ALL properties in Singapore from July 2012 to current (16 October 2012).

library(psych)
library(ltm)

## Loading required package: MASS

## Loading required package: msm

## Loading required package: mvtnorm

## Loading required package: polycor

## Loading required package: sfsmisc

## Attaching package: 'polycor'

## The following object(s) are masked from 'package:psych':
## 
## polyserial

## Attaching package: 'ltm'

## The following object(s) are masked from 'package:psych':
## 
## factor.scores

library(gclus)

## Loading required package: cluster

library(RColorBrewer)
library(wordcloud)

## Loading required package: Rcpp


#|------------------------------------------------------------------------------------------|
#|                                I N I T I A L I Z A T I O N                               |
#|------------------------------------------------------------------------------------------|
RealisReadDfr <- function(fileStr, partNum=1, 
                          workDirStr="C:/Users/denbrige/100 FxOption/103 FxOptionVerBack/080 Fx Git/R-nonsource")
{
  #---  Assert THREE (3) arguments:                                                   
  #       fileStr:      name of the file (without the suffix "_part_xx" and extension ".csv"
  #       partNum:      number of parts                                               
  #       workDirStr:   working directory                                             

  #---  Check that partNum is valid (between 1 to 99)                                 
  if( as.numeric(partNum) < 1 || as.numeric(partNum) > 99 ) 
    stop("partNum MUST be between 1 AND 99")
  #---  Set working directory                                                         
  setwd(workDirStr)
  #---  Read data from split parts
  #       Append suffix to the fileStr
  #       Read each part and merge them together

  if( as.numeric(partNum) > 1 )
  {
    retDfr <- read.csv( paste( fileStr, "_part01.csv", sep="" ), colClasses = "character" )

    for( id in 2:partNum )
    {
    #---  rbind() function will bind two data frames with the same header together
      partStr <- paste( fileStr, "_part", sprintf("%02d", as.numeric(id)), ".csv", sep="" )
      tmpDfr <- read.csv( partStr, colClasses = "character")
      retDfr <- rbind( retDfr, tmpDfr )
    }
  }
  else
    retDfr <- read.csv( paste( fileStr, ".csv", sep="" ), colClasses = "character" )

  #---  Return a data frame
  return(retDfr)
}

#|------------------------------------------------------------------------------------------|
#|                            I N T E R N A L   F U N C T I O N S                           |
#|------------------------------------------------------------------------------------------|
RealisAddressSplitDfr <- function( inChr )                                             
{                                                                                   
#---  Assert THREE (3) arguments:                                                   
#       inChr:      vector of addresses                                             

#---  Split address into parts      
  subChr <- substring(inChr, regexpr('#', inChr)) 
  lvlChr <- substr(subChr, 2, 3)
  untChr <- substr(subChr, 5, 6)

#---  Return a data frame         
  return(data.frame(level=lvlChr, unit=untChr))
}                                                                                   

freqVtr <- function(inDfr, orderVtr) 
{
  #---  Assert 'directory' is a character vector of length 1 indicating the location of the
  #       CSV files.
  #     'threshold' is a numeric vector of length 1 indicating the number of completely
  #       observed observations (on all variables) required to compute the correlation 
  #       between nitrate and sulfate; the default is 0.
  #     Return a numeric vector of correlations.

  #---  Assert create an empty numeric vector
  outVtr <- numeric(0)

  for(ord in orderVtr)
  {
    #---  Append numeric vector
    outVtr <- c(outVtr, inDfr[inDfr$name==ord,2])
  }

  #---  Assert return value is a numeric vector
  return(outVtr)
}

#|------------------------------------------------------------------------------------------|
#|                                M A I N   P R O C E D U R E                               |
#|------------------------------------------------------------------------------------------|
#---  Init loading data
#rawDfr <- RealisReadDfr("realis_terrasse_newsub_2008_jan_oct_15")
#rawDfr <- RealisReadDfr("realis_watertown_2000_oct_2012_oct_13")
rawDfr <- RealisReadDfr("realis_all_type_incl_ec_newsub_2012_jul_oct_16", 4)

#---  Count of cols of data
ncol(rawDfr)

## [1] 19

#---  Count of rows of data
nrow(rawDfr)

## [1] 3819

#---  Names of header
names(rawDfr)

##  [1] "Project.Name"                "Address"                    
##  [3] "No..of.Units"                "Area..sqm."                 
##  [5] "Type.of.Area"                "Transacted.Price...."       
##  [7] "Unit.Price....psm."          "Unit.Price....psf."         
##  [9] "Contract.Date"               "Property.Type"              
## [11] "Tenure"                      "Completion.Date"            
## [13] "Type.of.Sale"                "Purchaser.Address.Indicator"
## [15] "Postal.District"             "Postal.Sector"              
## [17] "Postal.Code"                 "Planning.Region"            
## [19] "Planning.Area"

#---  Peek at data
head(rawDfr)

##          Project.Name                    Address No..of.Units Area..sqm.
## 1    LEEDON RESIDENCE  6 Leedon Heights  #03-05             1        248
## 2  BARTLEY RESIDENCES 1A Lorong How Sun  #02-05             1         85
## 3 THE WHARF RESIDENCE 11 Tong Watt Road  #02-06             1        124
## 4        ASCENTIA SKY  1 Alexandra View  #29-04             1         89
## 5         THE HILLOFT       131C Jalan Dermawan             1        457
## 6             NOVA 88      8 Bhamo Road  #07-04             1         85
##   Type.of.Area Transacted.Price.... Unit.Price....psm. Unit.Price....psf.
## 1       Strata              5854338              23606               2193
## 2       Strata              1131000              13306               1236
## 3       Strata              2269500              18302               1700
## 4       Strata              1600000              17978               1670
## 5       Strata              2700000               5908                549
## 6       Strata              1210000              14235               1322
##   Contract.Date       Property.Type                  Tenure
## 1   02-OCT-2012         Condominium                Freehold
## 2   01-OCT-2012           Apartment  99 Yrs From 29/06/2011
## 3   01-OCT-2012         Condominium 999 Yrs From 01/07/1841
## 4   01-OCT-2012         Condominium  99 Yrs From 26/03/2008
## 5   01-OCT-2012 Semi-Detached House 999 Yrs From 12/10/1885
## 6   28-SEP-2012           Apartment                Freehold
##   Completion.Date Type.of.Sale Purchaser.Address.Indicator Postal.District
## 1     Uncompleted     New Sale                     Private              10
## 2     Uncompleted     New Sale                     Private              19
## 3            2012     Sub Sale                         HDB              09
## 4     Uncompleted     Sub Sale                     Private              03
## 5            2011     Sub Sale                     Private              23
## 6            2012     Sub Sale                     Private              12
##   Postal.Sector Postal.Code   Planning.Region Planning.Area
## 1            26      266215    Central Region   Bukit Timah
## 2            53      536559 North East Region     Serangoon
## 3            23      238008    Central Region  River Valley
## 4            15      158748    Central Region   Bukit Merah
## 5            66      669117       West Region   Bukit Batok
## 6            32      329640    Central Region        Novena


#--- Coerce character into numeric or date
rawDfr[, 4] <- suppressWarnings( as.numeric( rawDfr[, 4] ) )    # Area sqm
rawDfr[, 6] <- suppressWarnings( as.numeric( rawDfr[, 6] ) )    # Transacted Price
rawDfr[, 7] <- suppressWarnings( as.numeric( rawDfr[, 7] ) )    # Unit Price psm
rawDfr[, 8] <- suppressWarnings( as.numeric( rawDfr[, 8] ) )    # Unit Price psf
rawDfr[, 9] <- as.Date(rawDfr[, 9], "%d-%b-%Y")

#--- Split address into parts and merge new columns
rawDfr <- cbind( rawDfr, RealisAddressSplitDfr(rawDfr[, 2]) )

Part ONE (1)A: What is the median value of a property for Singapore?

This part answers a really simple question: What is the overall median price, area and psf?

There were THREE THOUSAND EIGHT HUNDRED AND NINETEEN (3,819) units that were sold with a median price of $0.98 million and an area

of NINETY THREE (93) sqm, at a median psf of ONE THOUSAND $1,000.

#
# |------------------------------------------------------------------------------------------|
# | P A R T O N E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Count of rows of data
nrow(rawDfr)

## [1] 3819

describe(rawDfr[, 4])

##   var    n  mean    sd median trimmed   mad min max range skew kurtosis
## 1   1 3819 101.8 58.71     93   93.67 32.62  31 734   703 3.58    20.97
##     se
## 1 0.95

describe(rawDfr[, 6])

##   var    n    mean      sd median trimmed    mad    min      max    range
## 1   1 3819 1245225 1188097 980000 1053722 366202 437080 30400000 29962920
##    skew kurtosis    se
## 1 11.02      200 19225

describe(rawDfr[, 8])

##   var    n mean    sd median trimmed   mad min  max range skew kurtosis
## 1   1 3819 1164 473.9   1000    1091 330.6 482 4920  4438  1.9     6.02
##     se
## 1 7.67

# --- Plot a simple histogram
par(mfrow = c(3, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
hist(rawDfr[, 4], xlab = "Area Sqm", main = "Distribution of Floor Area")
boxplot(rawDfr[, 4], outchar = T, main = "Boxplot of Floor Area", col = "slateblue1")
hist(rawDfr[, 6], xlab = "", main = "Distribution of Purchase Price")
boxplot(rawDfr[, 6], outchar = T, main = "Boxplot of Purchase Price", col = "slateblue1")
hist(rawDfr[, 8], xlab = "Unit Price Psf", main = "Distribution of Unit Price Psf")
boxplot(rawDfr[, 8], outchar = T, main = "Boxplot of Unit Price Psf", col = "slateblue1")

plot of chunk unnamed-chunk-2

# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (rawDfr[, 4]/pi)
pal <- colorRampPalette(brewer.pal(3, "Blues"))
symbols(rawDfr[, 8], rawDfr[, 6], circles = radius, inches = 0.3, fg = "white", 
    bg = pal(10), ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf")
# text( rawDfr[, 8], rawDfr[, 6], rawDfr[, 4], cex=0.6, col='red' )
text(1000, 980000, "MEDIAN", cex = 1.2, col = "red")

plot of chunk unnamed-chunk-3

This part answers a really simple question: Which period had the most transactions since launch?

# --- Plot a timeline
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
tempDfr <- rawDfr
tempDfr <- tempDfr[complete.cases(tempDfr[, 8]), ]
tempDfr <- tempDfr[complete.cases(tempDfr[, 9]), ]
plot(tempDfr[, 6] ~ tempDfr[, 9], col = rgb(red = 0, green = 0, blue = 0, alpha = 0.12), 
    xlim = c(as.Date("01-07-12", "%d-%m-%y"), as.Date("30-09-12", "%d-%m-%y")), 
    xlab = "Date", ylab = "", main = "Timeline of Transacted Price")

plot of chunk unnamed-chunk-4

Part ONE (1)B: Which is the most popular project in Singapore?

This part answers a really simple question: Which project has sold the MOST units in Singapore?

The project that sold the MOST units in Singapore is Parc Centros with THREE HUNDRED AND EIGHTY NINE (389) sales.

#
# |------------------------------------------------------------------------------------------|
# | P A R T C O U N T P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Plot a wordcloud
if (length(unique(rawDfr$Project.Name)) > 6) {
    nameDfr <- rawDfr$Project.Name

    par(mfrow = c(1, 1), mar = c(2.1, 2.1, 2.1, 2.1))
    wordcloud(gsub(" ", ".", rawDfr$Project.Name), colors = brewer.pal(6, "Dark2"), 
        random.order = FALSE)
}

## Loading required package: tm

plot of chunk unnamed-chunk-5

Part ONE (1)C: Which project is the MOST expensive in Singapore?

This part answers a really simple question: Which project has the highest MEDIAN psf in Singapore?

The MOST expensive project in Singapore is 1919, with a median psf of about TWO THOUSAND $2,000.


if (length(unique(rawDfr$Project.Name)) > 6) {
    # --- Count of freq by state
    table(nameDfr)
    # --- Create a data frame of freq by state Remove row.names
    tableDfr <- data.frame(name = names(tapply(nameDfr, nameDfr, length)), freq = tapply(nameDfr, 
        nameDfr, length))
    rownames(tableDfr) <- NULL

    # --- Create a subset
    outcome2Dfr <- rawDfr[rawDfr$Project.Name %in% subset(tableDfr$name, tableDfr$freq >= 
        20), ]
    # --- Count of rows of data
    nrow(outcome2Dfr)
    # --- Count of freq by state
    table(outcome2Dfr$Project.Name)

    # --- Plot a simple boxplot of the THIRTY(30)-day death rates by state (a)
    # Add a label to the y-axis '30-day Death Rate' (b) Add a title for the
    # histogram 'Heart Attack 30-day Death Rate by State' (c) Set the x- and
    # y-axis tick labels to be perpendicular to the axes (see las) (d) Order
    # the states by the MEDIAN THIRTY(30)-day death rate and plot the boxplot.
    # (e) Shrink the x-axis tick labels so that the abbreviated state names do
    # NOT overlap EACH other.  (f) Alter the x-axis tick labels so that they
    # include the number of hospitals in that state in parentheses. For
    # example, the label for the state of Connecticut would be CT (32). You
    # will need the axis() function and when you call the boxplot() function
    # you will want to set the option xaxt to be 'n'.
    psf <- outcome2Dfr[, 8]
    name <- reorder(outcome2Dfr$Project.Name, outcome2Dfr[, 8], median, na.rm = TRUE)
    orderVtr <- levels(name["scores"])
    countVtr <- freqVtr(tableDfr, orderVtr)

    par(mfrow = c(1, 1), las = 2, mar = c(15.1, 5.1, 4.1, 2.1))
    boxplot(psf ~ name, ylab = "Unit Price psf", xaxt = "n", main = "Unit Price Psf by Project")
    axis(1, at = seq_along(orderVtr), cex.axis = 0.8, labels = eval(substitute(paste(st, 
        " (", n, ")", sep = ""), list(st = orderVtr, n = countVtr))))
}

plot of chunk unnamed-chunk-6

Part FOUR (4): Which property can we buy for an amount of money?

This part answers a really simple question: Which property can we buy for between $1.4m and $1.5m?

There are ONE HUNDRED AND TEN (110) units that are available with a median area of 108 sqm which we can purchase for a median psf of $1,236.

#
# |------------------------------------------------------------------------------------------|
# | P A R T F O U R P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
subDfr <- rawDfr[rawDfr[, 6] > 1400000, ]
subDfr <- subDfr[subDfr[, 6] < 1500000, ]
# --- Count of rows of data
nrow(subDfr)

## [1] 110

describe(subDfr[, 4])

##   var   n  mean    sd median trimmed   mad min max range skew kurtosis  se
## 1   1 110 110.7 37.73    108   108.5 35.58  45 212   167  0.5    -0.16 3.6

describe(subDfr[, 6])

##   var   n    mean    sd  median trimmed   mad     min     max range skew
## 1   1 110 1444664 23803 1447350 1444028 27618 1400730 1490400 89670 0.17
##   kurtosis   se
## 1    -0.93 2270

describe(subDfr[, 8])

##   var   n mean    sd median trimmed   mad min  max range skew kurtosis
## 1   1 110 1373 519.4   1236    1317 401.8 628 2948  2320    1     0.36
##      se
## 1 49.52


# --- Plot a simple histogram
par(mfrow = c(3, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
hist(subDfr[, 4], xlab = "Area Sqm", main = "Distribution of Floor Area")
boxplot(subDfr[, 4], outchar = T, main = "Boxplot of Floor Area", col = "slateblue1")
hist(subDfr[, 6], xlab = "", main = "Distribution of Purchase Price")
boxplot(subDfr[, 6], outchar = T, main = "Boxplot of Purchase Price", col = "slateblue1")
hist(subDfr[, 8], xlab = "Unit Price Psf", main = "Distribution of Unit Price Psf")
boxplot(subDfr[, 8], outchar = T, main = "Boxplot of Unit Price Psf", col = "slateblue1")

plot of chunk unnamed-chunk-7

# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (subDfr[, 4]/pi)
symbols(subDfr[, 8], subDfr[, 6], circles = radius, inches = 0.3, fg = "white", 
    bg = pal(5), ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf")
text(subDfr[, 8], subDfr[, 6], subDfr[, 4], cex = 0.6, col = "red")
text(1236, 1447350, "MEDIAN", cex = 1.2, col = "red")

plot of chunk unnamed-chunk-8

Part FIVE (5): What is a good predictor of price for the project?

This part answers a really simple question: Which variable can be used to predict price accurately?

The variable 'area' is a BELOW AVERAGE (44.3%) predictor of price for a property in Singapore.

Note: the same variable explains 87.3% of price for Watertown project, with other unknown variables accounting for the rest of price.

#
# |------------------------------------------------------------------------------------------|
# | P A R T F I V E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Scatterplot and Correlation Analysis (library gclus and ltm)
# Scatterplot
subDfr <- data.frame(price = rawDfr[, 6], area = rawDfr[, 4], psf = rawDfr[, 
    8])
par(mfrow = c(1, 1), las = 1)
cpairs(subDfr, gap = 0.5, panel.colors = dmat.color(abs(cor(subDfr))), col = rgb(0, 
    0, 0, 0.1), main = "RAW Variables Ordered and Colored by Correlations")

plot of chunk unnamed-chunk-9


# --- Correlation matrix
cor(subDfr)

##        price    area     psf
## price 1.0000  0.6653  0.4999
## area  0.6653  1.0000 -0.1036
## psf   0.4999 -0.1036  1.0000


# --- Perform correlation test for matrix (library ltm) Correlation null
# hypothesis is that the correlation is zero (not correlated) If the
# p-value is less than the alpha level, then the null hypothesis is
# rejected Check for correlation p<0.05 is correlated
rcor.test(subDfr)

## 
##       price  area   psf   
## price  *****  0.665  0.500
## area  <0.001  ***** -0.104
## psf   <0.001 <0.001  *****
## 
## upper diagonal part contains correlation coefficient estimates 
## lower diagonal part contains corresponding p-values


# --- Simple Regression (unstandardized) Y = price; X = area;
raw1Lm <- lm(subDfr$price ~ subDfr$area)
summary(raw1Lm)

## 
## Call:
## lm(formula = subDfr$price ~ subDfr$area)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3457089  -320088   -98045   193253 22636043 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -126001      28745   -4.38  1.2e-05 ***
## subDfr$area    13464        244   55.06  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 887000 on 3817 degrees of freedom
## Multiple R-squared: 0.443,   Adjusted R-squared: 0.443 
## F-statistic: 3.03e+03 on 1 and 3817 DF,  p-value: <2e-16


#
# |------------------------------------------------------------------------------------------|
# | E N D O F S C R I P T |
# |------------------------------------------------------------------------------------------|