The data for this R script comes from the URA Realis web site (www.ura.gov.sg, click on Realis menu) The purpose of the web site
is to provide data and information about the caveats lodged for EVERY property transaction in Singapore.
The URA Realis web site contains a lot of data and we will only look at a small subset for this R script. The files for this
script are as follows:
(1) realis_realis_all_type_incl_ec_newsub_2012_jul_oct_16.csv
Contains information about ALL properties in Singapore from July 2012 to current (16 October 2012).
library(psych)
library(ltm)
## Loading required package: MASS
## Loading required package: msm
## Loading required package: mvtnorm
## Loading required package: polycor
## Loading required package: sfsmisc
## Attaching package: 'polycor'
## The following object(s) are masked from 'package:psych':
##
## polyserial
## Attaching package: 'ltm'
## The following object(s) are masked from 'package:psych':
##
## factor.scores
library(gclus)
## Loading required package: cluster
library(RColorBrewer)
library(wordcloud)
## Loading required package: Rcpp
#|------------------------------------------------------------------------------------------|
#| I N I T I A L I Z A T I O N |
#|------------------------------------------------------------------------------------------|
RealisReadDfr <- function(fileStr, partNum=1,
workDirStr="C:/Users/denbrige/100 FxOption/103 FxOptionVerBack/080 Fx Git/R-nonsource")
{
#--- Assert THREE (3) arguments:
# fileStr: name of the file (without the suffix "_part_xx" and extension ".csv"
# partNum: number of parts
# workDirStr: working directory
#--- Check that partNum is valid (between 1 to 99)
if( as.numeric(partNum) < 1 || as.numeric(partNum) > 99 )
stop("partNum MUST be between 1 AND 99")
#--- Set working directory
setwd(workDirStr)
#--- Read data from split parts
# Append suffix to the fileStr
# Read each part and merge them together
if( as.numeric(partNum) > 1 )
{
retDfr <- read.csv( paste( fileStr, "_part01.csv", sep="" ), colClasses = "character" )
for( id in 2:partNum )
{
#--- rbind() function will bind two data frames with the same header together
partStr <- paste( fileStr, "_part", sprintf("%02d", as.numeric(id)), ".csv", sep="" )
tmpDfr <- read.csv( partStr, colClasses = "character")
retDfr <- rbind( retDfr, tmpDfr )
}
}
else
retDfr <- read.csv( paste( fileStr, ".csv", sep="" ), colClasses = "character" )
#--- Return a data frame
return(retDfr)
}
#|------------------------------------------------------------------------------------------|
#| I N T E R N A L F U N C T I O N S |
#|------------------------------------------------------------------------------------------|
RealisAddressSplitDfr <- function( inChr )
{
#--- Assert THREE (3) arguments:
# inChr: vector of addresses
#--- Split address into parts
subChr <- substring(inChr, regexpr('#', inChr))
lvlChr <- substr(subChr, 2, 3)
untChr <- substr(subChr, 5, 6)
#--- Return a data frame
return(data.frame(level=lvlChr, unit=untChr))
}
freqVtr <- function(inDfr, orderVtr)
{
#--- Assert 'directory' is a character vector of length 1 indicating the location of the
# CSV files.
# 'threshold' is a numeric vector of length 1 indicating the number of completely
# observed observations (on all variables) required to compute the correlation
# between nitrate and sulfate; the default is 0.
# Return a numeric vector of correlations.
#--- Assert create an empty numeric vector
outVtr <- numeric(0)
for(ord in orderVtr)
{
#--- Append numeric vector
outVtr <- c(outVtr, inDfr[inDfr$name==ord,2])
}
#--- Assert return value is a numeric vector
return(outVtr)
}
#|------------------------------------------------------------------------------------------|
#| M A I N P R O C E D U R E |
#|------------------------------------------------------------------------------------------|
#--- Init loading data
#rawDfr <- RealisReadDfr("realis_terrasse_newsub_2008_jan_oct_15")
#rawDfr <- RealisReadDfr("realis_watertown_2000_oct_2012_oct_13")
rawDfr <- RealisReadDfr("realis_all_type_incl_ec_newsub_2012_jul_oct_16", 4)
#--- Count of cols of data
ncol(rawDfr)
## [1] 19
#--- Count of rows of data
nrow(rawDfr)
## [1] 3819
#--- Names of header
names(rawDfr)
## [1] "Project.Name" "Address"
## [3] "No..of.Units" "Area..sqm."
## [5] "Type.of.Area" "Transacted.Price...."
## [7] "Unit.Price....psm." "Unit.Price....psf."
## [9] "Contract.Date" "Property.Type"
## [11] "Tenure" "Completion.Date"
## [13] "Type.of.Sale" "Purchaser.Address.Indicator"
## [15] "Postal.District" "Postal.Sector"
## [17] "Postal.Code" "Planning.Region"
## [19] "Planning.Area"
#--- Peek at data
head(rawDfr)
## Project.Name Address No..of.Units Area..sqm.
## 1 LEEDON RESIDENCE 6 Leedon Heights #03-05 1 248
## 2 BARTLEY RESIDENCES 1A Lorong How Sun #02-05 1 85
## 3 THE WHARF RESIDENCE 11 Tong Watt Road #02-06 1 124
## 4 ASCENTIA SKY 1 Alexandra View #29-04 1 89
## 5 THE HILLOFT 131C Jalan Dermawan 1 457
## 6 NOVA 88 8 Bhamo Road #07-04 1 85
## Type.of.Area Transacted.Price.... Unit.Price....psm. Unit.Price....psf.
## 1 Strata 5854338 23606 2193
## 2 Strata 1131000 13306 1236
## 3 Strata 2269500 18302 1700
## 4 Strata 1600000 17978 1670
## 5 Strata 2700000 5908 549
## 6 Strata 1210000 14235 1322
## Contract.Date Property.Type Tenure
## 1 02-OCT-2012 Condominium Freehold
## 2 01-OCT-2012 Apartment 99 Yrs From 29/06/2011
## 3 01-OCT-2012 Condominium 999 Yrs From 01/07/1841
## 4 01-OCT-2012 Condominium 99 Yrs From 26/03/2008
## 5 01-OCT-2012 Semi-Detached House 999 Yrs From 12/10/1885
## 6 28-SEP-2012 Apartment Freehold
## Completion.Date Type.of.Sale Purchaser.Address.Indicator Postal.District
## 1 Uncompleted New Sale Private 10
## 2 Uncompleted New Sale Private 19
## 3 2012 Sub Sale HDB 09
## 4 Uncompleted Sub Sale Private 03
## 5 2011 Sub Sale Private 23
## 6 2012 Sub Sale Private 12
## Postal.Sector Postal.Code Planning.Region Planning.Area
## 1 26 266215 Central Region Bukit Timah
## 2 53 536559 North East Region Serangoon
## 3 23 238008 Central Region River Valley
## 4 15 158748 Central Region Bukit Merah
## 5 66 669117 West Region Bukit Batok
## 6 32 329640 Central Region Novena
#--- Coerce character into numeric or date
rawDfr[, 4] <- suppressWarnings( as.numeric( rawDfr[, 4] ) ) # Area sqm
rawDfr[, 6] <- suppressWarnings( as.numeric( rawDfr[, 6] ) ) # Transacted Price
rawDfr[, 7] <- suppressWarnings( as.numeric( rawDfr[, 7] ) ) # Unit Price psm
rawDfr[, 8] <- suppressWarnings( as.numeric( rawDfr[, 8] ) ) # Unit Price psf
rawDfr[, 9] <- as.Date(rawDfr[, 9], "%d-%b-%Y")
#--- Split address into parts and merge new columns
rawDfr <- cbind( rawDfr, RealisAddressSplitDfr(rawDfr[, 2]) )
#
# |------------------------------------------------------------------------------------------|
# | P A R T O N E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Count of rows of data
nrow(rawDfr)
## [1] 3819
describe(rawDfr[, 4])
## var n mean sd median trimmed mad min max range skew kurtosis
## 1 1 3819 101.8 58.71 93 93.67 32.62 31 734 703 3.58 20.97
## se
## 1 0.95
describe(rawDfr[, 6])
## var n mean sd median trimmed mad min max range
## 1 1 3819 1245225 1188097 980000 1053722 366202 437080 30400000 29962920
## skew kurtosis se
## 1 11.02 200 19225
describe(rawDfr[, 8])
## var n mean sd median trimmed mad min max range skew kurtosis
## 1 1 3819 1164 473.9 1000 1091 330.6 482 4920 4438 1.9 6.02
## se
## 1 7.67
# --- Plot a simple histogram
par(mfrow = c(3, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
hist(rawDfr[, 4], xlab = "Area Sqm", main = "Distribution of Floor Area")
boxplot(rawDfr[, 4], outchar = T, main = "Boxplot of Floor Area", col = "slateblue1")
hist(rawDfr[, 6], xlab = "", main = "Distribution of Purchase Price")
boxplot(rawDfr[, 6], outchar = T, main = "Boxplot of Purchase Price", col = "slateblue1")
hist(rawDfr[, 8], xlab = "Unit Price Psf", main = "Distribution of Unit Price Psf")
boxplot(rawDfr[, 8], outchar = T, main = "Boxplot of Unit Price Psf", col = "slateblue1")
# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (rawDfr[, 4]/pi)
pal <- colorRampPalette(brewer.pal(3, "Blues"))
symbols(rawDfr[, 8], rawDfr[, 6], circles = radius, inches = 0.3, fg = "white",
bg = pal(10), ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf")
# text( rawDfr[, 8], rawDfr[, 6], rawDfr[, 4], cex=0.6, col='red' )
text(1000, 980000, "MEDIAN", cex = 1.2, col = "red")
# --- Plot a timeline
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
tempDfr <- rawDfr
tempDfr <- tempDfr[complete.cases(tempDfr[, 8]), ]
tempDfr <- tempDfr[complete.cases(tempDfr[, 9]), ]
plot(tempDfr[, 6] ~ tempDfr[, 9], col = rgb(red = 0, green = 0, blue = 0, alpha = 0.12),
xlim = c(as.Date("01-07-12", "%d-%m-%y"), as.Date("30-09-12", "%d-%m-%y")),
xlab = "Date", ylab = "", main = "Timeline of Transacted Price")
#
# |------------------------------------------------------------------------------------------|
# | P A R T C O U N T P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Plot a wordcloud
if (length(unique(rawDfr$Project.Name)) > 6) {
nameDfr <- rawDfr$Project.Name
par(mfrow = c(1, 1), mar = c(2.1, 2.1, 2.1, 2.1))
wordcloud(gsub(" ", ".", rawDfr$Project.Name), colors = brewer.pal(6, "Dark2"),
random.order = FALSE)
}
## Loading required package: tm
if (length(unique(rawDfr$Project.Name)) > 6) {
# --- Count of freq by state
table(nameDfr)
# --- Create a data frame of freq by state Remove row.names
tableDfr <- data.frame(name = names(tapply(nameDfr, nameDfr, length)), freq = tapply(nameDfr,
nameDfr, length))
rownames(tableDfr) <- NULL
# --- Create a subset
outcome2Dfr <- rawDfr[rawDfr$Project.Name %in% subset(tableDfr$name, tableDfr$freq >=
20), ]
# --- Count of rows of data
nrow(outcome2Dfr)
# --- Count of freq by state
table(outcome2Dfr$Project.Name)
# --- Plot a simple boxplot of the THIRTY(30)-day death rates by state (a)
# Add a label to the y-axis '30-day Death Rate' (b) Add a title for the
# histogram 'Heart Attack 30-day Death Rate by State' (c) Set the x- and
# y-axis tick labels to be perpendicular to the axes (see las) (d) Order
# the states by the MEDIAN THIRTY(30)-day death rate and plot the boxplot.
# (e) Shrink the x-axis tick labels so that the abbreviated state names do
# NOT overlap EACH other. (f) Alter the x-axis tick labels so that they
# include the number of hospitals in that state in parentheses. For
# example, the label for the state of Connecticut would be CT (32). You
# will need the axis() function and when you call the boxplot() function
# you will want to set the option xaxt to be 'n'.
psf <- outcome2Dfr[, 8]
name <- reorder(outcome2Dfr$Project.Name, outcome2Dfr[, 8], median, na.rm = TRUE)
orderVtr <- levels(name["scores"])
countVtr <- freqVtr(tableDfr, orderVtr)
par(mfrow = c(1, 1), las = 2, mar = c(15.1, 5.1, 4.1, 2.1))
boxplot(psf ~ name, ylab = "Unit Price psf", xaxt = "n", main = "Unit Price Psf by Project")
axis(1, at = seq_along(orderVtr), cex.axis = 0.8, labels = eval(substitute(paste(st,
" (", n, ")", sep = ""), list(st = orderVtr, n = countVtr))))
}
#
# |------------------------------------------------------------------------------------------|
# | P A R T F O U R P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
subDfr <- rawDfr[rawDfr[, 6] > 1400000, ]
subDfr <- subDfr[subDfr[, 6] < 1500000, ]
# --- Count of rows of data
nrow(subDfr)
## [1] 110
describe(subDfr[, 4])
## var n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 110 110.7 37.73 108 108.5 35.58 45 212 167 0.5 -0.16 3.6
describe(subDfr[, 6])
## var n mean sd median trimmed mad min max range skew
## 1 1 110 1444664 23803 1447350 1444028 27618 1400730 1490400 89670 0.17
## kurtosis se
## 1 -0.93 2270
describe(subDfr[, 8])
## var n mean sd median trimmed mad min max range skew kurtosis
## 1 1 110 1373 519.4 1236 1317 401.8 628 2948 2320 1 0.36
## se
## 1 49.52
# --- Plot a simple histogram
par(mfrow = c(3, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
hist(subDfr[, 4], xlab = "Area Sqm", main = "Distribution of Floor Area")
boxplot(subDfr[, 4], outchar = T, main = "Boxplot of Floor Area", col = "slateblue1")
hist(subDfr[, 6], xlab = "", main = "Distribution of Purchase Price")
boxplot(subDfr[, 6], outchar = T, main = "Boxplot of Purchase Price", col = "slateblue1")
hist(subDfr[, 8], xlab = "Unit Price Psf", main = "Distribution of Unit Price Psf")
boxplot(subDfr[, 8], outchar = T, main = "Boxplot of Unit Price Psf", col = "slateblue1")
# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (subDfr[, 4]/pi)
symbols(subDfr[, 8], subDfr[, 6], circles = radius, inches = 0.3, fg = "white",
bg = pal(5), ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf")
text(subDfr[, 8], subDfr[, 6], subDfr[, 4], cex = 0.6, col = "red")
text(1236, 1447350, "MEDIAN", cex = 1.2, col = "red")
#
# |------------------------------------------------------------------------------------------|
# | P A R T F I V E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Scatterplot and Correlation Analysis (library gclus and ltm)
# Scatterplot
subDfr <- data.frame(price = rawDfr[, 6], area = rawDfr[, 4], psf = rawDfr[,
8])
par(mfrow = c(1, 1), las = 1)
cpairs(subDfr, gap = 0.5, panel.colors = dmat.color(abs(cor(subDfr))), col = rgb(0,
0, 0, 0.1), main = "RAW Variables Ordered and Colored by Correlations")
# --- Correlation matrix
cor(subDfr)
## price area psf
## price 1.0000 0.6653 0.4999
## area 0.6653 1.0000 -0.1036
## psf 0.4999 -0.1036 1.0000
# --- Perform correlation test for matrix (library ltm) Correlation null
# hypothesis is that the correlation is zero (not correlated) If the
# p-value is less than the alpha level, then the null hypothesis is
# rejected Check for correlation p<0.05 is correlated
rcor.test(subDfr)
##
## price area psf
## price ***** 0.665 0.500
## area <0.001 ***** -0.104
## psf <0.001 <0.001 *****
##
## upper diagonal part contains correlation coefficient estimates
## lower diagonal part contains corresponding p-values
# --- Simple Regression (unstandardized) Y = price; X = area;
raw1Lm <- lm(subDfr$price ~ subDfr$area)
summary(raw1Lm)
##
## Call:
## lm(formula = subDfr$price ~ subDfr$area)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3457089 -320088 -98045 193253 22636043
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -126001 28745 -4.38 1.2e-05 ***
## subDfr$area 13464 244 55.06 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 887000 on 3817 degrees of freedom
## Multiple R-squared: 0.443, Adjusted R-squared: 0.443
## F-statistic: 3.03e+03 on 1 and 3817 DF, p-value: <2e-16
#
# |------------------------------------------------------------------------------------------|
# | E N D O F S C R I P T |
# |------------------------------------------------------------------------------------------|