The data for this R script comes from the URA Realis web site (www.ura.gov.sg, click on Realis menu) The purpose of the web site
is to provide data and information about the caveats lodged for EVERY property transaction in Singapore.
The URA Realis web site contains a lot of data and we will only look at a small subset for this R script. The files for this
script are as follows:
(1) realis_ten_projects_treasure_newsub_2000_jan_2012_oct_18.csv
Contains information about ALL properties in Singapore from July 2012 to current (16 October 2012).
(2) realis_ten_projects_treasure_resale_2000_jan_2012_oct_18.csv
Contains information about properties for TEN (10) projects (including Treasure Mansions) from January 2000 to current (18 October 2012).
library(psych)
library(car)
## Loading required package: MASS
## Loading required package: nnet
## Attaching package: 'car'
## The following object(s) are masked from 'package:psych':
##
## logit
library(ltm)
## Loading required package: msm
## Loading required package: mvtnorm
## Loading required package: polycor
## Loading required package: sfsmisc
## Attaching package: 'polycor'
## The following object(s) are masked from 'package:psych':
##
## polyserial
## Attaching package: 'ltm'
## The following object(s) are masked from 'package:psych':
##
## factor.scores
library(gclus)
## Loading required package: cluster
library(RColorBrewer)
library(wordcloud)
## Loading required package: Rcpp
#|------------------------------------------------------------------------------------------|
#| I N I T I A L I Z A T I O N |
#|------------------------------------------------------------------------------------------|
RealisReadDfr <- function(fileStr, partNum=1,
workDirStr="C:/Users/denbrige/100 FxOption/103 FxOptionVerBack/080 Fx Git/R-nonsource")
{
#--- Assert THREE (3) arguments:
# fileStr: name of the file (without the suffix "_part_xx" and extension ".csv"
# partNum: number of parts
# workDirStr: working directory
#--- Check that partNum is valid (between 1 to 99)
if( as.numeric(partNum) < 1 || as.numeric(partNum) > 99 )
stop("partNum MUST be between 1 AND 99")
#--- Set working directory
setwd(workDirStr)
#--- Read data from split parts
# Append suffix to the fileStr
# Read each part and merge them together
if( as.numeric(partNum) > 1 )
{
retDfr <- read.csv( paste( fileStr, "_part01.csv", sep="" ), colClasses = "character" )
for( id in 2:partNum )
{
#--- rbind() function will bind two data frames with the same header together
partStr <- paste( fileStr, "_part", sprintf("%02d", as.numeric(id)), ".csv", sep="" )
tmpDfr <- read.csv( partStr, colClasses = "character")
retDfr <- rbind( retDfr, tmpDfr )
}
}
else
retDfr <- read.csv( paste( fileStr, ".csv", sep="" ), colClasses = "character" )
#--- Return a data frame
return(retDfr)
}
#|------------------------------------------------------------------------------------------|
#| I N T E R N A L F U N C T I O N S |
#|------------------------------------------------------------------------------------------|
RealisBoxplotFtr <- function(inDfr, FUN=median, ...)
{
#--- Plot a simple boxplot of the values by name
# (a) Order the names by the MEDIAN value and plot the boxplot.
# (e) Shrink the x-axis tick labels so that the abbreviated state names do NOT overlap
# EACH other.
# (f) Alter the x-axis tick labels so that they include the number of hospitals in that
# state in parentheses. For example, the label for the state of Connecticut would
# be CT (32). You will need the axis() function and when you call the boxplot()
# function you will want to set the option xaxt to be "n".
valueNum <- inDfr$value
nameFtr <- reorder(inDfr$name, inDfr$value, FUN, na.rm=TRUE)
orderVtr <- levels(nameFtr["scores"])
countVtr <- freqVtr(tableDfr, orderVtr)
boxplot(valueNum ~ nameFtr, xaxt="n", ...)
axis(1, at=seq_along(orderVtr), cex.axis=0.8,
labels=eval(substitute(paste(st," (",n,")",sep=""), list(st=orderVtr, n=countVtr) )))
return(nameFtr)
}
RealisAddressSplitDfr <- function( inChr )
{
#--- Assert THREE (3) arguments:
# inChr: vector of addresses
#--- Split address into parts
subChr <- substring(inChr, regexpr('#', inChr))
lvlChr <- substr(subChr, 2, 3)
untChr <- substr(subChr, 5, 6)
#--- Return a data frame
return(data.frame(level=lvlChr, unit=untChr))
}
freqDfr <- function(inVtr)
{
nameDfr <- inVtr
#--- Count of freq by name
table(nameDfr)
#--- Create a data frame of freq by name
# Remove row.names
tableDfr <- data.frame(name=names(tapply(nameDfr, nameDfr, length)), freq=tapply(nameDfr, nameDfr, length))
rownames(tableDfr) <- NULL
#--- Create a subset
return(tableDfr)
}
freqVtr <- function(inDfr, orderVtr)
{
#--- Assert 'directory' is a character vector of length 1 indicating the location of the
# CSV files.
# 'threshold' is a numeric vector of length 1 indicating the number of completely
# observed observations (on all variables) required to compute the correlation
# between nitrate and sulfate; the default is 0.
# Return a numeric vector of correlations.
#--- Assert create an empty numeric vector
outVtr <- numeric(0)
for(ord in orderVtr)
{
#--- Append numeric vector
outVtr <- c(outVtr, inDfr[inDfr$name==ord,2])
}
#--- Assert return value is a numeric vector
return(outVtr)
}
#|------------------------------------------------------------------------------------------|
#| M A I N P R O C E D U R E |
#|------------------------------------------------------------------------------------------|
#--- Init loading data
rawDfr <- RealisReadDfr("realis_ten_projects_treasure_newsub_2000_jan_2012_oct_18")
postDfr <- RealisReadDfr("realis_ten_projects_treasure_resale_2000_jan_2012_oct_18")
#--- Count of rows of data
nrow(rawDfr)
## [1] 293
#--- Count of rows of data
nrow(postDfr)
## [1] 98
#--- Coerce character into numeric or date
rawDfr[, 4] <- suppressWarnings( as.numeric( rawDfr[, 4] ) ) # Area sqm
rawDfr[, 6] <- suppressWarnings( as.numeric( rawDfr[, 6] ) ) # Transacted Price
rawDfr[, 7] <- suppressWarnings( as.numeric( rawDfr[, 7] ) ) # Unit Price psm
rawDfr[, 8] <- suppressWarnings( as.numeric( rawDfr[, 8] ) ) # Unit Price psf
rawDfr[, 9] <- as.Date(rawDfr[, 9], "%d-%b-%Y")
#--- Coerce character into numeric or date
postDfr[, 4] <- suppressWarnings( as.numeric( postDfr[, 4] ) ) # Area sqm
postDfr[, 6] <- suppressWarnings( as.numeric( postDfr[, 6] ) ) # Transacted Price
postDfr[, 7] <- suppressWarnings( as.numeric( postDfr[, 7] ) ) # Unit Price psm
postDfr[, 8] <- suppressWarnings( as.numeric( postDfr[, 8] ) ) # Unit Price psf
postDfr[, 9] <- as.Date(postDfr[, 9], "%d-%b-%Y")
#--- Split address into parts and merge new columns
rawDfr <- cbind( rawDfr, RealisAddressSplitDfr(rawDfr[, 2]) )
postDfr <- cbind( postDfr, RealisAddressSplitDfr(postDfr[, 2]) )
#
# |------------------------------------------------------------------------------------------|
# | P A R T O N E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Count of rows of data
nrow(rawDfr)
## [1] 293
describe(rawDfr[, 4])
## var n mean sd median trimmed mad min max range skew kurtosis
## 1 1 293 108.3 41.84 109 105 35.58 38 257 219 0.79 0.86
## se
## 1 2.44
describe(rawDfr[, 6])
## var n mean sd median trimmed mad min max range skew
## 1 1 293 898604 340455 848000 872586 366202 361000 2680000 2319000 1.19
## kurtosis se
## 1 3.37 19890
describe(rawDfr[, 8])
## var n mean sd median trimmed mad min max range skew kurtosis
## 1 1 293 803.8 198.9 815 799 170.5 358 1346 988 0.1 -0.25
## se
## 1 11.62
nrow(postDfr)
## [1] 98
describe(postDfr[, 4])
## var n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 98 113.9 42.1 103.5 110.9 26.69 38 240 202 0.75 0.33 4.25
describe(postDfr[, 6])
## var n mean sd median trimmed mad min max range skew
## 1 1 98 952769 371988 884000 920689 398226 430000 2500000 2070000 1.14
## kurtosis se
## 1 1.99 37576
describe(postDfr[, 8])
## var n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 98 815 253.4 801.5 799.3 298 391 1467 1076 0.53 -0.26 25.59
# --- Plot a simple histogram
par(mfcol = c(3, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# hist( rawDfr[, 4], xlab='Area Sqm', main='Distribution of Floor Area' )
boxplot(rawDfr[, 4], outchar = T, ylim = c(30, 300), main = "Boxplot of Floor Area",
col = "slateblue1")
# hist( rawDfr[, 6], xlab='', main='Distribution of Purchase Price' )
boxplot(rawDfr[, 6], outchar = T, ylim = c(3e+05, 3e+06), main = "Boxplot of Purchase Price",
col = "slateblue1")
# hist( rawDfr[, 8], xlab='Unit Price Psf', main='Distribution of Unit
# Price Psf' )
boxplot(rawDfr[, 8], outchar = T, ylim = c(300, 1500), main = "Boxplot of Unit Price Psf",
col = "slateblue1")
# hist( rawDfr[, 4], xlab='Area Sqm', main='Distribution of Floor Area' )
boxplot(postDfr[, 4], outchar = T, ylim = c(30, 300), main = "Boxplot of Floor Area (Resale)",
col = "orangered1")
# hist( rawDfr[, 6], xlab='', main='Distribution of Purchase Price' )
boxplot(postDfr[, 6], outchar = T, ylim = c(3e+05, 3e+06), main = "Boxplot of Purchase Price (Resale)",
col = "orangered1")
# hist( rawDfr[, 8], xlab='Unit Price Psf', main='Distribution of Unit
# Price Psf' )
boxplot(postDfr[, 8], outchar = T, ylim = c(300, 1500), main = "Boxplot of Unit Price Psf (Resale)",
col = "orangered1")
# --- Make a bubble plot
par(mfrow = c(1, 2), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (rawDfr[, 4]/pi)
pal <- colorRampPalette(brewer.pal(3, "Blues"))
symbols(rawDfr[, 8], rawDfr[, 6], circles = radius, inches = 0.3, fg = "white",
bg = pal(10), xlim = c(300, 1500), ylim = c(3e+05, 3e+06), ylab = "", xlab = "Unit Price Psf",
main = "Purchase Price vs Unit Price Psf")
text(rawDfr[, 8], rawDfr[, 6], rawDfr[, 4], cex = 0.6, col = "slateblue1")
text(815, 848000, "MEDIAN", cex = 1.2, col = "blue")
# --- Make a bubble plot
radius <- (postDfr[, 4]/pi)
palr <- colorRampPalette(brewer.pal(3, "Reds"))
symbols(postDfr[, 8], postDfr[, 6], circles = radius, inches = 0.3, fg = "white",
bg = palr(10), xlim = c(300, 1500), ylim = c(3e+05, 3e+06), ylab = "", xlab = "Unit Price Psf",
main = "Purchase Price vs Unit Price Psf (Resale)")
text(postDfr[, 8], postDfr[, 6], postDfr[, 4], cex = 0.6, col = "white")
text(802, 884000, "MEDIAN", cex = 1.2, col = "red")
# --- Plot a timeline
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
tempDfr <- rawDfr
tempDfr <- tempDfr[complete.cases(tempDfr[, 8]), ]
tempDfr <- tempDfr[complete.cases(tempDfr[, 9]), ]
plot(tempDfr[, 8] ~ tempDfr[, 9], col = rgb(red = 0, green = 0, blue = 1, alpha = 0.21),
ylim = c(300, 1500), xlim = c(as.Date("01-01-02", "%d-%m-%y"), as.Date("30-09-12",
"%d-%m-%y")), xlab = "Date", ylab = "", main = "Timeline of Unit Price Psf (Red: Resale)")
temprDfr <- postDfr
temprDfr <- temprDfr[complete.cases(temprDfr[, 8]), ]
temprDfr <- temprDfr[complete.cases(temprDfr[, 9]), ]
points(temprDfr[, 8] ~ temprDfr[, 9], pch = 21, col = rgb(red = 1, green = 0,
blue = 0, alpha = 0.51), ylim = c(300, 1500), xlim = c(as.Date("01-01-02",
"%d-%m-%y"), as.Date("30-09-12", "%d-%m-%y")))
#
# |------------------------------------------------------------------------------------------|
# | P A R T C O U N T P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Plot a wordcloud
if (length(unique(rawDfr$Project.Name)) > 1) {
par(mfrow = c(1, 2), mar = c(2.1, 2.1, 2.1, 2.1))
nameDfr <- rawDfr$Project.Name
wordcloud(gsub(" ", ".", rawDfr$Project.Name), scale = c(4, 0.5), colors = brewer.pal(6,
"Set2"), random.order = FALSE)
namerDfr <- postDfr$Project.Name
wordcloud(gsub(" ", ".", postDfr$Project.Name), scale = c(4, 0.5), colors = brewer.pal(6,
"Dark2"), random.order = FALSE)
}
## Loading required package: tm
if (length(unique(rawDfr$Project.Name)) > 1) {
par(mfrow = c(1, 2), las = 2, mar = c(15.1, 5.1, 4.1, 2.1))
# --- Create a boxplot with given arguments
nameDfr <- rawDfr
aColNum <- 1
bColNum <- 8
thresholdNum <- 0
nameFun <- median
mainStr <- "Unit Price Psf by Project (New)"
ylabStr <- "Unit Price Psf"
# --- Start
tableDfr <- freqDfr(nameDfr[, aColNum])
outcomeDfr <- nameDfr[nameDfr[, aColNum] %in% subset(tableDfr$name, tableDfr$freq >=
thresholdNum), ]
outDfr <- data.frame(name = outcomeDfr[, aColNum], value = outcomeDfr[,
bColNum])
nameFtr <- RealisBoxplotFtr(data.frame(name = outDfr$name, value = outDfr$value),
nameFun, ylab = ylabStr, main = mainStr, col = "slateblue1", ylim = c(300,
1500))
# --- Create a boxplot with given arguments
nameDfr <- postDfr
aColNum <- 1
bColNum <- 8
thresholdNum <- 0
nameFun <- median
mainStr <- "Unit Price Psf by Project (Resale)"
ylabStr <- "Unit Price Psf"
# --- Start
tableDfr <- freqDfr(nameDfr[, aColNum])
outcomeDfr <- nameDfr[nameDfr[, aColNum] %in% subset(tableDfr$name, tableDfr$freq >=
thresholdNum), ]
outDfr <- data.frame(name = outcomeDfr[, aColNum], value = outcomeDfr[,
bColNum])
nameFtr <- RealisBoxplotFtr(data.frame(name = outDfr$name, value = outDfr$value),
nameFun, ylab = ylabStr, main = mainStr, col = "orangered1", ylim = c(300,
1500))
par(mfrow = c(1, 2), las = 2, mar = c(15.1, 5.1, 4.1, 2.1))
# --- Create a boxplot with given arguments
nameDfr <- rawDfr
aColNum <- 1
bColNum <- 4
thresholdNum <- 0
nameFun <- median
mainStr <- "Floor Area by Project (New)"
ylabStr <- "Area Sqm"
# --- Start
tableDfr <- freqDfr(nameDfr[, aColNum])
outcomeDfr <- nameDfr[nameDfr[, aColNum] %in% subset(tableDfr$name, tableDfr$freq >=
thresholdNum), ]
outDfr <- data.frame(name = outcomeDfr[, aColNum], value = outcomeDfr[,
bColNum])
nameFtr <- RealisBoxplotFtr(data.frame(name = outDfr$name, value = outDfr$value),
nameFun, ylab = ylabStr, main = mainStr, col = "slateblue4", ylim = c(30,
300))
# --- Create a boxplot with given arguments
nameDfr <- postDfr
aColNum <- 1
bColNum <- 4
thresholdNum <- 0
nameFun <- median
mainStr <- "Floor Area by Project (Resale)"
ylabStr <- "Area Sqm"
# --- Start
tableDfr <- freqDfr(nameDfr[, aColNum])
outcomeDfr <- nameDfr[nameDfr[, aColNum] %in% subset(tableDfr$name, tableDfr$freq >=
thresholdNum), ]
outDfr <- data.frame(name = outcomeDfr[, aColNum], value = outcomeDfr[,
bColNum])
nameFtr <- RealisBoxplotFtr(data.frame(name = outDfr$name, value = outDfr$value),
nameFun, ylab = ylabStr, main = mainStr, col = "orangered4", ylim = c(30,
300))
}
postDfr[postDfr$Project.Name == "TREASURE MANSIONS", c(4, 6, 8, 9, 20, 21)]
## Area..sqm. Transacted.Price.... Unit.Price....psf. Contract.Date level
## 23 81 850000 975 2011-09-20 04
## 45 159 925000 540 2011-01-03 05
## 68 159 750000 438 2008-11-21 05
## 76 80 526000 611 2007-08-31 01
## 96 70 430000 571 2006-04-27 02
## unit
## 23 01
## 45 01
## 68 01
## 76 03
## 96 02
Hypothesis for the independent t-test
The null hypothesis for the independent t-test is that the population means from the two unrelated groups are equal.
Assumption of normality of the dependent variable
The independent t-test requires that the dependent variable is approximately normally distributed within each group. We can test
for this using a multitude of tests but the Shapiro-Wilks Test or a graphical method, such as a Q-Q Plot, are very common.
What to do when you violate the normality assumption
If you find that either one or both of your group's data is not approximately normally distributed and groups sizes differ greatly
then you have TWO (2) options:
(1) transform your data so that the data becomes normally distributed; or
(2) run the Mann-Whitney U Test which is a non-parametric test that does not require the assumption of normality
Assumption of Homogeneity of Variance
The independent t-test assumes the variances of the two groups you are measuring to be equal. The assumption of homogeneity of
variance can be tested using Levene's Test of Equality of Variances.
Overcoming a Violation of the Assumption of Homogeneity of Variance
If the Levene's Test for Equality of Variances is statistically significant and, therefore, indicates unequal variances, we can
correct for this violation by not using the pooled estimate for the error term for the t-statistic and also making adjustments to
the degrees of freedom using the Welch-Satterthwaite method.
For the THREE (3) dependent variables Area, Price and Psf, the results of Shapiro tests indicates that normality of Psf > Area >
Price, therefore Psf should be the dependent variable for the independent t-test. As the Levene's Test for Equality of Variances is
statistically significant, and therefore indicates unequal variance, we performed the Welch independent t-test.
The result of the t-test is insignificant (p=0.6923>0.05), and therefore indicates that the population means are equal.
#
# |------------------------------------------------------------------------------------------|
# | P A R T T W O P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Create subsets of TWO (2) groups Replace sub with new
bothDfr <- rbind(rawDfr, postDfr)
nrow(bothDfr)
## [1] 391
bothDfr[, 13] <- gsub("Sub Sale", "New Sale", bothDfr[, 13])
nsDfr <- subset(bothDfr, bothDfr[, 13] == "New Sale")
rsDfr <- subset(bothDfr, bothDfr[, 13] == "Resale")
nrow(nsDfr)
## [1] 293
nrow(rsDfr)
## [1] 98
# --- Check for normality p>0.05 is normal Layout
layout(matrix(1:6, 3, 2, byrow = TRUE))
for (cNum in c(4, 6, 8)) {
cNum <- as.numeric(cNum)
# --- Check for normality p>0.05 is normal
p <- shapiro.test(nsDfr[, cNum])$p.value
hist(nsDfr[, cNum], prob = T, main = c(paste("New Sale ", names(nsDfr)[cNum]),
paste("Shapiro p=", prettyNum(p, digits = 2), " (if p>0.05 then normal)")),
xlab = names(nsDfr)[cNum])
lines(density(nsDfr[, cNum]))
# --- Check for normality
qqnorm(nsDfr[, cNum])
}
# --- Check for normality p>0.05 is normal Layout
layout(matrix(1:6, 3, 2, byrow = TRUE))
for (cNum in c(4, 6, 8)) {
cNum <- as.numeric(cNum)
# --- Check for normality p>0.05 is normal
p <- shapiro.test(rsDfr[, cNum])$p.value
hist(rsDfr[, cNum], prob = T, main = c(paste("Resale Sale ", names(rsDfr)[cNum]),
paste("Shapiro p=", prettyNum(p, digits = 2), " (if p>0.05 then normal)")),
xlab = names(nsDfr)[cNum])
lines(density(rsDfr[, cNum]))
# --- Check for normality
qqnorm(rsDfr[, cNum])
}
# --- Independent t-test
t.test(bothDfr[, 8] ~ bothDfr[, 13], var.equal = T)
##
## Two Sample t-test
##
## data: bothDfr[, 8] by bothDfr[, 13]
## t = -0.4469, df = 389, p-value = 0.6552
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -60.19 37.90
## sample estimates:
## mean in group New Sale mean in group Resale
## 803.8 815.0
# --- Levene's test for homogeneity of variance A large F-value means
# significant, therefore violate homogeneity of variance
leveneTest(bothDfr[, 8], bothDfr[, 13], center = "mean")
## Warning: bothDfr[, 13] coerced to factor.
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 1 10.8 0.0011 **
## 389
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# --- Independent t-test
t.test(bothDfr[, 8] ~ bothDfr[, 13], var.equal = F)
##
## Welch Two Sample t-test
##
## data: bothDfr[, 8] by bothDfr[, 13]
## t = -0.3966, df = 139.1, p-value = 0.6923
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -66.72 44.43
## sample estimates:
## mean in group New Sale mean in group Resale
## 803.8 815.0
# --- Calculate effect size for independent t-test SD_Pooled ^ 2 =
# (DF_1/DF_Total) * SD_1 ^ 2 + (DF_2/DF_Total) * SD_2 ^ 2 d Effect size =
# (Mean(X_1) - Mean(X_2)) / SD_Pooled
nsDescDfr <- describe(nsDfr[, 8])
rsDescDfr <- describe(rsDfr[, 8])
nsDfNum <- nrow(nsDfr) - 1
rsDfNum <- nrow(rsDfr) - 1
totDfNum <- nsDfNum + rsDfNum
sdPooledNum <- sqrt(nsDfNum/totDfNum * nsDescDfr[1, 4]^2 + rsDfNum/totDfNum *
rsDescDfr[1, 4]^2)
(rsDescDfr[1, 3] - nsDescDfr[1, 3])/sdPooledNum
## [1] 0.05215
#
# |------------------------------------------------------------------------------------------|
# | P A R T T H R E E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
subDfr <- postDfr[postDfr[, 4] > 90, ]
subDfr <- subDfr[subDfr[, 4] < 100, ]
# --- Count of rows of data
nrow(subDfr)
## [1] 15
describe(subDfr[, 4])
## var n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 15 95.53 1.25 95 95.46 1.48 94 98 4 0.65 -0.48 0.32
describe(subDfr[, 6])
## var n mean sd median trimmed mad min max range skew
## 1 1 15 705587 124223 740000 703615 200151 536800 9e+05 363200 0.19
## kurtosis se
## 1 -1.59 32074
describe(subDfr[, 8])
## var n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 15 686.1 120.2 702 684 172 519 880 361 0.19 -1.55 31.03
# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- subDfr$level
symbols(subDfr[, 8], subDfr[, 6], circles = radius, inches = 0.3, fg = "white",
bg = "lightblue", ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf (Resale)")
text(subDfr[, 8], subDfr[, 6], subDfr$level, cex = 0.8, col = "red")
text(702, 740000, "MEDIAN", cex = 1.2, col = "red")
#
# |------------------------------------------------------------------------------------------|
# | P A R T F O U R P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
subDfr <- postDfr[postDfr[, 6] > 7e+05, ]
subDfr <- subDfr[subDfr[, 6] < 9e+05, ]
# --- Count of rows of data
nrow(subDfr)
## [1] 21
describe(subDfr[, 4])
## var n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 21 118.6 39.87 98 114.3 17.79 71 198 127 0.96 -0.53 8.7
describe(subDfr[, 6])
## var n mean sd median trimmed mad min max range skew
## 1 1 21 799810 53642 805000 798353 81543 726000 888000 162000 0.12
## kurtosis se
## 1 -1.55 11706
describe(subDfr[, 8])
## var n mean sd median trimmed mad min max range skew kurtosis
## 1 1 21 683.1 191.6 731 678.2 169 391 1060 669 -0.03 -1.02
## se
## 1 41.82
# --- Make a bubble plot
par(mfrow = c(1, 1), las = 2, mar = c(5.1, 5.1, 4.1, 2.1))
# --- Size the circles
radius <- (subDfr[, 4]/pi)
symbols(subDfr[, 8], subDfr[, 6], circles = radius, inches = 0.3, fg = "white",
bg = pal(5), ylab = "", xlab = "Unit Price Psf", main = "Purchase Price vs Unit Price Psf (Resale)")
text(subDfr[, 8], subDfr[, 6], subDfr[, 4], cex = 0.8, col = "red")
text(731, 805000, "MEDIAN", cex = 1.2, col = "red")
#
# |------------------------------------------------------------------------------------------|
# | P A R T F I V E P R O C E D U R E |
# |------------------------------------------------------------------------------------------|
# --- Scatterplot and Correlation Analysis (library gclus and ltm)
# Scatterplot
subDfr <- data.frame(price = rawDfr[, 6], area = rawDfr[, 4], psf = rawDfr[,
8])
par(mfrow = c(1, 1), las = 1)
cpairs(subDfr, gap = 0.5, panel.colors = dmat.color(abs(cor(subDfr))), col = rgb(0,
0, 0, 0.1), main = "RAW Variables Ordered and Colored by Correlations (New)")
# --- Correlation matrix
cor(subDfr)
## price area psf
## price 1.0000 0.7836 0.1545
## area 0.7836 1.0000 -0.4289
## psf 0.1545 -0.4289 1.0000
# --- Perform correlation test for matrix (library ltm) Correlation null
# hypothesis is that the correlation is zero (not correlated) If the
# p-value is less than the alpha level, then the null hypothesis is
# rejected Check for correlation p<0.05 is correlated
rcor.test(subDfr)
##
## price area psf
## price ***** 0.784 0.155
## area <0.001 ***** -0.429
## psf 0.008 <0.001 *****
##
## upper diagonal part contains correlation coefficient estimates
## lower diagonal part contains corresponding p-values
# --- Simple Regression (unstandardized) Y = price; X = area;
raw1Lm <- lm(subDfr$price ~ subDfr$area)
summary(raw1Lm)
##
## Call:
## lm(formula = subDfr$price ~ subDfr$area)
##
## Residuals:
## Min 1Q Median 3Q Max
## -704310 -114635 35690 131078 833128
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 208228 34392 6.05 4.3e-09 ***
## subDfr$area 6376 296 21.52 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 212000 on 291 degrees of freedom
## Multiple R-squared: 0.614, Adjusted R-squared: 0.613
## F-statistic: 463 on 1 and 291 DF, p-value: <2e-16
subrDfr <- data.frame(price = postDfr[, 6], area = postDfr[, 4], psf = postDfr[,
8])
# --- Correlation matrix
cor(subrDfr)
## price area psf
## price 1.0000 0.6716 0.3165
## area 0.6716 1.0000 -0.4067
## psf 0.3165 -0.4067 1.0000
# --- Perform correlation test for matrix (library ltm) Correlation null
# hypothesis is that the correlation is zero (not correlated) If the
# p-value is less than the alpha level, then the null hypothesis is
# rejected Check for correlation p<0.05 is correlated
rcor.test(subrDfr)
##
## price area psf
## price ***** 0.672 0.316
## area <0.001 ***** -0.407
## psf 0.001 <0.001 *****
##
## upper diagonal part contains correlation coefficient estimates
## lower diagonal part contains corresponding p-values
# --- Simple Regression (unstandardized) Y = price; X = area;
rawr1Lm <- lm(subrDfr$price ~ subrDfr$area)
summary(rawr1Lm)
##
## Call:
## lm(formula = subrDfr$price ~ subrDfr$area)
##
## Residuals:
## Min 1Q Median 3Q Max
## -615990 -228805 12708 215721 917536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 277059 81066 3.42 0.00093 ***
## subrDfr$area 5934 668 8.88 3.8e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 277000 on 96 degrees of freedom
## Multiple R-squared: 0.451, Adjusted R-squared: 0.445
## F-statistic: 78.9 on 1 and 96 DF, p-value: 3.76e-14
#
# |------------------------------------------------------------------------------------------|
# | E N D O F S C R I P T |
# |------------------------------------------------------------------------------------------|