Reading CSV methods

You will realize that we need to convert factors into numbers. Also numbers comes with commas, so it is quite cumbersome for conversion.

library(ggplot2)
pty <- read.csv("pty.csv")
str(pty)

## 'data.frame':    1022 obs. of  10 variables:
##  $ PROJECT        : Factor w/ 386 levels "1 KING ALBERT PARK",..: 117 117 117 117 117 220 220 220 220 220 ...
##  $ PROPERTY.TYPE  : Factor w/ 6 levels "Apartment","Condominium",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ TENURE         : Factor w/ 5 levels "103 years","956 years",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ SALE.DATE      : Factor w/ 15 levels "1-Aug-17","18-Jul-17",..: 6 6 5 2 2 6 6 6 6 4 ...
##  $ SQ.FT          : Factor w/ 295 levels "1,001","1,012",..: 86 245 267 288 288 275 275 275 242 273 ...
##  $ PRICE          : Factor w/ 804 levels "1,000,000","1,005,000",..: 366 95 238 337 321 209 185 163 709 143 ...
##  $ PSF            : Factor w/ 687 levels "1,000","1,002",..: 325 452 422 405 388 339 323 308 335 306 ...
##  $ COMPLETION.DATE: Factor w/ 48 levels "1963","1967",..: 47 47 47 47 47 47 47 47 47 47 ...
##  $ TYPE.OF.SALE   : Factor w/ 3 levels "New Sale","Resale",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DISTRICT       : int  3 3 3 3 3 3 3 3 3 3 ...

Using library READR

This will convert (numbers with commas) to without commas.And i convert into data frame for visualization purpose.

library(readr)
pty <- data.frame(read_csv("pty.csv")) # = read_delim(txt, delim = ",")

## Parsed with column specification:
## cols(
##   PROJECT = col_character(),
##   `PROPERTY TYPE` = col_character(),
##   TENURE = col_character(),
##   `SALE DATE` = col_character(),
##   `SQ FT` = col_number(),
##   PRICE = col_number(),
##   PSF = col_number(),
##   `COMPLETION DATE` = col_character(),
##   `TYPE OF SALE` = col_character(),
##   DISTRICT = col_integer()
## )

str(pty)

## 'data.frame':    1022 obs. of  10 variables:
##  $ PROJECT        : chr  "HIGHLINE RESIDENCES" "HIGHLINE RESIDENCES" "HIGHLINE RESIDENCES" "HIGHLINE RESIDENCES" ...
##  $ PROPERTY.TYPE  : chr  "Condominium" "Condominium" "Condominium" "Condominium" ...
##  $ TENURE         : chr  "99 years" "99 years" "99 years" "99 years" ...
##  $ SALE.DATE      : chr  "22-Jul-17" "22-Jul-17" "21-Jul-17" "18-Jul-17" ...
##  $ SQ.FT          : num  1152 506 700 915 915 ...
##  $ PRICE          : num  1961300 1152400 1433600 1830200 1760000 ...
##  $ PSF            : num  1703 2278 2049 2000 1924 ...
##  $ COMPLETION.DATE: chr  "Uncompleted" "Uncompleted" "Uncompleted" "Uncompleted" ...
##  $ TYPE.OF.SALE   : chr  "New Sale" "New Sale" "New Sale" "New Sale" ...
##  $ DISTRICT       : int  3 3 3 3 3 3 3 3 3 3 ...

Using QPLOT

Visualization 1 : Count of Sales based on TYPES

qplot(TYPE.OF.SALE, data=pty , geom="bar", fill=TYPE.OF.SALE)

Visualization 2 : Type of Sales

qplot(TYPE.OF.SALE, data=pty , geom="bar", fill=PROPERTY.TYPE)

Visualization 3 : See where the new launches are (new sales).

qplot(DISTRICT, TYPE.OF.SALE,  data=pty , geom="boxplot")

Visualization 4 : Range of PSF (PER SQUARE FOOT $) for the Type of Sales

qplot(TYPE.OF.SALE, PSF, data=pty , geom=c("boxplot", "jitter"))

Visualization 5 : PSF for Property Type

qplot(PSF, PROPERTY.TYPE, data=pty, geom="point")

Visualization 6 : PSF for Property Type based on Tenure type

qplot(PSF, PROPERTY.TYPE, data=pty, geom="point", size=TENURE)

## Warning: Using size for a discrete variable is not advised.

Using GGPLOT2

Visualization 7: SUBSET BY PSF

ggplot(data=subset(pty,PSF>1000), aes(PSF)) + geom_histogram() + facet_wrap(~PROPERTY.TYPE, nrow=2) + scale_x_log10()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Visualization 8: SUBSET BY DISTRICT but with FACET_WRAP (~ X+Y+Z variables)

ggplot(data=subset(pty,DISTRICT=10), aes(PSF)) + geom_histogram() + facet_wrap(~PROPERTY.TYPE + TYPE.OF.SALE, nrow=1) + scale_x_log10()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using colors for GGPLOT2

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

scales::hue_pal()(3) #SEE THE COLORS CODES

## [1] "#F8766D" "#00BA38" "#619CFF"

Visualization 9 : stack side by side

#position can be fill or stack
ggplot(data=pty, aes(x=TYPE.OF.SALE,fill=factor(PROPERTY.TYPE))) + geom_bar(position="dodge")

Visualization 10 : PIE CHART - TYPE OF SALE

ggplot(data=pty, aes(x=factor(1),fill=factor(TYPE.OF.SALE)))+geom_bar(width = 1)+coord_polar(theta = "y")

Visualization 11 : PIE CHART - PROPERTY TYPE

ggplot(data=pty, aes(x=factor(1),fill=factor(PROPERTY.TYPE)))+geom_bar(width = 1)+coord_polar(theta = "y")

Visualization 12 : coxcomb plot

ggplot(data=pty, aes(x=PROPERTY.TYPE,fill=factor(PROPERTY.TYPE)))+geom_bar(width = 1)+coord_polar(theta = "x")

ggplot(data=pty, aes(x=TYPE.OF.SALE,fill=factor(PROPERTY.TYPE)))+geom_bar(width = 1)+coord_polar(theta = "x")

Visualization on PROPERTY

James Lim (jkklim@hotmail.com)

October 10, 2018