R Markdown

This is an R Markdown document to keep all necessary code to wrangle.

Installing Packages

if(!require('dplyr')) {
  install.packages("dplyr")
}
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if(!require('tidyverse')) {
  install.packages('tidyverse')
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   0.3.5
## ✔ tibble  3.1.8     ✔ stringr 1.5.0
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.3     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
if(!require('knitr')){
  install.packages('knitr')
}
## Loading required package: knitr
if(!require('readr')){
  install.packages('readr')
}

if(!require('tidyverse')){
  install.packages("tidyverse")
}

if(!require('readxl')){
  install.packages("readxl", "ggplot2")
}
## Loading required package: readxl

Loading Packages

library(dplyr)
library(knitr)
library(readr)
library(tidyverse)
library(readxl)
library(ggplot2)

Importing Data

The data is imported from multiple sources and formats

df_housing_data <- read_csv("C:/Users/user/Downloads/Data Analytics/Data/housing_prices_data.csv",show_col_types = FALSE)
df_housing_data <- read_csv("C:/Users/user/Downloads/Data Analytics/Data/housing_prices_data.csv",show_col_types = FALSE)
df_population <- read_xlsx("C:/Users/user/Downloads/Data Analytics/Data/310104_population.xlsx",sheet=2,na="", guess_max=2000, skip = 9)
df_property_price_index <- read_xlsx("C:/Users/user/Downloads/Data Analytics/Data/641601_property_price_index.xlsx",sheet=2,na="", guess_max=2000, skip = 9)

Testing Imported Data

The imported data should be tested

head(df_housing_data,10)
## # A tibble: 10 × 14
##     year house…¹ house…² house…³ units…⁴ units…⁵ units…⁶ rate_…⁷ rate_…⁸ rate_…⁹
##    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1  2012   20329  280000  309000    4242  240000  263779     2.6     1.2     0.1
##  2  2013   22682  290000  318898    4371  240000  270571     2       3.2     0.4
##  3  2014   23049  300000  332124    4396  248000  278862     1.9     3.8     2.9
##  4  2015   24614  310000  343995    4441  252500  285265     2.1     4.3     4  
##  5  2016   25747  320000  358218    4503  265000  303274     1.5     1.6     2.9
##  6  2017   27889  345000  384371    5112  285000  331945     1.8     3.1     2.8
##  7  2018   27533  365000  409640    4800  289725  328606     1.8     2       3.3
##  8  2019   24655  385000  427037    4185  300000  341273     1.7    -0.1     1.4
##  9  2020   26788  419000  473631    4157  328000  371184    -0.7     1.4     1.2
## 10  2021   32383  506000  578024    5106  380000  435328    -0.7     3.8    10.2
## # … with 4 more variables: population <dbl>, rate_ppl_chg <dbl>,
## #   rate_dwell_chg <dbl>, dwell <dbl>, and abbreviated variable names
## #   ¹​houses_nos, ²​houses_md, ³​houses_mn, ⁴​units_apts_nos, ⁵​units_apts_md,
## #   ⁶​units_apts_mn, ⁷​rate_rental_pi, ⁸​rate_inflation, ⁹​rate_const_pi
head(df_population,16)
## # A tibble: 16 × 28
##    `Series ID`         A2060825C A2060…¹ A2060…² A2060…³ A2060…⁴ A2060…⁵ A2060…⁶
##    <dttm>                  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 1981-06-01 00:00:00   2608351 1958717 1178447  653940  657249  212565   65393
##  2 1981-09-01 00:00:00   2616060 1964139 1189946  655136  663047  212862   66716
##  3 1981-12-01 00:00:00   2624579 1969349 1200504  657014  667381  212935   68023
##  4 1982-03-01 00:00:00   2634534 1975617 1210128  658840  672273  213477   69023
##  5 1982-06-01 00:00:00   2643527 1981619 1219369  660066  676892  213679   69388
##  6 1982-09-01 00:00:00   2649615 1986589 1228791  661669  681539  213868   70025
##  7 1982-12-01 00:00:00   2655478 1991532 1235548  663641  684771  214168   70671
##  8 1983-03-01 00:00:00   2663858 1997990 1242336  666073  688002  214543   71620
##  9 1983-06-01 00:00:00   2668049 2003140 1248666  667942  691681  215090   72336
## 10 1983-09-01 00:00:00   2673036 2007981 1254248  669852  694823  215635   73411
## 11 1983-12-01 00:00:00   2678250 2012443 1259140  671738  697570  216145   74225
## 12 1984-03-01 00:00:00   2685607 2018217 1264361  673493  699900  216669   74835
## 13 1984-06-01 00:00:00   2692083 2023349 1269559  675233  702455  217409   75666
## 14 1984-09-01 00:00:00   2699019 2028241 1275622  676630  705605  217909   76593
## 15 1984-12-01 00:00:00   2706580 2033611 1281035  677950  708066  218417   77307
## 16 1985-03-01 00:00:00   2716617 2039883 1287316  679761  711701  219051   77966
## # … with 20 more variables: A2060832A <dbl>, A2060824A <dbl>, A2060834F <dbl>,
## #   A2060835J <dbl>, A2060836K <dbl>, A2060837L <dbl>, A2060838R <dbl>,
## #   A2060839T <dbl>, A2060840A <dbl>, A2060841C <dbl>, A2060833C <dbl>,
## #   A2060843J <dbl>, A2060844K <dbl>, A2060845L <dbl>, A2060846R <dbl>,
## #   A2060847T <dbl>, A2060848V <dbl>, A2060849W <dbl>, A2060850F <dbl>,
## #   A2060842F <dbl>, and abbreviated variable names ¹​A2060826F, ²​A2060827J,
## #   ³​A2060828K, ⁴​A2060829L, ⁵​A2060830W, ⁶​A2060831X
head(df_property_price_index,16)
## # A tibble: 16 × 28
##    `Series ID`         A837283…¹ A8372…² A8372…³ A8372…⁴ A8372…⁵ A8372…⁶ A8372…⁷
##    <dttm>                  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 2003-09-01 00:00:00      85.3    60.7    64.2    62.2    48.3    61.2    40.5
##  2 2003-12-01 00:00:00      88.2    62.1    69.4    63.9    50.6    66.5    43.3
##  3 2004-03-01 00:00:00      87      60.8    70.7    64.8    52.5    68.7    45.5
##  4 2004-06-01 00:00:00      84.4    60.9    71.7    65.9    53.7    71.8    45.2
##  5 2004-09-01 00:00:00      84.2    60.9    71.2    67.1    54.7    72.3    46.8
##  6 2004-12-01 00:00:00      84.8    62.4    72      68.6    57.1    74.6    49.7
##  7 2005-03-01 00:00:00      83.2    62.5    72.6    68.6    59.4    75.8    53.6
##  8 2005-06-01 00:00:00      82.6    63.2    73.3    69.3    62.6    76.9    54.7
##  9 2005-09-01 00:00:00      81.4    63.1    73.6    70      65.2    76.7    56.1
## 10 2005-12-01 00:00:00      81.8    64      75      71.6    70      79.1    60.2
## 11 2006-03-01 00:00:00      80.8    65.4    75.6    72.2    75.6    81      63.2
## 12 2006-06-01 00:00:00      82.3    67.2    77.5    73.4    86.4    83.4    67.1
## 13 2006-09-01 00:00:00      82      68.1    78.5    73.8    95.1    84.6    67.3
## 14 2006-12-01 00:00:00      82.7    69.5    80.5    75.7    98.4    86.1    70.5
## 15 2007-03-01 00:00:00      81.9    71      83.6    77.2    99      88.5    72.2
## 16 2007-06-01 00:00:00      85.2    76      89      81.9    97.7    89.6    74.1
## # … with 20 more variables: A83728446K <dbl>, A83728455L <dbl>,
## #   A83728389A <dbl>, A83728398C <dbl>, A83728407V <dbl>, A83728416W <dbl>,
## #   A83728425X <dbl>, A83728434A <dbl>, A83728443C <dbl>, A83728452F <dbl>,
## #   A83728461J <dbl>, A83728386V <dbl>, A83728395W <dbl>, A83728404L <dbl>,
## #   A83728413R <dbl>, A83728422T <dbl>, A83728431V <dbl>, A83728440W <dbl>,
## #   A83728449T <dbl>, A83728458V <dbl>, and abbreviated variable names
## #   ¹​A83728383L, ²​A83728392R, ³​A83728401F, ⁴​A83728410J, ⁵​A83728419C, …

Splitting Columns

Selecting Columns with piping

Only numeric columns to be selected

df_housing_data %>%
  select_if(is.numeric)
## # A tibble: 11 × 14
##     year house…¹ house…² house…³ units…⁴ units…⁵ units…⁶ rate_…⁷ rate_…⁸ rate_…⁹
##    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1  2012   20329  280000  309000    4242  240000  263779     2.6     1.2     0.1
##  2  2013   22682  290000  318898    4371  240000  270571     2       3.2     0.4
##  3  2014   23049  300000  332124    4396  248000  278862     1.9     3.8     2.9
##  4  2015   24614  310000  343995    4441  252500  285265     2.1     4.3     4  
##  5  2016   25747  320000  358218    4503  265000  303274     1.5     1.6     2.9
##  6  2017   27889  345000  384371    5112  285000  331945     1.8     3.1     2.8
##  7  2018   27533  365000  409640    4800  289725  328606     1.8     2       3.3
##  8  2019   24655  385000  427037    4185  300000  341273     1.7    -0.1     1.4
##  9  2020   26788  419000  473631    4157  328000  371184    -0.7     1.4     1.2
## 10  2021   32383  506000  578024    5106  380000  435328    -0.7     3.8    10.2
## 11  2022    7181  746667  821977    1167  543333  602917     1.2    20.7    16.8
## # … with 4 more variables: population <dbl>, rate_ppl_chg <dbl>,
## #   rate_dwell_chg <dbl>, dwell <dbl>, and abbreviated variable names
## #   ¹​houses_nos, ²​houses_md, ³​houses_mn, ⁴​units_apts_nos, ⁵​units_apts_md,
## #   ⁶​units_apts_mn, ⁷​rate_rental_pi, ⁸​rate_inflation, ⁹​rate_const_pi

Selecting Columns based on names

Selecting only columns name criteria

df_housing_data %>%
  select(starts_with('D'))
## # A tibble: 11 × 1
##      dwell
##      <dbl>
##  1 2309800
##  2 2353000
##  3 2399800
##  4 2459600
##  5 2521000
##  6 2580800
##  7 2636700
##  8 2690400
##  9 2745100
## 10 2803100
## 11 2853300

Selecting Columns based on names

Selecting only columns name with criteria

df_housing_data %>%
  select("year", contains('rate'))
## # A tibble: 11 × 6
##     year rate_rental_pi rate_inflation rate_const_pi rate_ppl_chg rate_dwell_chg
##    <dbl>          <dbl>          <dbl>         <dbl>        <dbl>          <dbl>
##  1  2012            2.6            1.2           0.1         1.9            1.95
##  2  2013            2              3.2           0.4         2.13           1.87
##  3  2014            1.9            3.8           2.9         2.17           1.99
##  4  2015            2.1            4.3           4           2.14           2.49
##  5  2016            1.5            1.6           2.9         2.39           2.5 
##  6  2017            1.8            3.1           2.8         2.24           2.37
##  7  2018            1.8            2             3.3         1.88           2.17
##  8  2019            1.7           -0.1           1.4         1.85           2.04
##  9  2020           -0.7            1.4           1.2         1.49           2.03
## 10  2021           -0.7            3.8          10.2        -0.87           2.11
## 11  2022            1.2           20.7          16.8         0.63           1.79

Selecting Columns with exclusion

Select all columns except specified

 df_housing_data %>%
  select(-"year",-"rate_inflation")
## # A tibble: 11 × 12
##    houses_nos houses_md houses…¹ units…² units…³ units…⁴ rate_…⁵ rate_…⁶ popul…⁷
##         <dbl>     <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1      20329    280000   309000    4242  240000  263779     2.6     0.1 5625354
##  2      22682    290000   318898    4371  240000  270571     2       0.4 5744915
##  3      23049    300000   332124    4396  248000  278862     1.9     2.9 5869511
##  4      24614    310000   343995    4441  252500  285265     2.1     4   5995205
##  5      25747    320000   358218    4503  265000  303274     1.5     2.9 6138627
##  6      27889    345000   384371    5112  285000  331945     1.8     2.8 6276233
##  7      27533    365000   409640    4800  289725  328606     1.8     3.3 6394316
##  8      24655    385000   427037    4185  300000  341273     1.7     1.4 6512352
##  9      26788    419000   473631    4157  328000  371184    -0.7     1.2 6609129
## 10      32383    506000   578024    5106  380000  435328    -0.7    10.2 6551771
## 11       7181    746667   821977    1167  543333  602917     1.2    16.8 6593314
## # … with 3 more variables: rate_ppl_chg <dbl>, rate_dwell_chg <dbl>,
## #   dwell <dbl>, and abbreviated variable names ¹​houses_mn, ²​units_apts_nos,
## #   ³​units_apts_md, ⁴​units_apts_mn, ⁵​rate_rental_pi, ⁶​rate_const_pi,
## #   ⁷​population

Selecting columns without pipe

head(select(df_housing_data, contains("chg")))
## # A tibble: 6 × 2
##   rate_ppl_chg rate_dwell_chg
##          <dbl>          <dbl>
## 1         1.9            1.95
## 2         2.13           1.87
## 3         2.17           1.99
## 4         2.14           2.49
## 5         2.39           2.5 
## 6         2.24           2.37

Selecting columns without pipe

df_rates <- select(df_housing_data, "year", contains("rate"))
head(df_rates,8)
## # A tibble: 8 × 6
##    year rate_rental_pi rate_inflation rate_const_pi rate_ppl_chg rate_dwell_chg
##   <dbl>          <dbl>          <dbl>         <dbl>        <dbl>          <dbl>
## 1  2012            2.6            1.2           0.1         1.9            1.95
## 2  2013            2              3.2           0.4         2.13           1.87
## 3  2014            1.9            3.8           2.9         2.17           1.99
## 4  2015            2.1            4.3           4           2.14           2.49
## 5  2016            1.5            1.6           2.9         2.39           2.5 
## 6  2017            1.8            3.1           2.8         2.24           2.37
## 7  2018            1.8            2             3.3         1.88           2.17
## 8  2019            1.7           -0.1           1.4         1.85           2.04

Simple Plotting

plot(df_rates$year, df_rates$rate_rental_pi)

### Multi Plotting

df_rates %>%
  ggplot(aes(x = year, y = rate_inflation, colour=rate_const_pi)) +
  geom_point(size=5) +
  geom_smooth(se=F, colour="orange")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

df_rates %>%
  ggplot(aes(year, rate_dwell_chg, color=rate_ppl_chg)) +
  geom_point() +
  geom_point(size=4) +
  ylab('Population (%)')

### Bar Graph

As a refresher, here is the chart you created before:

Dataset Merging

# merge two data frames by ID
df_merged <- merge(df_rates,df_housing_data,by="year")
df_merged
##    year rate_rental_pi.x rate_inflation.x rate_const_pi.x rate_ppl_chg.x
## 1  2012              2.6              1.2             0.1           1.90
## 2  2013              2.0              3.2             0.4           2.13
## 3  2014              1.9              3.8             2.9           2.17
## 4  2015              2.1              4.3             4.0           2.14
## 5  2016              1.5              1.6             2.9           2.39
## 6  2017              1.8              3.1             2.8           2.24
## 7  2018              1.8              2.0             3.3           1.88
## 8  2019              1.7             -0.1             1.4           1.85
## 9  2020             -0.7              1.4             1.2           1.49
## 10 2021             -0.7              3.8            10.2          -0.87
## 11 2022              1.2             20.7            16.8           0.63
##    rate_dwell_chg.x houses_nos houses_md houses_mn units_apts_nos units_apts_md
## 1              1.95      20329    280000    309000           4242        240000
## 2              1.87      22682    290000    318898           4371        240000
## 3              1.99      23049    300000    332124           4396        248000
## 4              2.49      24614    310000    343995           4441        252500
## 5              2.50      25747    320000    358218           4503        265000
## 6              2.37      27889    345000    384371           5112        285000
## 7              2.17      27533    365000    409640           4800        289725
## 8              2.04      24655    385000    427037           4185        300000
## 9              2.03      26788    419000    473631           4157        328000
## 10             2.11      32383    506000    578024           5106        380000
## 11             1.79       7181    746667    821977           1167        543333
##    units_apts_mn rate_rental_pi.y rate_inflation.y rate_const_pi.y population
## 1         263779              2.6              1.2             0.1    5625354
## 2         270571              2.0              3.2             0.4    5744915
## 3         278862              1.9              3.8             2.9    5869511
## 4         285265              2.1              4.3             4.0    5995205
## 5         303274              1.5              1.6             2.9    6138627
## 6         331945              1.8              3.1             2.8    6276233
## 7         328606              1.8              2.0             3.3    6394316
## 8         341273              1.7             -0.1             1.4    6512352
## 9         371184             -0.7              1.4             1.2    6609129
## 10        435328             -0.7              3.8            10.2    6551771
## 11        602917              1.2             20.7            16.8    6593314
##    rate_ppl_chg.y rate_dwell_chg.y   dwell
## 1            1.90             1.95 2309800
## 2            2.13             1.87 2353000
## 3            2.17             1.99 2399800
## 4            2.14             2.49 2459600
## 5            2.39             2.50 2521000
## 6            2.24             2.37 2580800
## 7            1.88             2.17 2636700
## 8            1.85             2.04 2690400
## 9            1.49             2.03 2745100
## 10          -0.87             2.11 2803100
## 11           0.63             1.79 2853300

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.