This is an R Markdown document to keep all necessary code to wrangle.
if(!require('dplyr')) {
install.packages("dplyr")
}
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if(!require('tidyverse')) {
install.packages('tidyverse')
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ stringr 1.5.0
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
if(!require('knitr')){
install.packages('knitr')
}
## Loading required package: knitr
if(!require('readr')){
install.packages('readr')
}
if(!require('tidyverse')){
install.packages("tidyverse")
}
if(!require('readxl')){
install.packages("readxl", "ggplot2")
}
## Loading required package: readxl
library(dplyr)
library(knitr)
library(readr)
library(tidyverse)
library(readxl)
library(ggplot2)
The data is imported from multiple sources and formats
df_housing_data <- read_csv("C:/Users/user/Downloads/Data Analytics/Data/housing_prices_data.csv",show_col_types = FALSE)
df_housing_data <- read_csv("C:/Users/user/Downloads/Data Analytics/Data/housing_prices_data.csv",show_col_types = FALSE)
df_population <- read_xlsx("C:/Users/user/Downloads/Data Analytics/Data/310104_population.xlsx",sheet=2,na="", guess_max=2000, skip = 9)
df_property_price_index <- read_xlsx("C:/Users/user/Downloads/Data Analytics/Data/641601_property_price_index.xlsx",sheet=2,na="", guess_max=2000, skip = 9)
The imported data should be tested
head(df_housing_data,10)
## # A tibble: 10 × 14
## year house…¹ house…² house…³ units…⁴ units…⁵ units…⁶ rate_…⁷ rate_…⁸ rate_…⁹
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012 20329 280000 309000 4242 240000 263779 2.6 1.2 0.1
## 2 2013 22682 290000 318898 4371 240000 270571 2 3.2 0.4
## 3 2014 23049 300000 332124 4396 248000 278862 1.9 3.8 2.9
## 4 2015 24614 310000 343995 4441 252500 285265 2.1 4.3 4
## 5 2016 25747 320000 358218 4503 265000 303274 1.5 1.6 2.9
## 6 2017 27889 345000 384371 5112 285000 331945 1.8 3.1 2.8
## 7 2018 27533 365000 409640 4800 289725 328606 1.8 2 3.3
## 8 2019 24655 385000 427037 4185 300000 341273 1.7 -0.1 1.4
## 9 2020 26788 419000 473631 4157 328000 371184 -0.7 1.4 1.2
## 10 2021 32383 506000 578024 5106 380000 435328 -0.7 3.8 10.2
## # … with 4 more variables: population <dbl>, rate_ppl_chg <dbl>,
## # rate_dwell_chg <dbl>, dwell <dbl>, and abbreviated variable names
## # ¹houses_nos, ²houses_md, ³houses_mn, ⁴units_apts_nos, ⁵units_apts_md,
## # ⁶units_apts_mn, ⁷rate_rental_pi, ⁸rate_inflation, ⁹rate_const_pi
head(df_population,16)
## # A tibble: 16 × 28
## `Series ID` A2060825C A2060…¹ A2060…² A2060…³ A2060…⁴ A2060…⁵ A2060…⁶
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1981-06-01 00:00:00 2608351 1958717 1178447 653940 657249 212565 65393
## 2 1981-09-01 00:00:00 2616060 1964139 1189946 655136 663047 212862 66716
## 3 1981-12-01 00:00:00 2624579 1969349 1200504 657014 667381 212935 68023
## 4 1982-03-01 00:00:00 2634534 1975617 1210128 658840 672273 213477 69023
## 5 1982-06-01 00:00:00 2643527 1981619 1219369 660066 676892 213679 69388
## 6 1982-09-01 00:00:00 2649615 1986589 1228791 661669 681539 213868 70025
## 7 1982-12-01 00:00:00 2655478 1991532 1235548 663641 684771 214168 70671
## 8 1983-03-01 00:00:00 2663858 1997990 1242336 666073 688002 214543 71620
## 9 1983-06-01 00:00:00 2668049 2003140 1248666 667942 691681 215090 72336
## 10 1983-09-01 00:00:00 2673036 2007981 1254248 669852 694823 215635 73411
## 11 1983-12-01 00:00:00 2678250 2012443 1259140 671738 697570 216145 74225
## 12 1984-03-01 00:00:00 2685607 2018217 1264361 673493 699900 216669 74835
## 13 1984-06-01 00:00:00 2692083 2023349 1269559 675233 702455 217409 75666
## 14 1984-09-01 00:00:00 2699019 2028241 1275622 676630 705605 217909 76593
## 15 1984-12-01 00:00:00 2706580 2033611 1281035 677950 708066 218417 77307
## 16 1985-03-01 00:00:00 2716617 2039883 1287316 679761 711701 219051 77966
## # … with 20 more variables: A2060832A <dbl>, A2060824A <dbl>, A2060834F <dbl>,
## # A2060835J <dbl>, A2060836K <dbl>, A2060837L <dbl>, A2060838R <dbl>,
## # A2060839T <dbl>, A2060840A <dbl>, A2060841C <dbl>, A2060833C <dbl>,
## # A2060843J <dbl>, A2060844K <dbl>, A2060845L <dbl>, A2060846R <dbl>,
## # A2060847T <dbl>, A2060848V <dbl>, A2060849W <dbl>, A2060850F <dbl>,
## # A2060842F <dbl>, and abbreviated variable names ¹A2060826F, ²A2060827J,
## # ³A2060828K, ⁴A2060829L, ⁵A2060830W, ⁶A2060831X
head(df_property_price_index,16)
## # A tibble: 16 × 28
## `Series ID` A837283…¹ A8372…² A8372…³ A8372…⁴ A8372…⁵ A8372…⁶ A8372…⁷
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2003-09-01 00:00:00 85.3 60.7 64.2 62.2 48.3 61.2 40.5
## 2 2003-12-01 00:00:00 88.2 62.1 69.4 63.9 50.6 66.5 43.3
## 3 2004-03-01 00:00:00 87 60.8 70.7 64.8 52.5 68.7 45.5
## 4 2004-06-01 00:00:00 84.4 60.9 71.7 65.9 53.7 71.8 45.2
## 5 2004-09-01 00:00:00 84.2 60.9 71.2 67.1 54.7 72.3 46.8
## 6 2004-12-01 00:00:00 84.8 62.4 72 68.6 57.1 74.6 49.7
## 7 2005-03-01 00:00:00 83.2 62.5 72.6 68.6 59.4 75.8 53.6
## 8 2005-06-01 00:00:00 82.6 63.2 73.3 69.3 62.6 76.9 54.7
## 9 2005-09-01 00:00:00 81.4 63.1 73.6 70 65.2 76.7 56.1
## 10 2005-12-01 00:00:00 81.8 64 75 71.6 70 79.1 60.2
## 11 2006-03-01 00:00:00 80.8 65.4 75.6 72.2 75.6 81 63.2
## 12 2006-06-01 00:00:00 82.3 67.2 77.5 73.4 86.4 83.4 67.1
## 13 2006-09-01 00:00:00 82 68.1 78.5 73.8 95.1 84.6 67.3
## 14 2006-12-01 00:00:00 82.7 69.5 80.5 75.7 98.4 86.1 70.5
## 15 2007-03-01 00:00:00 81.9 71 83.6 77.2 99 88.5 72.2
## 16 2007-06-01 00:00:00 85.2 76 89 81.9 97.7 89.6 74.1
## # … with 20 more variables: A83728446K <dbl>, A83728455L <dbl>,
## # A83728389A <dbl>, A83728398C <dbl>, A83728407V <dbl>, A83728416W <dbl>,
## # A83728425X <dbl>, A83728434A <dbl>, A83728443C <dbl>, A83728452F <dbl>,
## # A83728461J <dbl>, A83728386V <dbl>, A83728395W <dbl>, A83728404L <dbl>,
## # A83728413R <dbl>, A83728422T <dbl>, A83728431V <dbl>, A83728440W <dbl>,
## # A83728449T <dbl>, A83728458V <dbl>, and abbreviated variable names
## # ¹A83728383L, ²A83728392R, ³A83728401F, ⁴A83728410J, ⁵A83728419C, …
Only numeric columns to be selected
df_housing_data %>%
select_if(is.numeric)
## # A tibble: 11 × 14
## year house…¹ house…² house…³ units…⁴ units…⁵ units…⁶ rate_…⁷ rate_…⁸ rate_…⁹
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012 20329 280000 309000 4242 240000 263779 2.6 1.2 0.1
## 2 2013 22682 290000 318898 4371 240000 270571 2 3.2 0.4
## 3 2014 23049 300000 332124 4396 248000 278862 1.9 3.8 2.9
## 4 2015 24614 310000 343995 4441 252500 285265 2.1 4.3 4
## 5 2016 25747 320000 358218 4503 265000 303274 1.5 1.6 2.9
## 6 2017 27889 345000 384371 5112 285000 331945 1.8 3.1 2.8
## 7 2018 27533 365000 409640 4800 289725 328606 1.8 2 3.3
## 8 2019 24655 385000 427037 4185 300000 341273 1.7 -0.1 1.4
## 9 2020 26788 419000 473631 4157 328000 371184 -0.7 1.4 1.2
## 10 2021 32383 506000 578024 5106 380000 435328 -0.7 3.8 10.2
## 11 2022 7181 746667 821977 1167 543333 602917 1.2 20.7 16.8
## # … with 4 more variables: population <dbl>, rate_ppl_chg <dbl>,
## # rate_dwell_chg <dbl>, dwell <dbl>, and abbreviated variable names
## # ¹houses_nos, ²houses_md, ³houses_mn, ⁴units_apts_nos, ⁵units_apts_md,
## # ⁶units_apts_mn, ⁷rate_rental_pi, ⁸rate_inflation, ⁹rate_const_pi
Selecting only columns name criteria
df_housing_data %>%
select(starts_with('D'))
## # A tibble: 11 × 1
## dwell
## <dbl>
## 1 2309800
## 2 2353000
## 3 2399800
## 4 2459600
## 5 2521000
## 6 2580800
## 7 2636700
## 8 2690400
## 9 2745100
## 10 2803100
## 11 2853300
Selecting only columns name with criteria
df_housing_data %>%
select("year", contains('rate'))
## # A tibble: 11 × 6
## year rate_rental_pi rate_inflation rate_const_pi rate_ppl_chg rate_dwell_chg
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012 2.6 1.2 0.1 1.9 1.95
## 2 2013 2 3.2 0.4 2.13 1.87
## 3 2014 1.9 3.8 2.9 2.17 1.99
## 4 2015 2.1 4.3 4 2.14 2.49
## 5 2016 1.5 1.6 2.9 2.39 2.5
## 6 2017 1.8 3.1 2.8 2.24 2.37
## 7 2018 1.8 2 3.3 1.88 2.17
## 8 2019 1.7 -0.1 1.4 1.85 2.04
## 9 2020 -0.7 1.4 1.2 1.49 2.03
## 10 2021 -0.7 3.8 10.2 -0.87 2.11
## 11 2022 1.2 20.7 16.8 0.63 1.79
Select all columns except specified
df_housing_data %>%
select(-"year",-"rate_inflation")
## # A tibble: 11 × 12
## houses_nos houses_md houses…¹ units…² units…³ units…⁴ rate_…⁵ rate_…⁶ popul…⁷
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20329 280000 309000 4242 240000 263779 2.6 0.1 5625354
## 2 22682 290000 318898 4371 240000 270571 2 0.4 5744915
## 3 23049 300000 332124 4396 248000 278862 1.9 2.9 5869511
## 4 24614 310000 343995 4441 252500 285265 2.1 4 5995205
## 5 25747 320000 358218 4503 265000 303274 1.5 2.9 6138627
## 6 27889 345000 384371 5112 285000 331945 1.8 2.8 6276233
## 7 27533 365000 409640 4800 289725 328606 1.8 3.3 6394316
## 8 24655 385000 427037 4185 300000 341273 1.7 1.4 6512352
## 9 26788 419000 473631 4157 328000 371184 -0.7 1.2 6609129
## 10 32383 506000 578024 5106 380000 435328 -0.7 10.2 6551771
## 11 7181 746667 821977 1167 543333 602917 1.2 16.8 6593314
## # … with 3 more variables: rate_ppl_chg <dbl>, rate_dwell_chg <dbl>,
## # dwell <dbl>, and abbreviated variable names ¹houses_mn, ²units_apts_nos,
## # ³units_apts_md, ⁴units_apts_mn, ⁵rate_rental_pi, ⁶rate_const_pi,
## # ⁷population
head(select(df_housing_data, contains("chg")))
## # A tibble: 6 × 2
## rate_ppl_chg rate_dwell_chg
## <dbl> <dbl>
## 1 1.9 1.95
## 2 2.13 1.87
## 3 2.17 1.99
## 4 2.14 2.49
## 5 2.39 2.5
## 6 2.24 2.37
df_rates <- select(df_housing_data, "year", contains("rate"))
head(df_rates,8)
## # A tibble: 8 × 6
## year rate_rental_pi rate_inflation rate_const_pi rate_ppl_chg rate_dwell_chg
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2012 2.6 1.2 0.1 1.9 1.95
## 2 2013 2 3.2 0.4 2.13 1.87
## 3 2014 1.9 3.8 2.9 2.17 1.99
## 4 2015 2.1 4.3 4 2.14 2.49
## 5 2016 1.5 1.6 2.9 2.39 2.5
## 6 2017 1.8 3.1 2.8 2.24 2.37
## 7 2018 1.8 2 3.3 1.88 2.17
## 8 2019 1.7 -0.1 1.4 1.85 2.04
plot(df_rates$year, df_rates$rate_rental_pi)
### Multi Plotting
df_rates %>%
ggplot(aes(x = year, y = rate_inflation, colour=rate_const_pi)) +
geom_point(size=5) +
geom_smooth(se=F, colour="orange")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df_rates %>%
ggplot(aes(year, rate_dwell_chg, color=rate_ppl_chg)) +
geom_point() +
geom_point(size=4) +
ylab('Population (%)')
### Bar Graph
As a refresher, here is the chart you created before:
# merge two data frames by ID
df_merged <- merge(df_rates,df_housing_data,by="year")
df_merged
## year rate_rental_pi.x rate_inflation.x rate_const_pi.x rate_ppl_chg.x
## 1 2012 2.6 1.2 0.1 1.90
## 2 2013 2.0 3.2 0.4 2.13
## 3 2014 1.9 3.8 2.9 2.17
## 4 2015 2.1 4.3 4.0 2.14
## 5 2016 1.5 1.6 2.9 2.39
## 6 2017 1.8 3.1 2.8 2.24
## 7 2018 1.8 2.0 3.3 1.88
## 8 2019 1.7 -0.1 1.4 1.85
## 9 2020 -0.7 1.4 1.2 1.49
## 10 2021 -0.7 3.8 10.2 -0.87
## 11 2022 1.2 20.7 16.8 0.63
## rate_dwell_chg.x houses_nos houses_md houses_mn units_apts_nos units_apts_md
## 1 1.95 20329 280000 309000 4242 240000
## 2 1.87 22682 290000 318898 4371 240000
## 3 1.99 23049 300000 332124 4396 248000
## 4 2.49 24614 310000 343995 4441 252500
## 5 2.50 25747 320000 358218 4503 265000
## 6 2.37 27889 345000 384371 5112 285000
## 7 2.17 27533 365000 409640 4800 289725
## 8 2.04 24655 385000 427037 4185 300000
## 9 2.03 26788 419000 473631 4157 328000
## 10 2.11 32383 506000 578024 5106 380000
## 11 1.79 7181 746667 821977 1167 543333
## units_apts_mn rate_rental_pi.y rate_inflation.y rate_const_pi.y population
## 1 263779 2.6 1.2 0.1 5625354
## 2 270571 2.0 3.2 0.4 5744915
## 3 278862 1.9 3.8 2.9 5869511
## 4 285265 2.1 4.3 4.0 5995205
## 5 303274 1.5 1.6 2.9 6138627
## 6 331945 1.8 3.1 2.8 6276233
## 7 328606 1.8 2.0 3.3 6394316
## 8 341273 1.7 -0.1 1.4 6512352
## 9 371184 -0.7 1.4 1.2 6609129
## 10 435328 -0.7 3.8 10.2 6551771
## 11 602917 1.2 20.7 16.8 6593314
## rate_ppl_chg.y rate_dwell_chg.y dwell
## 1 1.90 1.95 2309800
## 2 2.13 1.87 2353000
## 3 2.17 1.99 2399800
## 4 2.14 2.49 2459600
## 5 2.39 2.50 2521000
## 6 2.24 2.37 2580800
## 7 1.88 2.17 2636700
## 8 1.85 2.04 2690400
## 9 1.49 2.03 2745100
## 10 -0.87 2.11 2803100
## 11 0.63 1.79 2853300
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.