This is an R Markdown document to keep all necessary code to wrangle.
if(!require('dplyr')) {
install.packages("dplyr")
}
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if(!require('tidyverse')) {
install.packages('tidyverse')
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ stringr 1.5.0
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
if(!require('knitr')){
install.packages('knitr')
}
## Loading required package: knitr
if(!require('readr')){
install.packages('readr')
}
if(!require('tidyverse')){
install.packages("tidyverse")
}
if(!require('readxl')){
install.packages("readxl")
}
## Loading required package: readxl
if(!require('ggplot2')){
install.packages("ggplot2")
}
library(dplyr)
library(knitr)
library(readr)
library(tidyverse)
library(readxl)
library(ggplot2)
The data is imported from multiple sources and formats
df_housing <- read_csv("C:/Users/user/Downloads/Data Analytics/Data/capitals_index.csv",show_col_types = FALSE)
df_housing_data_list <- read_csv("C:/Users/user/Downloads/Data Analytics/Data/vic_house_parameters_list.csv",show_col_types = FALSE)
df_population <- read_xlsx("C:/Users/user/Downloads/Data Analytics/Data/310104_population.xlsx",sheet=2,na="", guess_max=2000, skip = 9)
df_property_price_index <- read_xlsx("C:/Users/user/Downloads/Data Analytics/Data/641601_property_price_index.xlsx",sheet=2,na="", guess_max=2000, skip = 9)
The imported data should be tested
head(df_housing,50)
## # A tibble: 50 × 8
## Year City HouseValueK HouseNumberK PriceIndexHo…¹ CPI Total…² `Pop-M`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011 Sydney 541 2839 98.4 99.8 9049 7.3
## 2 2011 Melbourne 513 2266 100 99.8 22851 5.6
## 3 2011 Brisbane 429 1785 100. 99.8 7327 4.5
## 4 2011 Adelaide 380 722 101. 99.8 5222 1.6
## 5 2011 Perth 525 942 99.4 99.8 12749 2.4
## 6 2011 Hobart 305 231 102. 99.8 827 0.5
## 7 2011 Darwin 523 74 98.2 99.8 471 0.2
## 8 2011 Canberra 548 145 101. 99.8 1766 0.4
## 9 2012 Sydney 545 2863 104. 102. 9240 7.4
## 10 2012 Melbourne 492 2310 100. 102. 18764 5.7
## # … with 40 more rows, and abbreviated variable names ¹PriceIndexHousing,
## # ²TotalBldgPermit
head(df_housing_data_list,16)
## # A tibble: 16 × 4
## Year HouseAttribute Value Type
## <dbl> <chr> <dbl> <chr>
## 1 2012 dwell 2309800 Number
## 2 2012 houses_md 280 K
## 3 2012 houses_mn 309 K
## 4 2012 houses_nos 20329 Number
## 5 2012 population 5.6 M
## 6 2012 rate_const_pi 0.1 Change
## 7 2012 rate_dwell_chg 1.95 Change
## 8 2012 rate_inflation 1.2 Change
## 9 2012 rate_ppl_chg 1.9 Change
## 10 2012 rate_rental_pi 2.6 Change
## 11 2012 units_apts_md 240 K
## 12 2012 units_apts_mn 264 K
## 13 2012 units_apts_nos 4242 Number
## 14 2013 dwell 2353000 Number
## 15 2013 houses_md 290 K
## 16 2013 houses_mn 319 K
#head(df_housing_data,6)
#head(df_Inflation_rate,8)
head(df_population,10)
## # A tibble: 10 × 28
## `Series ID` A2060825C A2060…¹ A2060…² A2060…³ A2060…⁴ A2060…⁵ A2060…⁶
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1981-06-01 00:00:00 2608351 1958717 1178447 653940 657249 212565 65393
## 2 1981-09-01 00:00:00 2616060 1964139 1189946 655136 663047 212862 66716
## 3 1981-12-01 00:00:00 2624579 1969349 1200504 657014 667381 212935 68023
## 4 1982-03-01 00:00:00 2634534 1975617 1210128 658840 672273 213477 69023
## 5 1982-06-01 00:00:00 2643527 1981619 1219369 660066 676892 213679 69388
## 6 1982-09-01 00:00:00 2649615 1986589 1228791 661669 681539 213868 70025
## 7 1982-12-01 00:00:00 2655478 1991532 1235548 663641 684771 214168 70671
## 8 1983-03-01 00:00:00 2663858 1997990 1242336 666073 688002 214543 71620
## 9 1983-06-01 00:00:00 2668049 2003140 1248666 667942 691681 215090 72336
## 10 1983-09-01 00:00:00 2673036 2007981 1254248 669852 694823 215635 73411
## # … with 20 more variables: A2060832A <dbl>, A2060824A <dbl>, A2060834F <dbl>,
## # A2060835J <dbl>, A2060836K <dbl>, A2060837L <dbl>, A2060838R <dbl>,
## # A2060839T <dbl>, A2060840A <dbl>, A2060841C <dbl>, A2060833C <dbl>,
## # A2060843J <dbl>, A2060844K <dbl>, A2060845L <dbl>, A2060846R <dbl>,
## # A2060847T <dbl>, A2060848V <dbl>, A2060849W <dbl>, A2060850F <dbl>,
## # A2060842F <dbl>, and abbreviated variable names ¹A2060826F, ²A2060827J,
## # ³A2060828K, ⁴A2060829L, ⁵A2060830W, ⁶A2060831X
head(df_property_price_index,12)
## # A tibble: 12 × 28
## `Series ID` A837283…¹ A8372…² A8372…³ A8372…⁴ A8372…⁵ A8372…⁶ A8372…⁷
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2003-09-01 00:00:00 85.3 60.7 64.2 62.2 48.3 61.2 40.5
## 2 2003-12-01 00:00:00 88.2 62.1 69.4 63.9 50.6 66.5 43.3
## 3 2004-03-01 00:00:00 87 60.8 70.7 64.8 52.5 68.7 45.5
## 4 2004-06-01 00:00:00 84.4 60.9 71.7 65.9 53.7 71.8 45.2
## 5 2004-09-01 00:00:00 84.2 60.9 71.2 67.1 54.7 72.3 46.8
## 6 2004-12-01 00:00:00 84.8 62.4 72 68.6 57.1 74.6 49.7
## 7 2005-03-01 00:00:00 83.2 62.5 72.6 68.6 59.4 75.8 53.6
## 8 2005-06-01 00:00:00 82.6 63.2 73.3 69.3 62.6 76.9 54.7
## 9 2005-09-01 00:00:00 81.4 63.1 73.6 70 65.2 76.7 56.1
## 10 2005-12-01 00:00:00 81.8 64 75 71.6 70 79.1 60.2
## 11 2006-03-01 00:00:00 80.8 65.4 75.6 72.2 75.6 81 63.2
## 12 2006-06-01 00:00:00 82.3 67.2 77.5 73.4 86.4 83.4 67.1
## # … with 20 more variables: A83728446K <dbl>, A83728455L <dbl>,
## # A83728389A <dbl>, A83728398C <dbl>, A83728407V <dbl>, A83728416W <dbl>,
## # A83728425X <dbl>, A83728434A <dbl>, A83728443C <dbl>, A83728452F <dbl>,
## # A83728461J <dbl>, A83728386V <dbl>, A83728395W <dbl>, A83728404L <dbl>,
## # A83728413R <dbl>, A83728422T <dbl>, A83728431V <dbl>, A83728440W <dbl>,
## # A83728449T <dbl>, A83728458V <dbl>, and abbreviated variable names
## # ¹A83728383L, ²A83728392R, ³A83728401F, ⁴A83728410J, ⁵A83728419C, …
#head(df_housing_price_index_capitals,14)
df_housing
## # A tibble: 96 × 8
## Year City HouseValueK HouseNumberK PriceIndexHo…¹ CPI Total…² `Pop-M`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011 Sydney 541 2839 98.4 99.8 9049 7.3
## 2 2011 Melbourne 513 2266 100 99.8 22851 5.6
## 3 2011 Brisbane 429 1785 100. 99.8 7327 4.5
## 4 2011 Adelaide 380 722 101. 99.8 5222 1.6
## 5 2011 Perth 525 942 99.4 99.8 12749 2.4
## 6 2011 Hobart 305 231 102. 99.8 827 0.5
## 7 2011 Darwin 523 74 98.2 99.8 471 0.2
## 8 2011 Canberra 548 145 101. 99.8 1766 0.4
## 9 2012 Sydney 545 2863 104. 102. 9240 7.4
## 10 2012 Melbourne 492 2310 100. 102. 18764 5.7
## # … with 86 more rows, and abbreviated variable names ¹PriceIndexHousing,
## # ²TotalBldgPermit
ggplot(df_housing, aes(Year, CPI, fill = City)) +
geom_col()
#df_housing_price_index_capitals
ggplot(df_housing, aes(Year, TotalBldgPermit, fill = City)) +
geom_col()
ggplot(df_housing, aes(x = Year, y = HouseValueK, color = City )) +
geom_boxplot()
ggplot(df_housing, aes(x = Year, y = HouseNumberK, color = City )) +
geom_boxplot()
ggplot(df_housing, aes(y = HouseValueK, x = Year, color = City)) +
geom_point() +
geom_line()
ggplot(df_housing, aes(x = Year, y = CPI, color = City)) +
geom_line() +
facet_wrap(~City)
As a refresher, here is the chart you created before:
# df_housing_price_index_capitals %>%
# hchart('scatter', hcaes(x = Year, y = Index, color = City))
#
# # Select The columns
# df_housingp <- select(filter(df_housing, Year == 2014),City, Year, HouseNumberK)
# hchart(df_housingp, type = "pie", hcaes(City, HouseNumberK))
ggplot(df_housing, aes(x="", y=HouseNumberK, fill=City)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0)
### Scatter Graph Housing Values
As a refresher, here is the chart you created before:
glimpse(df_housing_data_list)
## Rows: 143
## Columns: 4
## $ Year <dbl> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2…
## $ HouseAttribute <chr> "dwell", "houses_md", "houses_mn", "houses_nos", "popul…
## $ Value <dbl> 2309800.00, 280.00, 309.00, 20329.00, 5.60, 0.10, 1.95,…
## $ Type <chr> "Number", "K", "K", "Number", "M", "Change", "Change", …
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
hchart(
select(df_housing,Year,City,HouseValueK), "scatter", hcaes(x = Year, y = HouseValueK, group = City))