1 Load packages

library(tidyverse)
library(DT)
library(scales)
library(stringr)
library(forcats)
library(lubridate)
library(corrplot)
library(caret)
library(xgboost)
library(doSNOW)

2 Load data

data1 <- read.csv("C:/r data warehouse/Melbourne housing data/Melbourne_housing_extra_data.csv", stringsAsFactors = F, na.strings = "")

3 Missing values

missing_var <- as.data.frame(colSums(is.na(data1))) %>% 
  rename(value = `colSums(is.na(data1))`) %>% 
  rownames_to_column(var = "Variable") %>% 
  filter(value > 0) %>% 
  arrange(value)

missing_var <-  missing_var %>% 
  mutate(prop = value/nrow(data1)) %>% 
  mutate(Variable = fct_reorder(Variable, prop))

missing_var %>% 
  ggplot(aes(Variable, prop))+
  geom_bar(stat = "identity")+
  scale_y_continuous(labels = scales::percent)+
  labs(x = "Missing variables", y = "Percent", title = "Missing variable Bar Plot")

Key points:

Missing value present in important variables such as Price & Bedroom2

4 Data structure

data1 <- data1 %>% mutate_at(vars(Type, Method, SellerG, Regionname), funs(as.factor(.)))

data1$Date <-  dmy(data1$Date)

Type: 1. br - bedroom(s); 2. h - house,cottage,villa, 3. semi,terrace; u - unit, duplex; 4. t - townhouse; 5. dev site - development site; 6. o res - other residential.

data1 <- data1 %>% 
  mutate(Type= fct_recode(Type,"House" = "h", "Unit" = "u", "Town-House" = "t"))

4.1 Investiage on 10 highest no. property sold in given Suburb

4.1.1 List of 10 highest no. of property sold

Top_highest_house_count <-  data1 %>% count(Suburb) %>% arrange(-n) %>% top_n(n=10)

datatable(Top_highest_house_count)

4.1.2 BoxPlot 10 highest no. of property sold and their average price

data1 %>% filter(Suburb %in% Top_highest_house_count$Suburb) %>%
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  mutate(Suburb = fct_reorder(Suburb, Price, median, na.rm=T)) %>% 
  ggplot(aes(Suburb, Price))+
  geom_boxplot(aes(fill = Type))+
  facet_grid(Type~.)+
  scale_y_continuous(labels = scales::dollar)+
  labs(x = "Suburb", y = "Price", title = "Top 10 highest no of Houses on Sale in the Market for given Suburb")+
  coord_flip()

4.2 Property Stats:

Top_avg_property_price <-   data1 %>% 
  group_by(Suburb) %>% 
  summarise(Avg = round(mean(Price, na.rm = T),2)) %>% 
  arrange(-Avg) %>% 
  top_n(n = 15)

Top_avg_property_price %>% 
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  ggplot(aes(fct_reorder(Suburb, Avg), Avg))+
  geom_bar(stat = "identity")+
  geom_text(aes(Suburb, 1, label = paste0("(",scales::dollar(Avg),")")), hjust = 0, vjust = .5, fontface = "bold")+
  coord_flip()+
  labs(y = "Price", x = "Suburb", title = "Top 15 Most expensive Suburb on average")

data1 %>% filter(Suburb %in% Top_avg_property_price$Suburb) %>% 
  group_by(Suburb, Type) %>% 
  summarise(Avg = mean(Price, na.rm = T), SE = sd(Price, na.rm = T)/sqrt(length(na.omit(Price))), N = length(na.omit(Price))) %>%
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  ggplot(aes(x = Avg, xmin = Avg-SE, xmax = Avg+SE, y = fct_reorder(Suburb, Avg)))+
  geom_point()+
  geom_segment(aes(x = Avg-SE, xend = Avg+SE, y = fct_reorder(Suburb, Avg), yend = fct_reorder(Suburb, Avg)))+
  facet_wrap(~Type)+
  scale_x_continuous(labels = scales::dollar)+
  labs(x = "Average Price", y = "Suburb", title = "Avg property price of Top 15 expensive Suburb with SE")+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))

Bottom_avg_property_price <-data1 %>% 
  group_by(Suburb) %>% 
  summarise(Avg = mean(Price, na.rm = T)) %>% 
  arrange(Avg) %>% 
  top_n(n = -15)

Bottom_avg_property_price %>% 
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  ggplot(aes(fct_reorder(Suburb, Avg), Avg))+
  geom_bar(stat = "identity")+
  geom_text(aes(Suburb, 1, label = paste0("(",scales::dollar(Avg),")")), hjust = 0, vjust = .5, fontface = "bold")+
  coord_flip()+
  labs(y = "Price", x = "Suburb", title = "Bottom 15 Least expensive Suburb on average")

data1 %>% filter(Suburb %in% Bottom_avg_property_price$Suburb) %>% 
  group_by(Suburb, Type) %>% 
  summarise(Avg = mean(Price, na.rm = T), SE = sd(Price, na.rm = T)/sqrt(length(na.omit(Price))), N = length(na.omit(Price))) %>%
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  ggplot(aes(x = Avg, xmin = Avg-SE, xmax = Avg+SE, y = fct_reorder(Suburb, Avg)))+
  geom_point()+
  geom_segment(aes(x = Avg-SE, xend = Avg+SE, y = fct_reorder(Suburb, Avg), yend = fct_reorder(Suburb, Avg)))+
  facet_wrap(~Type)+
  scale_x_continuous(labels = scales::dollar)+
  labs(x = "Average Price", y = "Suburb", title = "Avg property price of Bottom 15 expensive Suburb with SE")+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))

data1 %>%
  ggplot(aes(Date, Price, na.rm=T))+
  geom_jitter(alpha=.5)+
  geom_smooth()+
  scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))+
  facet_grid(~Type)+
  scale_y_continuous(labels = scales::dollar)+
  labs(x = "Date Sold", y = "Price", title = "Date Sold vs Price")

data1 %>% 
  select(Rooms, Type, Price, Method, Bedroom2, Bathroom, Car, BuildingArea, YearBuilt) %>% 
  mutate(Method = fct_recode(Method, "1" = "PI", "2" = "PN", "3" = "S", "4" = "SA", "5" = "SN", "6" = "SP", "7" = "SS", "8" = "VB", "9" = "W"), Type =    fct_recode(Type, "1" = "House", "2" = "Town-House", "3" = "Unit")) %>% 
  mutate_at(vars(Type, Method), funs(as.numeric(.))) %>% 
  cor(use = "complete.obs") %>% 
  corrplot(type = "lower")

data1 %>% 
  ggplot(aes(Price, fill = Type))+
  geom_density(alpha = .5)+
  scale_x_log10()+
  facet_wrap(~Regionname, ncol = 2)+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))+
  labs(x = "log10", y = "Density")

data1 %>% 
  group_by(Rooms) %>% 
  summarise(AvgPrice = mean(Price, na.rm = T)) %>% 
  arrange(Rooms) %>% 
  ggplot(aes(fct_reorder(as.factor(Rooms), AvgPrice), AvgPrice))+
  geom_bar(stat = "identity")+
  scale_y_continuous(labels = scales::dollar)+
  labs(x = "No. of Rooms", title = "Avg Price against no. of Rooms")+
  coord_flip()+
  geom_text(aes(label = paste0("(", scales::dollar(AvgPrice),")", sep = "")), hjust = .9, vjust = .5, fontface = "bold")

set.seed(131)


data1 <- data1 %>% 
  mutate(Months = month(Date), Year = year(Date))



train <- data1 %>% 
 select(-2,-8, -19,-18, -21)


train <- train %>% filter(!is.na(Price))



train.control <- trainControl(method = "repeatedcv",
                              number = 2,
                              repeats = 1,
                              search = "grid")

tune.grid <- expand.grid(eta = .05,
                         nrounds = c(50, 75, 100),
                         max_depth = 3,
                         min_child_weight = 1,
                         colsample_bytree = .8,
                         gamma = 0,
                         subsample = 1)



cl <- makeCluster(2, type = "SOCK")

registerDoSNOW(cl)




train_XGB <- train(Price~., data = train, method = "xgbTree", tuneGrid = tune.grid, trControl = train.control, na.action = na.pass)


stopCluster(cl)


important <- varImp(train_XGB)

Important_vars <- data.frame(Variables = row.names(important$importance), Importance = round(important$importance$Overall, 2))
  

Important_vars <- Important_vars %>% 
  mutate(Rank = paste0("#", dense_rank(desc(Importance))))
  
Important_vars %>% 
  arrange(-Importance) %>% 
  top_n(n=10, wt=Importance) %>% 
  ggplot(aes(fct_reorder(Variables, Importance), Importance))+
  geom_bar(stat = "identity")+
  geom_text(aes(Variables, 1, label = Rank),hjust=0, vjust=.5, size = 4, fontface = 'bold')+
  coord_flip()+
  labs(x = "Variables", y="Importance", title = "Relative variable importance")

data1 %>% 
  ggplot(aes(Distance, Price))+
  geom_jitter(aes(col = Type), alpha=.5)+
  geom_smooth(method = "lm")

data1 %>% 
  ggplot(aes(BuildingArea, Price))+
  geom_point()+
  scale_x_log10(limits = c(10,1000))+
  geom_smooth(method = "lm")

data1 %>% 
  dplyr::group_by(YearBuilt) %>% 
  dplyr::summarize(Avg_price = mean(Price, na.rm=T)) %>% 
  filter(!is.na(Avg_price)) %>% 
  ggplot(aes(YearBuilt, Avg_price))+
  geom_line(stat = "identity")+
  scale_x_continuous(limits = c(1850, 2020))+
  geom_smooth()+
  scale_y_continuous(labels = scales::dollar)

data1 %>% 
  dplyr::group_by(Date) %>% 
  dplyr::summarise(Avg = mean(Price, na.rm = T)) %>% 
  ggplot(aes(Date, Avg))+
  geom_line()+
  geom_smooth()+
  scale_y_continuous(labels = scales::dollar)

data1 %>% 
  dplyr::group_by(Date) %>% 
  dplyr::summarise(n=n()) %>% 
  ggplot(aes(Date, n))+
  geom_bar(stat = "identity")+
  scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))+
  geom_smooth()+
  labs(title = "Sales over time")+
  labs(y = "No. of Property Sold")

5 Junk Commands - I just like to keep it instead of deleting them

# train$Suburb <- train$Suburb %>% str_replace_all(" ", "_")
# 
# 
# 
# 
# str(train)
# 
# train <- train %>%
#   mutate_at(vars(Suburb, CouncilArea), funs(as.factor(.)))
# 
# train_features <- colnames(train)
# 
# colSums(is.na(train))
# 
# dummy_vars <- train %>% 
#   select(-4) %>% 
#   dummyVars(~.,.)
# 
# train_dummy <- predict(dummy_vars, train[, -4])
# 
# 
# pre_process <- preProcess(train_dummy, method = "bagImpute")
# 
# 
# 
# train %>% count(Suburb)
# View(train_dummy)


# train.1 <- train
# 
# str(train.1)
# 
# train.1_features <- colnames(train.1)
# 
# for (f in train.1_features) {
#   if ((class(train.1[[f]])=="factor") || (class(train.1[[f]])=="character")) {
#     levels <- unique(train.1[[f]])
#     train.1[[f]] <- as.numeric(factor(train.1[[f]], levels=levels))
#   }
# }



# melDataXGB = train(Price~., data = train,
#                        method = "xgbTree",trControl = train.control,
#                        tuneGrid = tune.grid,na.action = na.pass,metric="RMSE")




# data1 %>% 
#   ggplot(aes(Date, Price, na.rm=T))+
#   geom_jitter()+
#   geom_boxplot(aes(Date, Price, na.rm=T, group = Date), col = "yellow")+
#   scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
#   theme(axis.text.x = element_text(angle = 90, vjust = .5))+
#   facet_grid(~Type)




# data1 %>% group_by(Suburb, Type) %>% 
#   summarise(Avg = mean(Price, na.rm = T), mins = min(Price, na.rm=T), maxs = max(Price, na.rm = T)) %>% 
#   ggplot(aes(x = Avg, xmin = mins, xmax = maxs, y = Suburb))+
#   geom_point()+
#   geom_segment(aes(x = mins, xend = maxs, y = Suburb, yend = Suburb))+
#   coord_flip()+
#   facet_wrap(~Type)

# Top_lowest_house_count <- data1 %>% count(Suburb) %>% arrange(n)
#   
# Top_lowest_house_count %>% filter(n>1) %>% arrange(n) %>% top_n(n=-10)




# range(data1$Date)

# data1 %>% count(CouncilArea)

# chng_to_fct <- c(Type, Method, SellerG, Regionname)
# data1 %>% count(Suburb)

# summary(data1)


# levels(data1$Type)
# 
# unique(data1$Type)

Melbourne Housing Market

Scottish Chemjong

17 July 2017

1 Load packages

2 Load data

3 Missing values

4 Data structure

4.1 Investiage on 10 highest no. property sold in given Suburb

4.1.1 List of 10 highest no. of property sold

4.1.2 BoxPlot 10 highest no. of property sold and their average price

4.2 Property Stats:

5 Junk Commands - I just like to keep it instead of deleting them