** Credit goes to: **

  1. Tony Pino for scraping data and uploading in Kaggle
  2. Bukun - Kaggler for ideas
  3. Heads or Tails - Kaggler for ideas
  4. David Langer - Youtube for ideas

1 Load packages

library(tidyverse)
library(DT)
library(scales)
library(stringr)
library(forcats)
library(lubridate)
library(corrplot)
library(caret)
library(xgboost)
library(doSNOW)

2 Load data

data1 <- read.csv("C:/r data warehouse/Melbourne housing data/Melbourne_housing_extra_data.csv", stringsAsFactors = F, na.strings = "")

3 Missing values

missing_var <- as.data.frame(colSums(is.na(data1))) %>% 
  rename(value = `colSums(is.na(data1))`) %>% 
  rownames_to_column(var = "Variable") %>% 
  filter(value > 0) %>% 
  arrange(value)

missing_var <-  missing_var %>% 
  mutate(prop = value/nrow(data1)) %>% 
  mutate(Variable = fct_reorder(Variable, prop))

missing_var %>% 
  ggplot(aes(Variable, prop))+
  geom_bar(stat = "identity")+
  scale_y_continuous(labels = scales::percent)+
  labs(x = "Missing variables", y = "Percent", title = "Missing variable Bar Plot")

Key points:

  1. Missing value present in important variables such as Price & Bedroom2

4 Data structure

data1 <- data1 %>% mutate_at(vars(Type, Method, SellerG, Regionname), funs(as.factor(.)))

data1$Date <-  dmy(data1$Date)

Type: 1. br - bedroom(s); 2. h - house,cottage,villa, 3. semi,terrace; u - unit, duplex; 4. t - townhouse; 5. dev site - development site; 6. o res - other residential.

data1 <- data1 %>% 
  mutate(Type= fct_recode(Type,"House" = "h", "Unit" = "u", "Town-House" = "t"))

4.1 Investiage on 10 highest no. property sold in given Suburb

4.1.1 List of 10 highest no. of property sold

Top_highest_house_count <-  data1 %>% count(Suburb) %>% arrange(-n) %>% top_n(n=10)

datatable(Top_highest_house_count)

4.1.2 BoxPlot 10 highest no. of property sold and their average price

data1 %>% filter(Suburb %in% Top_highest_house_count$Suburb) %>%
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  mutate(Suburb = fct_reorder(Suburb, Price, median, na.rm=T)) %>% 
  ggplot(aes(Suburb, Price))+
  geom_boxplot(aes(fill = Type))+
  facet_grid(Type~.)+
  scale_y_continuous(labels = scales::dollar)+
  labs(x = "Suburb", y = "Price", title = "Top 10 highest no of Houses on Sale in the Market for given Suburb")+
  coord_flip()

4.2 Property Stats:

Top_avg_property_price <-   data1 %>% 
  group_by(Suburb) %>% 
  summarise(Avg = round(mean(Price, na.rm = T),2)) %>% 
  arrange(-Avg) %>% 
  top_n(n = 15)

Top_avg_property_price %>% 
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  ggplot(aes(fct_reorder(Suburb, Avg), Avg))+
  geom_bar(stat = "identity")+
  geom_text(aes(Suburb, 1, label = paste0("(",scales::dollar(Avg),")")), hjust = 0, vjust = .5, fontface = "bold")+
  coord_flip()+
  labs(y = "Price", x = "Suburb", title = "Top 15 Most expensive Suburb on average")

data1 %>% filter(Suburb %in% Top_avg_property_price$Suburb) %>% 
  group_by(Suburb, Type) %>% 
  summarise(Avg = mean(Price, na.rm = T), SE = sd(Price, na.rm = T)/sqrt(length(na.omit(Price))), N = length(na.omit(Price))) %>%
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  ggplot(aes(x = Avg, xmin = Avg-SE, xmax = Avg+SE, y = fct_reorder(Suburb, Avg)))+
  geom_point()+
  geom_segment(aes(x = Avg-SE, xend = Avg+SE, y = fct_reorder(Suburb, Avg), yend = fct_reorder(Suburb, Avg)))+
  facet_wrap(~Type)+
  scale_x_continuous(labels = scales::dollar)+
  labs(x = "Average Price", y = "Suburb", title = "Avg property price of Top 15 expensive Suburb with SE")+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))

Bottom_avg_property_price <-data1 %>% 
  group_by(Suburb) %>% 
  summarise(Avg = mean(Price, na.rm = T)) %>% 
  arrange(Avg) %>% 
  top_n(n = -15)

Bottom_avg_property_price %>% 
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  ggplot(aes(fct_reorder(Suburb, Avg), Avg))+
  geom_bar(stat = "identity")+
  geom_text(aes(Suburb, 1, label = paste0("(",scales::dollar(Avg),")")), hjust = 0, vjust = .5, fontface = "bold")+
  coord_flip()+
  labs(y = "Price", x = "Suburb", title = "Bottom 15 Least expensive Suburb on average")

data1 %>% filter(Suburb %in% Bottom_avg_property_price$Suburb) %>% 
  group_by(Suburb, Type) %>% 
  summarise(Avg = mean(Price, na.rm = T), SE = sd(Price, na.rm = T)/sqrt(length(na.omit(Price))), N = length(na.omit(Price))) %>%
  mutate_at(vars(Suburb), funs(as.factor(.))) %>% 
  ggplot(aes(x = Avg, xmin = Avg-SE, xmax = Avg+SE, y = fct_reorder(Suburb, Avg)))+
  geom_point()+
  geom_segment(aes(x = Avg-SE, xend = Avg+SE, y = fct_reorder(Suburb, Avg), yend = fct_reorder(Suburb, Avg)))+
  facet_wrap(~Type)+
  scale_x_continuous(labels = scales::dollar)+
  labs(x = "Average Price", y = "Suburb", title = "Avg property price of Bottom 15 expensive Suburb with SE")+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))

data1 %>%
  ggplot(aes(Date, Price, na.rm=T))+
  geom_jitter(alpha=.5)+
  geom_smooth()+
  scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))+
  facet_grid(~Type)+
  scale_y_continuous(labels = scales::dollar)+
  labs(x = "Date Sold", y = "Price", title = "Date Sold vs Price")

data1 %>% 
  select(Rooms, Type, Price, Method, Bedroom2, Bathroom, Car, BuildingArea, YearBuilt) %>% 
  mutate(Method = fct_recode(Method, "1" = "PI", "2" = "PN", "3" = "S", "4" = "SA", "5" = "SN", "6" = "SP", "7" = "SS", "8" = "VB", "9" = "W"), Type =    fct_recode(Type, "1" = "House", "2" = "Town-House", "3" = "Unit")) %>% 
  mutate_at(vars(Type, Method), funs(as.numeric(.))) %>% 
  cor(use = "complete.obs") %>% 
  corrplot(type = "lower")

data1 %>% 
  ggplot(aes(Price, fill = Type))+
  geom_density(alpha = .5)+
  scale_x_log10()+
  facet_wrap(~Regionname, ncol = 2)+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))+
  labs(x = "log10", y = "Density")

data1 %>% 
  group_by(Rooms) %>% 
  summarise(AvgPrice = mean(Price, na.rm = T)) %>% 
  arrange(Rooms) %>% 
  ggplot(aes(fct_reorder(as.factor(Rooms), AvgPrice), AvgPrice))+
  geom_bar(stat = "identity")+
  scale_y_continuous(labels = scales::dollar)+
  labs(x = "No. of Rooms", title = "Avg Price against no. of Rooms")+
  coord_flip()+
  geom_text(aes(label = paste0("(", scales::dollar(AvgPrice),")", sep = "")), hjust = .9, vjust = .5, fontface = "bold")

set.seed(131)


data1 <- data1 %>% 
  mutate(Months = month(Date), Year = year(Date))



train <- data1 %>% 
 select(-2,-8, -19,-18, -21)


train <- train %>% filter(!is.na(Price))



train.control <- trainControl(method = "repeatedcv",
                              number = 2,
                              repeats = 1,
                              search = "grid")

tune.grid <- expand.grid(eta = .05,
                         nrounds = c(50, 75, 100),
                         max_depth = 3,
                         min_child_weight = 1,
                         colsample_bytree = .8,
                         gamma = 0,
                         subsample = 1)



cl <- makeCluster(2, type = "SOCK")

registerDoSNOW(cl)




train_XGB <- train(Price~., data = train, method = "xgbTree", tuneGrid = tune.grid, trControl = train.control, na.action = na.pass)


stopCluster(cl)


important <- varImp(train_XGB)

Important_vars <- data.frame(Variables = row.names(important$importance), Importance = round(important$importance$Overall, 2))
  

Important_vars <- Important_vars %>% 
  mutate(Rank = paste0("#", dense_rank(desc(Importance))))
  
Important_vars %>% 
  arrange(-Importance) %>% 
  top_n(n=10, wt=Importance) %>% 
  ggplot(aes(fct_reorder(Variables, Importance), Importance))+
  geom_bar(stat = "identity")+
  geom_text(aes(Variables, 1, label = Rank),hjust=0, vjust=.5, size = 4, fontface = 'bold')+
  coord_flip()+
  labs(x = "Variables", y="Importance", title = "Relative variable importance")

data1 %>% 
  ggplot(aes(Distance, Price))+
  geom_jitter(aes(col = Type), alpha=.5)+
  geom_smooth(method = "lm")

data1 %>% 
  ggplot(aes(BuildingArea, Price))+
  geom_point()+
  scale_x_log10(limits = c(10,1000))+
  geom_smooth(method = "lm")

data1 %>% 
  dplyr::group_by(YearBuilt) %>% 
  dplyr::summarize(Avg_price = mean(Price, na.rm=T)) %>% 
  filter(!is.na(Avg_price)) %>% 
  ggplot(aes(YearBuilt, Avg_price))+
  geom_line(stat = "identity")+
  scale_x_continuous(limits = c(1850, 2020))+
  geom_smooth()+
  scale_y_continuous(labels = scales::dollar)

data1 %>% 
  dplyr::group_by(Date) %>% 
  dplyr::summarise(Avg = mean(Price, na.rm = T)) %>% 
  ggplot(aes(Date, Avg))+
  geom_line()+
  geom_smooth()+
  scale_y_continuous(labels = scales::dollar)

data1 %>% 
  dplyr::group_by(Date) %>% 
  dplyr::summarise(n=n()) %>% 
  ggplot(aes(Date, n))+
  geom_bar(stat = "identity")+
  scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
  theme(axis.text.x = element_text(angle = 90, vjust = .5))+
  geom_smooth()+
  labs(title = "Sales over time")+
  labs(y = "No. of Property Sold")

5 Junk Commands - I just like to keep it instead of deleting them

# train$Suburb <- train$Suburb %>% str_replace_all(" ", "_")
# 
# 
# 
# 
# str(train)
# 
# train <- train %>%
#   mutate_at(vars(Suburb, CouncilArea), funs(as.factor(.)))
# 
# train_features <- colnames(train)
# 
# colSums(is.na(train))
# 
# dummy_vars <- train %>% 
#   select(-4) %>% 
#   dummyVars(~.,.)
# 
# train_dummy <- predict(dummy_vars, train[, -4])
# 
# 
# pre_process <- preProcess(train_dummy, method = "bagImpute")
# 
# 
# 
# train %>% count(Suburb)
# View(train_dummy)


# train.1 <- train
# 
# str(train.1)
# 
# train.1_features <- colnames(train.1)
# 
# for (f in train.1_features) {
#   if ((class(train.1[[f]])=="factor") || (class(train.1[[f]])=="character")) {
#     levels <- unique(train.1[[f]])
#     train.1[[f]] <- as.numeric(factor(train.1[[f]], levels=levels))
#   }
# }



# melDataXGB = train(Price~., data = train,
#                        method = "xgbTree",trControl = train.control,
#                        tuneGrid = tune.grid,na.action = na.pass,metric="RMSE")




# data1 %>% 
#   ggplot(aes(Date, Price, na.rm=T))+
#   geom_jitter()+
#   geom_boxplot(aes(Date, Price, na.rm=T, group = Date), col = "yellow")+
#   scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
#   theme(axis.text.x = element_text(angle = 90, vjust = .5))+
#   facet_grid(~Type)




# data1 %>% group_by(Suburb, Type) %>% 
#   summarise(Avg = mean(Price, na.rm = T), mins = min(Price, na.rm=T), maxs = max(Price, na.rm = T)) %>% 
#   ggplot(aes(x = Avg, xmin = mins, xmax = maxs, y = Suburb))+
#   geom_point()+
#   geom_segment(aes(x = mins, xend = maxs, y = Suburb, yend = Suburb))+
#   coord_flip()+
#   facet_wrap(~Type)

# Top_lowest_house_count <- data1 %>% count(Suburb) %>% arrange(n)
#   
# Top_lowest_house_count %>% filter(n>1) %>% arrange(n) %>% top_n(n=-10)




# range(data1$Date)

# data1 %>% count(CouncilArea)

# chng_to_fct <- c(Type, Method, SellerG, Regionname)
# data1 %>% count(Suburb)

# summary(data1)


# levels(data1$Type)
# 
# unique(data1$Type)