** Credit goes to: **
library(tidyverse)
library(DT)
library(scales)
library(stringr)
library(forcats)
library(lubridate)
library(corrplot)
library(caret)
library(xgboost)
library(doSNOW)data1 <- read.csv("C:/r data warehouse/Melbourne housing data/Melbourne_housing_extra_data.csv", stringsAsFactors = F, na.strings = "")missing_var <- as.data.frame(colSums(is.na(data1))) %>%
rename(value = `colSums(is.na(data1))`) %>%
rownames_to_column(var = "Variable") %>%
filter(value > 0) %>%
arrange(value)
missing_var <- missing_var %>%
mutate(prop = value/nrow(data1)) %>%
mutate(Variable = fct_reorder(Variable, prop))
missing_var %>%
ggplot(aes(Variable, prop))+
geom_bar(stat = "identity")+
scale_y_continuous(labels = scales::percent)+
labs(x = "Missing variables", y = "Percent", title = "Missing variable Bar Plot")Key points:
data1 <- data1 %>% mutate_at(vars(Type, Method, SellerG, Regionname), funs(as.factor(.)))
data1$Date <- dmy(data1$Date)Type: 1. br - bedroom(s); 2. h - house,cottage,villa, 3. semi,terrace; u - unit, duplex; 4. t - townhouse; 5. dev site - development site; 6. o res - other residential.
data1 <- data1 %>%
mutate(Type= fct_recode(Type,"House" = "h", "Unit" = "u", "Town-House" = "t"))Top_highest_house_count <- data1 %>% count(Suburb) %>% arrange(-n) %>% top_n(n=10)
datatable(Top_highest_house_count)data1 %>% filter(Suburb %in% Top_highest_house_count$Suburb) %>%
mutate_at(vars(Suburb), funs(as.factor(.))) %>%
mutate(Suburb = fct_reorder(Suburb, Price, median, na.rm=T)) %>%
ggplot(aes(Suburb, Price))+
geom_boxplot(aes(fill = Type))+
facet_grid(Type~.)+
scale_y_continuous(labels = scales::dollar)+
labs(x = "Suburb", y = "Price", title = "Top 10 highest no of Houses on Sale in the Market for given Suburb")+
coord_flip()Top_avg_property_price <- data1 %>%
group_by(Suburb) %>%
summarise(Avg = round(mean(Price, na.rm = T),2)) %>%
arrange(-Avg) %>%
top_n(n = 15)
Top_avg_property_price %>%
mutate_at(vars(Suburb), funs(as.factor(.))) %>%
ggplot(aes(fct_reorder(Suburb, Avg), Avg))+
geom_bar(stat = "identity")+
geom_text(aes(Suburb, 1, label = paste0("(",scales::dollar(Avg),")")), hjust = 0, vjust = .5, fontface = "bold")+
coord_flip()+
labs(y = "Price", x = "Suburb", title = "Top 15 Most expensive Suburb on average")data1 %>% filter(Suburb %in% Top_avg_property_price$Suburb) %>%
group_by(Suburb, Type) %>%
summarise(Avg = mean(Price, na.rm = T), SE = sd(Price, na.rm = T)/sqrt(length(na.omit(Price))), N = length(na.omit(Price))) %>%
mutate_at(vars(Suburb), funs(as.factor(.))) %>%
ggplot(aes(x = Avg, xmin = Avg-SE, xmax = Avg+SE, y = fct_reorder(Suburb, Avg)))+
geom_point()+
geom_segment(aes(x = Avg-SE, xend = Avg+SE, y = fct_reorder(Suburb, Avg), yend = fct_reorder(Suburb, Avg)))+
facet_wrap(~Type)+
scale_x_continuous(labels = scales::dollar)+
labs(x = "Average Price", y = "Suburb", title = "Avg property price of Top 15 expensive Suburb with SE")+
theme(axis.text.x = element_text(angle = 90, vjust = .5))Bottom_avg_property_price <-data1 %>%
group_by(Suburb) %>%
summarise(Avg = mean(Price, na.rm = T)) %>%
arrange(Avg) %>%
top_n(n = -15)
Bottom_avg_property_price %>%
mutate_at(vars(Suburb), funs(as.factor(.))) %>%
ggplot(aes(fct_reorder(Suburb, Avg), Avg))+
geom_bar(stat = "identity")+
geom_text(aes(Suburb, 1, label = paste0("(",scales::dollar(Avg),")")), hjust = 0, vjust = .5, fontface = "bold")+
coord_flip()+
labs(y = "Price", x = "Suburb", title = "Bottom 15 Least expensive Suburb on average")data1 %>% filter(Suburb %in% Bottom_avg_property_price$Suburb) %>%
group_by(Suburb, Type) %>%
summarise(Avg = mean(Price, na.rm = T), SE = sd(Price, na.rm = T)/sqrt(length(na.omit(Price))), N = length(na.omit(Price))) %>%
mutate_at(vars(Suburb), funs(as.factor(.))) %>%
ggplot(aes(x = Avg, xmin = Avg-SE, xmax = Avg+SE, y = fct_reorder(Suburb, Avg)))+
geom_point()+
geom_segment(aes(x = Avg-SE, xend = Avg+SE, y = fct_reorder(Suburb, Avg), yend = fct_reorder(Suburb, Avg)))+
facet_wrap(~Type)+
scale_x_continuous(labels = scales::dollar)+
labs(x = "Average Price", y = "Suburb", title = "Avg property price of Bottom 15 expensive Suburb with SE")+
theme(axis.text.x = element_text(angle = 90, vjust = .5))data1 %>%
ggplot(aes(Date, Price, na.rm=T))+
geom_jitter(alpha=.5)+
geom_smooth()+
scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
theme(axis.text.x = element_text(angle = 90, vjust = .5))+
facet_grid(~Type)+
scale_y_continuous(labels = scales::dollar)+
labs(x = "Date Sold", y = "Price", title = "Date Sold vs Price")data1 %>%
select(Rooms, Type, Price, Method, Bedroom2, Bathroom, Car, BuildingArea, YearBuilt) %>%
mutate(Method = fct_recode(Method, "1" = "PI", "2" = "PN", "3" = "S", "4" = "SA", "5" = "SN", "6" = "SP", "7" = "SS", "8" = "VB", "9" = "W"), Type = fct_recode(Type, "1" = "House", "2" = "Town-House", "3" = "Unit")) %>%
mutate_at(vars(Type, Method), funs(as.numeric(.))) %>%
cor(use = "complete.obs") %>%
corrplot(type = "lower")data1 %>%
ggplot(aes(Price, fill = Type))+
geom_density(alpha = .5)+
scale_x_log10()+
facet_wrap(~Regionname, ncol = 2)+
theme(axis.text.x = element_text(angle = 90, vjust = .5))+
labs(x = "log10", y = "Density")data1 %>%
group_by(Rooms) %>%
summarise(AvgPrice = mean(Price, na.rm = T)) %>%
arrange(Rooms) %>%
ggplot(aes(fct_reorder(as.factor(Rooms), AvgPrice), AvgPrice))+
geom_bar(stat = "identity")+
scale_y_continuous(labels = scales::dollar)+
labs(x = "No. of Rooms", title = "Avg Price against no. of Rooms")+
coord_flip()+
geom_text(aes(label = paste0("(", scales::dollar(AvgPrice),")", sep = "")), hjust = .9, vjust = .5, fontface = "bold")set.seed(131)
data1 <- data1 %>%
mutate(Months = month(Date), Year = year(Date))
train <- data1 %>%
select(-2,-8, -19,-18, -21)
train <- train %>% filter(!is.na(Price))
train.control <- trainControl(method = "repeatedcv",
number = 2,
repeats = 1,
search = "grid")
tune.grid <- expand.grid(eta = .05,
nrounds = c(50, 75, 100),
max_depth = 3,
min_child_weight = 1,
colsample_bytree = .8,
gamma = 0,
subsample = 1)
cl <- makeCluster(2, type = "SOCK")
registerDoSNOW(cl)
train_XGB <- train(Price~., data = train, method = "xgbTree", tuneGrid = tune.grid, trControl = train.control, na.action = na.pass)
stopCluster(cl)
important <- varImp(train_XGB)
Important_vars <- data.frame(Variables = row.names(important$importance), Importance = round(important$importance$Overall, 2))
Important_vars <- Important_vars %>%
mutate(Rank = paste0("#", dense_rank(desc(Importance))))
Important_vars %>%
arrange(-Importance) %>%
top_n(n=10, wt=Importance) %>%
ggplot(aes(fct_reorder(Variables, Importance), Importance))+
geom_bar(stat = "identity")+
geom_text(aes(Variables, 1, label = Rank),hjust=0, vjust=.5, size = 4, fontface = 'bold')+
coord_flip()+
labs(x = "Variables", y="Importance", title = "Relative variable importance")data1 %>%
ggplot(aes(Distance, Price))+
geom_jitter(aes(col = Type), alpha=.5)+
geom_smooth(method = "lm")data1 %>%
ggplot(aes(BuildingArea, Price))+
geom_point()+
scale_x_log10(limits = c(10,1000))+
geom_smooth(method = "lm")data1 %>%
dplyr::group_by(YearBuilt) %>%
dplyr::summarize(Avg_price = mean(Price, na.rm=T)) %>%
filter(!is.na(Avg_price)) %>%
ggplot(aes(YearBuilt, Avg_price))+
geom_line(stat = "identity")+
scale_x_continuous(limits = c(1850, 2020))+
geom_smooth()+
scale_y_continuous(labels = scales::dollar)data1 %>%
dplyr::group_by(Date) %>%
dplyr::summarise(Avg = mean(Price, na.rm = T)) %>%
ggplot(aes(Date, Avg))+
geom_line()+
geom_smooth()+
scale_y_continuous(labels = scales::dollar)data1 %>%
dplyr::group_by(Date) %>%
dplyr::summarise(n=n()) %>%
ggplot(aes(Date, n))+
geom_bar(stat = "identity")+
scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
theme(axis.text.x = element_text(angle = 90, vjust = .5))+
geom_smooth()+
labs(title = "Sales over time")+
labs(y = "No. of Property Sold")# train$Suburb <- train$Suburb %>% str_replace_all(" ", "_")
#
#
#
#
# str(train)
#
# train <- train %>%
# mutate_at(vars(Suburb, CouncilArea), funs(as.factor(.)))
#
# train_features <- colnames(train)
#
# colSums(is.na(train))
#
# dummy_vars <- train %>%
# select(-4) %>%
# dummyVars(~.,.)
#
# train_dummy <- predict(dummy_vars, train[, -4])
#
#
# pre_process <- preProcess(train_dummy, method = "bagImpute")
#
#
#
# train %>% count(Suburb)
# View(train_dummy)
# train.1 <- train
#
# str(train.1)
#
# train.1_features <- colnames(train.1)
#
# for (f in train.1_features) {
# if ((class(train.1[[f]])=="factor") || (class(train.1[[f]])=="character")) {
# levels <- unique(train.1[[f]])
# train.1[[f]] <- as.numeric(factor(train.1[[f]], levels=levels))
# }
# }
# melDataXGB = train(Price~., data = train,
# method = "xgbTree",trControl = train.control,
# tuneGrid = tune.grid,na.action = na.pass,metric="RMSE")
# data1 %>%
# ggplot(aes(Date, Price, na.rm=T))+
# geom_jitter()+
# geom_boxplot(aes(Date, Price, na.rm=T, group = Date), col = "yellow")+
# scale_x_date(date_breaks = "1 month", date_labels = "%b-%y")+
# theme(axis.text.x = element_text(angle = 90, vjust = .5))+
# facet_grid(~Type)
# data1 %>% group_by(Suburb, Type) %>%
# summarise(Avg = mean(Price, na.rm = T), mins = min(Price, na.rm=T), maxs = max(Price, na.rm = T)) %>%
# ggplot(aes(x = Avg, xmin = mins, xmax = maxs, y = Suburb))+
# geom_point()+
# geom_segment(aes(x = mins, xend = maxs, y = Suburb, yend = Suburb))+
# coord_flip()+
# facet_wrap(~Type)
# Top_lowest_house_count <- data1 %>% count(Suburb) %>% arrange(n)
#
# Top_lowest_house_count %>% filter(n>1) %>% arrange(n) %>% top_n(n=-10)
# range(data1$Date)
# data1 %>% count(CouncilArea)
# chng_to_fct <- c(Type, Method, SellerG, Regionname)
# data1 %>% count(Suburb)
# summary(data1)
# levels(data1$Type)
#
# unique(data1$Type)