Rpubs link for this study: http://rpubs.com/The_Analyst/431304

Required packages

library(readr)
library(magrittr) 
library(dplyr)
library(tidyr)
library(readxl)
library(lubridate) 
library(stringr) 
library(ggplot2)
library(mlr)

Executive Summary

Data collection

Melbourne housing data is taken from the kaggle data set hyperlink,Melbourne Housing Market and Melbourne Crime Statistics has been sourced from hyperlink,Melbourne Crime Statistics

Data_Dictionary :

The followig variables are required:

Melbourne Housing Data:
  • Suburb: Suburbs
  • Rooms: Number of rooms
  • Price: Price in Australian dollars
  • Type: br - bedroom(s); h - house,cottage,villa, semi,terrace; u - unit, duplex; t - townhouse; dev site - development site; o res - other residential.
  • Date: Date sold
  • Distance: Distance from CBD in Kilometres
  • Regionname: General Region (West, North West, North, North east …etc)
Crime Statistics data:
  • Year ending June: Year of crime
  • Postcode: Self explanatory
  • Suburb/Town Name: Suburb
  • Offence Division: offence category
  • Incidents Recorded:Number of incidents
Crime_Inc_PostCode <- read_excel("Data_tables_Criminal_Incidents_Visualisation_year_ending_June_2018.xlsx", sheet = "Table 07" )
head(Crime_Inc_PostCode)
Melb <- read_csv("MELBOURNE_HOUSE_PRICES_LESS.csv") #Read the Housing data set
Parsed with column specification:
cols(
  Suburb = col_character(),
  Address = col_character(),
  Rooms = col_integer(),
  Type = col_character(),
  Price = col_integer(),
  Method = col_character(),
  SellerG = col_character(),
  Date = col_character(),
  Postcode = col_integer(),
  Regionname = col_character(),
  Propertycount = col_integer(),
  Distance = col_double(),
  CouncilArea = col_character()
)
head(Melb)

Structure and manipulations of Melbourne Housing data set

sum(is.na(Melb$Price)) #check for NA
[1] 13836
Melb_drop_na <- drop_na(Melb,Price) #handling NA's
Melb_filtered_data_2017 <- Melb_drop_na %>% filter( Type == 'h' & Rooms == 3 & Distance <100 & Date >= '2017-01-01') #apply the filtering criteria on the dataset
sapply(Melb_filtered_data_2017, class)
       Suburb       Address         Rooms          Type         Price 
  "character"   "character"     "integer"   "character"     "integer" 
       Method       SellerG          Date      Postcode    Regionname 
  "character"   "character"   "character"     "integer"   "character" 
Propertycount      Distance   CouncilArea 
    "integer"     "numeric"   "character" 
#factorise the Region Names 
Melb_filtered_data_2017$Regionname <- as.factor(Melb_filtered_data_2017$Regionname)
levels(Melb_filtered_data_2017$Regionname)
[1] "Eastern Metropolitan"       "Eastern Victoria"          
[3] "Northern Metropolitan"      "Northern Victoria"         
[5] "South-Eastern Metropolitan" "Southern Metropolitan"     
[7] "Western Metropolitan"       "Western Victoria"          
Melb_filtered_data_2017$Date<- dmy(Melb_filtered_data_2017$Date) #date transformation 
#to check the month in the date 
Melb_filtered_data_2017$Date %>% month(label = TRUE, abbr = FALSE) %>% head(10)
 [1] February February February February February February February February
 [9] February February
12 Levels: January < February < March < April < May < June < ... < December
#transform the Suburb variable to upper case
Melb_filtered_data_2017$Suburb <- toupper(Melb_filtered_data_2017$Suburb ) 

Structure and manipulations of Melbourne Crime data set

names(Crime_Inc_PostCode) #check the names of  dataset
[1] "Year ending June"    "Postcode"            "Suburb/Town Name"   
[4] "Offence Division"    "Offence Subdivision" "Offence Subgroup"   
[7] "Incidents Recorded" 
#rename the columns to make increase readablity
Crime_Inc_PostCode <- rename(Crime_Inc_PostCode,
                       Year = `Year ending June`,
                       Suburb = `Suburb/Town Name`,
                       Offence = `Offence Division`,
                       No_of_Incident = `Incidents Recorded`)
sapply(Crime_Inc_PostCode,class) #check datatypes 
               Year            Postcode              Suburb 
          "numeric"           "numeric"         "character" 
            Offence Offence Subdivision    Offence Subgroup 
        "character"         "character"         "character" 
     No_of_Incident 
          "numeric" 
#factorise the offence variable 
Crime_Inc_PostCode$Offence <- factor(Crime_Inc_PostCode$Offence, levels = c("A Crimes against the person","B Property and deception offences","C Drug offences","D Public order and security offences","E Justice procedures offences","F Other offences"),
       labels = c("Crime against person","Property offences","Drug offences","Public order and security","Justice Procedures","Others"))
Crime_Inc_PostCode$Suburb <- as.factor(Crime_Inc_PostCode$Suburb)
#filter the data for 2017 crime statistics 
Crime_Inc_PostCode_2017 <- Crime_Inc_PostCode %>% filter(Year %in% "2017")
sum(is.na(Crime_Inc_PostCode_2017)) #check for NA's
[1] 0

Summarise the data and create new variables.

# Summarise the data and create new columns fro median price and distance. 
Melb_working_set_2017 <- Melb_filtered_data_2017 %>%  group_by(Postcode,Suburb) %>% 
  mutate(median_price = median(Price),
         median_distance = median(Distance)
         ) %>% distinct(Postcode,median_price,median_distance,count,Regionname)
Trying to compute distinct() for variables not found in the data:
- `count`
This is an error, but only a warning is raised for compatibility reasons.
The following variables will be used:
- Postcode
- median_price
- median_distance
- Regionname
Melb_working_set_2017 %>% head(5)
#Summarise the Crime Data to find No of Incidents per Suburb
Suburbs_Crime_2017 <- Crime_Inc_PostCode_2017 %>% group_by(Suburb) %>% summarise(
  Total_Incidents = sum(No_of_Incident)
)
Suburbs_Crime_2017 %>% head(5)

Join the data sets

#Join the Data sets
# On the Melbourne Housing Data , Join the Number of Criminal Incidents , per Suburb
Ready_to_work_data <- left_join(Melb_working_set_2017, Suburbs_Crime_2017, by="Suburb") 
Column `Suburb` joining character vector and factor, coercing into character vector
Ready_to_work_data %>% head(5)

Scanning final data set for inconsistency

#Investigatge if there is any NA's in this data set as result of Join
sum(is.na(Ready_to_work_data))
[1] 1
Ready_to_work_data[!complete.cases(Ready_to_work_data),] #inspect NA rows
#Eliminate this suburb as we dont have info about the total number of Incidents
Ready_to_work_data <- Ready_to_work_data[complete.cases(Ready_to_work_data),]
#create a rule to check if the values are non negative
Rule <- editset(c("median_price >= 0", "median_distance >= 0","Total_Incidents>=0"))
Rule

Edit set:
num1 : 0 <= median_price
num2 : 0 <= median_distance
num3 : 0 <= Total_Incidents 
#apply rule on the Final data set 
rules_violated <- violatedEdits(Rule,Ready_to_work_data)
summary(rules_violated) #visualise if any rules is violated
No violations detected, 0 checks evaluated to NA
NULL
Ready_to_work_data %>% head(5)

Identify the suburb with higher price median (outliers)

outliers_price = boxplot(Ready_to_work_data$median_price ~ Ready_to_work_data$Regionname,plot=FALSE)$out
#Make a new column for labelling the outliers : Suburb/Region with highest number of crime incidents
Ready_to_work_data$label_price <- ifelse((Ready_to_work_data$median_price %in% outliers_price & Ready_to_work_data$median_price>3000000),
                                    paste(Ready_to_work_data$Suburb,"\n",
Ready_to_work_data$median_price),"")
ggplot(aes(y=median_price,x=Regionname,fill=Regionname),data = Ready_to_work_data)+geom_boxplot()+ geom_text(aes(label=label_price))+
  theme(legend.position = "bottom")+scale_x_discrete(labels=c("E.Metro","E.Vic","N.Metro",
                                                              "N.Vic","S-E.Metro","S.Metro","W.Metro",
                                                              "W.Vic")) +  
  ggtitle("Melbourne Property Price  by Region") + 
  xlab("Region Names") + ylab("Median_Price") +
  theme(
    
    plot.title = element_text(colour = "Black",size = 14, face = "bold", hjust = 0.5 )
  )

Identify the suburb with higher crime incidents (outliers)

outliers_crime = boxplot(Ready_to_work_data$Total_Incidents ~ Ready_to_work_data$Regionname,plot=FALSE)$out
#Make a new column for labelling the outliers : Suburb/Region with highest number of crime incidents. Use Paste to concatenate strings. 
Ready_to_work_data$label_crime <- ifelse((Ready_to_work_data$Total_Incidents %in% outliers_crime & Ready_to_work_data$Total_Incidents>15000),
                                    paste(Ready_to_work_data$Suburb,"\n",
Ready_to_work_data$Total_Incidents),"")
#Make a boxplot 
ggplot(aes(y=Total_Incidents,x=Regionname,fill=Regionname),data = Ready_to_work_data)+geom_boxplot()+
  #theme_grey(base_size = 10) +
  theme(legend.position = "bottom")+
  scale_x_discrete(labels=c("E.Metro","E.Vic","N.Metro",
                                                              "N.Vic","S-E.Metro","S.Metro","W.Metro",                                                              "W.Vic")) + geom_text(aes(label=label_crime))+
  ggtitle("Melbourne Crime Incidents by Region") + 
  xlab("Region Names") + ylab("No of Crime Incidents") +
  theme(
    
    plot.title = element_text(colour = "Black",size = 14, face = "bold", hjust = 0.5 )
  )

Excluding outlier

Data_without_Melb_Suburb <- Ready_to_work_data %>% filter(!Suburb %in% "MELBOURNE")
ggplot(aes(y=Total_Incidents,x=Regionname,fill=Regionname),data = Data_without_Melb_Suburb)+geom_boxplot()+
  # theme_grey(base_size = 10)+
  theme(legend.position = "bottom")+
  scale_x_discrete(labels=c("E.Metro","E.Vic","N.Metro",
                                                              "N.Vic","S-E.Metro","S.Metro","W.Metro",                                                              "W.Vic")) +
  ggtitle("Melbourne Crime Incidents by Region") + 
  xlab("Region Names") + ylab("No of Crime Incidents") +
  theme(
    
    plot.title = element_text(colour = "Black",size = 14, face = "bold", hjust = 0.5 )
  )

Multivariate outlier detection:

   qplot(data=Ready_to_work_data, x=median_price, y=Total_Incidents) +geom_text(aes(label=ifelse((Total_Incidents>5*IQR(Total_Incidents)|median_price>4*IQR(median_price)),paste(Suburb,"\n"),"")), hjust=1.1) + 
  ggtitle("Melbourne:  Number of crime incidents against property prices") + 
  ylab("No of Crime Incidents") + xlab("median property price") +
  theme(
    plot.title = element_text(colour = "Black",size = 14, face = "bold", hjust = 0.5 )
  ) # 5*IQR / 4*IQR - random values just for labelling most extreme points. 

Data Transformation for further processing

Further I wanted to explore if there is any correlation between the price and distance (from the central).

  1. Create a new copy of the data set with only median_distance and median_price.
  2. Check for the distrbution of the data.
  3. Apply suitable mathematical operations to transform the data as close to normal as possible. In this case sqrt and logarithmic fuctions gave the distribuions closer to normality.
  4. Transform the data frame copy accordingly.
#create a new copy of data for this task 
ML_dataset <- Melb_filtered_data_2017 %>% group_by(Suburb) %>% summarise( median_distance =median(Distance),
median_price= median(Price)) %>% select(median_distance,median_price)
ML_dataset <- as.data.frame(ML_dataset) #Covert the tibble into a df
hist(ML_dataset$median_distance) #right skewed

hist(ML_dataset$median_price) #right skewed

#apply transformation to achieve normality
hist(sqrt(ML_dataset$median_distance))

hist(log10(ML_dataset$median_price))

# transform the data frame accordingly 
for (i in length(ML_dataset)) {
  
  ML_dataset$median_distance <- sqrt(ML_dataset$median_distance)
  ML_dataset$median_price <- log10(ML_dataset$median_price)
  
}

Machine Learning model to predict the price using distance

Steps:
  • Step 1: Define the task as regression analysis with Price as target variable.
  • Step 2: We will use the classic regression as our algorithm
  • Step 3: Fitting the model
    • Divide the data into train and test set.
    • Train the learner on the training set.
  • Step 4 : Make pricing predictions using the trained model to predict the price in the test set.
  • Step 5: Evaluating the learner: We calculate the mean square error and mean absolute error to see the performance metric of our learner.
# Task
ML_dataset_task <- makeRegrTask(data = ML_dataset,target = "median_price")
#Learner algorithm - classic regression 
lrnr = makeLearner("regr.glm")
#definine the train and test sets
n=nrow(ML_dataset)
training_set = sample(n, size = 2/3*n)
testing_set = setdiff(1:n, training_set)
# make a model of the data and train the learner on the task using the training set
pricing_model = mlr::train(learner = lrnr, task = ML_dataset_task, subset = training_set)
pricing_model
Model for learner.id=regr.glm; learner.class=regr.glm
Trained on: task.id = ML_dataset; obs = 218; features = 1
Hyperparameters: family=gaussian,model=FALSE
#predict the price 
pred_price = predict(pricing_model, task = ML_dataset_task, subset = testing_set)
#evaluate the learner 
performance(pred_price,measures = list(mse,mae))
       mse        mae 
0.01667724 0.10352674 

Findings:

As the absolute error is small, the model performed well against the given set of data. The model can be applied on additional data sets to evaluate its performance. Further an F test can be done for overall model significance.

Further steps:

The data can be used to find a correlation between the housing prices and distance (from Melbourne Central). Also if the relationship changes travelling the same distance but in different metropolitan regions. Further, Crime statistics for the suburb can be looked to explore a correlation, if any between the housing price and crime in that suburb.



---
title: "MATH2349 Semester 2, 2018"
author: "Ahmad Hasnain s3712538"
subtitle: Assignment 3
output:
  html_notebook: default
---

Rpubs link for this study: http://rpubs.com/The_Analyst/431304 


## Required packages 

```{r}

library(readr)
library(magrittr) 
library(dplyr)
library(tidyr)
library(readxl)
library(lubridate) 
library(stringr) 
library(ggplot2)
library(mlr)
```

## Executive Summary 

* In this report, we are trying to analyse the housing prices and crime statistics for the key regional areas of Melbourne answering questions like which region has the highest property price or which region has the lowest number of reported crime incidents.
* Two data sets have been used for this purpose.
* The First one is the Melbourne Housing Data which contains the prices of properties sold, along with the type of property, its suburb, region etc. I have restricted my analysis to three bedroom landed houses sold in 2017 and within 100km from CBD. Also the rows where prices are not available have been removed as the housing prices are skewed and imputing them with mean or median might influence the whole result. Region names have been factorised and other necessary data transformation like changing the class of Date variable has been done. Post this the median price and distance have been calculated. 
* The crime statistics data has been imported  and its variable names have been updated.This was followed by converting variables into suitable data types and labelling the factors. The data for year 2017 has been filtered and checked for possible missing values. 
* Post sanitation, the crime statistics data has been joined to the housing data, using suburb as the grouping variable. Missing values were identified and handled.Obvious error and inconsistency was checked. 
* As the next logical step, we have used a boxplot to look into the outliers for Price and Crime. In this case the suburbs whose property price or crime rate are significantly higher than that of other suburbs in the same region.  
I have not eliminated the outliers as they are not false alarms but a real depiction of the high property price or crime stats of a particular suburb. 
* Going further, I have applied mathematical functions to transform the median price and distance, to achieve normality , post which we have used the mlr package to come with a model to predict the median price of a property using other variables. We have used the generalised linear model and achieved a performance statistics or mean absolute error below 0.1. 
* This dataset can be used in future for further statistical analysis. 

## Data collection 

Melbourne housing data is taken from the kaggle data set hyperlink,[Melbourne Housing Market](https://www.kaggle.com/anthonypino/melbourne-housing-market/home) and Melbourne Crime Statistics has been sourced from hyperlink,[Melbourne Crime Statistics](https://www.crimestatistics.vic.gov.au)   

##### Data_Dictionary :
The followig variables are required:

###### Melbourne Housing Data:
* Suburb: Suburbs 
* Rooms: Number of rooms
* Price: Price in Australian dollars
* Type: br - bedroom(s); h - house,cottage,villa, semi,terrace; u - unit, duplex; t - townhouse; dev site - development site; o res - other residential.
* Date: Date sold
* Distance: Distance from CBD in Kilometres
* Regionname: General Region (West, North West, North, North east ...etc)

###### Crime Statistics data: 
* Year ending June: Year of crime 
* Postcode: Self explanatory
* Suburb/Town Name: Suburb
* Offence Division: offence category
* Incidents Recorded:Number of incidents 



```{r}

Crime_Inc_PostCode <- read_excel("Data_tables_Criminal_Incidents_Visualisation_year_ending_June_2018.xlsx", sheet = "Table 07" )
head(Crime_Inc_PostCode)

Melb <- read_csv("MELBOURNE_HOUSE_PRICES_LESS.csv") #Read the Housing data set
head(Melb)


```


## Structure and manipulations of Melbourne Housing data set

* Check for NAs. 
Out of 60672 rows there are 13,836 items where the Price is missing.
We will  ignore those cases and only look at the rows where we have the price listed.
* For this assignment we are only looking at the price in 2017 for the landed houses ('h') with 3 bedrooms and within 100kms (from Melbourne Central).
* Check the class of variables. 
* Factorise the region names.  
* Change the type of date variable from numeric to date.  
* Transform the suburb into upper case to match with the other data set at time of join. 


```{r}

sum(is.na(Melb$Price)) #check for NA

Melb_drop_na <- drop_na(Melb,Price) #handling NA's
Melb_filtered_data_2017 <- Melb_drop_na %>% filter( Type == 'h' & Rooms == 3 & Distance <100 & Date >= '2017-01-01') #apply the filtering criteria on the dataset


sapply(Melb_filtered_data_2017, class)

#factorise the Region Names 
Melb_filtered_data_2017$Regionname <- as.factor(Melb_filtered_data_2017$Regionname)
levels(Melb_filtered_data_2017$Regionname)

Melb_filtered_data_2017$Date<- dmy(Melb_filtered_data_2017$Date) #date transformation 

#to check the month in the date 
Melb_filtered_data_2017$Date %>% month(label = TRUE, abbr = FALSE) %>% head(10)

#transform the Suburb variable to upper case
Melb_filtered_data_2017$Suburb <- toupper(Melb_filtered_data_2017$Suburb ) 

```

## Structure and manipulations of Melbourne Crime data set


* Rename the column name to ease readability. 
* Check the datatype of variables.
* Factorise the Offence type  and Suburbs. 
* Filter the 2017 crime statistics data for the purpose of this study. 
* Check for NA's in the final dataset. 

```{r}

names(Crime_Inc_PostCode) #check the names of  dataset

#rename the columns to make increase readablity
Crime_Inc_PostCode <- rename(Crime_Inc_PostCode,
                       Year = `Year ending June`,
                       Suburb = `Suburb/Town Name`,
                       Offence = `Offence Division`,
                       No_of_Incident = `Incidents Recorded`)
sapply(Crime_Inc_PostCode,class) #check datatypes 

#factorise the offence variable 
Crime_Inc_PostCode$Offence <- factor(Crime_Inc_PostCode$Offence, levels = c("A Crimes against the person","B Property and deception offences","C Drug offences","D Public order and security offences","E Justice procedures offences","F Other offences"),
       labels = c("Crime against person","Property offences","Drug offences","Public order and security","Justice Procedures","Others"))


Crime_Inc_PostCode$Suburb <- as.factor(Crime_Inc_PostCode$Suburb)


#filter the data for 2017 crime statistics 
Crime_Inc_PostCode_2017 <- Crime_Inc_PostCode %>% filter(Year %in% "2017")

sum(is.na(Crime_Inc_PostCode_2017)) #check for NA's

```


## Summarise the data and create new variables. 

* Use mutate to create two new variables median_price and median_distance for the Melbourne housing data set.
* Group by Suburbs and use summarise to create Total_Incidents by Suburbs. 

```{r}

# Summarise the data and create new columns fro median price and distance. 

Melb_working_set_2017 <- Melb_filtered_data_2017 %>%  group_by(Postcode,Suburb) %>% 
  mutate(median_price = median(Price),
         median_distance = median(Distance)
         ) %>% distinct(Postcode,median_price,median_distance,count,Regionname)

Melb_working_set_2017 %>% head(5)

#Summarise the Crime Data to find No of Incidents per Suburb
Suburbs_Crime_2017 <- Crime_Inc_PostCode_2017 %>% group_by(Suburb) %>% summarise(
  Total_Incidents = sum(No_of_Incident)
)

Suburbs_Crime_2017 %>% head(5)

```

## Join the data sets

* Join the Melbourne housing data with the crime statistics data to make a unified data set with Suburb name, postcode, regiona name, median price, median distance and  total number of crime incidents.  

```{r}

#Join the Data sets

# On the Melbourne Housing Data , Join the Number of Criminal Incidents , per Suburb

Ready_to_work_data <- left_join(Melb_working_set_2017, Suburbs_Crime_2017, by="Suburb") 

Ready_to_work_data %>% head(5)
```


##	Scanning final data set for inconsistency  

* Check for NA's in the final data set. 
* Eliminate the Suburb where we do not have information about the number of incidents committed. 
* Create rule to check for obvious error or inconsistency.

```{r}

#Investigatge if there is any NA's in this data set as result of Join
sum(is.na(Ready_to_work_data))

Ready_to_work_data[!complete.cases(Ready_to_work_data),] #inspect NA rows

#Eliminate this suburb as we dont have info about the total number of Incidents
Ready_to_work_data <- Ready_to_work_data[complete.cases(Ready_to_work_data),]


#create a rule to check if the values are non negative
Rule <- editset(c("median_price >= 0", "median_distance >= 0","Total_Incidents>=0"))
Rule
#apply rule on the Final data set 
rules_violated <- violatedEdits(Rule,Ready_to_work_data)

summary(rules_violated) #visualise if any rules is violated

Ready_to_work_data %>% head(5)

```

## Identify the suburb with higher price median (outliers)

* Group the data by suburbs and plot the median_price against the region.
* Find the Suburb with the highest property price. 
* We can see that South Metropolitan Region appears to be the costliest region with median property price around 1.5 millions and Toorak being the Suburb with highest price with over 3 million Au $. 


```{r fig.height = 08, fig.width = 5}
outliers_price = boxplot(Ready_to_work_data$median_price ~ Ready_to_work_data$Regionname,plot=FALSE)$out

#Make a new column for labelling the outliers : Suburb/Region with highest number of crime incidents
Ready_to_work_data$label_price <- ifelse((Ready_to_work_data$median_price %in% outliers_price & Ready_to_work_data$median_price>3000000),
                                    paste(Ready_to_work_data$Suburb,"\n",
Ready_to_work_data$median_price),"")


ggplot(aes(y=median_price,x=Regionname,fill=Regionname),data = Ready_to_work_data)+geom_boxplot()+ geom_text(aes(label=label_price))+
  theme(legend.position = "bottom")+scale_x_discrete(labels=c("E.Metro","E.Vic","N.Metro",
                                                              "N.Vic","S-E.Metro","S.Metro","W.Metro",
                                                              "W.Vic")) +  
  ggtitle("Melbourne Property Price  by Region") + 
  xlab("Region Names") + ylab("Median_Price") +
  theme(
    
    plot.title = element_text(colour = "Black",size = 14, face = "bold", hjust = 0.5 )
  )


```


##	Identify the suburb with higher crime incidents (outliers)

* Group the suburbs by regions and plot the crime incident for each region. 
* The outlier represents those "Suburbs" where the observed number of crime incidents departs significantly from the number of incidents in other suburbs on the same region. In other words it depicts the Suburbs with higher crime rate in that region. 
* Melbourne suburb with 16826 number of incidents in marked to show the Suburb which has the highest number of crime incidents recorded across all regions. 


```{r fig.height = 08, fig.width = 5}

outliers_crime = boxplot(Ready_to_work_data$Total_Incidents ~ Ready_to_work_data$Regionname,plot=FALSE)$out

#Make a new column for labelling the outliers : Suburb/Region with highest number of crime incidents. Use Paste to concatenate strings. 
Ready_to_work_data$label_crime <- ifelse((Ready_to_work_data$Total_Incidents %in% outliers_crime & Ready_to_work_data$Total_Incidents>15000),
                                    paste(Ready_to_work_data$Suburb,"\n",
Ready_to_work_data$Total_Incidents),"")


#Make a boxplot 
ggplot(aes(y=Total_Incidents,x=Regionname,fill=Regionname),data = Ready_to_work_data)+geom_boxplot()+
  #theme_grey(base_size = 10) +
  theme(legend.position = "bottom")+
  scale_x_discrete(labels=c("E.Metro","E.Vic","N.Metro",
                                                              "N.Vic","S-E.Metro","S.Metro","W.Metro",                                                              "W.Vic")) + geom_text(aes(label=label_crime))+
  ggtitle("Melbourne Crime Incidents by Region") + 
  xlab("Region Names") + ylab("No of Crime Incidents") +
  theme(
    
    plot.title = element_text(colour = "Black",size = 14, face = "bold", hjust = 0.5 )
  )



```


##	Excluding outlier

* Once we see that the suburb Melbourne has the maximum number of Crime Incidents in 2017, lets exclude it and view the boxplot again.Now we can clearly see the median i.e number of crimes in a region is highest for Western Victoria.( this is done only for visualisation)
* Still there are many outliers in the data, but as per this study, they represent the suburbs with higher crimes compared to that region and should not be excluded at any point. 


```{r fig.height = 06, fig.width = 5}

Data_without_Melb_Suburb <- Ready_to_work_data %>% filter(!Suburb %in% "MELBOURNE")

ggplot(aes(y=Total_Incidents,x=Regionname,fill=Regionname),data = Data_without_Melb_Suburb)+geom_boxplot()+
  # theme_grey(base_size = 10)+
  theme(legend.position = "bottom")+
  scale_x_discrete(labels=c("E.Metro","E.Vic","N.Metro",
                                                              "N.Vic","S-E.Metro","S.Metro","W.Metro",                                                              "W.Vic")) +
  ggtitle("Melbourne Crime Incidents by Region") + 
  xlab("Region Names") + ylab("No of Crime Incidents") +
  theme(
    
    plot.title = element_text(colour = "Black",size = 14, face = "bold", hjust = 0.5 )
  )
```

##	Multivariate outlier detection:

* Explore the relation between property price and crime incidents. 
* Use a scatter plot for number of Crime Incidents against property price.
* There are few outliers in this plot which suggest the suburbs with an unusually high property price and crime rate compared to other suburbs. 
* These outliers shouldnt be removed or imputed as they represent important facts.  


```{r fig.height = 06, fig.width = 11}

   qplot(data=Ready_to_work_data, x=median_price, y=Total_Incidents) +geom_text(aes(label=ifelse((Total_Incidents>5*IQR(Total_Incidents)|median_price>4*IQR(median_price)),paste(Suburb,"\n"),"")), hjust=1.1) + 
  ggtitle("Melbourne:  Number of crime incidents against property prices") + 
  ylab("No of Crime Incidents") + xlab("median property price") +
  theme(
    plot.title = element_text(colour = "Black",size = 14, face = "bold", hjust = 0.5 )
  ) # 5*IQR / 4*IQR - random values just for labelling most extreme points. 
```

* Looking at the plot above Toorak doesn't appear to be the suburb with best value for money as there are other suburbs which are cheaper and with lower crime rate, however there might be other factors involved which is beyond the scope of this report. 

## Data Transformation for further processing

Further I wanted  to explore if there is any correlation between the price and distance (from the central).

1. Create a new copy of the data set with only median_distance and median_price.
2. Check for the distrbution of the data. 
3. Apply suitable mathematical operations to transform the data as close to normal as possible. In this case sqrt and logarithmic fuctions gave the distribuions closer to normality. 
4. Transform the data frame copy accordingly.  

```{r}

#create a new copy of data for this task 

ML_dataset <- Melb_filtered_data_2017 %>% group_by(Suburb) %>% summarise( median_distance =median(Distance),
median_price= median(Price)) %>% select(median_distance,median_price)

ML_dataset <- as.data.frame(ML_dataset) #Covert the tibble into a df

hist(ML_dataset$median_distance) #right skewed
hist(ML_dataset$median_price) #right skewed

#apply transformation to achieve normality
hist(sqrt(ML_dataset$median_distance))
hist(log10(ML_dataset$median_price))

# transform the data frame accordingly 

for (i in length(ML_dataset)) {
  
  ML_dataset$median_distance <- sqrt(ML_dataset$median_distance)
  ML_dataset$median_price <- log10(ML_dataset$median_price)
  
}

```

## Machine Learning model to predict the price using distance

##### Steps: 


* Step 1: Define the task as regression analysis with Price as target variable.
* Step 2: We will use the classic regression as our algorithm 
* Step 3: Fitting the model
     + Divide the data into train and test set.
     + Train the learner on the training set. 
* Step 4 : Make pricing predictions using the trained model to predict the price in the test set. 
* Step 5: Evaluating the learner: 
We calculate the mean square error and mean absolute error to see the performance metric of our learner.

```{r}

# Task
ML_dataset_task <- makeRegrTask(data = ML_dataset,target = "median_price")

#Learner algorithm - classic regression 
lrnr = makeLearner("regr.glm")

#definine the train and test sets
n=nrow(ML_dataset)
training_set = sample(n, size = 2/3*n)
testing_set = setdiff(1:n, training_set)

# make a model of the data and train the learner on the task using the training set
pricing_model = mlr::train(learner = lrnr, task = ML_dataset_task, subset = training_set)
pricing_model

#predict the price 

pred_price = predict(pricing_model, task = ML_dataset_task, subset = testing_set)

#evaluate the learner 

performance(pred_price,measures = list(mse,mae))

```

**Findings:**

As the absolute error is small, the model performed well against the given set of data. The model can  be applied on additional data sets to evaluate its performance. Further an F test can be done for overall model significance.  

**Further steps:**

The data can be used to find a correlation between the housing prices and distance (from Melbourne Central). Also if the relationship changes travelling the same distance but in different metropolitan regions.
Further, Crime statistics for the suburb can be looked to explore a correlation, if any between the housing price and crime in that suburb. 

<br>
<br>