library(readr)
library(dplyr)
library(tidyr)
library(infotheo)
library(lubridate)
library(readxl)
library(outliers)
The preprocessed statistics was to evaluate the price of house selling in the Melbourne area and the market effect over a specific year according to the Crime Statistics. See also how the house price is influenced by the quantity of reported crimes. We combine Melbourne data on housing and Australia data on crime. The variable was transformed to appropriate forms and the two datasets were initially tidied and then combined. The data was then transparent and monitored for missing values, anomalies, mistakes, outliers etc. This task enables us to demonstrate various knowledge and skills learned during our course on where and how data is processed, reshaped and prepared for some further evaluation.
Two datasets are being used for this report.
Melbourne housing data were the first data collection Originating from Kaggle authorized by Tony Pino, Under the permit CC BY-NC-SA 4.0.. The dataset provides the price at which each homestay was sold around Melbourne, as well as the form and method of sale. The file is in two versions. We use the file called MEL_HOUSE_PRICES_LESS 2.csv because it contains all prices from 2016 to 2018 and Our goal is to equate the house market value with the number of crimes reported in Australia.
price <- read_csv("~/Desktop/SEM 1/Data Wrangling/Assignment-2/MEL_HOUSE_PRICES_LESS 2.csv") #reading the first dataset.
## Parsed with column specification:
## cols(
## Suburb = col_character(),
## Address = col_character(),
## Rooms = col_double(),
## Type = col_character(),
## Price = col_double(),
## Method = col_character(),
## SellerG = col_character(),
## Date = col_character(),
## Postcode = col_double(),
## Regionname = col_character(),
## Propertycount = col_double(),
## Distance = col_double(),
## CouncilArea = col_character()
## )
head(price)
The definition of the variables from the source defined accordingly.
The second dataset is of Melbourne Crime Statistics originated from crimestatistics.vic.gov.au/. From the dataset we are dealing with suburb against number of crime recorded for targeted year which is 2018.
crime <- read_excel("~/Desktop/SEM 1/Data Wrangling/Assignment-2/Data_tables_Criminal_Incidents_Visualisation_year_ending_December_2018.xlsx", sheet = "Table 07" ) #reading the second dataset.
head(crime)
The variable definition is given below:
The two datasets are merged until the datasets are sorted later. since we only require Suburb on Suburb Crime Statistics of Australia to compare.
summary(price)
## Suburb Address Rooms Type
## Length:63023 Length:63023 Min. : 1.000 Length:63023
## Class :character Class :character 1st Qu.: 3.000 Class :character
## Mode :character Mode :character Median : 3.000 Mode :character
## Mean : 3.111
## 3rd Qu.: 4.000
## Max. :31.000
##
## Price Method SellerG Date
## Min. : 85000 Length:63023 Length:63023 Length:63023
## 1st Qu.: 620000 Class :character Class :character Class :character
## Median : 830000 Mode :character Mode :character Mode :character
## Mean : 997898
## 3rd Qu.: 1220000
## Max. :11200000
## NA's :14590
## Postcode Regionname Propertycount Distance
## Min. :3000 Length:63023 Min. : 39 Min. : 0.00
## 1st Qu.:3056 Class :character 1st Qu.: 4380 1st Qu.: 7.00
## Median :3107 Mode :character Median : 6795 Median :11.40
## Mean :3126 Mean : 7618 Mean :12.68
## 3rd Qu.:3163 3rd Qu.:10412 3rd Qu.:16.70
## Max. :3980 Max. :21650 Max. :64.10
##
## CouncilArea
## Length:63023
## Class :character
## Mode :character
##
##
##
##
str(price) #Verifying the structure.
## tibble [63,023 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Suburb : chr [1:63023] "Abbotsford" "Abbotsford" "Abbotsford" "Aberfeldie" ...
## $ Address : chr [1:63023] "49 Lithgow St" "59A Turner St" "119B Yarra St" "68 Vida St" ...
## $ Rooms : num [1:63023] 3 3 3 3 2 2 2 3 6 3 ...
## $ Type : chr [1:63023] "h" "h" "h" "h" ...
## $ Price : num [1:63023] 1490000 1220000 1420000 1515000 670000 ...
## $ Method : chr [1:63023] "S" "S" "S" "S" ...
## $ SellerG : chr [1:63023] "Jellis" "Marshall" "Nelson" "Barry" ...
## $ Date : chr [1:63023] "1/04/2017" "1/04/2017" "1/04/2017" "1/04/2017" ...
## $ Postcode : num [1:63023] 3067 3067 3067 3040 3042 ...
## $ Regionname : chr [1:63023] "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Western Metropolitan" ...
## $ Propertycount: num [1:63023] 4019 4019 4019 1543 3464 ...
## $ Distance : num [1:63023] 3 3 3 7.5 10.4 10.4 10.4 10.4 14 3 ...
## $ CouncilArea : chr [1:63023] "Yarra City Council" "Yarra City Council" "Yarra City Council" "Moonee Valley City Council" ...
## - attr(*, "spec")=
## .. cols(
## .. Suburb = col_character(),
## .. Address = col_character(),
## .. Rooms = col_double(),
## .. Type = col_character(),
## .. Price = col_double(),
## .. Method = col_character(),
## .. SellerG = col_character(),
## .. Date = col_character(),
## .. Postcode = col_double(),
## .. Regionname = col_character(),
## .. Propertycount = col_double(),
## .. Distance = col_double(),
## .. CouncilArea = col_character()
## .. )
price_new=price
price_new <- price%>% mutate(Type=factor(Type, levels=c("t","u","h","br","o res","dev-site"), labels =c("Townhouse","Unit","House","Bedroom","Other residential","Development site")),
Date =as.Date(Date,format ="%d/%m/%Y"),
Suburb = factor(Suburb),
Method = factor(Method),
Postcode = as.character(Postcode),
Regionname= factor(Regionname))
is.factor(price_new$Type)
## [1] TRUE
class(price_new$Date)
## [1] "Date"
is.factor(price_new$Suburb)
## [1] TRUE
is.factor(price_new$Regionname)
## [1] TRUE
is.factor(price_new$Method)
## [1] TRUE
str(price_new) #Modified and verifying the structure again.
## tibble [63,023 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Suburb : Factor w/ 380 levels "Abbotsford","Aberfeldie",..: 1 1 1 2 3 3 3 3 4 5 ...
## $ Address : chr [1:63023] "49 Lithgow St" "59A Turner St" "119B Yarra St" "68 Vida St" ...
## $ Rooms : num [1:63023] 3 3 3 3 2 2 2 3 6 3 ...
## $ Type : Factor w/ 6 levels "Townhouse","Unit",..: 3 3 3 3 3 1 2 3 3 3 ...
## $ Price : num [1:63023] 1490000 1220000 1420000 1515000 670000 ...
## $ Method : Factor w/ 9 levels "PI","PN","S",..: 3 3 3 3 3 3 3 6 1 3 ...
## $ SellerG : chr [1:63023] "Jellis" "Marshall" "Nelson" "Barry" ...
## $ Date : Date[1:63023], format: "2017-04-01" "2017-04-01" ...
## $ Postcode : chr [1:63023] "3067" "3067" "3067" "3040" ...
## $ Regionname : Factor w/ 8 levels "Eastern Metropolitan",..: 3 3 3 7 7 7 7 7 7 6 ...
## $ Propertycount: num [1:63023] 4019 4019 4019 1543 3464 ...
## $ Distance : num [1:63023] 3 3 3 7.5 10.4 10.4 10.4 10.4 14 3 ...
## $ CouncilArea : chr [1:63023] "Yarra City Council" "Yarra City Council" "Yarra City Council" "Moonee Valley City Council" ...
We will here test whether or not the dataset we use is a smart dataset. To render the data set in a decent condition, the following characteristics should be used: Each variable should be having its own column Each value should be having its own row * Each observation should be having its own cell
Since our dataset has all the above-mentioned attributes, the dataset we have used is clean and ready for scanning already.
#To maximise reading capacity, rename the columns
crime <- rename(crime,
Year = `Year ending December`,
Suburb= `Suburb/Town Name`,
Division_of_crimes = `Offence Division`,
subdivision_of_crimes= `Offence Subdivision`,
collection_of_Incident = `Incidents Recorded`)
head(crime)
str(crime) #Modified and verifying the structure again.
## tibble [284,097 × 7] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:284097] 2009 2009 2009 2009 2009 ...
## $ Postcode : num [1:284097] 3000 3000 3000 3000 3000 ...
## $ Suburb : chr [1:284097] "MELBOURNE" "MELBOURNE" "MELBOURNE" "MELBOURNE" ...
## $ Division_of_crimes : chr [1:284097] "A Crimes against the person" "A Crimes against the person" "A Crimes against the person" "A Crimes against the person" ...
## $ subdivision_of_crimes : chr [1:284097] "A20 Assault and related offences" "A20 Assault and related offences" "A20 Assault and related offences" "A20 Assault and related offences" ...
## $ Offence Subgroup : chr [1:284097] "A232 Non-FV Common assault" "A231 FV Common assault" "A212 Non-FV Serious assault" "A211 FV Serious assault" ...
## $ collection_of_Incident: num [1:284097] 407 26 618 25 182 18 14 2 7 6 ...
crime$Division_of_crimes <- factor(crime$Division_of_crimes, levels = c("A Crimes against the person","B Property and deception offences","C Drug offences","D Public order and security offences","E Justice procedures offences","F Other offences"),labels = c("Offense to the Citizen","Deception and security crimes","Drug crimes","Public defence and order","Procedures for Justice","Others"))
levels(crime$Division_of_crimes) #checking new labels
## [1] "Offense to the Citizen" "Deception and security crimes"
## [3] "Drug crimes" "Public defence and order"
## [5] "Procedures for Justice" "Others"
crime$Suburb<- as.factor(crime$Suburb)
is.factor(crime$Suburb)
## [1] TRUE
NOTE: No columns in the dataset may be changed in order to gain a logical value as the third component. To convey this We must merge the two variables here to illustrate the idea of mutate.
sum(is.na(price_new$Rooms)) #Checking NA Values
## [1] 0
crime_2018 <- crime %>% filter(Year %in% "2018") #applying filter to get the appropriate year for the analysis .
crime_2018
sum(is.na(crime)) #Checking NA Values for second dataset.
## [1] 0
price_new_1=price_new
price_new_1=price_new %>% mutate(Year = year(Date), price_in_million=Price/10^6)
price_new_1$Suburb <- toupper(price_new_1$Suburb) ## this is performed to get the left join eaisily
head(price_new_1)
crime_sub_2018 <-crime_2018 %>% group_by(Suburb) %>%summarise(Sum_of_Incidents=sum(collection_of_Incident))
head(crime_sub_2018)
price_new_1$Year = as.character(price_new_1$Year) # converted year from numeric to character to get left join easily.
is.character(price_new_1$Year)
## [1] TRUE
crime_2018$Year = as.character(crime_2018$Year) # converted year from numeric to character to get left join easily.
is.character(crime_2018$Year)
## [1] TRUE
price_merge=price_new_1
price_merge<- left_join(price_new_1,crime_sub_2018,by="Suburb")
## Warning: Column `Suburb` joining character vector and factor, coercing into
## character vector
head(price_merge)
We see that there are missing values in price variable, price_in_million and Sum_of_Incidents.We conclude the variables is following various suburb patterns,so we seek to impute the mean price for every suburb.After we have imputed it we see that the price variable and price_in_million still comprises 11 missing values and Sum_of_Incidents followed with 284 missing values.We therefore impute these observations using the regionname(region) mean.
sum(is.na(price_merge)) #checking total NA values for newly created dataset.
## [1] 29464
price_merge_impute=price_merge
price_merge_impute <-price_merge %>% group_by(Suburb) %>% mutate (Price = ifelse(is.na(Price),mean(Price,na.rm=TRUE), Price),price_in_million=ifelse(is.na(price_in_million),mean(price_in_million,na.rm=TRUE),price_in_million), Sum_of_Incidents=ifelse(is.na(Sum_of_Incidents), mean(Sum_of_Incidents,na.rm=TRUE), Sum_of_Incidents))
sapply(price_merge_impute, function(x) sum(is.na(x))) #used this function to get the sum of all the null values.
## Suburb Address Rooms Type
## 0 0 0 0
## Price Method SellerG Date
## 11 0 0 0
## Postcode Regionname Propertycount Distance
## 0 0 0 0
## CouncilArea Year price_in_million Sum_of_Incidents
## 0 0 11 284
price_merge_impute <-price_merge %>% group_by(Regionname) %>% mutate (Price = ifelse(is.na(Price),mean(Price,na.rm=TRUE), Price),price_in_million=ifelse(is.na(price_in_million),mean(price_in_million,na.rm=TRUE),price_in_million), Sum_of_Incidents=ifelse(is.na(Sum_of_Incidents), mean(Sum_of_Incidents,na.rm=TRUE), Sum_of_Incidents))
sapply(price_merge_impute, function(x) sum(is.na(x)))
## Suburb Address Rooms Type
## 0 0 0 0
## Price Method SellerG Date
## 0 0 0 0
## Postcode Regionname Propertycount Distance
## 0 0 0 0
## CouncilArea Year price_in_million Sum_of_Incidents
## 0 0 0 0
price_merge_impute%>% group_by(Suburb) %>% summarise (Sum_of_na = sum(is.na(Price)))
price_merge_impute$Rooms <- as.factor(price_merge_impute$Rooms) ## converted to perform boxplot
is.factor(price_merge_impute$Rooms)
## [1] TRUE
price_merge_impute%>%boxplot(`Price`~`Rooms`, data = ., main="Price against Rooms", ylab = "Price in Frequency", xlab = "Rooms Sizes",col = "skyblue")
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x}
price_merge_impute <- price_merge_impute%>%group_by(Rooms)%>%mutate(Price= cap(Price))
price_merge_impute%>%boxplot(`Price`~`Rooms`, data = ., main="Price against Rooms", ylab = "Price in Frequency", xlab = "Rooms Sizes",col = "skyblue")
price_merge_impute %>% plot(Price~Sum_of_Incidents, data=., main = "Price vs sum_of_Incidents")
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
price_merge_impute <- price_merge_impute%>%mutate(Sum_of_Incidents=cap(Sum_of_Incidents))
price_merge_impute %>% plot(Price~Sum_of_Incidents, data=., main = "Price vs sum_of_Incidents")
This transformation is carried out to better understand the variable and for to better scaling. This transition typically takes place when a data distribution is nonlinear and the further analysis involves a linear distribution.Some techniques of statistical analysis to be applied to data allow the normally distribution of the data, and data transformation is therefore used.
hist(price_merge_impute$Price)
price_merge_impute$Price<- log10(price_merge_impute$Price)
hist(price_merge_impute$Price)
* Applying transformation on second variable.
hist(price_merge_impute$Sum_of_Incidents)
hist(log10(price_merge_impute$Sum_of_Incidents))