1. Exploration of the Melbourne House Data Starts

Reading the Melbourne data & importing required libraries

require(ggplot2)
## Loading required package: ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(sjmisc)
## 
## Attaching package: 'sjmisc'
## The following object is masked from 'package:tidyr':
## 
##     replace_na
library(corrplot)
## corrplot 0.92 loaded
housing.dataset <- read.csv("D:/Freelancer_questions/shivam/Melbourne_housing/melbourne_data.csv", header = TRUE)

str(housing.dataset)
## 'data.frame':    34857 obs. of  12 variables:
##  $ Date         : chr  "03-09-2016" "03-12-2016" "04-02-2016" "04-02-2016" ...
##  $ Type         : chr  "h" "h" "h" "u" ...
##  $ Price        : int  NA 1480000 1035000 NA 1465000 850000 1600000 NA NA NA ...
##  $ Landsize     : int  126 202 156 0 134 94 120 400 201 202 ...
##  $ BuildingArea : num  NA NA 79 NA 150 NA 142 220 NA NA ...
##  $ Rooms        : int  2 2 2 3 3 3 4 4 2 2 ...
##  $ Bathroom     : int  1 1 1 2 2 2 1 2 1 2 ...
##  $ Car          : int  1 1 0 1 0 1 2 2 2 1 ...
##  $ YearBuilt    : int  NA NA 1900 NA 1900 NA 2014 2006 1900 1900 ...
##  $ Distance     : chr  "2.5" "2.5" "2.5" "2.5" ...
##  $ Regionname   : chr  "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
##  $ Propertycount: chr  "4019" "4019" "4019" "4019" ...

2. Cleaning the data: Outlier removal; replace NA etc

Countine the number of rows and columns for the dataset

nrow(housing.dataset)
## [1] 34857
ncol(housing.dataset)
## [1] 12

Using the quartile function to clip the price below 2th percentile and above 99th percentile

data_x1_quantiles <- quantile(housing.dataset$Price, c(0.02, 0.99), na.rm = TRUE)
data_x1_quantiles
##      2%     99% 
##  349000 3400540
data_subset <- housing.dataset[housing.dataset$Price > data_x1_quantiles[1] & housing.dataset$Price < data_x1_quantiles[2], ]

dropping rows where na is >= 4 columns

delete.na <- function(DF, n) {
  DF[rowSums(is.na(DF)) <= n,]
}

cleaned_df <- delete.na(data_subset,4)
nrow(cleaned_df)
## [1] 20223

Replacing the na values of Price & YearNuilt with Median values

cleaned_df$Price[is.na(cleaned_df$Price)]<-median(cleaned_df$Price,na.rm=TRUE)

cleaned_df$YearBuilt[is.na(cleaned_df$YearBuilt)]<-median(cleaned_df$YearBuilt,na.rm=TRUE)

Replacing the na values of Price & YearBuild & BuildingArea with Median values

cleaned_df$Price[is.na(cleaned_df$Price)]<-median(cleaned_df$Price,na.rm=TRUE)

cleaned_df$YearBuilt[is.na(cleaned_df$YearBuilt)]<-median(cleaned_df$YearBuilt,na.rm=TRUE)

cleaned_df$BuildingArea [is.na(cleaned_df$BuildingArea )]<-median(cleaned_df$BuildingArea ,na.rm=TRUE)

cleaned_df$Landsize [is.na(cleaned_df$Landsize )]<-median(cleaned_df$Landsize ,na.rm=TRUE)

Where cars are na we are replacing with 0 cars

cleaned_df$Car[is.na(cleaned_df$Car)] <- 0

As building area cannot be 0 or < 5 replacing those values or correcting those values to atleast 10

cleaned_df$BuildingArea = replace(cleaned_df$BuildingArea, cleaned_df$BuildingArea < 10, 10 )

As landsize cannot be 0 or < 35 replacing those values or correcting those values to atleast 35

cleaned_df$Landsize = replace(cleaned_df$Landsize, cleaned_df$Landsize < 35, 35 )

converting few variables to categorical variables

names <- c('Rooms' ,'Bathroom','Car','YearBuilt','Propertycount')
cleaned_df[,names] <- lapply(cleaned_df[,names] , factor)

3. Summary of variables : Plotting

Barplot

ggplot(cleaned_df, aes(x=factor(Car)))+
  geom_bar(stat="count", width=0.7, fill="steelblue")+
  theme_minimal() + ggtitle("\t\t\t Count of Cars Histogram") +  xlab("Number of cars") + ylab("Count of Cars")

#### Inference 1. Maximum people have 1-3 cars 2. Very few people have greater than 5 cars

Grouped Bar chart

counts <- table(cleaned_df$Rooms, cleaned_df$Regionname)
barplot(counts, main="Rooms vs Region names",
        xlab="Region names", col=c("darkblue","red","green","yellow","pink","maroon","white","black"),
        legend = rownames(counts), beside=TRUE)

Inference

  1. There are few regions which have 7 rooms in average
  2. Many regions also have 5 rooms in average

Scatter plot

ggplot(cleaned_df, aes(x=Landsize, y=BuildingArea)) + geom_point() + ggtitle("\t\t\t Building Area vs Landsize scater plot") + xlab("Landsize") + ylab("BuildingArea")

Inference

  1. There seems to be a direct relationship between landsize and building area which is as expected

4. Focus on Price Variable

a) histogram of the price variable

sp<- ggplot(cleaned_df, aes(x=Price)) + geom_histogram(bins = 30)
sp + scale_x_continuous(name="Price", limits=c(0, 9000000)) + scale_y_continuous(name="count", limits=c(0, 10000))
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Inference

  1. The Price distribution is right skewed indicating that majority of population is in the left half of the price range

b)Group houses by some price ranges (like low, medium, high, etc.)

frq(cleaned_df$Price, auto.grp = 3)
## x <integer> 
## # total N=20223 valid N=20223 mean=1068958.90 sd=548268.27
## 
## Value |           Label |     N | Raw % | Valid % | Cum. %
## ----------------------------------------------------------
##     1 |  350000-1369999 | 15540 | 76.84 |   76.84 |  76.84
##     2 | 1370000-2379999 |  3978 | 19.67 |   19.67 |  96.51
##     3 | 2380000-3399999 |   705 |  3.49 |    3.49 | 100.00
##  <NA> |            <NA> |     0 |  0.00 |    <NA> |   <NA>

Inference : based on the above distribution of Prices has been segmented into 3 categories low, medium and high

  1. low = [416000-1029999 ]

  2. Medium = [1030000-1639999 ]

  3. High = [1640000-2249999 ]

###c) boxplot between the price and other variables

qplot(Price,Rooms, data=cleaned_df, geom="boxplot", color=Rooms) + ggtitle("boxplot between price and rooms")
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.

qplot(Price, Regionname, data=cleaned_df, geom="boxplot", color=Regionname) + ggtitle("boxplot between price and region")

Inference :

  1. Maximum outliers are for rooms 1 ,2 ,3
  2. Maximum deviation is between Western/Northern & South-Eastern Metropolitan

###d) How diferent attributes are correlated with the price? Which 3 variables are correlated the most with price?

corr_subset <- cleaned_df[,c("Price", "Landsize", "BuildingArea", "Distance")]

corr_subset$Distance <- as.integer(corr_subset$Distance)

corr_subset<- na.omit(corr_subset)

cor(corr_subset, method = c("pearson"))
##                    Price   Landsize BuildingArea    Distance
## Price         1.00000000 0.03063492   0.06735030 -0.25616254
## Landsize      0.03063492 1.00000000   0.08624689  0.04902107
## BuildingArea  0.06735030 0.08624689   1.00000000  0.05882978
## Distance     -0.25616254 0.04902107   0.05882978  1.00000000

Inference :

  1. Price positively correlated with Landsize & BuildingArea
  2. Price negatively correlated with Distance

5. List the frequencies of houses for various types. Create 2 scatter plots and colour the house price by landsize and type

Frequency of house for various categories

counts <- table(cleaned_df$Type, cleaned_df$Rooms)
barplot(counts, main="rooms vs type",
        xlab="rooms", col=c("darkblue","red","green"),
        legend = c("1. h", "2. u","3. t"), beside=TRUE)

counts <- table(cleaned_df$Rooms, cleaned_df$Propertycount)
barplot(counts, main="Rooms vs Propertycount",xlab="rooms",col=c("darkblue","red","green"),
         beside=TRUE)

Inference :

  1. Propertycount 8870 has maximum property

Scatter plot between Price vs Landsize/ BuildingArea

ggplot(cleaned_df, aes(x=Price, y=BuildingArea)) + geom_point() + ggtitle("\t\t\t Price vs BuildingArea scater plot") + xlab("Price") + ylab("BuildingArea")

ggplot(cleaned_df, aes(x=Price, y=Landsize)) + geom_point() + ggtitle("\t\t\t Price vs Landsize scater plot") + xlab("Price") + ylab("Landsize")

Inference :

  1. Clear relation between Price and Building Area, Price and Landsize ( mostly positive relationship)