Reading the Melbourne data & importing required libraries
require(ggplot2)
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(sjmisc)
##
## Attaching package: 'sjmisc'
## The following object is masked from 'package:tidyr':
##
## replace_na
library(corrplot)
## corrplot 0.92 loaded
housing.dataset <- read.csv("D:/Freelancer_questions/shivam/Melbourne_housing/melbourne_data.csv", header = TRUE)
str(housing.dataset)
## 'data.frame': 34857 obs. of 12 variables:
## $ Date : chr "03-09-2016" "03-12-2016" "04-02-2016" "04-02-2016" ...
## $ Type : chr "h" "h" "h" "u" ...
## $ Price : int NA 1480000 1035000 NA 1465000 850000 1600000 NA NA NA ...
## $ Landsize : int 126 202 156 0 134 94 120 400 201 202 ...
## $ BuildingArea : num NA NA 79 NA 150 NA 142 220 NA NA ...
## $ Rooms : int 2 2 2 3 3 3 4 4 2 2 ...
## $ Bathroom : int 1 1 1 2 2 2 1 2 1 2 ...
## $ Car : int 1 1 0 1 0 1 2 2 2 1 ...
## $ YearBuilt : int NA NA 1900 NA 1900 NA 2014 2006 1900 1900 ...
## $ Distance : chr "2.5" "2.5" "2.5" "2.5" ...
## $ Regionname : chr "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
## $ Propertycount: chr "4019" "4019" "4019" "4019" ...
nrow(housing.dataset)
## [1] 34857
ncol(housing.dataset)
## [1] 12
data_x1_quantiles <- quantile(housing.dataset$Price, c(0.02, 0.99), na.rm = TRUE)
data_x1_quantiles
## 2% 99%
## 349000 3400540
data_subset <- housing.dataset[housing.dataset$Price > data_x1_quantiles[1] & housing.dataset$Price < data_x1_quantiles[2], ]
delete.na <- function(DF, n) {
DF[rowSums(is.na(DF)) <= n,]
}
cleaned_df <- delete.na(data_subset,4)
nrow(cleaned_df)
## [1] 20223
cleaned_df$Price[is.na(cleaned_df$Price)]<-median(cleaned_df$Price,na.rm=TRUE)
cleaned_df$YearBuilt[is.na(cleaned_df$YearBuilt)]<-median(cleaned_df$YearBuilt,na.rm=TRUE)
cleaned_df$Price[is.na(cleaned_df$Price)]<-median(cleaned_df$Price,na.rm=TRUE)
cleaned_df$YearBuilt[is.na(cleaned_df$YearBuilt)]<-median(cleaned_df$YearBuilt,na.rm=TRUE)
cleaned_df$BuildingArea [is.na(cleaned_df$BuildingArea )]<-median(cleaned_df$BuildingArea ,na.rm=TRUE)
cleaned_df$Landsize [is.na(cleaned_df$Landsize )]<-median(cleaned_df$Landsize ,na.rm=TRUE)
cleaned_df$Car[is.na(cleaned_df$Car)] <- 0
cleaned_df$BuildingArea = replace(cleaned_df$BuildingArea, cleaned_df$BuildingArea < 10, 10 )
cleaned_df$Landsize = replace(cleaned_df$Landsize, cleaned_df$Landsize < 35, 35 )
names <- c('Rooms' ,'Bathroom','Car','YearBuilt','Propertycount')
cleaned_df[,names] <- lapply(cleaned_df[,names] , factor)
ggplot(cleaned_df, aes(x=factor(Car)))+
geom_bar(stat="count", width=0.7, fill="steelblue")+
theme_minimal() + ggtitle("\t\t\t Count of Cars Histogram") + xlab("Number of cars") + ylab("Count of Cars")
#### Inference 1. Maximum people have 1-3 cars 2. Very few people have
greater than 5 cars
counts <- table(cleaned_df$Rooms, cleaned_df$Regionname)
barplot(counts, main="Rooms vs Region names",
xlab="Region names", col=c("darkblue","red","green","yellow","pink","maroon","white","black"),
legend = rownames(counts), beside=TRUE)
ggplot(cleaned_df, aes(x=Landsize, y=BuildingArea)) + geom_point() + ggtitle("\t\t\t Building Area vs Landsize scater plot") + xlab("Landsize") + ylab("BuildingArea")
sp<- ggplot(cleaned_df, aes(x=Price)) + geom_histogram(bins = 30)
sp + scale_x_continuous(name="Price", limits=c(0, 9000000)) + scale_y_continuous(name="count", limits=c(0, 10000))
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
frq(cleaned_df$Price, auto.grp = 3)
## x <integer>
## # total N=20223 valid N=20223 mean=1068958.90 sd=548268.27
##
## Value | Label | N | Raw % | Valid % | Cum. %
## ----------------------------------------------------------
## 1 | 350000-1369999 | 15540 | 76.84 | 76.84 | 76.84
## 2 | 1370000-2379999 | 3978 | 19.67 | 19.67 | 96.51
## 3 | 2380000-3399999 | 705 | 3.49 | 3.49 | 100.00
## <NA> | <NA> | 0 | 0.00 | <NA> | <NA>
low = [416000-1029999 ]
Medium = [1030000-1639999 ]
High = [1640000-2249999 ]
###c) boxplot between the price and other variables
qplot(Price,Rooms, data=cleaned_df, geom="boxplot", color=Rooms) + ggtitle("boxplot between price and rooms")
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
qplot(Price, Regionname, data=cleaned_df, geom="boxplot", color=Regionname) + ggtitle("boxplot between price and region")
###d) How diferent attributes are correlated with the price? Which 3 variables are correlated the most with price?
corr_subset <- cleaned_df[,c("Price", "Landsize", "BuildingArea", "Distance")]
corr_subset$Distance <- as.integer(corr_subset$Distance)
corr_subset<- na.omit(corr_subset)
cor(corr_subset, method = c("pearson"))
## Price Landsize BuildingArea Distance
## Price 1.00000000 0.03063492 0.06735030 -0.25616254
## Landsize 0.03063492 1.00000000 0.08624689 0.04902107
## BuildingArea 0.06735030 0.08624689 1.00000000 0.05882978
## Distance -0.25616254 0.04902107 0.05882978 1.00000000
counts <- table(cleaned_df$Type, cleaned_df$Rooms)
barplot(counts, main="rooms vs type",
xlab="rooms", col=c("darkblue","red","green"),
legend = c("1. h", "2. u","3. t"), beside=TRUE)
counts <- table(cleaned_df$Rooms, cleaned_df$Propertycount)
barplot(counts, main="Rooms vs Propertycount",xlab="rooms",col=c("darkblue","red","green"),
beside=TRUE)
ggplot(cleaned_df, aes(x=Price, y=BuildingArea)) + geom_point() + ggtitle("\t\t\t Price vs BuildingArea scater plot") + xlab("Price") + ylab("BuildingArea")
ggplot(cleaned_df, aes(x=Price, y=Landsize)) + geom_point() + ggtitle("\t\t\t Price vs Landsize scater plot") + xlab("Price") + ylab("Landsize")