title: “houses” author: “rg” date: “2022-12-01”
Reading the Melbourne data & importing required libraries
require(ggplot2)
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(sjmisc)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
##
## Attaching package: 'sjmisc'
## The following object is masked from 'package:tidyr':
##
## replace_na
library(corrplot)
## corrplot 0.92 loaded
library(fastDummies)
library(caret)
## Loading required package: lattice
library(tidyr)
library(BBmisc)
##
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:sjmisc':
##
## %nin%, seq_col, seq_row
## The following objects are masked from 'package:dplyr':
##
## coalesce, collapse
## The following object is masked from 'package:base':
##
## isFALSE
library(class)
##load the package class
library(class)
library(C50)
library(MASS) # Needed to sample multivariate Gaussian distributions
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(neuralnet) # The package for neural networks in R
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(nnet)
housing.dataset <- read.csv("D:/Freelancer_questions/shivam/Melbourne_housing/melbourne_housing_data.csv", header = TRUE)
str(housing.dataset)
## 'data.frame': 48433 obs. of 14 variables:
## $ X : int 1 2 3 4 5 6 7 8 10 11 ...
## $ Suburb : chr "Abbotsford" "Abbotsford" "Abbotsford" "Aberfeldie" ...
## $ Address : chr "49 Lithgow St" "59A Turner St" "119B Yarra St" "68 Vida St" ...
## $ Rooms : int 3 3 3 3 2 2 2 3 3 3 ...
## $ Type : chr "h" "h" "h" "h" ...
## $ Price : int 1490000 1220000 1420000 1515000 670000 530000 540000 715000 1925000 515000 ...
## $ Method : chr "S" "S" "S" "S" ...
## $ SellerG : chr "Jellis" "Marshall" "Nelson" "Barry" ...
## $ Date : chr "1/04/2017" "1/04/2017" "1/04/2017" "1/04/2017" ...
## $ Postcode : int 3067 3067 3067 3040 3042 3042 3042 3042 3206 3020 ...
## $ Regionname : chr "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Western Metropolitan" ...
## $ Propertycount: int 4019 4019 4019 1543 3464 3464 3464 3464 3280 2185 ...
## $ Distance : num 3 3 3 7.5 10.4 10.4 10.4 10.4 3 10.5 ...
## $ CouncilArea : chr "Yarra City Council" "Yarra City Council" "Yarra City Council" "Moonee Valley City Council" ...
#Hypothesis testing # Null hypothesis: price of houses in Yarra city council area of Melbourne city is more than the average price of houses in the melbourn city # Alternative hypothesis: price of houses in Yarra city council area of Melbourne city is less than the average price of houses in the melbourn city # we are going to use z test for proving this statement statistically. following is the equation for z test # Zcalculated = (Xbar-m)/ (s/sqrt(n)). Now we need to find value for each variable and find the Z calculated value
# Xbar. This is the sample mean price of houses under Yarra City Council.
houseOf_YarraCity<- housing.dataset[housing.dataset$CouncilArea=='Yarra City Council',]
#Now we are going to take 40 samples from this dataframe and find the average price (Xbar)
houseOf_YarraCity<- head(houseOf_YarraCity,40)
Xbar<- mean(houseOf_YarraCity$Price)
Xbar
## [1] 1212000
# Now we need to find the average price (m) and standard deviation(s) of the entire population
m<-mean(housing.dataset$Price)
s<-sd(housing.dataset$Price,na.rm=FALSE)
n<-40
# substituting values in equation for finding Z calcuated value
Zcal<-((Xbar-m)/ (s/sqrt(n)))
Zcal
## [1] 2.281552
# Z calculated value is 2.28
# Now lets assume significance level as 5%. With a = .05 z table value will be 1.645
# Comparing Z calulated value and Z table value
# Z cal(2.281552)>1.645. Hence we reject Null hypothesis
df <- housing.dataset
df$test <- ifelse(df$CouncilArea == "Yarra City Council", 'Yarra City Council', 'Others')
qplot(Price, test, data=df, geom="boxplot", color=test) + ggtitle("boxplot between price and CouncilArea")
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
#Null hypothesis: price of houses in with rooms greater than 3 is more than the average price of houses with rooms 1,2,3 # Alternative hypothesis: price ofhouses in with rooms greater than 3 is less than the average price of houses with rooms 1,2,3 # doing a paired sameple t test
df2 <- housing.dataset
df2$test <- ifelse(df2$Rooms > 3, 'More than three rooms', 'less than 3 rooms')
qplot(Price, Rooms, data=df2, geom="boxplot", color=test) + ggtitle("boxplot between price and rooms")
###taking 100 records with two types of sample
###taking 100 records with two types of sample
less_3_rooms <- df2[which(df2$test=='less than 3 rooms'),]$Price[1:100]
more_than_3 <- df2[which(df2$test=='More than three rooms'),]$Price[1:100]
t.test(less_3_rooms, more_than_3, paired = TRUE, alternative = "two.sided")
##
## Paired t-test
##
## data: less_3_rooms and more_than_3
## t = -5.4789, df = 99, p-value = 3.259e-07
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -621748.4 -291142.2
## sample estimates:
## mean difference
## -456445.3
data <- housing.dataset[, !(colnames(housing.dataset) %in% c("X","Address","SellerG", "Date", "Postcode", "Propertycount"))]
data <- dummy_cols(data,
select_columns = c("Type","Suburb","Method","Regionname", "CouncilArea"),remove_selected_columns = TRUE)
data <- data.frame(apply(data, 2, function(x) as.numeric(as.character(x))))
data <- data %>% drop_na(Price)
#split data
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(417)
idx <- sample(nrow(data), nrow(data)* 0.75)
housing_train <- data[idx,]
housing_test <- data[ -idx,]
full_additive_model = lm(Price ~ ., data = housing_train)
summary(full_additive_model)$adj.r.squared
## [1] 0.6449683
housing_test$Predicted_Price <- predict(full_additive_model, housing_test)
## Warning in predict.lm(full_additive_model, housing_test): prediction from a
## rank-deficient fit may be misleading
housing_test <- housing_test %>% drop_na(Price)
housing_test <- housing_test %>% drop_na(Predicted_Price)
MAE(housing_test$Predicted_Price, housing_test$Price)
## [1] 228826.6
RMSE(housing_test$Predicted_Price, housing_test$Price)
## [1] 369807.3
preproc_data = normalize(data[,2:ncol(data)], method = "range", range = c(0, 1))
preproc_data$Price <- data$Price
set.seed(417)
idx <- sample(nrow(preproc_data), nrow(preproc_data)* 0.75)
housing_train_prec <- preproc_data[idx,]
housing_test_prec <- preproc_data[ -idx,]
full_additive_model_prec = lm(Price ~ ., data = housing_train_prec)
summary(full_additive_model_prec)$adj.r.squared
## [1] 0.5531689
housing_test_prec$Predicted_Price <- predict(full_additive_model_prec, housing_test_prec)
## Warning in predict.lm(full_additive_model_prec, housing_test_prec): prediction
## from a rank-deficient fit may be misleading
housing_test_prec <- housing_test_prec %>% drop_na(Price)
housing_test_prec <- housing_test_prec %>% drop_na(Predicted_Price)
MAE(housing_test_prec$Predicted_Price, housing_test_prec$Price)
## [1] 254775.5
RMSE(housing_test_prec$Predicted_Price, housing_test_prec$Price)
## [1] 408628.2
data2 <- data
data2 <- data2[, !(colnames(data2) %in% c("Type_u","Type_t","Type_h"))]
data2 <-cbind(data2,housing.dataset$Type)
colnames(data2)[which(names(data2) == "housing.dataset$Type")] <- "Type"
data2$Type <- as.factor(data2$Type)
data2 <- data2[, (colnames(data2) %in% c("Rooms","Price","Distance", "Type"))]
data2<- data2 %>% drop_na()
data2$Rooms <- as.numeric(data2$Rooms)
data2$Price <- as.numeric(data2$Price)
data2$Distance <- as.numeric(data2$Distance)
sum(is.na(data2))
## [1] 0
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(417)
idx <- sample(nrow(data2), nrow(data2)* 0.80)
housing_train_80 <- data2[idx,]
housing_test_20 <- data2[ -idx,]
unique(housing_train_80$Type)
## [1] h u t
## Levels: h t u
unique(housing_test_20$Type)
## [1] h u t
## Levels: h t u
modelknn<- knn(train=housing_train_80[,-4], test=housing_test_20[,-4], cl=housing_train_80$Type, k=1)
caret::confusionMatrix(housing_test_20$Type, modelknn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction h t u
## h 5863 528 469
## t 610 213 166
## u 546 173 1119
##
## Overall Statistics
##
## Accuracy : 0.7427
## 95% CI : (0.7339, 0.7514)
## No Information Rate : 0.7246
## P-Value [Acc > NIR] : 2.935e-05
##
## Kappa : 0.4192
##
## Mcnemar's Test P-Value : 0.007753
##
## Statistics by Class:
##
## Class: h Class: t Class: u
## Sensitivity 0.8353 0.23304 0.6380
## Specificity 0.6263 0.91155 0.9094
## Pos Pred Value 0.8547 0.21537 0.6088
## Neg Pred Value 0.5911 0.91941 0.9191
## Prevalence 0.7246 0.09435 0.1811
## Detection Rate 0.6052 0.02199 0.1155
## Detection Prevalence 0.7082 0.10210 0.1897
## Balanced Accuracy 0.7308 0.57229 0.7737
c50 <- C5.0(housing_train_80[,-4], housing_train_80$Type)
c50
##
## Call:
## C5.0.default(x = housing_train_80[, -4], y = housing_train_80$Type)
##
## Classification Tree
## Number of samples: 38746
## Number of predictors: 3
##
## Tree size: 323
##
## Non-standard options: attempt to group attributes
caret::confusionMatrix(housing_test_20$Type, predict(c50, newdata = housing_test_20[,-4]))
## Confusion Matrix and Statistics
##
## Reference
## Prediction h t u
## h 6562 101 197
## t 677 148 164
## u 403 61 1374
##
## Overall Statistics
##
## Accuracy : 0.8345
## 95% CI : (0.827, 0.8419)
## No Information Rate : 0.7889
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5905
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: h Class: t Class: u
## Sensitivity 0.8587 0.47742 0.7919
## Specificity 0.8543 0.91031 0.9416
## Pos Pred Value 0.9566 0.14965 0.7476
## Neg Pred Value 0.6180 0.98138 0.9540
## Prevalence 0.7889 0.03200 0.1791
## Detection Rate 0.6774 0.01528 0.1418
## Detection Prevalence 0.7082 0.10210 0.1897
## Balanced Accuracy 0.8565 0.69387 0.8668
nn<-nnet(Type ~ Distance + Rooms, data=housing_train_80,hidden=5,size=5, decay=5e-4, maxit=200)
## # weights: 33
## initial value 70814.435729
## iter 10 value 30909.725216
## iter 20 value 26386.171248
## iter 30 value 23626.919114
## iter 40 value 23210.444797
## iter 50 value 23070.440526
## iter 60 value 22975.935494
## iter 70 value 22896.631696
## iter 80 value 22867.220123
## iter 90 value 22840.392091
## iter 100 value 22823.690674
## iter 110 value 22807.449790
## iter 120 value 22802.526424
## iter 130 value 22798.795922
## iter 140 value 22793.018430
## iter 150 value 22789.437406
## iter 160 value 22786.800858
## iter 170 value 22783.835455
## iter 180 value 22782.373644
## iter 190 value 22781.813167
## iter 200 value 22778.745784
## final value 22778.745784
## stopped after 200 iterations
test <- predict(nn, housing_test_20[,c(1:3)],type="class")
unique(test)
## [1] "h" "u"