title: “houses” author: “rg” date: “2022-12-01”
Reading the Melbourne data & importing required libraries
require(ggplot2)
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(sjmisc)
##
## Attaching package: 'sjmisc'
## The following object is masked from 'package:tidyr':
##
## replace_na
library(corrplot)
## corrplot 0.92 loaded
library(fastDummies)
library(caret)
## Loading required package: lattice
library(tidyr)
library(BBmisc)
##
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:sjmisc':
##
## %nin%, seq_col, seq_row
## The following objects are masked from 'package:dplyr':
##
## coalesce, collapse
## The following object is masked from 'package:base':
##
## isFALSE
library(class)
##load the package class
library(class)
library(C50)
library(MASS) # Needed to sample multivariate Gaussian distributions
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(neuralnet) # The package for neural networks in R
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(nnet)
housing.dataset <- read.csv("D:/Freelancer_questions/shivam/Melbourne_housing/melbourne_data.csv", header = TRUE)
str(housing.dataset)
## 'data.frame': 34857 obs. of 12 variables:
## $ Date : chr "03-09-2016" "03-12-2016" "04-02-2016" "04-02-2016" ...
## $ Type : chr "h" "h" "h" "u" ...
## $ Price : int NA 1480000 1035000 NA 1465000 850000 1600000 NA NA NA ...
## $ Landsize : int 126 202 156 0 134 94 120 400 201 202 ...
## $ BuildingArea : num NA NA 79 NA 150 NA 142 220 NA NA ...
## $ Rooms : int 2 2 2 3 3 3 4 4 2 2 ...
## $ Bathroom : int 1 1 1 2 2 2 1 2 1 2 ...
## $ Car : int 1 1 0 1 0 1 2 2 2 1 ...
## $ YearBuilt : int NA NA 1900 NA 1900 NA 2014 2006 1900 1900 ...
## $ Distance : chr "2.5" "2.5" "2.5" "2.5" ...
## $ Regionname : chr "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
## $ Propertycount: chr "4019" "4019" "4019" "4019" ...
ggplot(housing.dataset, aes(x=Distance, y=Price)) + geom_point() +
ggtitle("\t\t\t Distance vs Price scater plot") + xlab("Distance") +
ylab("Price")
## Warning: Removed 7610 rows containing missing values (`geom_point()`).
ggplot(housing.dataset, aes(x=Bathroom, y=Price)) + geom_point() +
ggtitle("\t\t\t Bathroom vs Price scater plot") + xlab("no of Bathroom") +
ylab("Price")
## Warning: Removed 14057 rows containing missing values (`geom_point()`).
ggplot(housing.dataset, aes(x=Car, y=Price)) + geom_point() +
ggtitle("\t\t\t Car vs Price scater plot") + xlab("no of Car") +
ylab("Price")
## Warning: Removed 14434 rows containing missing values (`geom_point()`).
data <- dummy_cols(housing.dataset,
select_columns = c("Type","Regionname"),remove_selected_columns = TRUE)
data <- data[, !(colnames(data) %in% c("Date","Propertycount","YearBuilt"))]
data <- data.frame(apply(data, 2, function(x) as.numeric(as.character(x))))
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
data <- data %>% drop_na(Price)
###replacing the null with median values
data$Landsize [is.na(data$Landsize)]<-median(data$Landsize ,na.rm=TRUE)
data$BuildingArea [is.na(data$BuildingArea)]<-median(data$BuildingArea ,na.rm=TRUE)
data$Rooms [is.na(data$Rooms)]<-median(data$Rooms ,na.rm=TRUE)
data$Bathroom [is.na(data$Bathroom)]<-median(data$Bathroom ,na.rm=TRUE)
data$Car[is.na(data$Car)]<-median(data$Car ,na.rm=TRUE)
data$Distance[is.na(data$Distance)]<-median(data$Distance ,na.rm=TRUE)
#split data
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(417)
idx <- sample(nrow(data), nrow(data)* 0.75)
housing_train <- data[idx,]
housing_test <- data[ -idx,]
full_additive_model = lm(Price ~ ., data = housing_train)
summary(full_additive_model)$adj.r.squared
## [1] 0.5571159
housing_test$Predicted_Price <- predict(full_additive_model, housing_test)
## Warning in predict.lm(full_additive_model, housing_test): prediction from a
## rank-deficient fit may be misleading
housing_test <- housing_test %>% drop_na(Price)
housing_test <- housing_test %>% drop_na(Predicted_Price)
MAE(housing_test$Predicted_Price, housing_test$Price)
## [1] 284493.5
RMSE(housing_test$Predicted_Price, housing_test$Price)
## [1] 416837.9
preproc_data = normalize(data[,2:ncol(data)], method = "range", range = c(0, 1))
preproc_data$Price <- data$Price
set.seed(417)
idx <- sample(nrow(preproc_data), nrow(preproc_data)* 0.75)
housing_train_prec <- preproc_data[idx,]
housing_test_prec <- preproc_data[ -idx,]
full_additive_model_prec = lm(Price ~ ., data = housing_train_prec)
summary(full_additive_model_prec)$adj.r.squared
## [1] 0.5571159
housing_test_prec$Predicted_Price <- predict(full_additive_model_prec, housing_test_prec)
## Warning in predict.lm(full_additive_model_prec, housing_test_prec): prediction
## from a rank-deficient fit may be misleading
housing_test_prec <- housing_test_prec %>% drop_na(Price)
housing_test_prec <- housing_test_prec %>% drop_na(Predicted_Price)
MAE(housing_test_prec$Predicted_Price, housing_test_prec$Price)
## [1] 284493.5
RMSE(housing_test_prec$Predicted_Price, housing_test_prec$Price)
## [1] 416837.9
data2 <- data
data2 <- data2[, !(colnames(data2) %in% c("Price"))]
data2<- data2 %>% drop_na()
km.res <- kmeans(data2, 4, nstart = 25)
data2$Target <- as.factor(km.res$cluster)
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(417)
idx <- sample(nrow(data2), nrow(data2)* 0.80)
housing_train_80 <- data2[idx,]
housing_test_20 <- data2[ -idx,]
unique(housing_train_80$Target)
## [1] 3 2 1 4
## Levels: 1 2 3 4
unique(housing_test_20$Target)
## [1] 3 2 1
## Levels: 1 2 3 4
modelknn<- knn(train=housing_train_80, test=housing_test_20, cl=housing_train_80$Target, k=21)
caret::confusionMatrix(housing_test_20$Target, modelknn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 0 0 1 0
## 2 0 1503 4 0
## 3 0 2 3940 0
## 4 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9987
## 95% CI : (0.9974, 0.9995)
## No Information Rate : 0.7239
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9968
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity NA 0.9987 0.9987 NA
## Specificity 0.9998165 0.9990 0.9987 1
## Pos Pred Value NA 0.9973 0.9995 NA
## Neg Pred Value NA 0.9995 0.9967 NA
## Prevalence 0.0000000 0.2761 0.7239 0
## Detection Rate 0.0000000 0.2758 0.7229 0
## Detection Prevalence 0.0001835 0.2765 0.7233 0
## Balanced Accuracy NA 0.9988 0.9987 NA
c50 <- C5.0(housing_train_80[,-19], housing_train_80$Target)
c50
##
## Call:
## C5.0.default(x = housing_train_80[, -19], y = housing_train_80$Target)
##
## Classification Tree
## Number of samples: 21797
## Number of predictors: 18
##
## Tree size: 9
##
## Non-standard options: attempt to group attributes
caret::confusionMatrix(housing_test_20$Target, predict(c50, newdata = housing_test_20[,-19]))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 1 0 0 0
## 2 0 1507 0 0
## 3 0 1 3941 0
## 4 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9998
## 95% CI : (0.999, 1)
## No Information Rate : 0.7231
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9995
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 1.0000000 0.9993 1.0000 NA
## Specificity 1.0000000 1.0000 0.9993 1
## Pos Pred Value 1.0000000 1.0000 0.9997 NA
## Neg Pred Value 1.0000000 0.9997 1.0000 NA
## Prevalence 0.0001835 0.2767 0.7231 0
## Detection Rate 0.0001835 0.2765 0.7231 0
## Detection Prevalence 0.0001835 0.2765 0.7233 0
## Balanced Accuracy 1.0000000 0.9997 0.9997 NA
nn<-nnet(Target ~ Landsize + BuildingArea + Rooms + Bathroom + Car, data=housing_train_80, hidden=5,size=5, decay=5e-4, maxit=200)
## # weights: 54
## initial value 34980.666995
## iter 10 value 6368.356372
## iter 20 value 5113.239804
## iter 30 value 4143.094701
## iter 40 value 2560.510333
## iter 50 value 896.130659
## iter 60 value 271.642408
## iter 70 value 154.730789
## iter 80 value 121.184533
## iter 90 value 75.861625
## iter 100 value 46.469550
## iter 110 value 43.212627
## iter 120 value 34.329770
## iter 130 value 30.871260
## iter 140 value 27.917828
## iter 150 value 26.849672
## iter 160 value 22.810192
## iter 170 value 17.703523
## iter 180 value 16.089653
## iter 190 value 14.652899
## iter 200 value 13.770072
## final value 13.770072
## stopped after 200 iterations
test <- predict(nn, housing_test_20[,c(1:5)],type="class")
unique(test)
## [1] "3" "2" "1"