title: “houses” author: “rg” date: “2022-12-01”
Reading the Melbourne data & importing required libraries
require(ggplot2)
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(sjmisc)
##
## Attaching package: 'sjmisc'
## The following object is masked from 'package:tidyr':
##
## replace_na
library(corrplot)
## corrplot 0.92 loaded
library(fastDummies)
library(caret)
## Loading required package: lattice
library(tidyr)
library(BBmisc)
##
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:sjmisc':
##
## %nin%, seq_col, seq_row
## The following objects are masked from 'package:dplyr':
##
## coalesce, collapse
## The following object is masked from 'package:base':
##
## isFALSE
library(class)
##load the package class
library(class)
library(C50)
library(MASS) # Needed to sample multivariate Gaussian distributions
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(neuralnet) # The package for neural networks in R
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
housing.dataset <- read.csv("D:/Freelancer_questions/shivam/Melbourne_housing/melbourne_data.csv", header = TRUE)
str(housing.dataset)
## 'data.frame': 34857 obs. of 12 variables:
## $ Date : chr "03-09-2016" "03-12-2016" "04-02-2016" "04-02-2016" ...
## $ Type : chr "h" "h" "h" "u" ...
## $ Price : int NA 1480000 1035000 NA 1465000 850000 1600000 NA NA NA ...
## $ Landsize : int 126 202 156 0 134 94 120 400 201 202 ...
## $ BuildingArea : num NA NA 79 NA 150 NA 142 220 NA NA ...
## $ Rooms : int 2 2 2 3 3 3 4 4 2 2 ...
## $ Bathroom : int 1 1 1 2 2 2 1 2 1 2 ...
## $ Car : int 1 1 0 1 0 1 2 2 2 1 ...
## $ YearBuilt : int NA NA 1900 NA 1900 NA 2014 2006 1900 1900 ...
## $ Distance : chr "2.5" "2.5" "2.5" "2.5" ...
## $ Regionname : chr "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
## $ Propertycount: chr "4019" "4019" "4019" "4019" ...
data <- dummy_cols(housing.dataset,
select_columns = c("Type","Regionname","Propertycount"),remove_selected_columns = TRUE)
data <- data[, !(colnames(data) %in% c("Date"))]
data <- data.frame(apply(data, 2, function(x) as.numeric(as.character(x))))
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
data <- data %>% drop_na(Price)
#split data
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(417)
idx <- sample(nrow(data), nrow(data)* 0.75)
housing_train <- data[idx,]
housing_test <- data[ -idx,]
full_additive_model = lm(Price ~ ., data = housing_train)
summary(full_additive_model)$adj.r.squared
## [1] 0.7012419
housing_test$Predicted_Price <- predict(full_additive_model, housing_test)
## Warning in predict.lm(full_additive_model, housing_test): prediction from a
## rank-deficient fit may be misleading
housing_test <- housing_test %>% drop_na(Price)
housing_test <- housing_test %>% drop_na(Predicted_Price)
MAE(housing_test$Predicted_Price, housing_test$Price)
## [1] 235951.6
RMSE(housing_test$Predicted_Price, housing_test$Price)
## [1] 359541
preproc_data = normalize(data[,2:ncol(data)], method = "range", range = c(0, 1))
preproc_data$Price <- data$Price
set.seed(417)
idx <- sample(nrow(preproc_data), nrow(preproc_data)* 0.75)
housing_train_prec <- preproc_data[idx,]
housing_test_prec <- preproc_data[ -idx,]
full_additive_model_prec = lm(Price ~ ., data = housing_train_prec)
summary(full_additive_model_prec)$adj.r.squared
## [1] 0.7012419
housing_test_prec$Predicted_Price <- predict(full_additive_model_prec, housing_test_prec)
## Warning in predict.lm(full_additive_model_prec, housing_test_prec): prediction
## from a rank-deficient fit may be misleading
housing_test_prec <- housing_test_prec %>% drop_na(Price)
housing_test_prec <- housing_test_prec %>% drop_na(Predicted_Price)
MAE(housing_test_prec$Predicted_Price, housing_test_prec$Price)
## [1] 235951.6
RMSE(housing_test_prec$Predicted_Price, housing_test_prec$Price)
## [1] 359541
data2 <- data
data2<- data2 %>% drop_na()
km.res <- kmeans(data2, 4, nstart = 25)
data2$Target <- as.factor(km.res$cluster)
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(417)
idx <- sample(nrow(data2), nrow(data2)* 0.80)
housing_train_80 <- data2[idx,]
housing_test_20 <- data2[ -idx,]
unique(housing_train_80$Target)
## [1] 2 4 1 3
## Levels: 1 2 3 4
unique(housing_test_20$Target)
## [1] 2 4 1 3
## Levels: 1 2 3 4
modelknn<- knn(train=housing_train_80, test=housing_test_20, cl=housing_train_80$Target, k=21)
caret::confusionMatrix(housing_test_20$Target, modelknn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 210 0 0 0
## 2 2 552 0 0
## 3 0 0 38 0
## 4 0 0 0 977
##
## Overall Statistics
##
## Accuracy : 0.9989
## 95% CI : (0.9959, 0.9999)
## No Information Rate : 0.5492
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9981
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.9906 1.0000 1.00000 1.0000
## Specificity 1.0000 0.9984 1.00000 1.0000
## Pos Pred Value 1.0000 0.9964 1.00000 1.0000
## Neg Pred Value 0.9987 1.0000 1.00000 1.0000
## Prevalence 0.1192 0.3103 0.02136 0.5492
## Detection Rate 0.1180 0.3103 0.02136 0.5492
## Detection Prevalence 0.1180 0.3114 0.02136 0.5492
## Balanced Accuracy 0.9953 0.9992 1.00000 1.0000
c50 <- C5.0(housing_train_80[,-363], housing_train_80$Target)
c50
##
## Call:
## C5.0.default(x = housing_train_80[, -363], y = housing_train_80$Target)
##
## Classification Tree
## Number of samples: 7116
## Number of predictors: 363
##
## Tree size: 4
##
## Non-standard options: attempt to group attributes
set.seed(333)
n <- neuralnet(Target~ Landsize + BuildingArea + Rooms + Bathroom + Car,
data = housing_train_80,
hidden = 5,#adjust the hidden layers
err.fct = "ce",
linear.output = FALSE)
## Warning: Algorithm did not converge in 1 of 1 repetition(s) within the stepmax.
summary(n)
## Length Class Mode
## call 6 -none- call
## response 28464 -none- logical
## covariate 35580 -none- numeric
## model.list 2 -none- list
## err.fct 1 -none- function
## act.fct 1 -none- function
## linear.output 1 -none- logical
## data 364 data.frame list
## exclude 0 -none- NULL