Read data- Test and Train split
Part 1:reading external data and storing into a dataframe
library(data.table)
data <- read.csv("Training Data.csv")
data <- data[,-c(4,11,12)]
#dt <- fread(input="DATA_Unique_V5.csv",stringsAsFactors=TRUE)
#attach(houseprices.df)
# Train data 75% of the sample size
smp_size <- floor(0.75 * nrow(data))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
#Test and train data
data_train <- data[train_ind, ]
data_test <- data[-train_ind, ]
Installing Packages
#install.packages("Hmisc")
#install.packages("car")
#install.packages("corrplot")
library(corrplot)
## corrplot 0.84 loaded
library(car)
## Loading required package: carData
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(stats)
library(plyr)
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# install.packages("pROC")
#install.packages("randomForest")
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
Data cleaning, Missing value treatment done in Excel
sum_na <- function(vector)
{
s <- sum(is.na(vector))
return(s)
}
#Missing value statistics
NA_count<-sapply(data_train,sum_na)
Missing_value_var <- names(NA_count[which(NA_count>0)])
missing_percentage <- NA_count/nrow(data_train)
missing_percentage
## Area MacroArea dataId No..of.Rooms
## 0 0 0 0
## No..of.Bathrooms Big.Builder No..of.Photos Furnishing.status
## 0 0 0 0
## Floor.no Car.Parking Total.Floors Flooring
## 0 0 0 0
## Facing Tenants Rent_Final Super.Area.2
## 0 0 0 0
## Carpet.Area.2
## 0
data_train_miss <- data_train
data_test_miss <- data_test
class_data <- sapply(data_train_miss, class)
class_data
## Area MacroArea dataId No..of.Rooms
## "factor" "factor" "integer" "integer"
## No..of.Bathrooms Big.Builder No..of.Photos Furnishing.status
## "integer" "integer" "integer" "factor"
## Floor.no Car.Parking Total.Floors Flooring
## "integer" "factor" "integer" "factor"
## Facing Tenants Rent_Final Super.Area.2
## "factor" "factor" "integer" "numeric"
## Carpet.Area.2
## "numeric"
#Categorical variable transformation to log odds of each category
categorical_variable <- colnames(data_train_miss)[class_data=="factor"]
data_char <- data_train_miss
data_char_test <- data_test_miss
#data_char_score <- data_score_miss
for (i in categorical_variable)
{
category_mean <- aggregate(data_char$Rent_Final,list(data_char[[i]]), mean)
category_mean$log_odds = category_mean$x
# category_mean$log_odds[which(category_mean$log_odds==Inf)] <- 100
# category_mean$log_odds[which(category_mean$log_odds==-Inf)] <- -100
for (j in as.character(category_mean$Group.1))
{
data_char[which(data_char[[i]]==j),paste(i,"_char",sep="")] <- category_mean$log_odds[category_mean$Group.1==j]
data_char_test[which(data_char_test[[i]]==j),paste(i,"_char",sep="")] <- category_mean$log_odds[category_mean$Group.1==j]
#data_char_score[which(data_char_score[[i]]==j),paste(i,"_char",sep="")] <- category_mean$log_odds[category_mean$Group.1==j]
}
}
#Numerical variable transformation to log odds of each category
data_num <- data_char[,-match(categorical_variable,colnames(data_char))]
data_num_test <- data_char_test[,-match(categorical_variable,colnames(data_char))]
#data_num_score <- data_char_score[,-c(39:48)]
Model Development- Normal and interaction- 76% R2 on training data
houseprices.df <- data_num
univariate_summary <- data.frame(matrix(nrow=15,ncol=7))
colnames(univariate_summary)<- c("Variable","P_Value","Coefficient","Intercept","RSquare","Adj_RSquare","VIF")
j=1
for(i in c(2:7,9:17))
{
regress <- lm(Rent_Final~ houseprices.df[,i],houseprices.df)
df <- summary(regress)
univariate_summary[j,1] <- colnames(houseprices.df)[i]
univariate_summary[j,2] <- df$coefficients[2,4]
univariate_summary[j,3] <- df$coefficients[2,1]
univariate_summary[j,4] <- df$coefficients[1,1]
univariate_summary[j,5] <- df$r.squared
univariate_summary[j,6] <- df$adj.r.squared
j=j+1
}
univariate_summary
## Variable P_Value Coefficient Intercept
## 1 No..of.Rooms 6.503163e-79 26488.492722 -6.079734e+02
## 2 No..of.Bathrooms 1.758145e-31 19355.464200 8.173354e+03
## 3 Big.Builder 2.259392e-06 10287.259462 4.309199e+04
## 4 No..of.Photos 6.555831e-01 -48.882907 4.468884e+04
## 5 Floor.no 6.412748e-05 500.276837 4.102718e+04
## 6 Total.Floors 1.291703e-05 327.759817 4.005866e+04
## 7 Super.Area.2 5.863632e-124 55.110402 -6.417137e+03
## 8 Carpet.Area.2 2.272728e-09 6.911705 3.958790e+04
## 9 Area_char 2.279407e-279 1.000000 1.473774e-11
## 10 MacroArea_char 5.519023e-178 1.000000 1.473774e-11
## 11 Furnishing.status_char 1.964371e-26 1.000000 0.000000e+00
## 12 Car.Parking_char 5.687151e-30 1.000000 1.597790e-11
## 13 Flooring_char 1.824911e-57 1.000000 1.845822e-11
## 14 Facing_char 1.223269e-02 1.000000 -4.912580e-11
## 15 Tenants_char 1.127257e-04 1.000000 1.192097e-12
## RSquare Adj_RSquare VIF
## 1 0.2192138903 0.2186678860 NA
## 2 0.0909122489 0.0902765232 NA
## 3 0.0155206756 0.0148322285 NA
## 4 0.0001391533 -0.0005600501 NA
## 5 0.0111141847 0.0104226561 NA
## 6 0.0132191084 0.0125290519 NA
## 7 0.3244607264 0.3239883213 NA
## 8 0.0246812853 0.0239992443 NA
## 9 0.5902918129 0.5900053037 NA
## 10 0.4322252040 0.4318281587 NA
## 11 0.0761251534 0.0754790870 NA
## 12 0.0865124472 0.0858736447 NA
## 13 0.1635478027 0.1629628711 NA
## 14 0.0043812981 0.0036850612 NA
## 15 0.0103764503 0.0096844058 NA
library(ggplot2)
ggplot(univariate_summary, aes(as.factor(Variable), Adj_RSquare)) +
geom_bar(stat = "identity", col="green", fill="green")+
labs(y = "Adjusted R square", x = "Numerical Variable")

predictive_variables <-univariate_summary$Variable[which(univariate_summary$RSquare>=0.05)]
model_all <- lm(Rent_Final~No..of.Rooms+No..of.Bathrooms+Super.Area.2+Area_char+Furnishing.status_char+Car.Parking_char+Flooring_char,houseprices.df)
df2 <- summary(model_all)
df2$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.287237e+04 3.134854e+03 -20.055919 5.278569e-79
## No..of.Rooms 3.052895e+03 1.184147e+03 2.578140 1.003308e-02
## No..of.Bathrooms 3.005448e+03 1.008111e+03 2.981267 2.919314e-03
## Super.Area.2 2.747883e+01 2.070802e+00 13.269660 5.641152e-38
## Area_char 8.075600e-01 1.843700e-02 43.801059 4.005636e-266
## Furnishing.status_char 2.876635e-01 4.819061e-02 5.969286 3.003586e-09
## Car.Parking_char 2.372806e-01 4.758691e-02 4.986257 6.911819e-07
## Flooring_char 2.740283e-01 3.430020e-02 7.989117 2.781792e-15
m1_rsq <- df2$adj.r.squared
predict_test <- predict(model_all,data_char_test)
#No significant improvement for interaction of room with bathroom
model_int <- lm(Rent_Final~No..of.Rooms*No..of.Bathrooms+Super.Area.2+Area_char+Furnishing.status_char+Car.Parking_char+Flooring_char,houseprices.df)
df3 <- summary(model_int)
df3$coefficients
## Estimate Std. Error t value
## (Intercept) -4.962568e+04 5.989297e+03 -8.285728
## No..of.Rooms -6.890235e+03 4.011567e+03 -1.717592
## No..of.Bathrooms -4.255080e+03 2.974587e+03 -1.430478
## Super.Area.2 2.730426e+01 2.067745e+00 13.204844
## Area_char 8.037985e-01 1.845709e-02 43.549570
## Furnishing.status_char 2.804923e-01 4.817340e-02 5.822556
## Car.Parking_char 2.362254e-01 4.749324e-02 4.973873
## Flooring_char 2.761992e-01 3.424166e-02 8.066175
## No..of.Rooms:No..of.Bathrooms 5.436696e+03 2.096103e+03 2.593716
## Pr(>|t|)
## (Intercept) 2.680671e-16
## No..of.Rooms 8.608863e-02
## No..of.Bathrooms 1.527995e-01
## Super.Area.2 1.220216e-37
## Area_char 5.042178e-264
## Furnishing.status_char 7.152587e-09
## Car.Parking_char 7.361063e-07
## Flooring_char 1.526368e-15
## No..of.Rooms:No..of.Bathrooms 9.591936e-03
df3$adj.r.squared
## [1] 0.7648923
#Check room with area- No significant improvement
model_int_2 <- lm(Rent_Final~No..of.Rooms+No..of.Bathrooms+Super.Area.2+Area_char+Furnishing.status_char+Car.Parking_char+Flooring_char+No..of.Rooms*Super.Area.2,houseprices.df)
df4 <- summary(model_int_2)
df4$coefficients
## Estimate Std. Error t value
## (Intercept) -4.455144e+04 7.459163e+03 -5.9727136
## No..of.Rooms -7.635193e+03 4.123360e+03 -1.8516922
## No..of.Bathrooms 3.590867e+03 1.028891e+03 3.4900364
## Super.Area.2 -1.945100e+00 1.107000e+01 -0.1757091
## Area_char 8.069878e-01 1.839744e-02 43.8641466
## Furnishing.status_char 2.902405e-01 4.809346e-02 6.0349276
## Car.Parking_char 2.310073e-01 4.753824e-02 4.8593985
## Flooring_char 2.733863e-01 3.422516e-02 7.9878744
## No..of.Rooms:Super.Area.2 1.570782e+01 5.805811e+00 2.7055341
## Pr(>|t|)
## (Intercept) 2.943110e-09
## No..of.Rooms 6.427707e-02
## No..of.Bathrooms 4.977001e-04
## Super.Area.2 8.605475e-01
## Area_char 1.419569e-266
## Furnishing.status_char 2.025010e-09
## Car.Parking_char 1.307772e-06
## Flooring_char 2.810136e-15
## No..of.Rooms:Super.Area.2 6.900880e-03
df4$adj.r.squared
## [1] 0.7649897
#Check interaction of Flooring with super area- No significant improvement
model_int_3 <- lm(Rent_Final~No..of.Rooms+No..of.Bathrooms+Super.Area.2+Area_char+Furnishing.status_char+Car.Parking_char+Flooring_char+Flooring_char*Super.Area.2,houseprices.df)
df4 <- summary(model_int_3)
df4$coefficients
## Estimate Std. Error t value
## (Intercept) -3.307800e+04 5.742486e+03 -5.7602235
## No..of.Rooms 4.294272e+03 1.186324e+03 3.6198140
## No..of.Bathrooms 3.256329e+03 9.961170e+02 3.2690221
## Super.Area.2 -3.145401e+00 5.375746e+00 -0.5851096
## Area_char 8.103862e-01 1.820819e-02 44.5066758
## Furnishing.status_char 2.920930e-01 4.758288e-02 6.1386156
## Car.Parking_char 2.164891e-01 4.710253e-02 4.5961247
## Flooring_char -4.236162e-01 1.182158e-01 -3.5834146
## Super.Area.2:Flooring_char 6.588227e-04 1.069590e-04 6.1595789
## Pr(>|t|)
## (Intercept) 1.027905e-08
## No..of.Rooms 3.051498e-04
## No..of.Bathrooms 1.104979e-03
## Super.Area.2 5.585668e-01
## Area_char 8.942659e-272
## Furnishing.status_char 1.077536e-09
## Car.Parking_char 4.685466e-06
## Flooring_char 3.505731e-04
## Super.Area.2:Flooring_char 9.473822e-10
df4$adj.r.squared
## [1] 0.7699154