Read data- Test and Train split

Part 1:reading external data and storing into a dataframe

library(data.table)
data <- read.csv("Training Data.csv")
data <- data[,-c(4,11,12)]
#dt <- fread(input="DATA_Unique_V5.csv",stringsAsFactors=TRUE)
#attach(houseprices.df)



# Train data 75% of the sample size
smp_size <- floor(0.75 * nrow(data))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)

#Test and train data
data_train <- data[train_ind, ]
data_test <- data[-train_ind, ]

Installing Packages

#install.packages("Hmisc")
#install.packages("car")
#install.packages("corrplot")
library(corrplot)

## corrplot 0.84 loaded

library(car)

## Loading required package: carData

library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(stats)
library(plyr)

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# install.packages("pROC")
#install.packages("randomForest")
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

Data cleaning, Missing value treatment done in Excel

sum_na <- function(vector)
{
  
  s <- sum(is.na(vector))
  
  return(s)
}

#Missing value statistics
NA_count<-sapply(data_train,sum_na)
Missing_value_var <- names(NA_count[which(NA_count>0)])
missing_percentage <- NA_count/nrow(data_train)
missing_percentage

##              Area         MacroArea            dataId      No..of.Rooms 
##                 0                 0                 0                 0 
##  No..of.Bathrooms       Big.Builder     No..of.Photos Furnishing.status 
##                 0                 0                 0                 0 
##          Floor.no       Car.Parking      Total.Floors          Flooring 
##                 0                 0                 0                 0 
##            Facing           Tenants        Rent_Final      Super.Area.2 
##                 0                 0                 0                 0 
##     Carpet.Area.2 
##                 0

data_train_miss <- data_train
data_test_miss <- data_test

class_data <- sapply(data_train_miss, class)
class_data

##              Area         MacroArea            dataId      No..of.Rooms 
##          "factor"          "factor"         "integer"         "integer" 
##  No..of.Bathrooms       Big.Builder     No..of.Photos Furnishing.status 
##         "integer"         "integer"         "integer"          "factor" 
##          Floor.no       Car.Parking      Total.Floors          Flooring 
##         "integer"          "factor"         "integer"          "factor" 
##            Facing           Tenants        Rent_Final      Super.Area.2 
##          "factor"          "factor"         "integer"         "numeric" 
##     Carpet.Area.2 
##         "numeric"

#Categorical variable transformation to log odds of each category

categorical_variable <- colnames(data_train_miss)[class_data=="factor"]
data_char <- data_train_miss
data_char_test <- data_test_miss
#data_char_score <- data_score_miss

for (i in categorical_variable)
{
  category_mean <- aggregate(data_char$Rent_Final,list(data_char[[i]]), mean)
  category_mean$log_odds = category_mean$x
  # category_mean$log_odds[which(category_mean$log_odds==Inf)] <- 100
  # category_mean$log_odds[which(category_mean$log_odds==-Inf)] <- -100
  
  for (j in as.character(category_mean$Group.1))
  {
    data_char[which(data_char[[i]]==j),paste(i,"_char",sep="")] <- category_mean$log_odds[category_mean$Group.1==j]
    data_char_test[which(data_char_test[[i]]==j),paste(i,"_char",sep="")] <- category_mean$log_odds[category_mean$Group.1==j]
    #data_char_score[which(data_char_score[[i]]==j),paste(i,"_char",sep="")] <- category_mean$log_odds[category_mean$Group.1==j]
    
  }
}


#Numerical variable transformation to log odds of each category

data_num <- data_char[,-match(categorical_variable,colnames(data_char))]
data_num_test <- data_char_test[,-match(categorical_variable,colnames(data_char))]
#data_num_score <- data_char_score[,-c(39:48)]

Model Development- Normal and interaction- 76% R2 on training data

houseprices.df <- data_num

univariate_summary <- data.frame(matrix(nrow=15,ncol=7))
colnames(univariate_summary)<- c("Variable","P_Value","Coefficient","Intercept","RSquare","Adj_RSquare","VIF")

j=1
for(i in c(2:7,9:17))
{
  regress <- lm(Rent_Final~ houseprices.df[,i],houseprices.df)
  df <- summary(regress)
  univariate_summary[j,1] <- colnames(houseprices.df)[i]
  univariate_summary[j,2] <-  df$coefficients[2,4]
  univariate_summary[j,3] <-  df$coefficients[2,1] 
  univariate_summary[j,4] <-  df$coefficients[1,1] 
  univariate_summary[j,5] <-  df$r.squared
  univariate_summary[j,6] <-  df$adj.r.squared
  j=j+1
}

univariate_summary

##                  Variable       P_Value  Coefficient     Intercept
## 1            No..of.Rooms  6.503163e-79 26488.492722 -6.079734e+02
## 2        No..of.Bathrooms  1.758145e-31 19355.464200  8.173354e+03
## 3             Big.Builder  2.259392e-06 10287.259462  4.309199e+04
## 4           No..of.Photos  6.555831e-01   -48.882907  4.468884e+04
## 5                Floor.no  6.412748e-05   500.276837  4.102718e+04
## 6            Total.Floors  1.291703e-05   327.759817  4.005866e+04
## 7            Super.Area.2 5.863632e-124    55.110402 -6.417137e+03
## 8           Carpet.Area.2  2.272728e-09     6.911705  3.958790e+04
## 9               Area_char 2.279407e-279     1.000000  1.473774e-11
## 10         MacroArea_char 5.519023e-178     1.000000  1.473774e-11
## 11 Furnishing.status_char  1.964371e-26     1.000000  0.000000e+00
## 12       Car.Parking_char  5.687151e-30     1.000000  1.597790e-11
## 13          Flooring_char  1.824911e-57     1.000000  1.845822e-11
## 14            Facing_char  1.223269e-02     1.000000 -4.912580e-11
## 15           Tenants_char  1.127257e-04     1.000000  1.192097e-12
##         RSquare   Adj_RSquare VIF
## 1  0.2192138903  0.2186678860  NA
## 2  0.0909122489  0.0902765232  NA
## 3  0.0155206756  0.0148322285  NA
## 4  0.0001391533 -0.0005600501  NA
## 5  0.0111141847  0.0104226561  NA
## 6  0.0132191084  0.0125290519  NA
## 7  0.3244607264  0.3239883213  NA
## 8  0.0246812853  0.0239992443  NA
## 9  0.5902918129  0.5900053037  NA
## 10 0.4322252040  0.4318281587  NA
## 11 0.0761251534  0.0754790870  NA
## 12 0.0865124472  0.0858736447  NA
## 13 0.1635478027  0.1629628711  NA
## 14 0.0043812981  0.0036850612  NA
## 15 0.0103764503  0.0096844058  NA

library(ggplot2)
ggplot(univariate_summary, aes(as.factor(Variable), Adj_RSquare)) + 
    geom_bar(stat = "identity", col="green", fill="green")+ 
    labs(y = "Adjusted R square", x = "Numerical Variable")

predictive_variables <-univariate_summary$Variable[which(univariate_summary$RSquare>=0.05)]
model_all <- lm(Rent_Final~No..of.Rooms+No..of.Bathrooms+Super.Area.2+Area_char+Furnishing.status_char+Car.Parking_char+Flooring_char,houseprices.df)
df2 <- summary(model_all)
df2$coefficients

##                             Estimate   Std. Error    t value      Pr(>|t|)
## (Intercept)            -6.287237e+04 3.134854e+03 -20.055919  5.278569e-79
## No..of.Rooms            3.052895e+03 1.184147e+03   2.578140  1.003308e-02
## No..of.Bathrooms        3.005448e+03 1.008111e+03   2.981267  2.919314e-03
## Super.Area.2            2.747883e+01 2.070802e+00  13.269660  5.641152e-38
## Area_char               8.075600e-01 1.843700e-02  43.801059 4.005636e-266
## Furnishing.status_char  2.876635e-01 4.819061e-02   5.969286  3.003586e-09
## Car.Parking_char        2.372806e-01 4.758691e-02   4.986257  6.911819e-07
## Flooring_char           2.740283e-01 3.430020e-02   7.989117  2.781792e-15

m1_rsq <- df2$adj.r.squared
predict_test <- predict(model_all,data_char_test)

#No significant improvement for interaction of room with bathroom
model_int <- lm(Rent_Final~No..of.Rooms*No..of.Bathrooms+Super.Area.2+Area_char+Furnishing.status_char+Car.Parking_char+Flooring_char,houseprices.df)
df3 <- summary(model_int)
df3$coefficients

##                                    Estimate   Std. Error   t value
## (Intercept)                   -4.962568e+04 5.989297e+03 -8.285728
## No..of.Rooms                  -6.890235e+03 4.011567e+03 -1.717592
## No..of.Bathrooms              -4.255080e+03 2.974587e+03 -1.430478
## Super.Area.2                   2.730426e+01 2.067745e+00 13.204844
## Area_char                      8.037985e-01 1.845709e-02 43.549570
## Furnishing.status_char         2.804923e-01 4.817340e-02  5.822556
## Car.Parking_char               2.362254e-01 4.749324e-02  4.973873
## Flooring_char                  2.761992e-01 3.424166e-02  8.066175
## No..of.Rooms:No..of.Bathrooms  5.436696e+03 2.096103e+03  2.593716
##                                    Pr(>|t|)
## (Intercept)                    2.680671e-16
## No..of.Rooms                   8.608863e-02
## No..of.Bathrooms               1.527995e-01
## Super.Area.2                   1.220216e-37
## Area_char                     5.042178e-264
## Furnishing.status_char         7.152587e-09
## Car.Parking_char               7.361063e-07
## Flooring_char                  1.526368e-15
## No..of.Rooms:No..of.Bathrooms  9.591936e-03

df3$adj.r.squared

## [1] 0.7648923

#Check room with area- No significant improvement
model_int_2 <- lm(Rent_Final~No..of.Rooms+No..of.Bathrooms+Super.Area.2+Area_char+Furnishing.status_char+Car.Parking_char+Flooring_char+No..of.Rooms*Super.Area.2,houseprices.df)
df4 <- summary(model_int_2)
df4$coefficients

##                                Estimate   Std. Error    t value
## (Intercept)               -4.455144e+04 7.459163e+03 -5.9727136
## No..of.Rooms              -7.635193e+03 4.123360e+03 -1.8516922
## No..of.Bathrooms           3.590867e+03 1.028891e+03  3.4900364
## Super.Area.2              -1.945100e+00 1.107000e+01 -0.1757091
## Area_char                  8.069878e-01 1.839744e-02 43.8641466
## Furnishing.status_char     2.902405e-01 4.809346e-02  6.0349276
## Car.Parking_char           2.310073e-01 4.753824e-02  4.8593985
## Flooring_char              2.733863e-01 3.422516e-02  7.9878744
## No..of.Rooms:Super.Area.2  1.570782e+01 5.805811e+00  2.7055341
##                                Pr(>|t|)
## (Intercept)                2.943110e-09
## No..of.Rooms               6.427707e-02
## No..of.Bathrooms           4.977001e-04
## Super.Area.2               8.605475e-01
## Area_char                 1.419569e-266
## Furnishing.status_char     2.025010e-09
## Car.Parking_char           1.307772e-06
## Flooring_char              2.810136e-15
## No..of.Rooms:Super.Area.2  6.900880e-03

df4$adj.r.squared

## [1] 0.7649897

#Check interaction of Flooring with super area- No significant improvement
model_int_3 <- lm(Rent_Final~No..of.Rooms+No..of.Bathrooms+Super.Area.2+Area_char+Furnishing.status_char+Car.Parking_char+Flooring_char+Flooring_char*Super.Area.2,houseprices.df)
df4 <- summary(model_int_3)
df4$coefficients

##                                 Estimate   Std. Error    t value
## (Intercept)                -3.307800e+04 5.742486e+03 -5.7602235
## No..of.Rooms                4.294272e+03 1.186324e+03  3.6198140
## No..of.Bathrooms            3.256329e+03 9.961170e+02  3.2690221
## Super.Area.2               -3.145401e+00 5.375746e+00 -0.5851096
## Area_char                   8.103862e-01 1.820819e-02 44.5066758
## Furnishing.status_char      2.920930e-01 4.758288e-02  6.1386156
## Car.Parking_char            2.164891e-01 4.710253e-02  4.5961247
## Flooring_char              -4.236162e-01 1.182158e-01 -3.5834146
## Super.Area.2:Flooring_char  6.588227e-04 1.069590e-04  6.1595789
##                                 Pr(>|t|)
## (Intercept)                 1.027905e-08
## No..of.Rooms                3.051498e-04
## No..of.Bathrooms            1.104979e-03
## Super.Area.2                5.585668e-01
## Area_char                  8.942659e-272
## Furnishing.status_char      1.077536e-09
## Car.Parking_char            4.685466e-06
## Flooring_char               3.505731e-04
## Super.Area.2:Flooring_char  9.473822e-10

df4$adj.r.squared

## [1] 0.7699154