Problem Definition

Use data set House Property. Find out which all columns / features impact changes in Price of house.
Create a linear model for all coulmns with corrrelation.

Setup

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra) 
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Functions

detect_outliers <- function(inp, na.rm=TRUE) {
  i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
  i.max <- 1.5 * IQR(inp, na.rm=na.rm)
  otp <- inp
  otp[inp < (i.qnt[1] - i.max)] <- NA
  otp[inp > (i.qnt[2] + i.max)] <- NA
  #inp <- count(inp[is.na(otp)])
  sum(is.na(otp))
}
Except_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}
Remove_Outliers <- function ( z, na.rm = TRUE){
 Out <- Except_outliers(z)
 Out <-as.data.frame (Out)
 z <- Out$Out[match(z, Out$Out)]
 z
}
Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(dfrModel, aes(x="", y=input)) +
            geom_boxplot(aes(fill=input), color="green") +
            labs(title="Outliers")
Plot
}

Dataset

setwd("D:/Welingkar/Competitions/Anaholix/Round2/Q3")
dfrModel <- read.csv("./House_Price.csv", header=T, stringsAsFactors=F)
head(dfrModel)
##   price size bedroom
## 1 42000 5850       3
## 2 38500 4000       2
## 3 49500 3060       3
## 4 60500 6650       3
## 5 61000 6360       2
## 6 66000 4160       3

Observation
All are numerical datasets.

Summary

#summary(dfrModel)
lapply(dfrModel, FUN=summary)
## $price
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25000   49125   62000   68122   82000  190000 
## 
## $size
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1650    3600    4600    5150    6360   16200 
## 
## $bedroom
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.965   3.000   6.000
lapply(dfrModel, FUN=detect_outliers)
## $price
## [1] 15
## 
## $size
## [1] 12
## 
## $bedroom
## [1] 12

** Obsevations**
We can see that there are few outliers so For this model we are going with Outliers.

lapply(dfrModel, FUN=Graph_Boxplot)
## $price

## 
## $size

## 
## $bedroom

Correlation

vctCorr = numeric(0)
for (i in names(dfrModel)){
cor.result <- cor(dfrModel$price, as.numeric(dfrModel[,i]))
vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr
##     price      size   bedroom 
## 1.0000000 0.5357957 0.3664474

Visualize

dfrGraph <- gather(dfrModel, variable, value, -price)
head(dfrGraph)
##   price variable value
## 1 42000     size  5850
## 2 38500     size  4000
## 3 49500     size  3060
## 4 60500     size  6650
## 5 61000     size  6360
## 6 66000     size  4160
ggplot(dfrGraph) +
geom_jitter(aes(value,price, colour=variable)) + 
geom_smooth(aes(value,price, colour=variable), method=lm, se=FALSE) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of Price With Other Features")

Observation
There is some impact of all the features with Price.

Find Best Multi Linear Model
Choose the best linear model by using step(). Choose a model by AIC in a Stepwise Algorithm
In statistics, stepwise regression is a method of fitting regression models in which the choice of predictive variables is carried out by an automatic procedure. In each step, a variable is considered for addition to or subtraction from the set of explanatory variables based on some prespecified criterion.
The Akaike information criterion (AIC) is a measure of the relative quality of statistical models for a given set of data. Given a collection of models for the data, AIC estimates the quality of each model, relative to each of the other models. Hence, AIC provides a means for model selection.

#?step()
stpModel=step(lm(data=dfrModel, price~.), trace=0, steps=10000)
stpSummary <- summary(stpModel)
stpSummary 
## 
## Call:
## lm(formula = price ~ size + bedroom, data = dfrModel)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65665 -12498  -2075   8970  97205 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.613e+03  4.103e+03   1.368    0.172    
## size        6.053e+00  4.243e-01  14.265  < 2e-16 ***
## bedroom     1.057e+04  1.248e+03   8.470 2.31e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21230 on 543 degrees of freedom
## Multiple R-squared:  0.3703, Adjusted R-squared:  0.3679 
## F-statistic: 159.6 on 2 and 543 DF,  p-value: < 2.2e-16

Observation
Best results given by price ~ size + bedroom.

Multiple R square value is also 0.3703 which is showing it as a average to good model.

Make Final Multi Linear Model

x1 <- dfrModel$size
x2 <- dfrModel$bedroom
y <- dfrModel$price
slmModel <- lm(y~x1+x2, data=dfrModel)

Observation
No errors. Model successfully created.

Show Model

# print summary
summary(slmModel)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = dfrModel)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65665 -12498  -2075   8970  97205 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.613e+03  4.103e+03   1.368    0.172    
## x1          6.053e+00  4.243e-01  14.265  < 2e-16 ***
## x2          1.057e+04  1.248e+03   8.470 2.31e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21230 on 543 degrees of freedom
## Multiple R-squared:  0.3703, Adjusted R-squared:  0.3679 
## F-statistic: 159.6 on 2 and 543 DF,  p-value: < 2.2e-16