Calling the Libraries

#library(devtools)
#install_github("vqv/ggbiplot")
library(leaps)
library(MASS)
library(e1071) 
library(readr)
library(e1071)
library(Metrics)
library(vioplot)
library(randomForest)
library(ROCR)

Importing and understanding the “houses” data

df <- read_csv("houses.csv")
head(df)                        #Accessing few first rows of the "Houses" dataset.
## # A tibble: 6 x 16
##       id  price bedrooms bathrooms sqft_living sqft_lot floors waterfront  view
##    <dbl>  <dbl>    <dbl>     <dbl>       <dbl>    <dbl>  <dbl>      <dbl> <dbl>
## 1 7.13e9 2.22e5        3      1           1180     5650      1          0     0
## 2 6.41e9 5.38e5        3      2.25        2570     7242      2          0     0
## 3 5.63e9 1.80e5        2      1            770    10000      1          0     0
## 4 2.49e9 6.04e5        4      3           1960     5000      1          0     0
## 5 1.95e9 5.10e5        3      2           1680     8080      1          0     0
## 6 7.24e9 1.23e6        4      4.5         5420   101930      1          0     0
## # ... with 7 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## #   sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, age <dbl>
tail(df)                        #Accessing few last rows of the "Houeses" dataset.              
## # A tibble: 6 x 16
##       id  price bedrooms bathrooms sqft_living sqft_lot floors waterfront  view
##    <dbl>  <dbl>    <dbl>     <dbl>       <dbl>    <dbl>  <dbl>      <dbl> <dbl>
## 1 3.00e9 475000        3      2.5         1310     1294      2          0     0
## 2 2.63e8 360000        3      2.5         1530     1131      3          0     0
## 3 6.60e9 400000        4      2.5         2310     5813      2          0     0
## 4 1.52e9 402101        2      0.75        1020     1350      2          0     0
## 5 2.91e8 400000        3      2.5         1600     2388      2          0     0
## 6 1.52e9 325000        2      0.75        1020     1076      2          0     0
## # ... with 7 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## #   sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, age <dbl>
dim(df)                         #Checking the Dimensions of the dataset.
## [1] 21613    16
str(df)                        #Checking the structure of the variables of the "Houses" dataset.
## tibble [21,613 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id           : num [1:21613] 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ price        : num [1:21613] 221900 538000 180000 604000 510000 ...
##  $ bedrooms     : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num [1:21613] 1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num [1:21613] 5650 7242 10000 5000 8080 ...
##  $ floors       : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num [1:21613] 1180 2170 770 1050 1680 ...
##  $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num [1:21613] 1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
##  $ age          : num [1:21613] 59 63 82 49 28 13 19 52 55 12 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   price = col_double(),
##   ..   bedrooms = col_double(),
##   ..   bathrooms = col_double(),
##   ..   sqft_living = col_double(),
##   ..   sqft_lot = col_double(),
##   ..   floors = col_double(),
##   ..   waterfront = col_double(),
##   ..   view = col_double(),
##   ..   condition = col_double(),
##   ..   grade = col_double(),
##   ..   sqft_above = col_double(),
##   ..   sqft_basement = col_double(),
##   ..   yr_built = col_double(),
##   ..   yr_renovated = col_double(),
##   ..   age = col_double()
##   .. )
summary(df)                   #Accessing the Summary of the "Houses" dataset. 
##        id                price            bedrooms        bathrooms    
##  Min.   :1.000e+06   Min.   :  75000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:2.123e+09   1st Qu.: 321950   1st Qu.: 3.000   1st Qu.:1.750  
##  Median :3.905e+09   Median : 450000   Median : 3.000   Median :2.250  
##  Mean   :4.580e+09   Mean   : 540088   Mean   : 3.371   Mean   :2.115  
##  3rd Qu.:7.309e+09   3rd Qu.: 645000   3rd Qu.: 4.000   3rd Qu.:2.500  
##  Max.   :9.900e+09   Max.   :7700000   Max.   :33.000   Max.   :8.000  
##   sqft_living       sqft_lot           floors        waterfront      
##  Min.   :  290   Min.   :    520   Min.   :1.000   Min.   :0.000000  
##  1st Qu.: 1427   1st Qu.:   5040   1st Qu.:1.000   1st Qu.:0.000000  
##  Median : 1910   Median :   7618   Median :1.500   Median :0.000000  
##  Mean   : 2080   Mean   :  15107   Mean   :1.494   Mean   :0.007542  
##  3rd Qu.: 2550   3rd Qu.:  10688   3rd Qu.:2.000   3rd Qu.:0.000000  
##  Max.   :13540   Max.   :1651359   Max.   :3.500   Max.   :1.000000  
##       view          condition         grade          sqft_above  
##  Min.   :0.0000   Min.   :1.000   Min.   : 1.000   Min.   : 290  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.: 7.000   1st Qu.:1190  
##  Median :0.0000   Median :3.000   Median : 7.000   Median :1560  
##  Mean   :0.2343   Mean   :3.409   Mean   : 7.657   Mean   :1788  
##  3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.: 8.000   3rd Qu.:2210  
##  Max.   :4.0000   Max.   :5.000   Max.   :13.000   Max.   :9410  
##  sqft_basement       yr_built     yr_renovated         age        
##  Min.   :   0.0   Min.   :1900   Min.   :   0.0   Min.   : -1.00  
##  1st Qu.:   0.0   1st Qu.:1951   1st Qu.:   0.0   1st Qu.: 18.00  
##  Median :   0.0   Median :1975   Median :   0.0   Median : 40.00  
##  Mean   : 291.5   Mean   :1971   Mean   :  84.4   Mean   : 43.32  
##  3rd Qu.: 560.0   3rd Qu.:1997   3rd Qu.:   0.0   3rd Qu.: 63.00  
##  Max.   :4820.0   Max.   :2015   Max.   :2015.0   Max.   :115.00

Checking the frequency of the variables of the “Houses” dataset.

table(df$bedrooms)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   33 
##   13  199 2760 9824 6882 1601  272   38   13    6    3    1    1
table(df$bathrooms)
## 
##    0  0.5 0.75    1 1.25  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5 3.75    4 
##   10    4   72 3852    9 1446 3048 1930 2047 5380 1185  753  589  731  155  136 
## 4.25  4.5 4.75    5 5.25  5.5 5.75    6 6.25  6.5 6.75  7.5 7.75    8 
##   79  100   23   21   13   10    4    6    2    2    2    1    1    2
table(df$waterfront)
## 
##     0     1 
## 21450   163
table(df$view)
## 
##     0     1     2     3     4 
## 19489   332   963   510   319
table(df$condition)
## 
##     1     2     3     4     5 
##    30   172 14031  5679  1701
table(df$grade)
## 
##    1    3    4    5    6    7    8    9   10   11   12   13 
##    1    3   29  242 2038 8981 6068 2615 1134  399   90   13

we can see there are majority of 2,3,4,5,6 Bedrooms; 1-5 Bathrooms;No waterfronts; 3-5 condition and 5-10 Grade in the “Houses” dataset.

df$id=NULL

Correlation of the variables

library(corrplot)
corrplot(cor(df),method = 'circle',order="hclust")

cor(df)
##                     price     bedrooms   bathrooms sqft_living     sqft_lot
## price          1.00000000  0.308349598  0.52513751  0.70203505  0.089660861
## bedrooms       0.30834960  1.000000000  0.51588364  0.57667069  0.031703243
## bathrooms      0.52513751  0.515883638  1.00000000  0.75466528  0.087739662
## sqft_living    0.70203505  0.576670693  0.75466528  1.00000000  0.172825661
## sqft_lot       0.08966086  0.031703243  0.08773966  0.17282566  1.000000000
## floors         0.25679389  0.175428935  0.50065317  0.35394929 -0.005200991
## waterfront     0.26636943 -0.006582479  0.06374363  0.10381782  0.021603683
## view           0.39729349  0.079531852  0.18773702  0.28461119  0.074710106
## condition      0.03636179  0.028472104 -0.12498193 -0.05875259 -0.008958250
## grade          0.66743426  0.356966725  0.66498253  0.76270448  0.113621124
## sqft_above     0.60556730  0.477600161  0.68534248  0.87659660  0.183512281
## sqft_basement  0.32381602  0.303093375  0.28377003  0.43504297  0.015286202
## yr_built       0.05401153  0.154178069  0.50601944  0.31804877  0.053080367
## yr_renovated   0.12643379  0.018840823  0.05073898  0.05536293  0.007643505
## age           -0.05395078 -0.154323756 -0.50640694 -0.31848848 -0.052989555
##                     floors   waterfront        view   condition       grade
## price          0.256793888  0.266369434  0.39729349  0.03636179  0.66743426
## bedrooms       0.175428935 -0.006582479  0.07953185  0.02847210  0.35696673
## bathrooms      0.500653173  0.063743629  0.18773702 -0.12498193  0.66498253
## sqft_living    0.353949290  0.103817818  0.28461119 -0.05875259  0.76270448
## sqft_lot      -0.005200991  0.021603683  0.07471011 -0.00895825  0.11362112
## floors         1.000000000  0.023698320  0.02944382 -0.26376795  0.45818251
## waterfront     0.023698320  1.000000000  0.40185735  0.01665316  0.08277491
## view           0.029443820  0.401857351  1.00000000  0.04598974  0.25132058
## condition     -0.263767946  0.016653157  0.04598974  1.00000000 -0.14467367
## grade          0.458182514  0.082774914  0.25132058 -0.14467367  1.00000000
## sqft_above     0.523884710  0.072074592  0.16764934 -0.15821362  0.75592294
## sqft_basement -0.245704542  0.080587939  0.27694658  0.17410491  0.16839182
## yr_built       0.489319425 -0.026161086 -0.05343985 -0.36141656  0.44696320
## yr_renovated   0.006338401  0.092884837  0.10391729 -0.06061779  0.01441428
## age           -0.489639965  0.026092934  0.05345777  0.36066523 -0.44741524
##                sqft_above sqft_basement    yr_built yr_renovated         age
## price          0.60556730    0.32381602  0.05401153  0.126433793 -0.05395078
## bedrooms       0.47760016    0.30309338  0.15417807  0.018840823 -0.15432376
## bathrooms      0.68534248    0.28377003  0.50601944  0.050738978 -0.50640694
## sqft_living    0.87659660    0.43504297  0.31804877  0.055362927 -0.31848848
## sqft_lot       0.18351228    0.01528620  0.05308037  0.007643505 -0.05298956
## floors         0.52388471   -0.24570454  0.48931942  0.006338401 -0.48963997
## waterfront     0.07207459    0.08058794 -0.02616109  0.092884837  0.02609293
## view           0.16764934    0.27694658 -0.05343985  0.103917288  0.05345777
## condition     -0.15821362    0.17410491 -0.36141656 -0.060617787  0.36066523
## grade          0.75592294    0.16839182  0.44696320  0.014414281 -0.44741524
## sqft_above     1.00000000   -0.05194331  0.42389835  0.023284688 -0.42424753
## sqft_basement -0.05194331    1.00000000 -0.13312410  0.071322902  0.13286495
## yr_built       0.42389835   -0.13312410  1.00000000 -0.224873518 -0.99987329
## yr_renovated   0.02328469    0.07132290 -0.22487352  1.000000000  0.22448020
## age           -0.42424753    0.13286495 -0.99987329  0.224480202  1.00000000

Splitting the Dataset into Train and Test (60:40) datasets

set.seed(12345)
train.rows = sample(rownames(df), dim(df)[1]*0.6)
train.df = df[train.rows, ]

valid.rows = setdiff(rownames(df), train.rows) 
valid.df = df[valid.rows, ]

Data Visualization of the “Houses” dataset

###Code for Additional Clean Graphing of visualization plots###
library("ggplot2")
cleanup = theme(panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.background = element_blank(),
                axis.line.x = element_line(color = 'black'),
                axis.line.y = element_line(color = 'black'),
                legend.key = element_rect(fill = 'white'),
                text = element_text(size = 15))

Scatterplots

library(grid)
library(gridExtra)
library("ggplot2")

s1=ggplot(data = train.df, mapping = aes(x =train.df$bedrooms,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and Bedrooms ") +
    xlab("Number of Bedrooms")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'green1')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s2=ggplot(data = train.df, mapping = aes(x =train.df$bathrooms,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and Bathrooms ") +
    xlab("Number of Bathrooms")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'green1')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s3=ggplot(data = train.df, mapping = aes(x =train.df$sqft_living,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and Sqft_living ") +
    xlab("Sqft_living")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'green1')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s4=ggplot(data = train.df, mapping = aes(x =train.df$waterfront,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and waterfront ") +
    xlab("Waterfront")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'green1')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s5=ggplot(data = train.df, mapping = aes(x =train.df$floors,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and Floors ") +
    xlab("Floors")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'grey')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s6=ggplot(data = train.df, mapping = aes(x =train.df$view,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and view ") +
    xlab("view")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'green1')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s7=ggplot(data = train.df, mapping = aes(x =train.df$view,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and  ") +
    xlab("Number of Bedrooms")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'green1')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s8=ggplot(data = train.df, mapping = aes(x =train.df$grade,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and Grade ") +
    xlab("Grade")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'green1')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s9=ggplot(data = train.df, mapping = aes(x =train.df$sqft_above,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and Sqft_above ") +
    xlab("Sqft_above")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'grey')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s10=ggplot(data = train.df, mapping = aes(x =train.df$sqft_basement,y=train.df$price)) +
    geom_point()+
  ggtitle("Scatterplot of Price and Sqft_basement ") +
    xlab("Sqft_basement")+
    ylab("Price")+
    geom_smooth(method = 'lm', color = 'green1')+
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

s1

s2

s3

s4

s5

s6

s7

s8

s9

s10

Histograms

library("ggplot2")
h1=ggplot(data = df, mapping = aes(x = df$price)) +
  geom_histogram( bins=30, fill = "steelblue") +
  labs(x = "Price of the Houses") +
  ggtitle("Histogram of Price of Houses in King County ") +
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup
h2=ggplot(data = df, mapping = aes(x = df$bedrooms)) +
  geom_histogram( bins=30, fill = "darkolivegreen") +
  labs(x = "Number of Bedrooms") +
  ggtitle("Histogram of number of Bedrooms of Houses in King County ") +
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup
h3=ggplot(data = df, mapping = aes(x = df$bathrooms)) +
  geom_histogram( bins=30, fill = "green1") +
  labs(x = "Number of Bathrooms") +
  ggtitle("Histogram of number of Bathrooms of Houses in King County ") +
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup
h4=ggplot(data = df, mapping = aes(x = df$sqft_lot)) +
  geom_histogram( bins=30, fill = "tomato1") +
  labs(x = "Lot Size(Sqft") +
  ggtitle("Histogram of Lot size of Houses in King County ") +
  theme(plot.title = element_text(hjust = 0.5)) +
  cleanup
h5=ggplot(data = df, mapping = aes(x = df$waterfront)) +
  geom_histogram( bins=30, fill = "red1") +
  labs(x = "Waterfront") +
  ggtitle("Histogram of waterfront for Houses in King County ") +
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup
h6=ggplot(data = df, mapping = aes(x = df$yr_built)) +
  geom_histogram( bins=30, fill = "brown1") +
  labs(x = "Built Year") +
  ggtitle("Histogram of Year Built of Houses in King County ") +
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup
h7=ggplot(data = df, mapping = aes(x = df$yr_renovated)) +
  geom_histogram( bins=30, fill = "grey1") +
  labs(x = "Renovated Year") +
  ggtitle("Histogram of Year Renovated of Houses in King County ") +
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup
h8=ggplot(data = df, mapping = aes(x = df$age)) +
  geom_histogram( bins=30, fill = "orange1") +
  labs(x = "Age of the house") +
  ggtitle("Histogram of age of Houses in King County ") +
  theme(plot.title = element_text(hjust = 0.5)) +
    cleanup

h1

h2

h3

h4

h5

h6

h7

h8

Boxplot and kernel density plots of discrete variables of the “Houses” dataset.

vioplot(df$bedrooms,col="gold")

vioplot(df$bathrooms,col="brown1")

vioplot(df$view,col="yellow1")

vioplot(df$waterfront,col="green1")

vioplot(df$grade,col="tomato1")

vioplot(df$condition,col="steelblue")

Data Modelling

Linear Model

Using Backward Elimination Wrapper method for the Multi-Linear Regression model

full.model=glm(price~.,data=train.df)
summary(full.model)
## 
## Call:
## glm(formula = price ~ ., data = train.df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1300891   -107367    -10259     87476   4171402  
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.692e+07  8.081e+06  -4.568 4.96e-06 ***
## bedrooms      -3.595e+04  2.497e+03 -14.396  < 2e-16 ***
## bathrooms      3.915e+04  4.390e+03   8.918  < 2e-16 ***
## sqft_living    1.666e+02  5.874e+00  28.361  < 2e-16 ***
## sqft_lot      -2.698e-01  4.334e-02  -6.225 4.96e-10 ***
## floors         2.520e+04  4.723e+03   5.336 9.65e-08 ***
## waterfront     5.424e+05  2.587e+04  20.967  < 2e-16 ***
## view           4.556e+04  2.898e+03  15.724  < 2e-16 ***
## condition      2.284e+04  3.180e+03   7.182 7.24e-13 ***
## grade          1.271e+05  2.757e+03  46.095  < 2e-16 ***
## sqft_above     1.980e+00  5.631e+00   0.352    0.725    
## sqft_basement         NA         NA      NA       NA    
## yr_built       1.782e+04  4.011e+03   4.442 8.98e-06 ***
## yr_renovated   7.016e+00  5.045e+00   1.391    0.164    
## age            2.137e+04  4.010e+03   5.328 1.01e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 45109576356)
## 
##     Null deviance: 1.6817e+15  on 12966  degrees of freedom
## Residual deviance: 5.8430e+14  on 12953  degrees of freedom
## AIC: 354926
## 
## Number of Fisher Scoring iterations: 2
step.model <- stepAIC(full.model, direction = "both", trace = TRUE)
## Start:  AIC=354925.9
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
##     waterfront + view + condition + grade + sqft_above + sqft_basement + 
##     yr_built + yr_renovated + age
## 
## 
## Step:  AIC=354925.9
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
##     waterfront + view + condition + grade + sqft_above + yr_built + 
##     yr_renovated + age
## 
##                Df   Deviance    AIC
## - sqft_above    1 5.8431e+14 354924
## - yr_renovated  1 5.8439e+14 354926
## <none>            5.8430e+14 354926
## - yr_built      1 5.8519e+14 354944
## - age           1 5.8558e+14 354952
## - floors        1 5.8559e+14 354952
## - sqft_lot      1 5.8605e+14 354963
## - condition     1 5.8663e+14 354975
## - bathrooms     1 5.8789e+14 355003
## - bedrooms      1 5.9365e+14 355130
## - view          1 5.9546e+14 355169
## - waterfront    1 6.0413e+14 355357
## - sqft_living   1 6.2059e+14 355705
## - grade         1 6.8015e+14 356893
## 
## Step:  AIC=354924
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
##     waterfront + view + condition + grade + yr_built + yr_renovated + 
##     age
## 
##                 Df   Deviance    AIC
## - yr_renovated   1 5.8440e+14 354924
## <none>             5.8431e+14 354924
## + sqft_above     1 5.8430e+14 354926
## + sqft_basement  1 5.8430e+14 354926
## - yr_built       1 5.8520e+14 354942
## - age            1 5.8559e+14 354950
## - floors         1 5.8591e+14 354958
## - sqft_lot       1 5.8606e+14 354961
## - condition      1 5.8663e+14 354973
## - bathrooms      1 5.8796e+14 355003
## - bedrooms       1 5.9367e+14 355128
## - view           1 5.9569e+14 355172
## - waterfront     1 6.0419e+14 355356
## - sqft_living    1 6.5804e+14 356463
## - grade          1 6.8265e+14 356939
## 
## Step:  AIC=354923.9
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
##     waterfront + view + condition + grade + yr_built + age
## 
##                 Df   Deviance    AIC
## <none>             5.8440e+14 354924
## + yr_renovated   1 5.8431e+14 354924
## + sqft_basement  1 5.8439e+14 354926
## + sqft_above     1 5.8439e+14 354926
## - yr_built       1 5.8527e+14 354941
## - age            1 5.8567e+14 354950
## - floors         1 5.8604e+14 354958
## - sqft_lot       1 5.8614e+14 354960
## - condition      1 5.8663e+14 354971
## - bathrooms      1 5.8824e+14 355007
## - bedrooms       1 5.9380e+14 355129
## - view           1 5.9583e+14 355173
## - waterfront     1 6.0463e+14 355363
## - sqft_living    1 6.5810e+14 356462
## - grade          1 6.8282e+14 356940
summary(step.model)
## 
## Call:
## glm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot + 
##     floors + waterfront + view + condition + grade + yr_built + 
##     age, data = train.df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1293083   -107303     -9949     87253   4170872  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.664e+07  8.077e+06  -4.537 5.77e-06 ***
## bedrooms    -3.605e+04  2.496e+03 -14.440  < 2e-16 ***
## bathrooms    3.961e+04  4.289e+03   9.234  < 2e-16 ***
## sqft_living  1.680e+02  4.157e+00  40.420  < 2e-16 ***
## sqft_lot    -2.676e-01  4.308e-02  -6.212 5.41e-10 ***
## floors       2.617e+04  4.331e+03   6.043 1.55e-09 ***
## waterfront   5.457e+05  2.577e+04  21.180  < 2e-16 ***
## view         4.550e+04  2.858e+03  15.920  < 2e-16 ***
## condition    2.204e+04  3.130e+03   7.042 1.99e-12 ***
## grade        1.273e+05  2.725e+03  46.710  < 2e-16 ***
## yr_built     1.768e+04  4.009e+03   4.410 1.04e-05 ***
## age          2.126e+04  4.009e+03   5.304 1.15e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 45109774050)
## 
##     Null deviance: 1.6817e+15  on 12966  degrees of freedom
## Residual deviance: 5.8440e+14  on 12955  degrees of freedom
## AIC: 354924
## 
## Number of Fisher Scoring iterations: 2
plot(full.model)

final.model=glm(price ~ bedrooms + bathrooms + sqft_living + sqft_lot + 
    floors + waterfront + view + condition + grade + yr_built + 
    age,data=train.df)
summary(final.model)
## 
## Call:
## glm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot + 
##     floors + waterfront + view + condition + grade + yr_built + 
##     age, data = train.df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1293083   -107303     -9949     87253   4170872  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.664e+07  8.077e+06  -4.537 5.77e-06 ***
## bedrooms    -3.605e+04  2.496e+03 -14.440  < 2e-16 ***
## bathrooms    3.961e+04  4.289e+03   9.234  < 2e-16 ***
## sqft_living  1.680e+02  4.157e+00  40.420  < 2e-16 ***
## sqft_lot    -2.676e-01  4.308e-02  -6.212 5.41e-10 ***
## floors       2.617e+04  4.331e+03   6.043 1.55e-09 ***
## waterfront   5.457e+05  2.577e+04  21.180  < 2e-16 ***
## view         4.550e+04  2.858e+03  15.920  < 2e-16 ***
## condition    2.204e+04  3.130e+03   7.042 1.99e-12 ***
## grade        1.273e+05  2.725e+03  46.710  < 2e-16 ***
## yr_built     1.768e+04  4.009e+03   4.410 1.04e-05 ***
## age          2.126e+04  4.009e+03   5.304 1.15e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 45109774050)
## 
##     Null deviance: 1.6817e+15  on 12966  degrees of freedom
## Residual deviance: 5.8440e+14  on 12955  degrees of freedom
## AIC: 354924
## 
## Number of Fisher Scoring iterations: 2
anova(final.model)      #Analysis of variance of the final model
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: price
## 
## Terms added sequentially (first to last)
## 
## 
##             Df   Deviance Resid. Df Resid. Dev
## NULL                          12966 1.6817e+15
## bedrooms     1 1.5603e+14     12965 1.5257e+15
## bathrooms    1 2.9956e+14     12964 1.2261e+15
## sqft_living  1 4.0230e+14     12963 8.2383e+14
## sqft_lot     1 3.5631e+12     12962 8.2027e+14
## floors       1 2.4316e+10     12961 8.2024e+14
## waterfront   1 4.3570e+13     12960 7.7667e+14
## view         1 3.1303e+13     12959 7.4537e+14
## condition    1 1.2269e+13     12958 7.3310e+14
## grade        1 6.7214e+13     12957 6.6589e+14
## yr_built     1 8.0223e+13     12956 5.8567e+14
## age          1 1.2690e+12     12955 5.8440e+14

Assumption checking

plot(final.model)

prediction_glm=predict(final.model,valid.df,type="response")
rmse(valid.df$price,prediction_glm)
## [1] 222037

Support Vector Machine

model.svm <- svm(price ~ bedrooms + bathrooms + sqft_living + sqft_lot +floors + waterfront + view + condition + grade +yr_built +  age,data = df,kernel="linear")
summary(model.svm)
## 
## Call:
## svm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot + 
##     floors + waterfront + view + condition + grade + yr_built + age, 
##     data = df, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.09090909 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  17028
plot(model.svm)
 prediction_svm = predict(model.svm, valid.df, type="response")
 rmse(valid.df$price,prediction_svm)
## [1] 230840.7

Random Forest

set.seed(12345)
reg.randomForest=price ~ bedrooms + bathrooms + sqft_living + sqft_lot +floors + waterfront + view + condition + grade +yr_built +  age
 model.randomForest= randomForest(reg.randomForest,df,ntree = 100, nodesize = 10, proximity = T)
 model.randomForest
## 
## Call:
##  randomForest(formula = reg.randomForest, data = df, ntree = 100,      nodesize = 10, proximity = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 36590581324
##                     % Var explained: 72.85
 plot(model.randomForest)

 varImpPlot(model.randomForest)

 prediction_randomForest = predict(model.randomForest, valid.df, type="response")
 rmse(valid.df$price,prediction_randomForest)
## [1] 122062.9
library(car)
vif(final.model)
##    bedrooms   bathrooms sqft_living    sqft_lot      floors  waterfront 
##    1.598268    3.115729    4.147641    1.046444    1.571236    1.170105 
##        view   condition       grade    yr_built         age 
##    1.303290    1.185598    2.945794 3956.927950 3957.136879