1. Loading the Libraries

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caTools)

2. Loading the Dataset

house <- read.csv("C:/Users/nazir ali khan/Downloads/kc_house_data.csv", stringsAsFactors=TRUE)
head(house)
##           id            date   price bedrooms bathrooms sqft_living sqft_lot
## 1 7129300520 20141013T000000  221900        3      1.00        1180     5650
## 2 6414100192 20141209T000000  538000        3      2.25        2570     7242
## 3 5631500400 20150225T000000  180000        2      1.00         770    10000
## 4 2487200875 20141209T000000  604000        4      3.00        1960     5000
## 5 1954400510 20150218T000000  510000        3      2.00        1680     8080
## 6 7237550310 20140512T000000 1230000        4      4.50        5420   101930
##   floors waterfront view condition grade sqft_above sqft_basement yr_built
## 1      1          0    0         3     7       1180             0     1955
## 2      2          0    0         3     7       2170           400     1951
## 3      1          0    0         3     6        770             0     1933
## 4      1          0    0         5     7       1050           910     1965
## 5      1          0    0         3     8       1680             0     1987
## 6      1          0    0         3    11       3890          1530     2001
##   yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1            0   98178 47.5112 -122.257          1340       5650
## 2         1991   98125 47.7210 -122.319          1690       7639
## 3            0   98028 47.7379 -122.233          2720       8062
## 4            0   98136 47.5208 -122.393          1360       5000
## 5            0   98074 47.6168 -122.045          1800       7503
## 6            0   98053 47.6561 -122.005          4760     101930

3. Dropping Unneccessary Columns

house <- house %>% select(-id, -yr_built, -yr_renovated, -zipcode, -lat, -long)
glimpse(house)
## Rows: 21,613
## Columns: 15
## $ date          <fct> 20141013T000000, 20141209T000000, 20150225T000000, 20141…
## $ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1230000, 257500,…
## $ bedrooms      <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living   <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot      <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition     <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade         <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above    <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15    <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …

4. Dealing with Data Types

house$waterfront <- as.factor(house$waterfront) # converting waterfront availability as factors

house$date <- as.character(house$date) # converting date into string
house$date <- substr(house$date, start = 1, stop = 4) # extracting the years
house$date <- as.factor(house$date) # converting date into factors
glimpse(house)
## Rows: 21,613
## Columns: 15
## $ date          <fct> 2014, 2014, 2015, 2014, 2015, 2014, 2014, 2015, 2015, 20…
## $ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1230000, 257500,…
## $ bedrooms      <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living   <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot      <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition     <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade         <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above    <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15    <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …

5. Splitting the Dataset into Train and Test Data

set.seed(1000)
split <- sample.split(house$price, SplitRatio = 0.75)
train_house <- subset(house, split==T)
test_house <- subset(house, split==F)

6. Model Creation - Phase 1

6.1. Creating the Linear Model

attach(house) # attaching the dataset

model <- lm(price ~ ., data = train_house)
print(model)
## 
## Call:
## lm(formula = price ~ ., data = train_house)
## 
## Coefficients:
##   (Intercept)       date2015       bedrooms      bathrooms    sqft_living  
##    -6.962e+05      2.526e+04     -3.863e+04     -1.579e+04      2.298e+02  
##      sqft_lot         floors    waterfront1           view      condition  
##     1.483e-02     -5.350e+03      5.811e+05      5.907e+04      5.474e+04  
##         grade     sqft_above  sqft_basement  sqft_living15     sqft_lot15  
##     9.987e+04     -3.185e+01             NA      9.892e+00     -7.929e-01
summary(model)
## 
## Call:
## lm(formula = price ~ ., data = train_house)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1196321  -124582   -14687    94753  4544909 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.962e+05  2.007e+04 -34.691  < 2e-16 ***
## date2015       2.526e+04  3.870e+03   6.526 6.94e-11 ***
## bedrooms      -3.863e+04  2.458e+03 -15.714  < 2e-16 ***
## bathrooms     -1.579e+04  4.003e+03  -3.946 7.99e-05 ***
## sqft_living    2.298e+02  5.601e+00  41.021  < 2e-16 ***
## sqft_lot       1.483e-02  6.198e-02   0.239   0.8109    
## floors        -5.350e+03  4.565e+03  -1.172   0.2412    
## waterfront1    5.811e+05  2.222e+04  26.155  < 2e-16 ***
## view           5.907e+04  2.780e+03  21.246  < 2e-16 ***
## condition      5.474e+04  2.902e+03  18.866  < 2e-16 ***
## grade          9.987e+04  2.722e+03  36.690  < 2e-16 ***
## sqft_above    -3.185e+01  5.545e+00  -5.743 9.47e-09 ***
## sqft_basement         NA         NA      NA       NA    
## sqft_living15  9.892e+00  4.415e+00   2.241   0.0251 *  
## sqft_lot15    -7.929e-01  9.515e-02  -8.333  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 234300 on 16842 degrees of freedom
## Multiple R-squared:  0.6119, Adjusted R-squared:  0.6116 
## F-statistic:  2043 on 13 and 16842 DF,  p-value: < 2.2e-16

6.2. Predicting the Test Data

predicted_house <- predict(model, data=test_house)
summary(predicted_house)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -682388  348730  478561  541593  669994 3264539

6.3. Checking the Residual Plot for Accuracy of Model

residuals <- model$residuals # computing the residuals

resplot <- plot(predicted_house, residuals, main = "Residual Plot", xlab = "Predicted Values", ylab = "Residuals") # plotting the residuals

print(resplot)
## NULL

7. Model Creation - Phase 2

7.1. Creating the 2nd Linear Model without Columns having p-values greater than 5%

model2 <- lm(price ~ . - sqft_lot - floors - sqft_basement - sqft_living15, data = train_house)
summary(model2)
## 
## Call:
## lm(formula = price ~ . - sqft_lot - floors - sqft_basement - 
##     sqft_living15, data = train_house)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1206835  -124138   -15065    94828  4519000 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.007e+05  1.981e+04 -35.371  < 2e-16 ***
## date2015     2.534e+04  3.870e+03   6.548 6.02e-11 ***
## bedrooms    -3.847e+04  2.455e+03 -15.672  < 2e-16 ***
## bathrooms   -1.808e+04  3.744e+03  -4.829 1.38e-06 ***
## sqft_living  2.348e+02  5.154e+00  45.558  < 2e-16 ***
## waterfront1  5.791e+05  2.220e+04  26.080  < 2e-16 ***
## view         5.980e+04  2.759e+03  21.676  < 2e-16 ***
## condition    5.512e+04  2.874e+03  19.181  < 2e-16 ***
## grade        1.012e+05  2.583e+03  39.171  < 2e-16 ***
## sqft_above  -3.275e+01  4.913e+00  -6.666 2.70e-11 ***
## sqft_lot15  -7.562e-01  6.834e-02 -11.065  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 234300 on 16845 degrees of freedom
## Multiple R-squared:  0.6118, Adjusted R-squared:  0.6115 
## F-statistic:  2654 on 10 and 16845 DF,  p-value: < 2.2e-16

7.2 Predicting the Test Data Again

predicted_house2 <- predict(model2, data=test_house)
summary(predicted_house2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -675480  349056  478401  541593  669427 3280318

7.3 Checking the Residual Plot for Accuracy of Model

residuals2 <- model2$residuals # computing the residuals

resplot2 <- plot(predicted_house2, residuals2, main = "Residual Plot", xlab = "Predicted Values", ylab = "Residuals") # plotting the residuals

print(resplot2)
## NULL

8. Comparing the Residual Plots

par(mfrow = c(1, 2))
plot(predicted_house, residuals, main = "Residual Plot", xlab = "Predicted Values", ylab = "Residuals")
plot(predicted_house2, residuals2, main = "Residual Plot", xlab = "Predicted Values", ylab = "Residuals")