1. Loading the Libraries
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caTools)
2. Loading the Dataset
house <- read.csv("C:/Users/nazir ali khan/Downloads/kc_house_data.csv", stringsAsFactors=TRUE)
head(house)
## id date price bedrooms bathrooms sqft_living sqft_lot
## 1 7129300520 20141013T000000 221900 3 1.00 1180 5650
## 2 6414100192 20141209T000000 538000 3 2.25 2570 7242
## 3 5631500400 20150225T000000 180000 2 1.00 770 10000
## 4 2487200875 20141209T000000 604000 4 3.00 1960 5000
## 5 1954400510 20150218T000000 510000 3 2.00 1680 8080
## 6 7237550310 20140512T000000 1230000 4 4.50 5420 101930
## floors waterfront view condition grade sqft_above sqft_basement yr_built
## 1 1 0 0 3 7 1180 0 1955
## 2 2 0 0 3 7 2170 400 1951
## 3 1 0 0 3 6 770 0 1933
## 4 1 0 0 5 7 1050 910 1965
## 5 1 0 0 3 8 1680 0 1987
## 6 1 0 0 3 11 3890 1530 2001
## yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1 0 98178 47.5112 -122.257 1340 5650
## 2 1991 98125 47.7210 -122.319 1690 7639
## 3 0 98028 47.7379 -122.233 2720 8062
## 4 0 98136 47.5208 -122.393 1360 5000
## 5 0 98074 47.6168 -122.045 1800 7503
## 6 0 98053 47.6561 -122.005 4760 101930
3. Dropping Unneccessary Columns
house <- house %>% select(-id, -yr_built, -yr_renovated, -zipcode, -lat, -long)
glimpse(house)
## Rows: 21,613
## Columns: 15
## $ date <fct> 20141013T000000, 20141209T000000, 20150225T000000, 20141…
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1230000, 257500,…
## $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …
4. Dealing with Data Types
house$waterfront <- as.factor(house$waterfront) # converting waterfront availability as factors
house$date <- as.character(house$date) # converting date into string
house$date <- substr(house$date, start = 1, stop = 4) # extracting the years
house$date <- as.factor(house$date) # converting date into factors
glimpse(house)
## Rows: 21,613
## Columns: 15
## $ date <fct> 2014, 2014, 2015, 2014, 2015, 2014, 2014, 2015, 2015, 20…
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1230000, 257500,…
## $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …
5. Splitting the Dataset into Train and Test Data
set.seed(1000)
split <- sample.split(house$price, SplitRatio = 0.75)
train_house <- subset(house, split==T)
test_house <- subset(house, split==F)
6. Model Creation - Phase 1
6.1. Creating the Linear Model
attach(house) # attaching the dataset
model <- lm(price ~ ., data = train_house)
print(model)
##
## Call:
## lm(formula = price ~ ., data = train_house)
##
## Coefficients:
## (Intercept) date2015 bedrooms bathrooms sqft_living
## -6.962e+05 2.526e+04 -3.863e+04 -1.579e+04 2.298e+02
## sqft_lot floors waterfront1 view condition
## 1.483e-02 -5.350e+03 5.811e+05 5.907e+04 5.474e+04
## grade sqft_above sqft_basement sqft_living15 sqft_lot15
## 9.987e+04 -3.185e+01 NA 9.892e+00 -7.929e-01
summary(model)
##
## Call:
## lm(formula = price ~ ., data = train_house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1196321 -124582 -14687 94753 4544909
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.962e+05 2.007e+04 -34.691 < 2e-16 ***
## date2015 2.526e+04 3.870e+03 6.526 6.94e-11 ***
## bedrooms -3.863e+04 2.458e+03 -15.714 < 2e-16 ***
## bathrooms -1.579e+04 4.003e+03 -3.946 7.99e-05 ***
## sqft_living 2.298e+02 5.601e+00 41.021 < 2e-16 ***
## sqft_lot 1.483e-02 6.198e-02 0.239 0.8109
## floors -5.350e+03 4.565e+03 -1.172 0.2412
## waterfront1 5.811e+05 2.222e+04 26.155 < 2e-16 ***
## view 5.907e+04 2.780e+03 21.246 < 2e-16 ***
## condition 5.474e+04 2.902e+03 18.866 < 2e-16 ***
## grade 9.987e+04 2.722e+03 36.690 < 2e-16 ***
## sqft_above -3.185e+01 5.545e+00 -5.743 9.47e-09 ***
## sqft_basement NA NA NA NA
## sqft_living15 9.892e+00 4.415e+00 2.241 0.0251 *
## sqft_lot15 -7.929e-01 9.515e-02 -8.333 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 234300 on 16842 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6116
## F-statistic: 2043 on 13 and 16842 DF, p-value: < 2.2e-16
6.2. Predicting the Test Data
predicted_house <- predict(model, data=test_house)
summary(predicted_house)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -682388 348730 478561 541593 669994 3264539
6.3. Checking the Residual Plot for Accuracy of Model
residuals <- model$residuals # computing the residuals
resplot <- plot(predicted_house, residuals, main = "Residual Plot", xlab = "Predicted Values", ylab = "Residuals") # plotting the residuals

print(resplot)
## NULL
7. Model Creation - Phase 2
7.1. Creating the 2nd Linear Model without Columns having p-values
greater than 5%
model2 <- lm(price ~ . - sqft_lot - floors - sqft_basement - sqft_living15, data = train_house)
summary(model2)
##
## Call:
## lm(formula = price ~ . - sqft_lot - floors - sqft_basement -
## sqft_living15, data = train_house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1206835 -124138 -15065 94828 4519000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.007e+05 1.981e+04 -35.371 < 2e-16 ***
## date2015 2.534e+04 3.870e+03 6.548 6.02e-11 ***
## bedrooms -3.847e+04 2.455e+03 -15.672 < 2e-16 ***
## bathrooms -1.808e+04 3.744e+03 -4.829 1.38e-06 ***
## sqft_living 2.348e+02 5.154e+00 45.558 < 2e-16 ***
## waterfront1 5.791e+05 2.220e+04 26.080 < 2e-16 ***
## view 5.980e+04 2.759e+03 21.676 < 2e-16 ***
## condition 5.512e+04 2.874e+03 19.181 < 2e-16 ***
## grade 1.012e+05 2.583e+03 39.171 < 2e-16 ***
## sqft_above -3.275e+01 4.913e+00 -6.666 2.70e-11 ***
## sqft_lot15 -7.562e-01 6.834e-02 -11.065 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 234300 on 16845 degrees of freedom
## Multiple R-squared: 0.6118, Adjusted R-squared: 0.6115
## F-statistic: 2654 on 10 and 16845 DF, p-value: < 2.2e-16
7.2 Predicting the Test Data Again
predicted_house2 <- predict(model2, data=test_house)
summary(predicted_house2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -675480 349056 478401 541593 669427 3280318
7.3 Checking the Residual Plot for Accuracy of Model
residuals2 <- model2$residuals # computing the residuals
resplot2 <- plot(predicted_house2, residuals2, main = "Residual Plot", xlab = "Predicted Values", ylab = "Residuals") # plotting the residuals

print(resplot2)
## NULL
8. Comparing the Residual Plots
par(mfrow = c(1, 2))
plot(predicted_house, residuals, main = "Residual Plot", xlab = "Predicted Values", ylab = "Residuals")
plot(predicted_house2, residuals2, main = "Residual Plot", xlab = "Predicted Values", ylab = "Residuals")
