Read file

data <- read.csv("kc_house_data.csv", header = T, stringsAsFactors = T, sep = ",")

Omit missing data

data <- na.omit(data)

Removing Unnecessary Variables

data1 <- subset(data, select = -c(id, sqft_living, sqft_lot, zipcode, lat, long, view))

Converting Date to Standard Format

data1$date <- as.Date(data1$date, format = c("%Y%m%dT000000"))

Extracting Year from Date Variable

data1$year_sale <- as.numeric(format(data1$date, "%Y"))

Removing Date Variable and calculating the age of house

data2 <- subset(data1, select = -c(1))
data2$house_age <- data2$year_sale- data2$yr_built

Since ‘Age’ is having negative value, we add +2 to make ‘Age’ positive

data2$house_age <- 2+ data2$house_age

Removing unnecessary variables

data3 <- subset(data2, select = -c(yr_built, year_sale))

Loading library DPLYR

library(dplyr)

Converting ‘Renovated’ variable as factor and renaming the levels as ‘Yes’ and ‘No’

data3 <- data3 %>% mutate(renovated = as.factor(case_when(data3$yr_renovated >0 ~ "Yes", 
                                                          data3$yr_renovated == 0 ~ "No")))

Removing unnecessary variables

data4 <- subset(data3, select = -c(yr_renovated))

Converting ‘Waterfront’ variable as factor and renaming the levels

data4$waterfront <- factor(data4$waterfront, levels= c(0,1), labels= c("No", "Yes"))

Load library ‘ggcorrplot’ and plot correlation of all continuous variables

library(ggcorrplot)
data.continuous <- data4 %>% select_if(is.numeric)
cor.matrix <- cor(data.continuous)
ggcorrplot(cor.matrix, type = "lower", hc.order= T, lab= T)

Removing outliers from ‘Price’ variable using boxplot method

outlier <- boxplot(data4$price)$out

outlier_data <- data4[which(data4$price %in% outlier),]
data5 <- data4[-which(data4$price %in% outlier),]

Set seed and create training data and test data

set.seed(12345)
index <- sample(1:nrow(data5), 0.80*nrow(data5))
train_data <- data5[index,]
test_data <- data5[-index,]

Generate Linear model using lm function

options(scipen = 100)
model1 <- lm(price~., data = train_data)
summary(model1)
## 
## Call:
## lm(formula = price ~ ., data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -546128  -92099   -6929   80846  717683 
## 
## Coefficients:
##                    Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)   -718698.71159   13061.97029 -55.022 < 0.0000000000000002 ***
## bedrooms       -13961.91598    1480.43064  -9.431 < 0.0000000000000002 ***
## bathrooms       16454.80290    2191.45882   7.509  0.00000000000006284 ***
## floors          58195.68658    2789.03400  20.866 < 0.0000000000000002 ***
## waterfrontYes  150730.68257   18802.45368   8.017  0.00000000000000116 ***
## condition       19949.03995    1816.67189  10.981 < 0.0000000000000002 ***
## grade           92044.86860    1679.38800  54.809 < 0.0000000000000002 ***
## sqft_above         57.19854       3.11098  18.386 < 0.0000000000000002 ***
## sqft_basement      98.78673       3.50041  28.221 < 0.0000000000000002 ***
## sqft_living15      60.05082       2.81401  21.340 < 0.0000000000000002 ***
## sqft_lot15         -0.14399       0.04125  -3.491             0.000483 ***
## house_age        2617.92563      52.72464  49.653 < 0.0000000000000002 ***
## renovatedYes    10626.08345    5943.40565   1.788             0.073814 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 136000 on 16348 degrees of freedom
## Multiple R-squared:  0.5718, Adjusted R-squared:  0.5715 
## F-statistic:  1820 on 12 and 16348 DF,  p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model1)

Load library car for checking the Variability Inflation Factor

library(car)
vif(model1)
##      bedrooms     bathrooms        floors    waterfront     condition 
##      1.618321      2.845416      1.979201      1.009768      1.214990 
##         grade    sqft_above sqft_basement sqft_living15    sqft_lot15 
##      2.676297      4.514739      1.828056      2.652167      1.074721 
##     house_age     renovated 
##      2.097744      1.123333

Load library lm.beta to check the standard normal values of Regression coefficients

library(lm.beta)
lm.beta(model1)
## 
## Call:
## lm(formula = price ~ ., data = train_data)
## 
## Standardized Coefficients::
##   (Intercept)      bedrooms     bathrooms        floors waterfrontYes 
##            NA  -0.061398244   0.064818492   0.150227232   0.041225374 
##     condition         grade    sqft_above sqft_basement sqft_living15 
##   0.061943893   0.458862285   0.199926856   0.195272377   0.177852762 
##    sqft_lot15     house_age  renovatedYes 
##  -0.018520771   0.368032885   0.009697477

Breusch pagan test for checking heteroscedasticity

library(lmtest)
# Ho- The variance of residuals is constant
# Ha- The variance of residuals is not constant
bptest(model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 732.25, df = 12, p-value < 0.00000000000000022
# Since p value < 0.05, reject null hypothesis. So, The variance of residuals is not constant.
# Hence, there is heteroscedasticity.
library(car)
ncvTest(model1)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 851.7915, Df = 1, p = < 0.000000000000000222

use the model to predict the ‘Price’ of test data and compare the Actual price and Predicted price of test data

test_data$Predicted <- predict(model1, test_data)
test_comparison <- data.frame(Actual_price= test_data$price, Predicted_Price= test_data$Predicted)
head(test_comparison, 10)
##    Actual_price Predicted_Price
## 1        291850        334366.0
## 2        310000        487125.6
## 3        385000        428928.8
## 4        233000        366961.9
## 5        667000        650602.6
## 6        322500        399117.2
## 7        696000        667469.4
## 8        785000        680987.1
## 9        920000        707694.0
## 10       885000        612687.8

Use MAPE(Mean Ansolute Percentage Error) for checking the error of the model

MAPE <- mean(abs(test_comparison$Actual_price- test_comparison$Predicted_Price)/ test_comparison$Actual_price)
MAPE
## [1] 0.2585842
(1-MAPE)*100
## [1] 74.14158

Calculate RMSE (Root Mean Squared Error)

RMSE <- sqrt(mean(test_comparison$Actual_price- test_comparison$Predicted_Price)^2)
RMSE
## [1] 985.6164