Read file
data <- read.csv("kc_house_data.csv", header = T, stringsAsFactors = T, sep = ",")
Omit missing data
data <- na.omit(data)
Removing Unnecessary Variables
data1 <- subset(data, select = -c(id, sqft_living, sqft_lot, zipcode, lat, long, view))
Removing Date Variable and calculating the age of house
data2 <- subset(data1, select = -c(1))
data2$house_age <- data2$year_sale- data2$yr_built
Since ‘Age’ is having negative value, we add +2 to make ‘Age’ positive
data2$house_age <- 2+ data2$house_age
Removing unnecessary variables
data3 <- subset(data2, select = -c(yr_built, year_sale))
Loading library DPLYR
library(dplyr)
Converting ‘Renovated’ variable as factor and renaming the levels as ‘Yes’ and ‘No’
data3 <- data3 %>% mutate(renovated = as.factor(case_when(data3$yr_renovated >0 ~ "Yes",
data3$yr_renovated == 0 ~ "No")))
Removing unnecessary variables
data4 <- subset(data3, select = -c(yr_renovated))
Converting ‘Waterfront’ variable as factor and renaming the levels
data4$waterfront <- factor(data4$waterfront, levels= c(0,1), labels= c("No", "Yes"))
Load library ‘ggcorrplot’ and plot correlation of all continuous variables
library(ggcorrplot)
data.continuous <- data4 %>% select_if(is.numeric)
cor.matrix <- cor(data.continuous)
ggcorrplot(cor.matrix, type = "lower", hc.order= T, lab= T)

Removing outliers from ‘Price’ variable using boxplot method
outlier <- boxplot(data4$price)$out

outlier_data <- data4[which(data4$price %in% outlier),]
data5 <- data4[-which(data4$price %in% outlier),]
Set seed and create training data and test data
set.seed(12345)
index <- sample(1:nrow(data5), 0.80*nrow(data5))
train_data <- data5[index,]
test_data <- data5[-index,]
Generate Linear model using lm function
options(scipen = 100)
model1 <- lm(price~., data = train_data)
summary(model1)
##
## Call:
## lm(formula = price ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -546128 -92099 -6929 80846 717683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -718698.71159 13061.97029 -55.022 < 0.0000000000000002 ***
## bedrooms -13961.91598 1480.43064 -9.431 < 0.0000000000000002 ***
## bathrooms 16454.80290 2191.45882 7.509 0.00000000000006284 ***
## floors 58195.68658 2789.03400 20.866 < 0.0000000000000002 ***
## waterfrontYes 150730.68257 18802.45368 8.017 0.00000000000000116 ***
## condition 19949.03995 1816.67189 10.981 < 0.0000000000000002 ***
## grade 92044.86860 1679.38800 54.809 < 0.0000000000000002 ***
## sqft_above 57.19854 3.11098 18.386 < 0.0000000000000002 ***
## sqft_basement 98.78673 3.50041 28.221 < 0.0000000000000002 ***
## sqft_living15 60.05082 2.81401 21.340 < 0.0000000000000002 ***
## sqft_lot15 -0.14399 0.04125 -3.491 0.000483 ***
## house_age 2617.92563 52.72464 49.653 < 0.0000000000000002 ***
## renovatedYes 10626.08345 5943.40565 1.788 0.073814 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 136000 on 16348 degrees of freedom
## Multiple R-squared: 0.5718, Adjusted R-squared: 0.5715
## F-statistic: 1820 on 12 and 16348 DF, p-value: < 0.00000000000000022
par(mfrow = c(2,2))
plot(model1)

Load library car for checking the Variability Inflation Factor
library(car)
vif(model1)
## bedrooms bathrooms floors waterfront condition
## 1.618321 2.845416 1.979201 1.009768 1.214990
## grade sqft_above sqft_basement sqft_living15 sqft_lot15
## 2.676297 4.514739 1.828056 2.652167 1.074721
## house_age renovated
## 2.097744 1.123333
Load library lm.beta to check the standard normal values of Regression coefficients
library(lm.beta)
lm.beta(model1)
##
## Call:
## lm(formula = price ~ ., data = train_data)
##
## Standardized Coefficients::
## (Intercept) bedrooms bathrooms floors waterfrontYes
## NA -0.061398244 0.064818492 0.150227232 0.041225374
## condition grade sqft_above sqft_basement sqft_living15
## 0.061943893 0.458862285 0.199926856 0.195272377 0.177852762
## sqft_lot15 house_age renovatedYes
## -0.018520771 0.368032885 0.009697477
Breusch pagan test for checking heteroscedasticity
library(lmtest)
# Ho- The variance of residuals is constant
# Ha- The variance of residuals is not constant
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 732.25, df = 12, p-value < 0.00000000000000022
# Since p value < 0.05, reject null hypothesis. So, The variance of residuals is not constant.
# Hence, there is heteroscedasticity.
library(car)
ncvTest(model1)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 851.7915, Df = 1, p = < 0.000000000000000222
use the model to predict the ‘Price’ of test data and compare the Actual price and Predicted price of test data
test_data$Predicted <- predict(model1, test_data)
test_comparison <- data.frame(Actual_price= test_data$price, Predicted_Price= test_data$Predicted)
head(test_comparison, 10)
## Actual_price Predicted_Price
## 1 291850 334366.0
## 2 310000 487125.6
## 3 385000 428928.8
## 4 233000 366961.9
## 5 667000 650602.6
## 6 322500 399117.2
## 7 696000 667469.4
## 8 785000 680987.1
## 9 920000 707694.0
## 10 885000 612687.8
Use MAPE(Mean Ansolute Percentage Error) for checking the error of the model
MAPE <- mean(abs(test_comparison$Actual_price- test_comparison$Predicted_Price)/ test_comparison$Actual_price)
MAPE
## [1] 0.2585842
(1-MAPE)*100
## [1] 74.14158
Calculate RMSE (Root Mean Squared Error)
RMSE <- sqrt(mean(test_comparison$Actual_price- test_comparison$Predicted_Price)^2)
RMSE
## [1] 985.6164