price represents the sale price of a house in Rs. area gives the total size of a property in square feet bedrooms represents the number of bedrooms bathrooms shows the number of bathrooms stories variable shows the number of stories excluding basement mainroad =1 if the house faces a main road livingroom = 1 if the house has a separate living room or a drawing room for guests basement shows if the house has a basement hotwaterheating = 1 if the house uses gas for hot water heating airconditioning = 1 if there is central air conditioning parking shoes the number of cars that can be parked prefarea is 1 if the house is located in the preferred neighbourhood of the city
data <- read.csv('file:///C:/Users/badal/Desktop/datset_/usecase/Housing.csv')
head(data)
any(is.na(data))
[1] FALSE
str(data)
'data.frame': 545 obs. of 13 variables:
$ price : int 13300000 12250000 12250000 12215000 11410000 10850000 10150000 10150000 9870000 9800000 ...
$ area : int 7420 8960 9960 7500 7420 7500 8580 16200 8100 5750 ...
$ bedrooms : int 4 4 3 4 4 3 4 5 4 3 ...
$ bathrooms : int 2 4 2 2 1 3 3 3 1 2 ...
$ stories : int 3 4 2 2 2 1 4 2 2 4 ...
$ mainroad : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
$ guestroom : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 2 2 ...
$ basement : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 1 ...
$ hotwaterheating : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
$ airconditioning : Factor w/ 2 levels "no","yes": 2 2 1 2 2 2 2 1 2 2 ...
$ parking : int 2 3 2 3 2 2 2 0 2 1 ...
$ prefarea : Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 2 1 2 2 ...
$ furnishingstatus: Factor w/ 3 levels "furnished","semi-furnished",..: 1 1 2 1 1 2 2 3 1 3 ...
library(dplyr)
data$mainroad <-factor(data$mainroad, labels = c("No","Yes"))
data$guestroom <-factor(data$guestroom, labels = c("No","Yes"))
data$basement <-factor(data$basement, labels = c("No","Yes"))
data$hotwaterheating <-factor(data$hotwaterheating, labels = c("No","Yes"))
data$airconditioning <-factor(data$airconditioning, labels = c("No","Yes"))
data$parking <- factor(data$parking, labels = c("No","Small","Medium", "Large"))
data$prefarea<- factor(data$prefarea,labels = c("No","Yes"))
data$furnishingstatus<- factor(data$furnishingstatus, labels =c("furnished", "semi-furnished", "unfurnished"))
summary(factor(data$furnishingstatus))
furnished semi-furnished unfurnished
140 227 178
fst <- data.frame(model.matrix( ~ furnishingstatus, data))
fst
fst <- fst[,-1]
data<- cbind(data[,-13], fst)
summary(data)
price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating airconditioning parking prefarea
Min. : 1750000 Min. : 1650 Min. :1.000 Min. :1.000 Min. :1.000 No : 77 No :448 No :354 No :520 No :373 No :299 No :417
1st Qu.: 3430000 1st Qu.: 3600 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 Yes:468 Yes: 97 Yes:191 Yes: 25 Yes:172 Small :126 Yes:128
Median : 4340000 Median : 4600 Median :3.000 Median :1.000 Median :2.000 Medium:108
Mean : 4766729 Mean : 5151 Mean :2.965 Mean :1.286 Mean :1.806 Large : 12
3rd Qu.: 5740000 3rd Qu.: 6360 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:2.000
Max. :13300000 Max. :16200 Max. :6.000 Max. :4.000 Max. :4.000
furnishingstatussemi.furnished furnishingstatusunfurnished
Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000
Mean :0.4165 Mean :0.3266
3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.0000
library(ggplot2)
ggplot(data, aes(x=price))+ geom_histogram(bins = 30)
ggplot(data, aes(x= price))+geom_histogram(bins = 30, fill= "blue", col= "red")
ggplot(data, aes(y= price, x= hotwaterheating, fill= hotwaterheating))+ geom_boxplot()
ggplot(data, aes(y= price, x= airconditioning, fill= airconditioning))+ geom_boxplot()
ggplot(data, aes(x= area, y= price))+ geom_point(col="red")+geom_smooth(method = 'lm', se=F)
ggplot(data, aes(x= area, y= price, col=factor(bedrooms)))+ geom_point(col="lightblue")+geom_smooth(method = 'lm', se=F)+labs(col="bedrooms")
# Let us create the new metric and assign it to "areaperbedroom"
data$areaperbedroom <- data$area/data$bedrooms
# metric - bathrooms per bedroom
data$bbratio <- data$bathrooms/data$bedrooms
splitting data
library(caTools)
set.seed(123)
index<- sample.split(data$price,SplitRatio = 0.65)
train <- subset(data,index==T)
test<- subset(data,index==F)
nrow(train)
[1] 385
nrow(test)
[1] 160
model bulding
model1<- lm(Formula=price~.,data=train) #selecting all variables.
In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 㤼㸱Formula㤼㸲 will be disregarded
summary(model1)
Call:
lm(data = train, Formula = price ~ .)
Residuals:
Min 1Q Median 3Q Max
-2504113 -634879 -80599 511075 5060460
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 874315.2 848433.8 1.031 0.303452
area 360.8 105.8 3.410 0.000722 ***
bedrooms -211715.4 249807.2 -0.848 0.397260
bathrooms 1398493.1 558261.1 2.505 0.012675 *
stories 415843.4 79577.0 5.226 2.92e-07 ***
mainroadYes 560392.0 180753.6 3.100 0.002083 **
guestroomYes 434052.3 163788.7 2.650 0.008396 **
basementYes 241724.3 137721.0 1.755 0.080064 .
hotwaterheatingYes 780537.4 268829.4 2.903 0.003914 **
airconditioningYes 874359.9 135570.6 6.449 3.54e-10 ***
parkingSmall 457162.3 145060.5 3.152 0.001758 **
parkingMedium 590082.4 164977.0 3.577 0.000394 ***
parkingLarge 314956.0 466080.5 0.676 0.499622
prefareaYes 684629.3 142398.9 4.808 2.23e-06 ***
furnishingstatussemi.furnished 48379.0 147553.0 0.328 0.743193
furnishingstatusunfurnished -323248.4 158810.0 -2.035 0.042525 *
areaperbedroom -381.2 286.5 -1.330 0.184208
bbratio -1024676.6 1704794.7 -0.601 0.548174
---
Signif. codes: 0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1
Residual standard error: 1108000 on 367 degrees of freedom
Multiple R-squared: 0.6846, Adjusted R-squared: 0.67
F-statistic: 46.85 on 17 and 367 DF, p-value: < 2.2e-16
# Pass the model_1 in the vif function
library(car)
vif(model1)
GVIF Df GVIF^(1/(2*Df))
area 16.023437 1 4.002928
bedrooms 10.515692 1 3.242791
bathrooms 23.322562 1 4.829344
stories 1.572269 1 1.253901
mainroad 1.177956 1 1.085337
guestroom 1.252031 1 1.118942
basement 1.372742 1 1.171641
hotwaterheating 1.063719 1 1.031367
airconditioning 1.269521 1 1.126730
parking 1.318628 3 1.047178
prefarea 1.182424 1 1.087393
furnishingstatus 1.167263 2 1.039423
areaperbedroom 17.814876 1 4.220767
bbratio 22.452484 1 4.738405
model2 <- lm(formula = price ~ area + bedrooms + bathrooms + stories + mainroad + guestroom +
basement + hotwaterheating + airconditioning + parking + prefarea +
furnishingstatusunfurnished + furnishingstatussemi.furnished + areaperbedroom, data = train)
summary(model2)
Call:
lm(formula = price ~ area + bedrooms + bathrooms + stories +
mainroad + guestroom + basement + hotwaterheating + airconditioning +
parking + prefarea + furnishingstatusunfurnished + furnishingstatussemi.furnished +
areaperbedroom, data = train)
Residuals:
Min 1Q Median 3Q Max
-2550916 -638396 -98127 516615 5072424
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 504181.15 583121.69 0.865 0.387808
area 382.71 99.29 3.854 0.000137 ***
bedrooms -100804.09 168237.94 -0.599 0.549424
bathrooms 1072508.43 132203.38 8.113 7.48e-15 ***
stories 414366.54 79470.03 5.214 3.09e-07 ***
mainroadYes 558913.32 180579.95 3.095 0.002118 **
guestroomYes 435813.52 163620.33 2.664 0.008072 **
basementYes 232302.09 136707.18 1.699 0.090114 .
hotwaterheatingYes 793375.52 267746.91 2.963 0.003243 **
airconditioningYes 876153.64 135420.09 6.470 3.13e-10 ***
parkingSmall 454774.39 144880.17 3.139 0.001832 **
parkingMedium 586211.81 164708.19 3.559 0.000421 ***
parkingLarge 313365.81 465668.30 0.673 0.501409
prefareaYes 694044.67 141411.80 4.908 1.39e-06 ***
furnishingstatusunfurnished -319135.12 158524.78 -2.013 0.044826 *
furnishingstatussemi.furnished 52813.02 147240.55 0.359 0.720036
areaperbedroom -440.19 268.90 -1.637 0.102490
---
Signif. codes: 0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1
Residual standard error: 1107000 on 368 degrees of freedom
Multiple R-squared: 0.6843, Adjusted R-squared: 0.6705
F-statistic: 49.85 on 16 and 368 DF, p-value: < 2.2e-16
vif(model2)
GVIF Df GVIF^(1/(2*Df))
area 14.130284 1 3.759027
bedrooms 4.777825 1 2.185824
bathrooms 1.310211 1 1.144644
stories 1.570770 1 1.253303
mainroad 1.177738 1 1.085236
guestroom 1.251630 1 1.118763
basement 1.354957 1 1.164026
hotwaterheating 1.057004 1 1.028107
airconditioning 1.268906 1 1.126457
parking 1.316335 3 1.046874
prefarea 1.168115 1 1.080794
furnishingstatusunfurnished 1.738832 1 1.318648
furnishingstatussemi.furnished 1.660610 1 1.288647
areaperbedroom 15.721413 1 3.965024
model3 <- lm(formula = price ~ area + bathrooms + stories + mainroad + guestroom +
basement + hotwaterheating + airconditioning + parking + prefarea +
furnishingstatusunfurnished , data = train)
summary(model3)
Call:
lm(formula = price ~ area + bathrooms + stories + mainroad +
guestroom + basement + hotwaterheating + airconditioning +
parking + prefarea + furnishingstatusunfurnished, data = train)
Residuals:
Min 1Q Median 3Q Max
-2717894 -697257 -82137 520522 5133541
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 67407.81 258253.68 0.261 0.794227
area 230.66 31.05 7.429 7.62e-13 ***
bathrooms 1117247.71 130199.08 8.581 2.61e-16 ***
stories 473118.95 74185.73 6.377 5.36e-10 ***
mainroadYes 513703.01 179125.33 2.868 0.004369 **
guestroomYes 453188.36 163358.83 2.774 0.005814 **
basementYes 263532.17 135836.00 1.940 0.053128 .
hotwaterheatingYes 839534.98 266952.44 3.145 0.001796 **
airconditioningYes 872528.97 134318.11 6.496 2.66e-10 ***
parkingSmall 452148.84 144948.03 3.119 0.001954 **
parkingMedium 633548.13 163661.27 3.871 0.000128 ***
parkingLarge 326493.45 466152.48 0.700 0.484116
prefareaYes 713937.40 141381.32 5.050 6.95e-07 ***
furnishingstatusunfurnished -353514.59 125478.26 -2.817 0.005101 **
---
Signif. codes: 0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1
Residual standard error: 1110000 on 371 degrees of freedom
Multiple R-squared: 0.6799, Adjusted R-squared: 0.6687
F-statistic: 60.61 on 13 and 371 DF, p-value: < 2.2e-16
vif(model3)
GVIF Df GVIF^(1/(2*Df))
area 1.373955 1 1.172158
bathrooms 1.263554 1 1.124079
stories 1.361032 1 1.166633
mainroad 1.152246 1 1.073427
guestroom 1.240534 1 1.113792
basement 1.330131 1 1.153313
hotwaterheating 1.044762 1 1.022136
airconditioning 1.241236 1 1.114108
parking 1.279746 3 1.041967
prefarea 1.160968 1 1.077482
furnishingstatusunfurnished 1.083233 1 1.040785
model4 <- lm(formula = price ~ area + bathrooms + stories + guestroom + hotwaterheating + airconditioning + parking + prefarea +
furnishingstatusunfurnished , data = train)
summary(model4)
Call:
lm(formula = price ~ area + bathrooms + stories + guestroom +
hotwaterheating + airconditioning + parking + prefarea +
furnishingstatusunfurnished, data = train)
Residuals:
Min 1Q Median 3Q Max
-2470160 -705623 -65999 542942 4978397
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 524701.2 226280.8 2.319 0.020945 *
area 240.5 30.6 7.861 4.14e-14 ***
bathrooms 1122153.8 130017.8 8.631 < 2e-16 ***
stories 464459.9 72257.6 6.428 3.96e-10 ***
guestroomYes 592026.6 154061.0 3.843 0.000143 ***
hotwaterheatingYes 828926.7 270674.9 3.062 0.002355 **
airconditioningYes 885373.1 135998.4 6.510 2.43e-10 ***
parkingSmall 420311.3 146567.5 2.868 0.004370 **
parkingMedium 680754.2 164628.6 4.135 4.39e-05 ***
parkingLarge 343451.2 472485.0 0.727 0.467741
prefareaYes 808448.2 140061.2 5.772 1.65e-08 ***
furnishingstatusunfurnished -389529.7 126695.6 -3.075 0.002264 **
---
Signif. codes: 0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1
Residual standard error: 1126000 on 373 degrees of freedom
Multiple R-squared: 0.6691, Adjusted R-squared: 0.6593
F-statistic: 68.55 on 11 and 373 DF, p-value: < 2.2e-16
vif(model4)
model5 <- lm(formula = price ~ area + bathrooms + stories + guestroom + airconditioning + parking + prefarea +
furnishingstatusunfurnished , data = train)
summary(model5)
Call:
lm(formula = price ~ area + bathrooms + stories + guestroom +
airconditioning + parking + prefarea + furnishingstatusunfurnished,
data = train)
Residuals:
Min 1Q Median 3Q Max
-2492575 -726320 -59881 537245 4937129
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 547793.05 228674.29 2.396 0.017088 *
area 237.67 30.92 7.686 1.35e-13 ***
bathrooms 1152405.25 131086.12 8.791 < 2e-16 ***
stories 470095.47 73038.79 6.436 3.76e-10 ***
guestroomYes 606471.32 155704.11 3.895 0.000116 ***
airconditioningYes 822323.40 135928.50 6.050 3.52e-09 ***
parkingSmall 434164.62 148129.56 2.931 0.003587 **
parkingMedium 724408.27 165837.22 4.368 1.62e-05 ***
parkingLarge 329687.82 477726.52 0.690 0.490548
prefareaYes 789730.13 141486.46 5.582 4.58e-08 ***
furnishingstatusunfurnished -407778.37 127965.13 -3.187 0.001560 **
---
Signif. codes: 0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1
Residual standard error: 1138000 on 374 degrees of freedom
Multiple R-squared: 0.6607, Adjusted R-squared: 0.6517
F-statistic: 72.84 on 10 and 374 DF, p-value: < 2.2e-16
vif(model5)
GVIF Df GVIF^(1/(2*Df))
area 1.296377 1 1.138585
bathrooms 1.218382 1 1.103803
stories 1.254952 1 1.120246
guestroom 1.072052 1 1.035399
airconditioning 1.209201 1 1.099637
parking 1.237144 3 1.036104
prefarea 1.106008 1 1.051669
furnishingstatusunfurnished 1.071669 1 1.035214
head(price_pred,10)
1 3 4 5 6 7 9 10 12 13
8362871 7674136 7517068 6557111 8594098 10261070 7508457 8344505 7702105 7395758
compare_result<- cbind(actual= test$price, predicted = price_pred)
number of rows of result is not a multiple of vector length (arg 1)
compare_result <- as.data.frame(compare_result)
error<- compare_result$actual- compare_result$predicted
cbind(compare_result,error)