#library(devtools)
#install_github("vqv/ggbiplot")
library(leaps)
library(MASS)
library(e1071)
library(readr)
library(e1071)
library(Metrics)
library(vioplot)
library(randomForest)
library(ROCR)
df <- read_csv("houses.csv")
head(df) #Accessing few first rows of the "Houses" dataset.
## # A tibble: 6 x 16
## id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.13e9 2.22e5 3 1 1180 5650 1 0 0
## 2 6.41e9 5.38e5 3 2.25 2570 7242 2 0 0
## 3 5.63e9 1.80e5 2 1 770 10000 1 0 0
## 4 2.49e9 6.04e5 4 3 1960 5000 1 0 0
## 5 1.95e9 5.10e5 3 2 1680 8080 1 0 0
## 6 7.24e9 1.23e6 4 4.5 5420 101930 1 0 0
## # ... with 7 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## # sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, age <dbl>
tail(df) #Accessing few last rows of the "Houeses" dataset.
## # A tibble: 6 x 16
## id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.00e9 475000 3 2.5 1310 1294 2 0 0
## 2 2.63e8 360000 3 2.5 1530 1131 3 0 0
## 3 6.60e9 400000 4 2.5 2310 5813 2 0 0
## 4 1.52e9 402101 2 0.75 1020 1350 2 0 0
## 5 2.91e8 400000 3 2.5 1600 2388 2 0 0
## 6 1.52e9 325000 2 0.75 1020 1076 2 0 0
## # ... with 7 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## # sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, age <dbl>
dim(df) #Checking the Dimensions of the dataset.
## [1] 21613 16
str(df) #Checking the structure of the variables of the "Houses" dataset.
## tibble [21,613 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:21613] 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ price : num [1:21613] 221900 538000 180000 604000 510000 ...
## $ bedrooms : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num [1:21613] 1180 2570 770 1960 1680 ...
## $ sqft_lot : num [1:21613] 5650 7242 10000 5000 8080 ...
## $ floors : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ view : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num [1:21613] 1180 2170 770 1050 1680 ...
## $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num [1:21613] 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
## $ age : num [1:21613] 59 63 82 49 28 13 19 52 55 12 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. price = col_double(),
## .. bedrooms = col_double(),
## .. bathrooms = col_double(),
## .. sqft_living = col_double(),
## .. sqft_lot = col_double(),
## .. floors = col_double(),
## .. waterfront = col_double(),
## .. view = col_double(),
## .. condition = col_double(),
## .. grade = col_double(),
## .. sqft_above = col_double(),
## .. sqft_basement = col_double(),
## .. yr_built = col_double(),
## .. yr_renovated = col_double(),
## .. age = col_double()
## .. )
summary(df) #Accessing the Summary of the "Houses" dataset.
## id price bedrooms bathrooms
## Min. :1.000e+06 Min. : 75000 Min. : 0.000 Min. :0.000
## 1st Qu.:2.123e+09 1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750
## Median :3.905e+09 Median : 450000 Median : 3.000 Median :2.250
## Mean :4.580e+09 Mean : 540088 Mean : 3.371 Mean :2.115
## 3rd Qu.:7.309e+09 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500
## Max. :9.900e+09 Max. :7700000 Max. :33.000 Max. :8.000
## sqft_living sqft_lot floors waterfront
## Min. : 290 Min. : 520 Min. :1.000 Min. :0.000000
## 1st Qu.: 1427 1st Qu.: 5040 1st Qu.:1.000 1st Qu.:0.000000
## Median : 1910 Median : 7618 Median :1.500 Median :0.000000
## Mean : 2080 Mean : 15107 Mean :1.494 Mean :0.007542
## 3rd Qu.: 2550 3rd Qu.: 10688 3rd Qu.:2.000 3rd Qu.:0.000000
## Max. :13540 Max. :1651359 Max. :3.500 Max. :1.000000
## view condition grade sqft_above
## Min. :0.0000 Min. :1.000 Min. : 1.000 Min. : 290
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.: 7.000 1st Qu.:1190
## Median :0.0000 Median :3.000 Median : 7.000 Median :1560
## Mean :0.2343 Mean :3.409 Mean : 7.657 Mean :1788
## 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.: 8.000 3rd Qu.:2210
## Max. :4.0000 Max. :5.000 Max. :13.000 Max. :9410
## sqft_basement yr_built yr_renovated age
## Min. : 0.0 Min. :1900 Min. : 0.0 Min. : -1.00
## 1st Qu.: 0.0 1st Qu.:1951 1st Qu.: 0.0 1st Qu.: 18.00
## Median : 0.0 Median :1975 Median : 0.0 Median : 40.00
## Mean : 291.5 Mean :1971 Mean : 84.4 Mean : 43.32
## 3rd Qu.: 560.0 3rd Qu.:1997 3rd Qu.: 0.0 3rd Qu.: 63.00
## Max. :4820.0 Max. :2015 Max. :2015.0 Max. :115.00
table(df$bedrooms)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 33
## 13 199 2760 9824 6882 1601 272 38 13 6 3 1 1
table(df$bathrooms)
##
## 0 0.5 0.75 1 1.25 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75 4
## 10 4 72 3852 9 1446 3048 1930 2047 5380 1185 753 589 731 155 136
## 4.25 4.5 4.75 5 5.25 5.5 5.75 6 6.25 6.5 6.75 7.5 7.75 8
## 79 100 23 21 13 10 4 6 2 2 2 1 1 2
table(df$waterfront)
##
## 0 1
## 21450 163
table(df$view)
##
## 0 1 2 3 4
## 19489 332 963 510 319
table(df$condition)
##
## 1 2 3 4 5
## 30 172 14031 5679 1701
table(df$grade)
##
## 1 3 4 5 6 7 8 9 10 11 12 13
## 1 3 29 242 2038 8981 6068 2615 1134 399 90 13
we can see there are majority of 2,3,4,5,6 Bedrooms; 1-5 Bathrooms;No waterfronts; 3-5 condition and 5-10 Grade in the “Houses” dataset.
df$id=NULL
library(corrplot)
corrplot(cor(df),method = 'circle',order="hclust")
cor(df)
## price bedrooms bathrooms sqft_living sqft_lot
## price 1.00000000 0.308349598 0.52513751 0.70203505 0.089660861
## bedrooms 0.30834960 1.000000000 0.51588364 0.57667069 0.031703243
## bathrooms 0.52513751 0.515883638 1.00000000 0.75466528 0.087739662
## sqft_living 0.70203505 0.576670693 0.75466528 1.00000000 0.172825661
## sqft_lot 0.08966086 0.031703243 0.08773966 0.17282566 1.000000000
## floors 0.25679389 0.175428935 0.50065317 0.35394929 -0.005200991
## waterfront 0.26636943 -0.006582479 0.06374363 0.10381782 0.021603683
## view 0.39729349 0.079531852 0.18773702 0.28461119 0.074710106
## condition 0.03636179 0.028472104 -0.12498193 -0.05875259 -0.008958250
## grade 0.66743426 0.356966725 0.66498253 0.76270448 0.113621124
## sqft_above 0.60556730 0.477600161 0.68534248 0.87659660 0.183512281
## sqft_basement 0.32381602 0.303093375 0.28377003 0.43504297 0.015286202
## yr_built 0.05401153 0.154178069 0.50601944 0.31804877 0.053080367
## yr_renovated 0.12643379 0.018840823 0.05073898 0.05536293 0.007643505
## age -0.05395078 -0.154323756 -0.50640694 -0.31848848 -0.052989555
## floors waterfront view condition grade
## price 0.256793888 0.266369434 0.39729349 0.03636179 0.66743426
## bedrooms 0.175428935 -0.006582479 0.07953185 0.02847210 0.35696673
## bathrooms 0.500653173 0.063743629 0.18773702 -0.12498193 0.66498253
## sqft_living 0.353949290 0.103817818 0.28461119 -0.05875259 0.76270448
## sqft_lot -0.005200991 0.021603683 0.07471011 -0.00895825 0.11362112
## floors 1.000000000 0.023698320 0.02944382 -0.26376795 0.45818251
## waterfront 0.023698320 1.000000000 0.40185735 0.01665316 0.08277491
## view 0.029443820 0.401857351 1.00000000 0.04598974 0.25132058
## condition -0.263767946 0.016653157 0.04598974 1.00000000 -0.14467367
## grade 0.458182514 0.082774914 0.25132058 -0.14467367 1.00000000
## sqft_above 0.523884710 0.072074592 0.16764934 -0.15821362 0.75592294
## sqft_basement -0.245704542 0.080587939 0.27694658 0.17410491 0.16839182
## yr_built 0.489319425 -0.026161086 -0.05343985 -0.36141656 0.44696320
## yr_renovated 0.006338401 0.092884837 0.10391729 -0.06061779 0.01441428
## age -0.489639965 0.026092934 0.05345777 0.36066523 -0.44741524
## sqft_above sqft_basement yr_built yr_renovated age
## price 0.60556730 0.32381602 0.05401153 0.126433793 -0.05395078
## bedrooms 0.47760016 0.30309338 0.15417807 0.018840823 -0.15432376
## bathrooms 0.68534248 0.28377003 0.50601944 0.050738978 -0.50640694
## sqft_living 0.87659660 0.43504297 0.31804877 0.055362927 -0.31848848
## sqft_lot 0.18351228 0.01528620 0.05308037 0.007643505 -0.05298956
## floors 0.52388471 -0.24570454 0.48931942 0.006338401 -0.48963997
## waterfront 0.07207459 0.08058794 -0.02616109 0.092884837 0.02609293
## view 0.16764934 0.27694658 -0.05343985 0.103917288 0.05345777
## condition -0.15821362 0.17410491 -0.36141656 -0.060617787 0.36066523
## grade 0.75592294 0.16839182 0.44696320 0.014414281 -0.44741524
## sqft_above 1.00000000 -0.05194331 0.42389835 0.023284688 -0.42424753
## sqft_basement -0.05194331 1.00000000 -0.13312410 0.071322902 0.13286495
## yr_built 0.42389835 -0.13312410 1.00000000 -0.224873518 -0.99987329
## yr_renovated 0.02328469 0.07132290 -0.22487352 1.000000000 0.22448020
## age -0.42424753 0.13286495 -0.99987329 0.224480202 1.00000000
set.seed(12345)
train.rows = sample(rownames(df), dim(df)[1]*0.6)
train.df = df[train.rows, ]
valid.rows = setdiff(rownames(df), train.rows)
valid.df = df[valid.rows, ]
###Code for Additional Clean Graphing of visualization plots###
library("ggplot2")
cleanup = theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line.x = element_line(color = 'black'),
axis.line.y = element_line(color = 'black'),
legend.key = element_rect(fill = 'white'),
text = element_text(size = 15))
library(grid)
library(gridExtra)
library("ggplot2")
s1=ggplot(data = train.df, mapping = aes(x =train.df$bedrooms,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and Bedrooms ") +
xlab("Number of Bedrooms")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'green1')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s2=ggplot(data = train.df, mapping = aes(x =train.df$bathrooms,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and Bathrooms ") +
xlab("Number of Bathrooms")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'green1')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s3=ggplot(data = train.df, mapping = aes(x =train.df$sqft_living,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and Sqft_living ") +
xlab("Sqft_living")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'green1')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s4=ggplot(data = train.df, mapping = aes(x =train.df$waterfront,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and waterfront ") +
xlab("Waterfront")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'green1')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s5=ggplot(data = train.df, mapping = aes(x =train.df$floors,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and Floors ") +
xlab("Floors")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'grey')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s6=ggplot(data = train.df, mapping = aes(x =train.df$view,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and view ") +
xlab("view")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'green1')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s7=ggplot(data = train.df, mapping = aes(x =train.df$view,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and ") +
xlab("Number of Bedrooms")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'green1')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s8=ggplot(data = train.df, mapping = aes(x =train.df$grade,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and Grade ") +
xlab("Grade")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'green1')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s9=ggplot(data = train.df, mapping = aes(x =train.df$sqft_above,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and Sqft_above ") +
xlab("Sqft_above")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'grey')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s10=ggplot(data = train.df, mapping = aes(x =train.df$sqft_basement,y=train.df$price)) +
geom_point()+
ggtitle("Scatterplot of Price and Sqft_basement ") +
xlab("Sqft_basement")+
ylab("Price")+
geom_smooth(method = 'lm', color = 'green1')+
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
s1
s2
s3
s4
s5
s6
s7
s8
s9
s10
library("ggplot2")
h1=ggplot(data = df, mapping = aes(x = df$price)) +
geom_histogram( bins=30, fill = "steelblue") +
labs(x = "Price of the Houses") +
ggtitle("Histogram of Price of Houses in King County ") +
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
h2=ggplot(data = df, mapping = aes(x = df$bedrooms)) +
geom_histogram( bins=30, fill = "darkolivegreen") +
labs(x = "Number of Bedrooms") +
ggtitle("Histogram of number of Bedrooms of Houses in King County ") +
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
h3=ggplot(data = df, mapping = aes(x = df$bathrooms)) +
geom_histogram( bins=30, fill = "green1") +
labs(x = "Number of Bathrooms") +
ggtitle("Histogram of number of Bathrooms of Houses in King County ") +
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
h4=ggplot(data = df, mapping = aes(x = df$sqft_lot)) +
geom_histogram( bins=30, fill = "tomato1") +
labs(x = "Lot Size(Sqft") +
ggtitle("Histogram of Lot size of Houses in King County ") +
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
h5=ggplot(data = df, mapping = aes(x = df$waterfront)) +
geom_histogram( bins=30, fill = "red1") +
labs(x = "Waterfront") +
ggtitle("Histogram of waterfront for Houses in King County ") +
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
h6=ggplot(data = df, mapping = aes(x = df$yr_built)) +
geom_histogram( bins=30, fill = "brown1") +
labs(x = "Built Year") +
ggtitle("Histogram of Year Built of Houses in King County ") +
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
h7=ggplot(data = df, mapping = aes(x = df$yr_renovated)) +
geom_histogram( bins=30, fill = "grey1") +
labs(x = "Renovated Year") +
ggtitle("Histogram of Year Renovated of Houses in King County ") +
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
h8=ggplot(data = df, mapping = aes(x = df$age)) +
geom_histogram( bins=30, fill = "orange1") +
labs(x = "Age of the house") +
ggtitle("Histogram of age of Houses in King County ") +
theme(plot.title = element_text(hjust = 0.5)) +
cleanup
h1
h2
h3
h4
h5
h6
h7
h8
Boxplot and kernel density plots of discrete variables of the “Houses” dataset.
vioplot(df$bedrooms,col="gold")
vioplot(df$bathrooms,col="brown1")
vioplot(df$view,col="yellow1")
vioplot(df$waterfront,col="green1")
vioplot(df$grade,col="tomato1")
vioplot(df$condition,col="steelblue")
Using Backward Elimination Wrapper method for the Multi-Linear Regression model
full.model=glm(price~.,data=train.df)
summary(full.model)
##
## Call:
## glm(formula = price ~ ., data = train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1300891 -107367 -10259 87476 4171402
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.692e+07 8.081e+06 -4.568 4.96e-06 ***
## bedrooms -3.595e+04 2.497e+03 -14.396 < 2e-16 ***
## bathrooms 3.915e+04 4.390e+03 8.918 < 2e-16 ***
## sqft_living 1.666e+02 5.874e+00 28.361 < 2e-16 ***
## sqft_lot -2.698e-01 4.334e-02 -6.225 4.96e-10 ***
## floors 2.520e+04 4.723e+03 5.336 9.65e-08 ***
## waterfront 5.424e+05 2.587e+04 20.967 < 2e-16 ***
## view 4.556e+04 2.898e+03 15.724 < 2e-16 ***
## condition 2.284e+04 3.180e+03 7.182 7.24e-13 ***
## grade 1.271e+05 2.757e+03 46.095 < 2e-16 ***
## sqft_above 1.980e+00 5.631e+00 0.352 0.725
## sqft_basement NA NA NA NA
## yr_built 1.782e+04 4.011e+03 4.442 8.98e-06 ***
## yr_renovated 7.016e+00 5.045e+00 1.391 0.164
## age 2.137e+04 4.010e+03 5.328 1.01e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 45109576356)
##
## Null deviance: 1.6817e+15 on 12966 degrees of freedom
## Residual deviance: 5.8430e+14 on 12953 degrees of freedom
## AIC: 354926
##
## Number of Fisher Scoring iterations: 2
step.model <- stepAIC(full.model, direction = "both", trace = TRUE)
## Start: AIC=354925.9
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
## waterfront + view + condition + grade + sqft_above + sqft_basement +
## yr_built + yr_renovated + age
##
##
## Step: AIC=354925.9
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
## waterfront + view + condition + grade + sqft_above + yr_built +
## yr_renovated + age
##
## Df Deviance AIC
## - sqft_above 1 5.8431e+14 354924
## - yr_renovated 1 5.8439e+14 354926
## <none> 5.8430e+14 354926
## - yr_built 1 5.8519e+14 354944
## - age 1 5.8558e+14 354952
## - floors 1 5.8559e+14 354952
## - sqft_lot 1 5.8605e+14 354963
## - condition 1 5.8663e+14 354975
## - bathrooms 1 5.8789e+14 355003
## - bedrooms 1 5.9365e+14 355130
## - view 1 5.9546e+14 355169
## - waterfront 1 6.0413e+14 355357
## - sqft_living 1 6.2059e+14 355705
## - grade 1 6.8015e+14 356893
##
## Step: AIC=354924
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
## waterfront + view + condition + grade + yr_built + yr_renovated +
## age
##
## Df Deviance AIC
## - yr_renovated 1 5.8440e+14 354924
## <none> 5.8431e+14 354924
## + sqft_above 1 5.8430e+14 354926
## + sqft_basement 1 5.8430e+14 354926
## - yr_built 1 5.8520e+14 354942
## - age 1 5.8559e+14 354950
## - floors 1 5.8591e+14 354958
## - sqft_lot 1 5.8606e+14 354961
## - condition 1 5.8663e+14 354973
## - bathrooms 1 5.8796e+14 355003
## - bedrooms 1 5.9367e+14 355128
## - view 1 5.9569e+14 355172
## - waterfront 1 6.0419e+14 355356
## - sqft_living 1 6.5804e+14 356463
## - grade 1 6.8265e+14 356939
##
## Step: AIC=354923.9
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
## waterfront + view + condition + grade + yr_built + age
##
## Df Deviance AIC
## <none> 5.8440e+14 354924
## + yr_renovated 1 5.8431e+14 354924
## + sqft_basement 1 5.8439e+14 354926
## + sqft_above 1 5.8439e+14 354926
## - yr_built 1 5.8527e+14 354941
## - age 1 5.8567e+14 354950
## - floors 1 5.8604e+14 354958
## - sqft_lot 1 5.8614e+14 354960
## - condition 1 5.8663e+14 354971
## - bathrooms 1 5.8824e+14 355007
## - bedrooms 1 5.9380e+14 355129
## - view 1 5.9583e+14 355173
## - waterfront 1 6.0463e+14 355363
## - sqft_living 1 6.5810e+14 356462
## - grade 1 6.8282e+14 356940
summary(step.model)
##
## Call:
## glm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
## floors + waterfront + view + condition + grade + yr_built +
## age, data = train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1293083 -107303 -9949 87253 4170872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.664e+07 8.077e+06 -4.537 5.77e-06 ***
## bedrooms -3.605e+04 2.496e+03 -14.440 < 2e-16 ***
## bathrooms 3.961e+04 4.289e+03 9.234 < 2e-16 ***
## sqft_living 1.680e+02 4.157e+00 40.420 < 2e-16 ***
## sqft_lot -2.676e-01 4.308e-02 -6.212 5.41e-10 ***
## floors 2.617e+04 4.331e+03 6.043 1.55e-09 ***
## waterfront 5.457e+05 2.577e+04 21.180 < 2e-16 ***
## view 4.550e+04 2.858e+03 15.920 < 2e-16 ***
## condition 2.204e+04 3.130e+03 7.042 1.99e-12 ***
## grade 1.273e+05 2.725e+03 46.710 < 2e-16 ***
## yr_built 1.768e+04 4.009e+03 4.410 1.04e-05 ***
## age 2.126e+04 4.009e+03 5.304 1.15e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 45109774050)
##
## Null deviance: 1.6817e+15 on 12966 degrees of freedom
## Residual deviance: 5.8440e+14 on 12955 degrees of freedom
## AIC: 354924
##
## Number of Fisher Scoring iterations: 2
plot(full.model)
final.model=glm(price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
floors + waterfront + view + condition + grade + yr_built +
age,data=train.df)
summary(final.model)
##
## Call:
## glm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
## floors + waterfront + view + condition + grade + yr_built +
## age, data = train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1293083 -107303 -9949 87253 4170872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.664e+07 8.077e+06 -4.537 5.77e-06 ***
## bedrooms -3.605e+04 2.496e+03 -14.440 < 2e-16 ***
## bathrooms 3.961e+04 4.289e+03 9.234 < 2e-16 ***
## sqft_living 1.680e+02 4.157e+00 40.420 < 2e-16 ***
## sqft_lot -2.676e-01 4.308e-02 -6.212 5.41e-10 ***
## floors 2.617e+04 4.331e+03 6.043 1.55e-09 ***
## waterfront 5.457e+05 2.577e+04 21.180 < 2e-16 ***
## view 4.550e+04 2.858e+03 15.920 < 2e-16 ***
## condition 2.204e+04 3.130e+03 7.042 1.99e-12 ***
## grade 1.273e+05 2.725e+03 46.710 < 2e-16 ***
## yr_built 1.768e+04 4.009e+03 4.410 1.04e-05 ***
## age 2.126e+04 4.009e+03 5.304 1.15e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 45109774050)
##
## Null deviance: 1.6817e+15 on 12966 degrees of freedom
## Residual deviance: 5.8440e+14 on 12955 degrees of freedom
## AIC: 354924
##
## Number of Fisher Scoring iterations: 2
anova(final.model) #Analysis of variance of the final model
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: price
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 12966 1.6817e+15
## bedrooms 1 1.5603e+14 12965 1.5257e+15
## bathrooms 1 2.9956e+14 12964 1.2261e+15
## sqft_living 1 4.0230e+14 12963 8.2383e+14
## sqft_lot 1 3.5631e+12 12962 8.2027e+14
## floors 1 2.4316e+10 12961 8.2024e+14
## waterfront 1 4.3570e+13 12960 7.7667e+14
## view 1 3.1303e+13 12959 7.4537e+14
## condition 1 1.2269e+13 12958 7.3310e+14
## grade 1 6.7214e+13 12957 6.6589e+14
## yr_built 1 8.0223e+13 12956 5.8567e+14
## age 1 1.2690e+12 12955 5.8440e+14
plot(final.model)
prediction_glm=predict(final.model,valid.df,type="response")
rmse(valid.df$price,prediction_glm)
## [1] 222037
model.svm <- svm(price ~ bedrooms + bathrooms + sqft_living + sqft_lot +floors + waterfront + view + condition + grade +yr_built + age,data = df,kernel="linear")
summary(model.svm)
##
## Call:
## svm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
## floors + waterfront + view + condition + grade + yr_built + age,
## data = df, kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.09090909
## epsilon: 0.1
##
##
## Number of Support Vectors: 17028
plot(model.svm)
prediction_svm = predict(model.svm, valid.df, type="response")
rmse(valid.df$price,prediction_svm)
## [1] 230840.7
set.seed(12345)
reg.randomForest=price ~ bedrooms + bathrooms + sqft_living + sqft_lot +floors + waterfront + view + condition + grade +yr_built + age
model.randomForest= randomForest(reg.randomForest,df,ntree = 100, nodesize = 10, proximity = T)
model.randomForest
##
## Call:
## randomForest(formula = reg.randomForest, data = df, ntree = 100, nodesize = 10, proximity = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 36590581324
## % Var explained: 72.85
plot(model.randomForest)
varImpPlot(model.randomForest)
prediction_randomForest = predict(model.randomForest, valid.df, type="response")
rmse(valid.df$price,prediction_randomForest)
## [1] 122062.9
library(car)
vif(final.model)
## bedrooms bathrooms sqft_living sqft_lot floors waterfront
## 1.598268 3.115729 4.147641 1.046444 1.571236 1.170105
## view condition grade yr_built age
## 1.303290 1.185598 2.945794 3956.927950 3957.136879