the aim of this analysis to use weighted regression to predict the houses prices
plz feel free to contact me for any advice or recommendations: vet.m.mohamed@gmail.com
find the data here
firstly lets load the libraries we are going to use
library(tidyverse)
library(caret)
library(mice)
library(car)
library(lm.beta)
library(knitr)
Next lets import the data
houses<-read_csv(file = "./data sets/kc_house_data.csv",col_types = "cTninnnifffinnddf??nn")
head(houses)%>%kable("markdown")
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7129300520 | 2014-10-13 | 221900 | 3 | 1.00 | 1180 | 5650 | 1 | 0 | 0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 6414100192 | 2014-12-09 | 538000 | 3 | 2.25 | 2570 | 7242 | 2 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 5631500400 | 2015-02-25 | 180000 | 2 | 1.00 | 770 | 10000 | 1 | 0 | 0 | 3 | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 2487200875 | 2014-12-09 | 604000 | 4 | 3.00 | 1960 | 5000 | 1 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 1954400510 | 2015-02-18 | 510000 | 3 | 2.00 | 1680 | 8080 | 1 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
| 7237550310 | 2014-05-12 | 1225000 | 4 | 4.50 | 5420 | 101930 | 1 | 0 | 0 | 3 | 11 | 3890 | 1530 | 2001 | 0 | 98053 | 47.6561 | -122.005 | 4760 | 101930 |
checking the structure of the data
str(houses)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 21613 obs. of 21 variables:
## $ id : chr "7129300520" "6414100192" "5631500400" "2487200875" ...
## $ date : POSIXct, format: "2014-10-13" "2014-12-09" ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num 1180 2570 770 1960 1680 ...
## $ sqft_lot : num 5650 7242 10000 5000 8080 ...
## $ floors : int 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ view : Factor w/ 5 levels "0","3","4","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ condition : Factor w/ 5 levels "3","5","4","1",..: 1 1 1 2 1 1 1 1 1 1 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num 1180 2170 770 1050 1680 ...
## $ sqft_basement: num 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num 0 1991 0 0 0 ...
## $ zipcode : Factor w/ 70 levels "98178","98125",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: num 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : num 5650 7639 8062 5000 7503 ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 2079 obs. of 5 variables:
## ..$ row : int 13 15 18 26 28 33 34 36 55 66 ...
## ..$ col : chr "floors" "floors" "floors" "floors" ...
## ..$ expected: chr "no trailing characters" "no trailing characters" "no trailing characters" "no trailing characters" ...
## ..$ actual : chr ".5" ".5" ".5" ".5" ...
## ..$ file : chr "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_character(),
## .. date = col_datetime(format = ""),
## .. price = col_number(),
## .. bedrooms = col_integer(),
## .. bathrooms = col_number(),
## .. sqft_living = col_number(),
## .. sqft_lot = col_number(),
## .. floors = col_integer(),
## .. waterfront = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. view = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. condition = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. grade = col_integer(),
## .. sqft_above = col_number(),
## .. sqft_basement = col_number(),
## .. yr_built = col_double(),
## .. yr_renovated = col_double(),
## .. zipcode = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. lat = col_double(),
## .. long = col_double(),
## .. sqft_living15 = col_number(),
## .. sqft_lot15 = col_number()
## .. )
removing id lat long date
houses<-houses%>%select(-c(id,date,lat,long))
need to check the condition if it need to be ordered
unique(houses$condition)
## [1] 3 5 4 1 2
## Levels: 3 5 4 1 2
i think that is preferable if it is ordered
houses<-houses%>%mutate(condition=factor(condition,levels = c("1","2","3","4","5"),labels = c("1","2","3","4","5"),ordered = T))
checking the structure again
str(houses)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 21613 obs. of 17 variables:
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num 1180 2570 770 1960 1680 ...
## $ sqft_lot : num 5650 7242 10000 5000 8080 ...
## $ floors : int 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ view : Factor w/ 5 levels "0","3","4","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ condition : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num 1180 2170 770 1050 1680 ...
## $ sqft_basement: num 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num 0 1991 0 0 0 ...
## $ zipcode : Factor w/ 70 levels "98178","98125",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ sqft_living15: num 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : num 5650 7639 8062 5000 7503 ...
now we want to test some assumption starting with is the missing values , what about it ?
summary(houses)
## price bedrooms bathrooms sqft_living
## Min. : 75000 Min. : 0.000 Min. :0.000 Min. : 290
## 1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427
## Median : 450000 Median : 3.000 Median :2.250 Median : 1910
## Mean : 540088 Mean : 3.371 Mean :2.115 Mean : 2080
## 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550
## Max. :7700000 Max. :33.000 Max. :8.000 Max. :13540
##
## sqft_lot floors waterfront view condition
## Min. : 520 Min. :1.000 0:21450 0:19489 1: 30
## 1st Qu.: 5040 1st Qu.:1.000 1: 163 3: 510 2: 172
## Median : 7618 Median :1.000 4: 319 3:14031
## Mean : 15107 Mean :1.485 2: 963 4: 5679
## 3rd Qu.: 10688 3rd Qu.:2.000 1: 332 5: 1701
## Max. :1651359 Max. :3.000
## NA's :2079
## grade sqft_above sqft_basement yr_built
## Min. : 1.000 Min. : 290 Min. : 0.0 Min. :1900
## 1st Qu.: 7.000 1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951
## Median : 7.000 Median :1560 Median : 0.0 Median :1975
## Mean : 7.657 Mean :1788 Mean : 291.5 Mean :1971
## 3rd Qu.: 8.000 3rd Qu.:2210 3rd Qu.: 560.0 3rd Qu.:1997
## Max. :13.000 Max. :9410 Max. :4820.0 Max. :2015
##
## yr_renovated zipcode sqft_living15 sqft_lot15
## Min. : 0.0 98103 : 602 Min. : 399 Min. : 651
## 1st Qu.: 0.0 98038 : 590 1st Qu.:1490 1st Qu.: 5100
## Median : 0.0 98115 : 583 Median :1840 Median : 7620
## Mean : 84.4 98052 : 574 Mean :1987 Mean : 12768
## 3rd Qu.: 0.0 98117 : 553 3rd Qu.:2360 3rd Qu.: 10083
## Max. :2015.0 98042 : 548 Max. :6210 Max. :871200
## (Other):18163
we have a problem only with floor
lets check the percent of missing
(sum(is.na(houses$floors))/nrow(houses)
)*100
## [1] 9.619211
only 10% of missing values , so i will impute it using multiple imputation method
first i want to extract the numeric variables
num<-houses%>%select(price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_living15,sqft_lot15)%>%data.frame()
set.seed(3215)
imp<-mice(data = num)
##
## iter imp variable
## 1 1 floors
## 1 2 floors
## 1 3 floors
## 1 4 floors
## 1 5 floors
## 2 1 floors
## 2 2 floors
## 2 3 floors
## 2 4 floors
## 2 5 floors
## 3 1 floors
## 3 2 floors
## 3 3 floors
## 3 4 floors
## 3 5 floors
## 4 1 floors
## 4 2 floors
## 4 3 floors
## 4 4 floors
## 4 5 floors
## 5 1 floors
## 5 2 floors
## 5 3 floors
## 5 4 floors
## 5 5 floors
impnum<-complete(imp)%>%data.frame()
houses<-houses%>%mutate(floors=impnum$floors,
floors=as.integer(floors))
check again for missing
(sum(is.na(houses$floors))/nrow(houses)
)*100
## [1] 0
great
now lets run regular regression to test the assumption
fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
##
## Call:
## lm(formula = price ~ ., data = houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1238733 -70188 -618 62402 4415643
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.143e+05 1.250e+05 7.316 2.64e-13 ***
## bedrooms -2.733e+04 1.528e+03 -17.887 < 2e-16 ***
## bathrooms 2.559e+04 2.647e+03 9.669 < 2e-16 ***
## sqft_living 1.272e+02 3.547e+00 35.857 < 2e-16 ***
## sqft_lot 2.433e-01 3.833e-02 6.347 2.23e-10 ***
## floors -4.887e+04 3.092e+03 -15.807 < 2e-16 ***
## waterfront1 5.634e+05 1.603e+04 35.149 < 2e-16 ***
## view3 1.529e+05 7.617e+03 20.070 < 2e-16 ***
## view4 3.373e+05 1.167e+04 28.913 < 2e-16 ***
## view2 6.829e+04 5.559e+03 12.284 < 2e-16 ***
## view1 7.658e+04 9.099e+03 8.416 < 2e-16 ***
## condition.L 6.642e+04 1.923e+04 3.453 0.000555 ***
## condition.Q 3.217e+02 1.626e+04 0.020 0.984212
## condition.C 3.137e+04 1.223e+04 2.564 0.010345 *
## condition^4 -1.478e+04 7.047e+03 -2.098 0.035950 *
## grade 5.754e+04 1.814e+03 31.716 < 2e-16 ***
## sqft_above 8.021e+01 3.596e+00 22.305 < 2e-16 ***
## sqft_basement NA NA NA NA
## yr_built -6.588e+02 6.588e+01 -10.000 < 2e-16 ***
## yr_renovated 1.764e+01 2.949e+00 5.980 2.27e-09 ***
## zipcode98125 1.596e+05 1.271e+04 12.563 < 2e-16 ***
## zipcode98028 7.666e+04 1.384e+04 5.540 3.06e-08 ***
## zipcode98136 2.054e+05 1.408e+04 14.594 < 2e-16 ***
## zipcode98074 1.252e+05 1.280e+04 9.781 < 2e-16 ***
## zipcode98053 1.451e+05 1.309e+04 11.089 < 2e-16 ***
## zipcode98003 -5.580e+04 1.387e+04 -4.024 5.74e-05 ***
## zipcode98198 -4.202e+04 1.382e+04 -3.040 0.002370 **
## zipcode98146 6.460e+04 1.370e+04 4.714 2.45e-06 ***
## zipcode98038 -1.268e+04 1.216e+04 -1.043 0.297161
## zipcode98007 2.016e+05 1.690e+04 11.933 < 2e-16 ***
## zipcode98115 2.898e+05 1.202e+04 24.102 < 2e-16 ***
## zipcode98107 3.095e+05 1.414e+04 21.890 < 2e-16 ***
## zipcode98126 1.583e+05 1.313e+04 12.063 < 2e-16 ***
## zipcode98019 4.584e+04 1.549e+04 2.960 0.003084 **
## zipcode98103 3.004e+05 1.209e+04 24.850 < 2e-16 ***
## zipcode98002 -1.429e+04 1.519e+04 -0.941 0.346577
## zipcode98133 1.272e+05 1.232e+04 10.326 < 2e-16 ***
## zipcode98040 4.700e+05 1.412e+04 33.285 < 2e-16 ***
## zipcode98092 -8.049e+04 1.332e+04 -6.044 1.52e-09 ***
## zipcode98030 -3.796e+04 1.422e+04 -2.670 0.007598 **
## zipcode98119 4.395e+05 1.566e+04 28.072 < 2e-16 ***
## zipcode98112 5.786e+05 1.427e+04 40.555 < 2e-16 ***
## zipcode98052 1.832e+05 1.219e+04 15.034 < 2e-16 ***
## zipcode98027 1.241e+05 1.291e+04 9.611 < 2e-16 ***
## zipcode98117 2.769e+05 1.214e+04 22.802 < 2e-16 ***
## zipcode98058 -1.494e+04 1.258e+04 -1.187 0.235173
## zipcode98001 -4.373e+04 1.311e+04 -3.335 0.000854 ***
## zipcode98056 5.351e+04 1.284e+04 4.168 3.09e-05 ***
## zipcode98166 1.566e+04 1.417e+04 1.105 0.269143
## zipcode98023 -7.646e+04 1.237e+04 -6.184 6.37e-10 ***
## zipcode98070 -5.017e+04 1.833e+04 -2.736 0.006217 **
## zipcode98148 2.885e+04 2.347e+04 1.229 0.219014
## zipcode98105 4.286e+05 1.470e+04 29.144 < 2e-16 ***
## zipcode98042 -3.875e+04 1.225e+04 -3.162 0.001569 **
## zipcode98008 2.079e+05 1.384e+04 15.023 < 2e-16 ***
## zipcode98059 3.939e+04 1.264e+04 3.116 0.001837 **
## zipcode98122 2.984e+05 1.389e+04 21.481 < 2e-16 ***
## zipcode98144 2.411e+05 1.326e+04 18.184 < 2e-16 ***
## zipcode98004 7.317e+05 1.368e+04 53.499 < 2e-16 ***
## zipcode98005 2.565e+05 1.610e+04 15.928 < 2e-16 ***
## zipcode98034 1.577e+05 1.214e+04 12.987 < 2e-16 ***
## zipcode98075 1.188e+05 1.346e+04 8.822 < 2e-16 ***
## zipcode98116 2.445e+05 1.340e+04 18.246 < 2e-16 ***
## zipcode98010 2.339e+04 1.905e+04 1.228 0.219631
## zipcode98118 1.304e+05 1.223e+04 10.663 < 2e-16 ***
## zipcode98199 3.490e+05 1.353e+04 25.796 < 2e-16 ***
## zipcode98032 -3.643e+04 1.749e+04 -2.083 0.037261 *
## zipcode98045 4.988e+04 1.480e+04 3.369 0.000755 ***
## zipcode98102 4.715e+05 1.877e+04 25.123 < 2e-16 ***
## zipcode98077 7.336e+04 1.551e+04 4.730 2.26e-06 ***
## zipcode98108 8.659e+04 1.541e+04 5.619 1.94e-08 ***
## zipcode98168 3.608e+04 1.395e+04 2.587 0.009693 **
## zipcode98177 1.702e+05 1.421e+04 11.980 < 2e-16 ***
## zipcode98065 3.907e+04 1.375e+04 2.842 0.004484 **
## zipcode98029 1.674e+05 1.360e+04 12.304 < 2e-16 ***
## zipcode98006 2.219e+05 1.259e+04 17.635 < 2e-16 ***
## zipcode98109 4.584e+05 1.847e+04 24.824 < 2e-16 ***
## zipcode98022 -5.148e+04 1.470e+04 -3.502 0.000463 ***
## zipcode98033 3.208e+05 1.271e+04 25.233 < 2e-16 ***
## zipcode98155 1.043e+05 1.251e+04 8.335 < 2e-16 ***
## zipcode98024 1.187e+05 2.071e+04 5.730 1.02e-08 ***
## zipcode98011 7.712e+04 1.529e+04 5.043 4.62e-07 ***
## zipcode98031 -2.838e+04 1.399e+04 -2.029 0.042490 *
## zipcode98106 1.081e+05 1.327e+04 8.149 3.86e-16 ***
## zipcode98072 1.073e+05 1.407e+04 7.631 2.43e-14 ***
## zipcode98188 -4.004e+03 1.696e+04 -0.236 0.813388
## zipcode98014 5.725e+04 1.785e+04 3.206 0.001346 **
## zipcode98055 4.549e+03 1.398e+04 0.325 0.744864
## zipcode98039 1.271e+06 2.507e+04 50.694 < 2e-16 ***
## sqft_living15 1.125e+01 2.884e+00 3.901 9.62e-05 ***
## sqft_lot15 -1.420e-01 6.024e-02 -2.357 0.018455 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160200 on 21523 degrees of freedom
## Multiple R-squared: 0.8103, Adjusted R-squared: 0.8095
## F-statistic: 1033 on 89 and 21523 DF, p-value: < 2.2e-16
here from round 1 we can see that most of predictors are significant
also we see that the ares of basement has nothing to do for prediction , so i will remove it and run the analysis again
houses<-houses%>%select(-sqft_basement)
fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
##
## Call:
## lm(formula = price ~ ., data = houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1238733 -70188 -618 62402 4415643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.143e+05 1.250e+05 7.316 2.64e-13 ***
## bedrooms -2.733e+04 1.528e+03 -17.887 < 2e-16 ***
## bathrooms 2.559e+04 2.647e+03 9.669 < 2e-16 ***
## sqft_living 1.272e+02 3.547e+00 35.857 < 2e-16 ***
## sqft_lot 2.433e-01 3.833e-02 6.347 2.23e-10 ***
## floors -4.887e+04 3.092e+03 -15.807 < 2e-16 ***
## waterfront1 5.634e+05 1.603e+04 35.149 < 2e-16 ***
## view3 1.529e+05 7.617e+03 20.070 < 2e-16 ***
## view4 3.373e+05 1.167e+04 28.913 < 2e-16 ***
## view2 6.829e+04 5.559e+03 12.284 < 2e-16 ***
## view1 7.658e+04 9.099e+03 8.416 < 2e-16 ***
## condition.L 6.642e+04 1.923e+04 3.453 0.000555 ***
## condition.Q 3.217e+02 1.626e+04 0.020 0.984212
## condition.C 3.137e+04 1.223e+04 2.564 0.010345 *
## condition^4 -1.478e+04 7.047e+03 -2.098 0.035950 *
## grade 5.754e+04 1.814e+03 31.716 < 2e-16 ***
## sqft_above 8.021e+01 3.596e+00 22.305 < 2e-16 ***
## yr_built -6.588e+02 6.588e+01 -10.000 < 2e-16 ***
## yr_renovated 1.764e+01 2.949e+00 5.980 2.27e-09 ***
## zipcode98125 1.596e+05 1.271e+04 12.563 < 2e-16 ***
## zipcode98028 7.666e+04 1.384e+04 5.540 3.06e-08 ***
## zipcode98136 2.054e+05 1.408e+04 14.594 < 2e-16 ***
## zipcode98074 1.252e+05 1.280e+04 9.781 < 2e-16 ***
## zipcode98053 1.451e+05 1.309e+04 11.089 < 2e-16 ***
## zipcode98003 -5.580e+04 1.387e+04 -4.024 5.74e-05 ***
## zipcode98198 -4.202e+04 1.382e+04 -3.040 0.002370 **
## zipcode98146 6.460e+04 1.370e+04 4.714 2.45e-06 ***
## zipcode98038 -1.268e+04 1.216e+04 -1.043 0.297161
## zipcode98007 2.016e+05 1.690e+04 11.933 < 2e-16 ***
## zipcode98115 2.898e+05 1.202e+04 24.102 < 2e-16 ***
## zipcode98107 3.095e+05 1.414e+04 21.890 < 2e-16 ***
## zipcode98126 1.583e+05 1.313e+04 12.063 < 2e-16 ***
## zipcode98019 4.584e+04 1.549e+04 2.960 0.003084 **
## zipcode98103 3.004e+05 1.209e+04 24.850 < 2e-16 ***
## zipcode98002 -1.429e+04 1.519e+04 -0.941 0.346577
## zipcode98133 1.272e+05 1.232e+04 10.326 < 2e-16 ***
## zipcode98040 4.700e+05 1.412e+04 33.285 < 2e-16 ***
## zipcode98092 -8.049e+04 1.332e+04 -6.044 1.52e-09 ***
## zipcode98030 -3.796e+04 1.422e+04 -2.670 0.007598 **
## zipcode98119 4.395e+05 1.566e+04 28.072 < 2e-16 ***
## zipcode98112 5.786e+05 1.427e+04 40.555 < 2e-16 ***
## zipcode98052 1.832e+05 1.219e+04 15.034 < 2e-16 ***
## zipcode98027 1.241e+05 1.291e+04 9.611 < 2e-16 ***
## zipcode98117 2.769e+05 1.214e+04 22.802 < 2e-16 ***
## zipcode98058 -1.494e+04 1.258e+04 -1.187 0.235173
## zipcode98001 -4.373e+04 1.311e+04 -3.335 0.000854 ***
## zipcode98056 5.351e+04 1.284e+04 4.168 3.09e-05 ***
## zipcode98166 1.566e+04 1.417e+04 1.105 0.269143
## zipcode98023 -7.646e+04 1.237e+04 -6.184 6.37e-10 ***
## zipcode98070 -5.017e+04 1.833e+04 -2.736 0.006217 **
## zipcode98148 2.885e+04 2.347e+04 1.229 0.219014
## zipcode98105 4.286e+05 1.470e+04 29.144 < 2e-16 ***
## zipcode98042 -3.875e+04 1.225e+04 -3.162 0.001569 **
## zipcode98008 2.079e+05 1.384e+04 15.023 < 2e-16 ***
## zipcode98059 3.939e+04 1.264e+04 3.116 0.001837 **
## zipcode98122 2.984e+05 1.389e+04 21.481 < 2e-16 ***
## zipcode98144 2.411e+05 1.326e+04 18.184 < 2e-16 ***
## zipcode98004 7.317e+05 1.368e+04 53.499 < 2e-16 ***
## zipcode98005 2.565e+05 1.610e+04 15.928 < 2e-16 ***
## zipcode98034 1.577e+05 1.214e+04 12.987 < 2e-16 ***
## zipcode98075 1.188e+05 1.346e+04 8.822 < 2e-16 ***
## zipcode98116 2.445e+05 1.340e+04 18.246 < 2e-16 ***
## zipcode98010 2.339e+04 1.905e+04 1.228 0.219631
## zipcode98118 1.304e+05 1.223e+04 10.663 < 2e-16 ***
## zipcode98199 3.490e+05 1.353e+04 25.796 < 2e-16 ***
## zipcode98032 -3.643e+04 1.749e+04 -2.083 0.037261 *
## zipcode98045 4.988e+04 1.480e+04 3.369 0.000755 ***
## zipcode98102 4.715e+05 1.877e+04 25.123 < 2e-16 ***
## zipcode98077 7.336e+04 1.551e+04 4.730 2.26e-06 ***
## zipcode98108 8.659e+04 1.541e+04 5.619 1.94e-08 ***
## zipcode98168 3.608e+04 1.395e+04 2.587 0.009693 **
## zipcode98177 1.702e+05 1.421e+04 11.980 < 2e-16 ***
## zipcode98065 3.907e+04 1.375e+04 2.842 0.004484 **
## zipcode98029 1.674e+05 1.360e+04 12.304 < 2e-16 ***
## zipcode98006 2.219e+05 1.259e+04 17.635 < 2e-16 ***
## zipcode98109 4.584e+05 1.847e+04 24.824 < 2e-16 ***
## zipcode98022 -5.148e+04 1.470e+04 -3.502 0.000463 ***
## zipcode98033 3.208e+05 1.271e+04 25.233 < 2e-16 ***
## zipcode98155 1.043e+05 1.251e+04 8.335 < 2e-16 ***
## zipcode98024 1.187e+05 2.071e+04 5.730 1.02e-08 ***
## zipcode98011 7.712e+04 1.529e+04 5.043 4.62e-07 ***
## zipcode98031 -2.838e+04 1.399e+04 -2.029 0.042490 *
## zipcode98106 1.081e+05 1.327e+04 8.149 3.86e-16 ***
## zipcode98072 1.073e+05 1.407e+04 7.631 2.43e-14 ***
## zipcode98188 -4.004e+03 1.696e+04 -0.236 0.813388
## zipcode98014 5.725e+04 1.785e+04 3.206 0.001346 **
## zipcode98055 4.549e+03 1.398e+04 0.325 0.744864
## zipcode98039 1.271e+06 2.507e+04 50.694 < 2e-16 ***
## sqft_living15 1.125e+01 2.884e+00 3.901 9.62e-05 ***
## sqft_lot15 -1.420e-01 6.024e-02 -2.357 0.018455 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160200 on 21523 degrees of freedom
## Multiple R-squared: 0.8103, Adjusted R-squared: 0.8095
## F-statistic: 1033 on 89 and 21523 DF, p-value: < 2.2e-16
another thing to take care with is that zibcode has a lot of levels and we need to round it
according to residual i will try to round it to small numbers of areas using residuals
grouping<-houses%>%mutate(resid=resid(fit))%>%
group_by(zipcode)%>%summarize(medresid=median(resid),
cnt=n())%>%arrange(medresid)%>%
mutate(cumcnt=cumsum(cnt),
group=ntile(cumcnt,5))
houses<-houses%>%left_join(grouping[,c("zipcode","group")])%>%mutate(group=factor(group))
now remove the zip from the data and run fit again
houses<-houses%>%select(-zipcode)
fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
##
## Call:
## lm(formula = price ~ ., data = houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1442272 -96219 -2491 82020 4346063
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.666e+06 1.240e+05 29.576 < 2e-16 ***
## bedrooms -3.549e+04 1.766e+03 -20.102 < 2e-16 ***
## bathrooms 3.544e+04 3.077e+03 11.518 < 2e-16 ***
## sqft_living 1.445e+02 4.084e+00 35.384 < 2e-16 ***
## sqft_lot 1.418e-01 4.464e-02 3.175 0.001499 **
## floors -2.230e+04 3.343e+03 -6.672 2.58e-11 ***
## waterfront1 5.423e+05 1.840e+04 29.474 < 2e-16 ***
## view3 1.248e+05 8.777e+03 14.213 < 2e-16 ***
## view4 3.287e+05 1.347e+04 24.399 < 2e-16 ***
## view2 5.990e+04 6.399e+03 9.361 < 2e-16 ***
## view1 1.073e+05 1.053e+04 10.192 < 2e-16 ***
## condition.L 4.642e+04 2.247e+04 2.066 0.038845 *
## condition.Q -2.564e+03 1.901e+04 -0.135 0.892712
## condition.C 2.743e+04 1.430e+04 1.918 0.055123 .
## condition^4 -8.245e+03 8.226e+03 -1.002 0.316153
## grade 7.973e+04 2.021e+03 39.455 < 2e-16 ***
## sqft_above 5.174e+01 3.999e+00 12.939 < 2e-16 ***
## yr_built -1.951e+03 6.585e+01 -29.625 < 2e-16 ***
## yr_renovated 1.237e+01 3.416e+00 3.620 0.000295 ***
## sqft_living15 3.527e+00 3.140e+00 1.123 0.261383
## sqft_lot15 -2.867e-01 6.825e-02 -4.200 2.68e-05 ***
## group2 -1.898e+05 4.245e+03 -44.714 < 2e-16 ***
## group3 -2.847e+05 4.573e+03 -62.261 < 2e-16 ***
## group4 -2.700e+05 4.470e+03 -60.399 < 2e-16 ***
## group5 -3.798e+05 4.807e+03 -79.017 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 187800 on 21588 degrees of freedom
## Multiple R-squared: 0.7387, Adjusted R-squared: 0.7384
## F-statistic: 2542 on 24 and 21588 DF, p-value: < 2.2e-16
most coefficients here are beautiful
now what about outlier!!
#using mahalanobis distance for multivariate outliers
num2<-houses%>%select(price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_living15,sqft_lot15)
mah<-mahalanobis(x = num2,center = colMeans(num2),cov = cov(num2))
cutmahup<-qchisq(.99,ncol(num2))
cutmahdown<-qchisq(.01,ncol(num2))
outidxup<-mah>cutmahup%>%as.numeric()
outidxdown<-mah<cutmahdown%>%as.numeric()
table(outidxup)
## outidxup
## FALSE TRUE
## 20489 1124
table(outidxdown)
## outidxdown
## FALSE TRUE
## 20434 1179
next testing for influential points
inf<-hatvalues(fit)
cutinf<-(2*(ncol(houses)-1)+1)/nrow(houses)
table(inf>cutinf)
##
## FALSE TRUE
## 17832 3781
horrible !!
infidx<-inf>cutinf%>%as.numeric()
testing cooks distance
cook<-cooks.distance(fit)
cutcook<-4/(nrow(houses)-(ncol(houses)-1)-1)
table(cook>cutcook)
##
## FALSE TRUE
## 20450 1163
cookidx<-cook>cutcook%>%as.numeric()
testing for variance inflation
vif<-vif(fit)
vif
## GVIF Df GVIF^(1/(2*Df))
## bedrooms 1.652718 1 1.285581
## bathrooms 3.442414 1 1.855374
## sqft_living 8.620412 1 2.936054
## sqft_lot 2.095620 1 1.447626
## floors 2.123134 1 1.457098
## waterfront 1.552769 1 1.246101
## view 1.852788 4 1.080135
## condition 1.325168 4 1.035819
## grade 3.457940 1 1.859554
## sqft_above 6.719385 1 2.592178
## yr_built 2.292582 1 1.514128
## yr_renovated 1.153622 1 1.074068
## sqft_living15 2.839157 1 1.684980
## sqft_lot15 2.128469 1 1.458927
## group 1.313838 4 1.034708
good news , we don’t have here any vif value over 10 and this indicate absence of multicolinearity
testing auto correlation
auto<-durbinWatsonTest(fit,simulate = T,reps = 1000,method = "resample")
auto
## lag Autocorrelation D-W Statistic p-value
## 1 0.01596825 1.968049 0.024
## Alternative hypothesis: rho != 0
great , p value>.01 indicate that there is no auto correlation
Now lets see the most bad records in our data
allbad<-outidxup+outidxdown+infidx+cookidx
table(allbad)
## allbad
## 0 1 2 3
## 16491 3473 1173 476
i will delete all records which break 2 rules
bad<-allbad>=2
table(bad)
## bad
## FALSE TRUE
## 19964 1649
houses<-houses%>%anti_join(houses[bad,])
running the fit again
fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
##
## Call:
## lm(formula = price ~ ., data = houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -523005 -78395 -4085 68184 929903
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.282e+06 8.637e+04 37.999 < 2e-16 ***
## bedrooms -1.981e+04 1.334e+03 -14.845 < 2e-16 ***
## bathrooms 2.669e+04 2.234e+03 11.951 < 2e-16 ***
## sqft_living 1.027e+02 3.129e+00 32.826 < 2e-16 ***
## sqft_lot 4.835e-01 1.229e-01 3.934 8.37e-05 ***
## floors 4.338e+03 2.362e+03 1.836 0.0664 .
## waterfront1 5.293e+05 3.527e+04 15.006 < 2e-16 ***
## view3 1.195e+05 7.224e+03 16.547 < 2e-16 ***
## view4 3.497e+05 1.561e+04 22.400 < 2e-16 ***
## view2 6.127e+04 4.665e+03 13.135 < 2e-16 ***
## view1 1.020e+05 8.631e+03 11.813 < 2e-16 ***
## condition.L 5.875e+04 3.487e+04 1.685 0.0921 .
## condition.Q 3.234e+02 2.947e+04 0.011 0.9912
## condition.C 1.060e+04 1.884e+04 0.563 0.5735
## condition^4 3.950e+03 8.677e+03 0.455 0.6489
## grade 7.506e+04 1.428e+03 52.582 < 2e-16 ***
## sqft_above 2.623e+01 2.954e+00 8.880 < 2e-16 ***
## yr_built -1.760e+03 4.563e+01 -38.576 < 2e-16 ***
## yr_renovated 1.293e+01 2.527e+00 5.118 3.11e-07 ***
## sqft_living15 2.192e+01 2.424e+00 9.045 < 2e-16 ***
## sqft_lot15 -6.147e-01 1.506e-01 -4.082 4.48e-05 ***
## group2 -1.369e+05 2.930e+03 -46.710 < 2e-16 ***
## group3 -2.334e+05 3.141e+03 -74.298 < 2e-16 ***
## group4 -2.113e+05 3.097e+03 -68.238 < 2e-16 ***
## group5 -3.185e+05 3.334e+03 -95.529 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 122200 on 19939 degrees of freedom
## Multiple R-squared: 0.7602, Adjusted R-squared: 0.7599
## F-statistic: 2634 on 24 and 19939 DF, p-value: < 2.2e-16
plot(fit,3)
we see here that we have some degree of heteroscedasticity
checking the normality
qqPlot(fit)
## [1] 5718 18796
OK , the qqplot shows that the data has a degree of skewness
and to prove it lets see histogram
hist(rstudent(fit))
some right skew but the data still good
the only issue we have to handle is heteroscedasticity
now i will check partial residual to know the problematic variable
resid<-resid(fit)
term<-predict(fit,houses,type = "terms")%>%data.frame()
partial.resid<-resid+term
houses.org<-houses
names(houses.org)<-paste("org",names(houses),sep = ".")
names(partial.resid)<-paste("part",names(partial.resid),sep = ".")
names(term)<-paste("term",names(term),sep = ".")
allpartial<-cbind(houses.org[,-1],partial.resid,term)
allpartial%>%ggplot(aes(org.sqft_living,part.sqft_living))+
geom_point()+
geom_smooth()+
geom_line(aes(x = org.sqft_living,y=term.sqft_living),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
allpartial%>%ggplot(aes(org.sqft_lot,part.sqft_lot))+
geom_point()+
geom_smooth()+
geom_line(aes(x = org.sqft_lot,y=term.sqft_lot),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
allpartial%>%ggplot(aes(org.sqft_above,part.sqft_above))+
geom_point()+
geom_smooth()+
geom_line(aes(x = org.sqft_above,y=term.sqft_above),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
allpartial%>%ggplot(aes(org.sqft_living15,part.sqft_living15))+
geom_point()+
geom_smooth()+
geom_line(aes(x = org.sqft_living15,y=term.sqft_living15),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
allpartial%>%ggplot(aes(org.sqft_lot15,part.sqft_lot15))+
geom_point()+
geom_smooth()+
geom_line(aes(x = org.sqft_lot15,y=term.sqft_lot15),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
allpartial%>%ggplot(aes(org.yr_built,part.yr_built))+
geom_point()+
geom_smooth()+
geom_line(aes(x = org.yr_built,y=term.yr_built),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
allpartial%>%ggplot(aes(org.yr_renovated,part.yr_renovated))+
geom_point()+
geom_smooth()+
geom_line(aes(x = org.yr_renovated,y=term.yr_renovated),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
here we can find that the most problems associated with square foot of lot and the same variable in 2015 and the year of reonivation
i will categorize the renovation years variable into 4 categories according to residuals
houses$yr_renovated%>%table()%>%kable("html")
| . | Freq |
|---|---|
| 0 | 19281 |
| 1940 | 2 |
| 1944 | 1 |
| 1945 | 3 |
| 1946 | 2 |
| 1948 | 1 |
| 1950 | 2 |
| 1951 | 1 |
| 1953 | 3 |
| 1954 | 1 |
| 1955 | 3 |
| 1956 | 1 |
| 1957 | 3 |
| 1958 | 5 |
| 1959 | 1 |
| 1960 | 3 |
| 1962 | 1 |
| 1963 | 3 |
| 1964 | 5 |
| 1965 | 3 |
| 1967 | 2 |
| 1968 | 6 |
| 1969 | 3 |
| 1970 | 8 |
| 1971 | 2 |
| 1972 | 2 |
| 1973 | 5 |
| 1974 | 3 |
| 1975 | 6 |
| 1976 | 2 |
| 1977 | 7 |
| 1978 | 4 |
| 1979 | 7 |
| 1980 | 9 |
| 1981 | 3 |
| 1982 | 7 |
| 1983 | 14 |
| 1984 | 15 |
| 1985 | 12 |
| 1986 | 8 |
| 1987 | 12 |
| 1988 | 12 |
| 1989 | 17 |
| 1990 | 20 |
| 1991 | 13 |
| 1992 | 12 |
| 1993 | 15 |
| 1994 | 13 |
| 1995 | 13 |
| 1996 | 9 |
| 1997 | 11 |
| 1998 | 13 |
| 1999 | 9 |
| 2000 | 21 |
| 2001 | 15 |
| 2002 | 12 |
| 2003 | 25 |
| 2004 | 20 |
| 2005 | 27 |
| 2006 | 18 |
| 2007 | 26 |
| 2008 | 9 |
| 2009 | 19 |
| 2010 | 15 |
| 2011 | 12 |
| 2012 | 11 |
| 2013 | 28 |
| 2014 | 78 |
| 2015 | 9 |
stdresid<-rstudent(fit)
catigor<-houses%>%mutate(stdresid)%>%
group_by(yr_renovated)%>%
summarize(medstd=median(stdresid),cnt=n())%>%
arrange(medstd)%>%
mutate(cumcnt=cumsum(cnt),renovC=ntile(cumcnt,4))
catigor%>%kable("markdown")
| yr_renovated | medstd | cnt | cumcnt | renovC |
|---|---|---|---|---|
| 1967 | -1.3146319 | 2 | 2 | 1 |
| 1976 | -1.1681682 | 2 | 4 | 1 |
| 1963 | -1.1644635 | 3 | 7 | 1 |
| 1969 | -1.1364518 | 3 | 10 | 1 |
| 1964 | -1.1031153 | 5 | 15 | 1 |
| 1970 | -1.0855883 | 8 | 23 | 1 |
| 1996 | -1.0752276 | 9 | 32 | 1 |
| 1953 | -1.0183430 | 3 | 35 | 1 |
| 1945 | -0.9554270 | 3 | 38 | 1 |
| 1960 | -0.8671719 | 3 | 41 | 1 |
| 1959 | -0.6575658 | 1 | 42 | 1 |
| 1950 | -0.6536802 | 2 | 44 | 1 |
| 1978 | -0.5931954 | 4 | 48 | 1 |
| 1982 | -0.5847983 | 7 | 55 | 1 |
| 1973 | -0.5711407 | 5 | 60 | 1 |
| 1971 | -0.5155870 | 2 | 62 | 1 |
| 1981 | -0.4807747 | 3 | 65 | 1 |
| 1957 | -0.4715717 | 3 | 68 | 1 |
| 1995 | -0.4594498 | 13 | 81 | 2 |
| 1983 | -0.4412839 | 14 | 95 | 2 |
| 1977 | -0.4311997 | 7 | 102 | 2 |
| 1984 | -0.4227199 | 15 | 117 | 2 |
| 1997 | -0.4130501 | 11 | 128 | 2 |
| 1968 | -0.4060072 | 6 | 134 | 2 |
| 1974 | -0.3795095 | 3 | 137 | 2 |
| 1946 | -0.3785293 | 2 | 139 | 2 |
| 1940 | -0.3263868 | 2 | 141 | 2 |
| 1994 | -0.3208032 | 13 | 154 | 2 |
| 1993 | -0.2803082 | 15 | 169 | 2 |
| 1988 | -0.2740405 | 12 | 181 | 2 |
| 1948 | -0.2578666 | 1 | 182 | 2 |
| 1991 | -0.2541159 | 13 | 195 | 2 |
| 1992 | -0.2498357 | 12 | 207 | 2 |
| 1989 | -0.2221852 | 17 | 224 | 2 |
| 1958 | -0.2079038 | 5 | 229 | 2 |
| 2001 | -0.2000301 | 15 | 244 | 3 |
| 1987 | -0.1873965 | 12 | 256 | 3 |
| 2000 | -0.1789553 | 21 | 277 | 3 |
| 1990 | -0.1543012 | 20 | 297 | 3 |
| 1980 | -0.1477219 | 9 | 306 | 3 |
| 1972 | -0.1006371 | 2 | 308 | 3 |
| 1965 | -0.0930242 | 3 | 311 | 3 |
| 2013 | -0.0892279 | 28 | 339 | 3 |
| 1955 | -0.0803034 | 3 | 342 | 3 |
| 1986 | -0.0612799 | 8 | 350 | 3 |
| 2012 | -0.0611215 | 11 | 361 | 3 |
| 1985 | -0.0536939 | 12 | 373 | 3 |
| 0 | -0.0324502 | 19281 | 19654 | 3 |
| 1975 | 0.0167476 | 6 | 19660 | 3 |
| 2015 | 0.0227493 | 9 | 19669 | 3 |
| 2009 | 0.0288613 | 19 | 19688 | 3 |
| 2011 | 0.0822008 | 12 | 19700 | 3 |
| 1998 | 0.0945444 | 13 | 19713 | 4 |
| 2007 | 0.1200742 | 26 | 19739 | 4 |
| 1979 | 0.1245174 | 7 | 19746 | 4 |
| 2004 | 0.2679457 | 20 | 19766 | 4 |
| 2006 | 0.2908748 | 18 | 19784 | 4 |
| 2003 | 0.2965783 | 25 | 19809 | 4 |
| 1944 | 0.3629567 | 1 | 19810 | 4 |
| 1951 | 0.4672665 | 1 | 19811 | 4 |
| 2014 | 0.4996164 | 78 | 19889 | 4 |
| 2005 | 0.5223866 | 27 | 19916 | 4 |
| 2010 | 0.5686400 | 15 | 19931 | 4 |
| 2002 | 0.6547362 | 12 | 19943 | 4 |
| 1956 | 0.8582450 | 1 | 19944 | 4 |
| 1999 | 0.9077240 | 9 | 19953 | 4 |
| 2008 | 1.0379758 | 9 | 19962 | 4 |
| 1962 | 1.2237277 | 1 | 19963 | 4 |
| 1954 | 2.1236397 | 1 | 19964 | 4 |
catigor<-catigor%>%select(yr_renovated,renovC)
houses<-(houses)%>%left_join(catigor)%>%mutate(renovC=factor(renovC))
## Joining, by = "yr_renovated"
houses<-houses%>%select(-yr_renovated)
houses%>%head%>%kable("markdown")
| price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | yr_built | sqft_living15 | sqft_lot15 | group | renovC |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 221900 | 3 | 1.00 | 1180 | 5650 | 1 | 0 | 0 | 3 | 7 | 1180 | 1955 | 1340 | 5650 | 5 | 3 |
| 538000 | 3 | 2.25 | 2570 | 7242 | 2 | 0 | 0 | 3 | 7 | 2170 | 1951 | 1690 | 7639 | 4 | 2 |
| 180000 | 2 | 1.00 | 770 | 10000 | 1 | 0 | 0 | 3 | 6 | 770 | 1933 | 2720 | 8062 | 4 | 3 |
| 604000 | 4 | 3.00 | 1960 | 5000 | 1 | 0 | 0 | 5 | 7 | 1050 | 1965 | 1360 | 5000 | 4 | 3 |
| 510000 | 3 | 2.00 | 1680 | 8080 | 1 | 0 | 0 | 3 | 8 | 1680 | 1987 | 1800 | 7503 | 4 | 3 |
| 257500 | 3 | 2.25 | 1715 | 6819 | 2 | 0 | 0 | 3 | 7 | 1715 | 1995 | 2238 | 6819 | 5 | 3 |
table(houses$renovC)
##
## 1 2 3 4
## 68 161 19471 264
recalculating the fit
fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
##
## Call:
## lm(formula = price ~ ., data = houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -523310 -77900 -4030 67725 930210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.239e+06 8.498e+04 38.117 < 2e-16 ***
## bedrooms -1.986e+04 1.332e+03 -14.911 < 2e-16 ***
## bathrooms 2.627e+04 2.226e+03 11.801 < 2e-16 ***
## sqft_living 1.028e+02 3.123e+00 32.907 < 2e-16 ***
## sqft_lot 4.764e-01 1.227e-01 3.884 0.000103 ***
## floors 5.000e+03 2.358e+03 2.120 0.034018 *
## waterfront1 5.332e+05 3.522e+04 15.140 < 2e-16 ***
## view3 1.200e+05 7.211e+03 16.637 < 2e-16 ***
## view4 3.492e+05 1.559e+04 22.398 < 2e-16 ***
## view2 6.184e+04 4.657e+03 13.277 < 2e-16 ***
## view1 1.023e+05 8.615e+03 11.877 < 2e-16 ***
## condition.L 5.938e+04 3.481e+04 1.706 0.088056 .
## condition.Q -5.216e+02 2.942e+04 -0.018 0.985853
## condition.C 1.148e+04 1.881e+04 0.611 0.541475
## condition^4 3.053e+03 8.662e+03 0.352 0.724501
## grade 7.469e+04 1.426e+03 52.397 < 2e-16 ***
## sqft_above 2.671e+01 2.949e+00 9.056 < 2e-16 ***
## yr_built -1.772e+03 4.501e+01 -39.360 < 2e-16 ***
## sqft_living15 2.181e+01 2.417e+00 9.024 < 2e-16 ***
## sqft_lot15 -5.939e-01 1.503e-01 -3.950 7.83e-05 ***
## group2 -1.367e+05 2.925e+03 -46.718 < 2e-16 ***
## group3 -2.332e+05 3.136e+03 -74.364 < 2e-16 ***
## group4 -2.107e+05 3.092e+03 -68.141 < 2e-16 ***
## group5 -3.184e+05 3.328e+03 -95.665 < 2e-16 ***
## renovC2 6.885e+04 1.767e+04 3.897 9.79e-05 ***
## renovC3 6.777e+04 1.493e+04 4.541 5.64e-06 ***
## renovC4 1.366e+05 1.664e+04 8.204 2.46e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 122000 on 19937 degrees of freedom
## Multiple R-squared: 0.7611, Adjusted R-squared: 0.7608
## F-statistic: 2443 on 26 and 19937 DF, p-value: < 2.2e-16
Good , all categories are significant
now lets try the weighted regression and start with calculating the weights
residl<-resid(fit)%>%abs()
fited<-fitted(fit)
residfit<-lm(residl~fited)
fittresid<-fitted(residfit)
wts<-1/fittresid^2
wtsfit<-houses%>%with(lm(price~.,data=houses,weights = wts))
summary(wtsfit)
##
## Call:
## lm(formula = price ~ ., data = houses, weights = wts)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -4.0657 -0.8911 -0.0765 0.7666 13.6759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.780e+06 7.615e+04 36.506 < 2e-16 ***
## bedrooms -1.943e+04 1.158e+03 -16.779 < 2e-16 ***
## bathrooms 1.874e+04 1.974e+03 9.495 < 2e-16 ***
## sqft_living 9.526e+01 2.969e+00 32.083 < 2e-16 ***
## sqft_lot 6.726e-01 1.059e-01 6.349 2.22e-10 ***
## floors 1.150e+04 2.101e+03 5.473 4.47e-08 ***
## waterfront1 5.600e+05 7.409e+04 7.558 4.27e-14 ***
## view3 1.071e+05 8.500e+03 12.597 < 2e-16 ***
## view4 3.550e+05 2.536e+04 13.999 < 2e-16 ***
## view2 5.989e+04 4.929e+03 12.149 < 2e-16 ***
## view1 9.405e+04 9.276e+03 10.140 < 2e-16 ***
## condition.L 8.421e+04 1.866e+04 4.512 6.47e-06 ***
## condition.Q -1.985e+04 1.577e+04 -1.259 0.208127
## condition.C 1.014e+04 1.036e+04 0.979 0.327814
## condition^4 8.627e+03 5.048e+03 1.709 0.087436 .
## grade 5.794e+04 1.244e+03 46.559 < 2e-16 ***
## sqft_above 1.802e+01 2.819e+00 6.392 1.67e-10 ***
## yr_built -1.481e+03 4.079e+01 -36.304 < 2e-16 ***
## sqft_living15 3.203e+01 2.260e+00 14.173 < 2e-16 ***
## sqft_lot15 -4.333e-01 1.249e-01 -3.471 0.000520 ***
## group2 -1.205e+05 3.246e+03 -37.123 < 2e-16 ***
## group3 -2.220e+05 3.243e+03 -68.442 < 2e-16 ***
## group4 -1.953e+05 3.262e+03 -59.878 < 2e-16 ***
## group5 -2.901e+05 3.290e+03 -88.173 < 2e-16 ***
## renovC2 5.253e+04 1.454e+04 3.613 0.000304 ***
## renovC3 6.112e+04 1.166e+04 5.244 1.59e-07 ***
## renovC4 1.237e+05 1.393e+04 8.878 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.26 on 19937 degrees of freedom
## Multiple R-squared: 0.7181, Adjusted R-squared: 0.7177
## F-statistic: 1953 on 26 and 19937 DF, p-value: < 2.2e-16
plot(wtsfit,3,main = "weighted regression")
plot(fit,3,main = "Ordinary regression")
we see here an improve in the linearity and heteroscedasticity
comparing the standardized coefficients of OLS model and weighted model
data.frame(cbind(lm.beta(fit)$standardized.coefficients,lm.beta(wtsfit)$standardized.coefficients))%>%kable(format = "markdown",digits = 3,col.names = c("Ordinary","Weighted"))
| Ordinary | Weighted | |
|---|---|---|
| (Intercept) | 0.000 | 0.000 |
| bedrooms | -0.068 | -0.067 |
| bathrooms | 0.074 | 0.053 |
| sqft_living | 0.318 | 0.294 |
| sqft_lot | 0.025 | 0.035 |
| floors | 0.011 | 0.026 |
| waterfront1 | 0.057 | 0.059 |
| view3 | 0.059 | 0.053 |
| view4 | 0.085 | 0.086 |
| view2 | 0.047 | 0.046 |
| view1 | 0.041 | 0.038 |
| condition.L | 0.048 | 0.068 |
| condition.Q | -0.001 | -0.023 |
| condition.C | 0.014 | 0.013 |
| condition^4 | 0.006 | 0.018 |
| grade | 0.320 | 0.248 |
| sqft_above | 0.078 | 0.053 |
| yr_built | -0.208 | -0.173 |
| sqft_living15 | 0.055 | 0.081 |
| sqft_lot15 | -0.025 | -0.018 |
| group2 | -0.236 | -0.208 |
| group3 | -0.377 | -0.359 |
| group4 | -0.348 | -0.322 |
| group5 | -0.497 | -0.453 |
| renovC2 | 0.025 | 0.019 |
| renovC3 | 0.042 | 0.038 |
| renovC4 | 0.063 | 0.057 |
The Coefficients are so close to each other
Now lets see the ability of our model to predict out sample data
houses<-cbind(houses,wts)
set.seed(4785)
trainidx<-createDataPartition(y = houses$price,p = .75,list = F)
traindata<-houses[trainidx,]
testdata<-houses[-trainidx,]
running the model
set.seed(3454)
trainfit<-traindata%>%with(lm(price~.,data = traindata[,-length(names(traindata))],weights = traindata$wts))
summary(trainfit)
##
## Call:
## lm(formula = price ~ ., data = traindata[, -length(names(traindata))],
## weights = traindata$wts)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -4.1019 -0.8939 -0.0758 0.7716 13.7050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.807e+06 8.800e+04 31.900 < 2e-16 ***
## bedrooms -1.950e+04 1.336e+03 -14.597 < 2e-16 ***
## bathrooms 1.838e+04 2.272e+03 8.090 6.40e-16 ***
## sqft_living 9.243e+01 3.417e+00 27.049 < 2e-16 ***
## sqft_lot 7.483e-01 1.235e-01 6.059 1.40e-09 ***
## floors 1.257e+04 2.433e+03 5.165 2.43e-07 ***
## waterfront1 5.638e+05 7.454e+04 7.564 4.13e-14 ***
## view3 1.107e+05 9.718e+03 11.394 < 2e-16 ***
## view4 3.500e+05 2.768e+04 12.643 < 2e-16 ***
## view2 6.173e+04 5.685e+03 10.859 < 2e-16 ***
## view1 9.688e+04 1.020e+04 9.502 < 2e-16 ***
## condition.L 9.235e+04 2.330e+04 3.964 7.41e-05 ***
## condition.Q -2.660e+04 1.968e+04 -1.352 0.17643
## condition.C 1.896e+04 1.275e+04 1.487 0.13693
## condition^4 4.314e+03 6.060e+03 0.712 0.47647
## grade 5.897e+04 1.439e+03 40.973 < 2e-16 ***
## sqft_above 2.151e+01 3.234e+00 6.651 3.00e-11 ***
## yr_built -1.498e+03 4.712e+01 -31.799 < 2e-16 ***
## sqft_living15 3.170e+01 2.624e+00 12.084 < 2e-16 ***
## sqft_lot15 -5.765e-01 1.480e-01 -3.896 9.84e-05 ***
## group2 -1.225e+05 3.767e+03 -32.536 < 2e-16 ***
## group3 -2.252e+05 3.761e+03 -59.880 < 2e-16 ***
## group4 -1.974e+05 3.780e+03 -52.208 < 2e-16 ***
## group5 -2.945e+05 3.816e+03 -77.169 < 2e-16 ***
## renovC2 5.127e+04 1.678e+04 3.055 0.00226 **
## renovC3 6.332e+04 1.342e+04 4.718 2.41e-06 ***
## renovC4 1.256e+05 1.635e+04 7.684 1.64e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.259 on 14947 degrees of freedom
## Multiple R-squared: 0.7204, Adjusted R-squared: 0.7199
## F-statistic: 1481 on 26 and 14947 DF, p-value: < 2.2e-16
predicted<-predict(trainfit,testdata)
cbind(RMSE=RMSE(pred = predicted,obs = testdata$price),R2=R2(pred = predicted,obs = testdata$price))%>%kable("markdown")
| RMSE | R2 |
|---|---|
| 124517.3 | 0.7519751 |