the aim of this analysis to use weighted regression to predict the houses prices

plz feel free to contact me for any advice or recommendations: vet.m.mohamed@gmail.com

find the data here

firstly lets load the libraries we are going to use

library(tidyverse)
library(caret)
library(mice)
library(car)
library(lm.beta)
library(knitr)

Next lets import the data

houses<-read_csv(file = "./data sets/kc_house_data.csv",col_types = "cTninnnifffinnddf??nn")

head(houses)%>%kable("markdown")
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
7129300520 2014-10-13 221900 3 1.00 1180 5650 1 0 0 3 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
6414100192 2014-12-09 538000 3 2.25 2570 7242 2 0 0 3 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
5631500400 2015-02-25 180000 2 1.00 770 10000 1 0 0 3 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
2487200875 2014-12-09 604000 4 3.00 1960 5000 1 0 0 5 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
1954400510 2015-02-18 510000 3 2.00 1680 8080 1 0 0 3 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503
7237550310 2014-05-12 1225000 4 4.50 5420 101930 1 0 0 3 11 3890 1530 2001 0 98053 47.6561 -122.005 4760 101930

checking the structure of the data

str(houses)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 21613 obs. of  21 variables:
##  $ id           : chr  "7129300520" "6414100192" "5631500400" "2487200875" ...
##  $ date         : POSIXct, format: "2014-10-13" "2014-12-09" ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num  1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num  5650 7242 10000 5000 8080 ...
##  $ floors       : int  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ view         : Factor w/ 5 levels "0","3","4","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ condition    : Factor w/ 5 levels "3","5","4","1",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num  1180 2170 770 1050 1680 ...
##  $ sqft_basement: num  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num  1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num  0 1991 0 0 0 ...
##  $ zipcode      : Factor w/ 70 levels "98178","98125",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num  1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num  5650 7639 8062 5000 7503 ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 2079 obs. of  5 variables:
##   ..$ row     : int  13 15 18 26 28 33 34 36 55 66 ...
##   ..$ col     : chr  "floors" "floors" "floors" "floors" ...
##   ..$ expected: chr  "no trailing characters" "no trailing characters" "no trailing characters" "no trailing characters" ...
##   ..$ actual  : chr  ".5" ".5" ".5" ".5" ...
##   ..$ file    : chr  "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   date = col_datetime(format = ""),
##   ..   price = col_number(),
##   ..   bedrooms = col_integer(),
##   ..   bathrooms = col_number(),
##   ..   sqft_living = col_number(),
##   ..   sqft_lot = col_number(),
##   ..   floors = col_integer(),
##   ..   waterfront = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   view = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   condition = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   grade = col_integer(),
##   ..   sqft_above = col_number(),
##   ..   sqft_basement = col_number(),
##   ..   yr_built = col_double(),
##   ..   yr_renovated = col_double(),
##   ..   zipcode = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   lat = col_double(),
##   ..   long = col_double(),
##   ..   sqft_living15 = col_number(),
##   ..   sqft_lot15 = col_number()
##   .. )

removing id lat long date

houses<-houses%>%select(-c(id,date,lat,long))

need to check the condition if it need to be ordered

unique(houses$condition)
## [1] 3 5 4 1 2
## Levels: 3 5 4 1 2

i think that is preferable if it is ordered

houses<-houses%>%mutate(condition=factor(condition,levels = c("1","2","3","4","5"),labels = c("1","2","3","4","5"),ordered = T))

checking the structure again

str(houses)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 21613 obs. of  17 variables:
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num  1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num  5650 7242 10000 5000 8080 ...
##  $ floors       : int  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ view         : Factor w/ 5 levels "0","3","4","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ condition    : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num  1180 2170 770 1050 1680 ...
##  $ sqft_basement: num  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num  1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num  0 1991 0 0 0 ...
##  $ zipcode      : Factor w/ 70 levels "98178","98125",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ sqft_living15: num  1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num  5650 7639 8062 5000 7503 ...

now we want to test some assumption starting with is the missing values , what about it ?

summary(houses)
##      price            bedrooms        bathrooms      sqft_living   
##  Min.   :  75000   Min.   : 0.000   Min.   :0.000   Min.   :  290  
##  1st Qu.: 321950   1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427  
##  Median : 450000   Median : 3.000   Median :2.250   Median : 1910  
##  Mean   : 540088   Mean   : 3.371   Mean   :2.115   Mean   : 2080  
##  3rd Qu.: 645000   3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550  
##  Max.   :7700000   Max.   :33.000   Max.   :8.000   Max.   :13540  
##                                                                    
##     sqft_lot           floors      waterfront view      condition
##  Min.   :    520   Min.   :1.000   0:21450    0:19489   1:   30  
##  1st Qu.:   5040   1st Qu.:1.000   1:  163    3:  510   2:  172  
##  Median :   7618   Median :1.000              4:  319   3:14031  
##  Mean   :  15107   Mean   :1.485              2:  963   4: 5679  
##  3rd Qu.:  10688   3rd Qu.:2.000              1:  332   5: 1701  
##  Max.   :1651359   Max.   :3.000                                 
##                    NA's   :2079                                  
##      grade          sqft_above   sqft_basement       yr_built   
##  Min.   : 1.000   Min.   : 290   Min.   :   0.0   Min.   :1900  
##  1st Qu.: 7.000   1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951  
##  Median : 7.000   Median :1560   Median :   0.0   Median :1975  
##  Mean   : 7.657   Mean   :1788   Mean   : 291.5   Mean   :1971  
##  3rd Qu.: 8.000   3rd Qu.:2210   3rd Qu.: 560.0   3rd Qu.:1997  
##  Max.   :13.000   Max.   :9410   Max.   :4820.0   Max.   :2015  
##                                                                 
##   yr_renovated       zipcode      sqft_living15    sqft_lot15    
##  Min.   :   0.0   98103  :  602   Min.   : 399   Min.   :   651  
##  1st Qu.:   0.0   98038  :  590   1st Qu.:1490   1st Qu.:  5100  
##  Median :   0.0   98115  :  583   Median :1840   Median :  7620  
##  Mean   :  84.4   98052  :  574   Mean   :1987   Mean   : 12768  
##  3rd Qu.:   0.0   98117  :  553   3rd Qu.:2360   3rd Qu.: 10083  
##  Max.   :2015.0   98042  :  548   Max.   :6210   Max.   :871200  
##                   (Other):18163

we have a problem only with floor

lets check the percent of missing

(sum(is.na(houses$floors))/nrow(houses)
)*100
## [1] 9.619211

only 10% of missing values , so i will impute it using multiple imputation method

first i want to extract the numeric variables

num<-houses%>%select(price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_living15,sqft_lot15)%>%data.frame()

set.seed(3215)

imp<-mice(data = num)
## 
##  iter imp variable
##   1   1  floors
##   1   2  floors
##   1   3  floors
##   1   4  floors
##   1   5  floors
##   2   1  floors
##   2   2  floors
##   2   3  floors
##   2   4  floors
##   2   5  floors
##   3   1  floors
##   3   2  floors
##   3   3  floors
##   3   4  floors
##   3   5  floors
##   4   1  floors
##   4   2  floors
##   4   3  floors
##   4   4  floors
##   4   5  floors
##   5   1  floors
##   5   2  floors
##   5   3  floors
##   5   4  floors
##   5   5  floors
impnum<-complete(imp)%>%data.frame()

houses<-houses%>%mutate(floors=impnum$floors,
                        floors=as.integer(floors))

check again for missing

(sum(is.na(houses$floors))/nrow(houses)
)*100
## [1] 0

great

now lets run regular regression to test the assumption

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1238733   -70188     -618    62402  4415643 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.143e+05  1.250e+05   7.316 2.64e-13 ***
## bedrooms      -2.733e+04  1.528e+03 -17.887  < 2e-16 ***
## bathrooms      2.559e+04  2.647e+03   9.669  < 2e-16 ***
## sqft_living    1.272e+02  3.547e+00  35.857  < 2e-16 ***
## sqft_lot       2.433e-01  3.833e-02   6.347 2.23e-10 ***
## floors        -4.887e+04  3.092e+03 -15.807  < 2e-16 ***
## waterfront1    5.634e+05  1.603e+04  35.149  < 2e-16 ***
## view3          1.529e+05  7.617e+03  20.070  < 2e-16 ***
## view4          3.373e+05  1.167e+04  28.913  < 2e-16 ***
## view2          6.829e+04  5.559e+03  12.284  < 2e-16 ***
## view1          7.658e+04  9.099e+03   8.416  < 2e-16 ***
## condition.L    6.642e+04  1.923e+04   3.453 0.000555 ***
## condition.Q    3.217e+02  1.626e+04   0.020 0.984212    
## condition.C    3.137e+04  1.223e+04   2.564 0.010345 *  
## condition^4   -1.478e+04  7.047e+03  -2.098 0.035950 *  
## grade          5.754e+04  1.814e+03  31.716  < 2e-16 ***
## sqft_above     8.021e+01  3.596e+00  22.305  < 2e-16 ***
## sqft_basement         NA         NA      NA       NA    
## yr_built      -6.588e+02  6.588e+01 -10.000  < 2e-16 ***
## yr_renovated   1.764e+01  2.949e+00   5.980 2.27e-09 ***
## zipcode98125   1.596e+05  1.271e+04  12.563  < 2e-16 ***
## zipcode98028   7.666e+04  1.384e+04   5.540 3.06e-08 ***
## zipcode98136   2.054e+05  1.408e+04  14.594  < 2e-16 ***
## zipcode98074   1.252e+05  1.280e+04   9.781  < 2e-16 ***
## zipcode98053   1.451e+05  1.309e+04  11.089  < 2e-16 ***
## zipcode98003  -5.580e+04  1.387e+04  -4.024 5.74e-05 ***
## zipcode98198  -4.202e+04  1.382e+04  -3.040 0.002370 ** 
## zipcode98146   6.460e+04  1.370e+04   4.714 2.45e-06 ***
## zipcode98038  -1.268e+04  1.216e+04  -1.043 0.297161    
## zipcode98007   2.016e+05  1.690e+04  11.933  < 2e-16 ***
## zipcode98115   2.898e+05  1.202e+04  24.102  < 2e-16 ***
## zipcode98107   3.095e+05  1.414e+04  21.890  < 2e-16 ***
## zipcode98126   1.583e+05  1.313e+04  12.063  < 2e-16 ***
## zipcode98019   4.584e+04  1.549e+04   2.960 0.003084 ** 
## zipcode98103   3.004e+05  1.209e+04  24.850  < 2e-16 ***
## zipcode98002  -1.429e+04  1.519e+04  -0.941 0.346577    
## zipcode98133   1.272e+05  1.232e+04  10.326  < 2e-16 ***
## zipcode98040   4.700e+05  1.412e+04  33.285  < 2e-16 ***
## zipcode98092  -8.049e+04  1.332e+04  -6.044 1.52e-09 ***
## zipcode98030  -3.796e+04  1.422e+04  -2.670 0.007598 ** 
## zipcode98119   4.395e+05  1.566e+04  28.072  < 2e-16 ***
## zipcode98112   5.786e+05  1.427e+04  40.555  < 2e-16 ***
## zipcode98052   1.832e+05  1.219e+04  15.034  < 2e-16 ***
## zipcode98027   1.241e+05  1.291e+04   9.611  < 2e-16 ***
## zipcode98117   2.769e+05  1.214e+04  22.802  < 2e-16 ***
## zipcode98058  -1.494e+04  1.258e+04  -1.187 0.235173    
## zipcode98001  -4.373e+04  1.311e+04  -3.335 0.000854 ***
## zipcode98056   5.351e+04  1.284e+04   4.168 3.09e-05 ***
## zipcode98166   1.566e+04  1.417e+04   1.105 0.269143    
## zipcode98023  -7.646e+04  1.237e+04  -6.184 6.37e-10 ***
## zipcode98070  -5.017e+04  1.833e+04  -2.736 0.006217 ** 
## zipcode98148   2.885e+04  2.347e+04   1.229 0.219014    
## zipcode98105   4.286e+05  1.470e+04  29.144  < 2e-16 ***
## zipcode98042  -3.875e+04  1.225e+04  -3.162 0.001569 ** 
## zipcode98008   2.079e+05  1.384e+04  15.023  < 2e-16 ***
## zipcode98059   3.939e+04  1.264e+04   3.116 0.001837 ** 
## zipcode98122   2.984e+05  1.389e+04  21.481  < 2e-16 ***
## zipcode98144   2.411e+05  1.326e+04  18.184  < 2e-16 ***
## zipcode98004   7.317e+05  1.368e+04  53.499  < 2e-16 ***
## zipcode98005   2.565e+05  1.610e+04  15.928  < 2e-16 ***
## zipcode98034   1.577e+05  1.214e+04  12.987  < 2e-16 ***
## zipcode98075   1.188e+05  1.346e+04   8.822  < 2e-16 ***
## zipcode98116   2.445e+05  1.340e+04  18.246  < 2e-16 ***
## zipcode98010   2.339e+04  1.905e+04   1.228 0.219631    
## zipcode98118   1.304e+05  1.223e+04  10.663  < 2e-16 ***
## zipcode98199   3.490e+05  1.353e+04  25.796  < 2e-16 ***
## zipcode98032  -3.643e+04  1.749e+04  -2.083 0.037261 *  
## zipcode98045   4.988e+04  1.480e+04   3.369 0.000755 ***
## zipcode98102   4.715e+05  1.877e+04  25.123  < 2e-16 ***
## zipcode98077   7.336e+04  1.551e+04   4.730 2.26e-06 ***
## zipcode98108   8.659e+04  1.541e+04   5.619 1.94e-08 ***
## zipcode98168   3.608e+04  1.395e+04   2.587 0.009693 ** 
## zipcode98177   1.702e+05  1.421e+04  11.980  < 2e-16 ***
## zipcode98065   3.907e+04  1.375e+04   2.842 0.004484 ** 
## zipcode98029   1.674e+05  1.360e+04  12.304  < 2e-16 ***
## zipcode98006   2.219e+05  1.259e+04  17.635  < 2e-16 ***
## zipcode98109   4.584e+05  1.847e+04  24.824  < 2e-16 ***
## zipcode98022  -5.148e+04  1.470e+04  -3.502 0.000463 ***
## zipcode98033   3.208e+05  1.271e+04  25.233  < 2e-16 ***
## zipcode98155   1.043e+05  1.251e+04   8.335  < 2e-16 ***
## zipcode98024   1.187e+05  2.071e+04   5.730 1.02e-08 ***
## zipcode98011   7.712e+04  1.529e+04   5.043 4.62e-07 ***
## zipcode98031  -2.838e+04  1.399e+04  -2.029 0.042490 *  
## zipcode98106   1.081e+05  1.327e+04   8.149 3.86e-16 ***
## zipcode98072   1.073e+05  1.407e+04   7.631 2.43e-14 ***
## zipcode98188  -4.004e+03  1.696e+04  -0.236 0.813388    
## zipcode98014   5.725e+04  1.785e+04   3.206 0.001346 ** 
## zipcode98055   4.549e+03  1.398e+04   0.325 0.744864    
## zipcode98039   1.271e+06  2.507e+04  50.694  < 2e-16 ***
## sqft_living15  1.125e+01  2.884e+00   3.901 9.62e-05 ***
## sqft_lot15    -1.420e-01  6.024e-02  -2.357 0.018455 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160200 on 21523 degrees of freedom
## Multiple R-squared:  0.8103, Adjusted R-squared:  0.8095 
## F-statistic:  1033 on 89 and 21523 DF,  p-value: < 2.2e-16

here from round 1 we can see that most of predictors are significant

also we see that the ares of basement has nothing to do for prediction , so i will remove it and run the analysis again

houses<-houses%>%select(-sqft_basement)

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1238733   -70188     -618    62402  4415643 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.143e+05  1.250e+05   7.316 2.64e-13 ***
## bedrooms      -2.733e+04  1.528e+03 -17.887  < 2e-16 ***
## bathrooms      2.559e+04  2.647e+03   9.669  < 2e-16 ***
## sqft_living    1.272e+02  3.547e+00  35.857  < 2e-16 ***
## sqft_lot       2.433e-01  3.833e-02   6.347 2.23e-10 ***
## floors        -4.887e+04  3.092e+03 -15.807  < 2e-16 ***
## waterfront1    5.634e+05  1.603e+04  35.149  < 2e-16 ***
## view3          1.529e+05  7.617e+03  20.070  < 2e-16 ***
## view4          3.373e+05  1.167e+04  28.913  < 2e-16 ***
## view2          6.829e+04  5.559e+03  12.284  < 2e-16 ***
## view1          7.658e+04  9.099e+03   8.416  < 2e-16 ***
## condition.L    6.642e+04  1.923e+04   3.453 0.000555 ***
## condition.Q    3.217e+02  1.626e+04   0.020 0.984212    
## condition.C    3.137e+04  1.223e+04   2.564 0.010345 *  
## condition^4   -1.478e+04  7.047e+03  -2.098 0.035950 *  
## grade          5.754e+04  1.814e+03  31.716  < 2e-16 ***
## sqft_above     8.021e+01  3.596e+00  22.305  < 2e-16 ***
## yr_built      -6.588e+02  6.588e+01 -10.000  < 2e-16 ***
## yr_renovated   1.764e+01  2.949e+00   5.980 2.27e-09 ***
## zipcode98125   1.596e+05  1.271e+04  12.563  < 2e-16 ***
## zipcode98028   7.666e+04  1.384e+04   5.540 3.06e-08 ***
## zipcode98136   2.054e+05  1.408e+04  14.594  < 2e-16 ***
## zipcode98074   1.252e+05  1.280e+04   9.781  < 2e-16 ***
## zipcode98053   1.451e+05  1.309e+04  11.089  < 2e-16 ***
## zipcode98003  -5.580e+04  1.387e+04  -4.024 5.74e-05 ***
## zipcode98198  -4.202e+04  1.382e+04  -3.040 0.002370 ** 
## zipcode98146   6.460e+04  1.370e+04   4.714 2.45e-06 ***
## zipcode98038  -1.268e+04  1.216e+04  -1.043 0.297161    
## zipcode98007   2.016e+05  1.690e+04  11.933  < 2e-16 ***
## zipcode98115   2.898e+05  1.202e+04  24.102  < 2e-16 ***
## zipcode98107   3.095e+05  1.414e+04  21.890  < 2e-16 ***
## zipcode98126   1.583e+05  1.313e+04  12.063  < 2e-16 ***
## zipcode98019   4.584e+04  1.549e+04   2.960 0.003084 ** 
## zipcode98103   3.004e+05  1.209e+04  24.850  < 2e-16 ***
## zipcode98002  -1.429e+04  1.519e+04  -0.941 0.346577    
## zipcode98133   1.272e+05  1.232e+04  10.326  < 2e-16 ***
## zipcode98040   4.700e+05  1.412e+04  33.285  < 2e-16 ***
## zipcode98092  -8.049e+04  1.332e+04  -6.044 1.52e-09 ***
## zipcode98030  -3.796e+04  1.422e+04  -2.670 0.007598 ** 
## zipcode98119   4.395e+05  1.566e+04  28.072  < 2e-16 ***
## zipcode98112   5.786e+05  1.427e+04  40.555  < 2e-16 ***
## zipcode98052   1.832e+05  1.219e+04  15.034  < 2e-16 ***
## zipcode98027   1.241e+05  1.291e+04   9.611  < 2e-16 ***
## zipcode98117   2.769e+05  1.214e+04  22.802  < 2e-16 ***
## zipcode98058  -1.494e+04  1.258e+04  -1.187 0.235173    
## zipcode98001  -4.373e+04  1.311e+04  -3.335 0.000854 ***
## zipcode98056   5.351e+04  1.284e+04   4.168 3.09e-05 ***
## zipcode98166   1.566e+04  1.417e+04   1.105 0.269143    
## zipcode98023  -7.646e+04  1.237e+04  -6.184 6.37e-10 ***
## zipcode98070  -5.017e+04  1.833e+04  -2.736 0.006217 ** 
## zipcode98148   2.885e+04  2.347e+04   1.229 0.219014    
## zipcode98105   4.286e+05  1.470e+04  29.144  < 2e-16 ***
## zipcode98042  -3.875e+04  1.225e+04  -3.162 0.001569 ** 
## zipcode98008   2.079e+05  1.384e+04  15.023  < 2e-16 ***
## zipcode98059   3.939e+04  1.264e+04   3.116 0.001837 ** 
## zipcode98122   2.984e+05  1.389e+04  21.481  < 2e-16 ***
## zipcode98144   2.411e+05  1.326e+04  18.184  < 2e-16 ***
## zipcode98004   7.317e+05  1.368e+04  53.499  < 2e-16 ***
## zipcode98005   2.565e+05  1.610e+04  15.928  < 2e-16 ***
## zipcode98034   1.577e+05  1.214e+04  12.987  < 2e-16 ***
## zipcode98075   1.188e+05  1.346e+04   8.822  < 2e-16 ***
## zipcode98116   2.445e+05  1.340e+04  18.246  < 2e-16 ***
## zipcode98010   2.339e+04  1.905e+04   1.228 0.219631    
## zipcode98118   1.304e+05  1.223e+04  10.663  < 2e-16 ***
## zipcode98199   3.490e+05  1.353e+04  25.796  < 2e-16 ***
## zipcode98032  -3.643e+04  1.749e+04  -2.083 0.037261 *  
## zipcode98045   4.988e+04  1.480e+04   3.369 0.000755 ***
## zipcode98102   4.715e+05  1.877e+04  25.123  < 2e-16 ***
## zipcode98077   7.336e+04  1.551e+04   4.730 2.26e-06 ***
## zipcode98108   8.659e+04  1.541e+04   5.619 1.94e-08 ***
## zipcode98168   3.608e+04  1.395e+04   2.587 0.009693 ** 
## zipcode98177   1.702e+05  1.421e+04  11.980  < 2e-16 ***
## zipcode98065   3.907e+04  1.375e+04   2.842 0.004484 ** 
## zipcode98029   1.674e+05  1.360e+04  12.304  < 2e-16 ***
## zipcode98006   2.219e+05  1.259e+04  17.635  < 2e-16 ***
## zipcode98109   4.584e+05  1.847e+04  24.824  < 2e-16 ***
## zipcode98022  -5.148e+04  1.470e+04  -3.502 0.000463 ***
## zipcode98033   3.208e+05  1.271e+04  25.233  < 2e-16 ***
## zipcode98155   1.043e+05  1.251e+04   8.335  < 2e-16 ***
## zipcode98024   1.187e+05  2.071e+04   5.730 1.02e-08 ***
## zipcode98011   7.712e+04  1.529e+04   5.043 4.62e-07 ***
## zipcode98031  -2.838e+04  1.399e+04  -2.029 0.042490 *  
## zipcode98106   1.081e+05  1.327e+04   8.149 3.86e-16 ***
## zipcode98072   1.073e+05  1.407e+04   7.631 2.43e-14 ***
## zipcode98188  -4.004e+03  1.696e+04  -0.236 0.813388    
## zipcode98014   5.725e+04  1.785e+04   3.206 0.001346 ** 
## zipcode98055   4.549e+03  1.398e+04   0.325 0.744864    
## zipcode98039   1.271e+06  2.507e+04  50.694  < 2e-16 ***
## sqft_living15  1.125e+01  2.884e+00   3.901 9.62e-05 ***
## sqft_lot15    -1.420e-01  6.024e-02  -2.357 0.018455 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160200 on 21523 degrees of freedom
## Multiple R-squared:  0.8103, Adjusted R-squared:  0.8095 
## F-statistic:  1033 on 89 and 21523 DF,  p-value: < 2.2e-16

another thing to take care with is that zibcode has a lot of levels and we need to round it

according to residual i will try to round it to small numbers of areas using residuals

grouping<-houses%>%mutate(resid=resid(fit))%>%
        group_by(zipcode)%>%summarize(medresid=median(resid),
                                      cnt=n())%>%arrange(medresid)%>%
        mutate(cumcnt=cumsum(cnt),
               group=ntile(cumcnt,5))

houses<-houses%>%left_join(grouping[,c("zipcode","group")])%>%mutate(group=factor(group))

now remove the zip from the data and run fit again

houses<-houses%>%select(-zipcode)

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1442272   -96219    -2491    82020  4346063 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.666e+06  1.240e+05  29.576  < 2e-16 ***
## bedrooms      -3.549e+04  1.766e+03 -20.102  < 2e-16 ***
## bathrooms      3.544e+04  3.077e+03  11.518  < 2e-16 ***
## sqft_living    1.445e+02  4.084e+00  35.384  < 2e-16 ***
## sqft_lot       1.418e-01  4.464e-02   3.175 0.001499 ** 
## floors        -2.230e+04  3.343e+03  -6.672 2.58e-11 ***
## waterfront1    5.423e+05  1.840e+04  29.474  < 2e-16 ***
## view3          1.248e+05  8.777e+03  14.213  < 2e-16 ***
## view4          3.287e+05  1.347e+04  24.399  < 2e-16 ***
## view2          5.990e+04  6.399e+03   9.361  < 2e-16 ***
## view1          1.073e+05  1.053e+04  10.192  < 2e-16 ***
## condition.L    4.642e+04  2.247e+04   2.066 0.038845 *  
## condition.Q   -2.564e+03  1.901e+04  -0.135 0.892712    
## condition.C    2.743e+04  1.430e+04   1.918 0.055123 .  
## condition^4   -8.245e+03  8.226e+03  -1.002 0.316153    
## grade          7.973e+04  2.021e+03  39.455  < 2e-16 ***
## sqft_above     5.174e+01  3.999e+00  12.939  < 2e-16 ***
## yr_built      -1.951e+03  6.585e+01 -29.625  < 2e-16 ***
## yr_renovated   1.237e+01  3.416e+00   3.620 0.000295 ***
## sqft_living15  3.527e+00  3.140e+00   1.123 0.261383    
## sqft_lot15    -2.867e-01  6.825e-02  -4.200 2.68e-05 ***
## group2        -1.898e+05  4.245e+03 -44.714  < 2e-16 ***
## group3        -2.847e+05  4.573e+03 -62.261  < 2e-16 ***
## group4        -2.700e+05  4.470e+03 -60.399  < 2e-16 ***
## group5        -3.798e+05  4.807e+03 -79.017  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 187800 on 21588 degrees of freedom
## Multiple R-squared:  0.7387, Adjusted R-squared:  0.7384 
## F-statistic:  2542 on 24 and 21588 DF,  p-value: < 2.2e-16

most coefficients here are beautiful

now what about outlier!!

#using mahalanobis distance for multivariate outliers 

num2<-houses%>%select(price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_living15,sqft_lot15)

mah<-mahalanobis(x = num2,center = colMeans(num2),cov = cov(num2))

cutmahup<-qchisq(.99,ncol(num2))
cutmahdown<-qchisq(.01,ncol(num2))

outidxup<-mah>cutmahup%>%as.numeric()
outidxdown<-mah<cutmahdown%>%as.numeric()

table(outidxup)
## outidxup
## FALSE  TRUE 
## 20489  1124
table(outidxdown)
## outidxdown
## FALSE  TRUE 
## 20434  1179

next testing for influential points

inf<-hatvalues(fit)
cutinf<-(2*(ncol(houses)-1)+1)/nrow(houses)

table(inf>cutinf)
## 
## FALSE  TRUE 
## 17832  3781

horrible !!

infidx<-inf>cutinf%>%as.numeric()

testing cooks distance

cook<-cooks.distance(fit)

cutcook<-4/(nrow(houses)-(ncol(houses)-1)-1)

table(cook>cutcook)
## 
## FALSE  TRUE 
## 20450  1163
cookidx<-cook>cutcook%>%as.numeric()

testing for variance inflation

vif<-vif(fit)
vif
##                   GVIF Df GVIF^(1/(2*Df))
## bedrooms      1.652718  1        1.285581
## bathrooms     3.442414  1        1.855374
## sqft_living   8.620412  1        2.936054
## sqft_lot      2.095620  1        1.447626
## floors        2.123134  1        1.457098
## waterfront    1.552769  1        1.246101
## view          1.852788  4        1.080135
## condition     1.325168  4        1.035819
## grade         3.457940  1        1.859554
## sqft_above    6.719385  1        2.592178
## yr_built      2.292582  1        1.514128
## yr_renovated  1.153622  1        1.074068
## sqft_living15 2.839157  1        1.684980
## sqft_lot15    2.128469  1        1.458927
## group         1.313838  4        1.034708

good news , we don’t have here any vif value over 10 and this indicate absence of multicolinearity

testing auto correlation

auto<-durbinWatsonTest(fit,simulate = T,reps = 1000,method = "resample")
auto
##  lag Autocorrelation D-W Statistic p-value
##    1      0.01596825      1.968049   0.024
##  Alternative hypothesis: rho != 0

great , p value>.01 indicate that there is no auto correlation

Now lets see the most bad records in our data

allbad<-outidxup+outidxdown+infidx+cookidx

table(allbad)
## allbad
##     0     1     2     3 
## 16491  3473  1173   476

i will delete all records which break 2 rules

bad<-allbad>=2

table(bad)
## bad
## FALSE  TRUE 
## 19964  1649
houses<-houses%>%anti_join(houses[bad,])

running the fit again

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -523005  -78395   -4085   68184  929903 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.282e+06  8.637e+04  37.999  < 2e-16 ***
## bedrooms      -1.981e+04  1.334e+03 -14.845  < 2e-16 ***
## bathrooms      2.669e+04  2.234e+03  11.951  < 2e-16 ***
## sqft_living    1.027e+02  3.129e+00  32.826  < 2e-16 ***
## sqft_lot       4.835e-01  1.229e-01   3.934 8.37e-05 ***
## floors         4.338e+03  2.362e+03   1.836   0.0664 .  
## waterfront1    5.293e+05  3.527e+04  15.006  < 2e-16 ***
## view3          1.195e+05  7.224e+03  16.547  < 2e-16 ***
## view4          3.497e+05  1.561e+04  22.400  < 2e-16 ***
## view2          6.127e+04  4.665e+03  13.135  < 2e-16 ***
## view1          1.020e+05  8.631e+03  11.813  < 2e-16 ***
## condition.L    5.875e+04  3.487e+04   1.685   0.0921 .  
## condition.Q    3.234e+02  2.947e+04   0.011   0.9912    
## condition.C    1.060e+04  1.884e+04   0.563   0.5735    
## condition^4    3.950e+03  8.677e+03   0.455   0.6489    
## grade          7.506e+04  1.428e+03  52.582  < 2e-16 ***
## sqft_above     2.623e+01  2.954e+00   8.880  < 2e-16 ***
## yr_built      -1.760e+03  4.563e+01 -38.576  < 2e-16 ***
## yr_renovated   1.293e+01  2.527e+00   5.118 3.11e-07 ***
## sqft_living15  2.192e+01  2.424e+00   9.045  < 2e-16 ***
## sqft_lot15    -6.147e-01  1.506e-01  -4.082 4.48e-05 ***
## group2        -1.369e+05  2.930e+03 -46.710  < 2e-16 ***
## group3        -2.334e+05  3.141e+03 -74.298  < 2e-16 ***
## group4        -2.113e+05  3.097e+03 -68.238  < 2e-16 ***
## group5        -3.185e+05  3.334e+03 -95.529  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 122200 on 19939 degrees of freedom
## Multiple R-squared:  0.7602, Adjusted R-squared:  0.7599 
## F-statistic:  2634 on 24 and 19939 DF,  p-value: < 2.2e-16
plot(fit,3)

we see here that we have some degree of heteroscedasticity

checking the normality

qqPlot(fit)

## [1]  5718 18796

OK , the qqplot shows that the data has a degree of skewness

and to prove it lets see histogram

hist(rstudent(fit))

some right skew but the data still good

the only issue we have to handle is heteroscedasticity

now i will check partial residual to know the problematic variable

resid<-resid(fit)
term<-predict(fit,houses,type = "terms")%>%data.frame()

partial.resid<-resid+term
houses.org<-houses
names(houses.org)<-paste("org",names(houses),sep = ".")
names(partial.resid)<-paste("part",names(partial.resid),sep = ".")
names(term)<-paste("term",names(term),sep = ".")
allpartial<-cbind(houses.org[,-1],partial.resid,term)


allpartial%>%ggplot(aes(org.sqft_living,part.sqft_living))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_living,y=term.sqft_living),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.sqft_lot,part.sqft_lot))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_lot,y=term.sqft_lot),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.sqft_above,part.sqft_above))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_above,y=term.sqft_above),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.sqft_living15,part.sqft_living15))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_living15,y=term.sqft_living15),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.sqft_lot15,part.sqft_lot15))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_lot15,y=term.sqft_lot15),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.yr_built,part.yr_built))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.yr_built,y=term.yr_built),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.yr_renovated,part.yr_renovated))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.yr_renovated,y=term.yr_renovated),color="red",lwd=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

here we can find that the most problems associated with square foot of lot and the same variable in 2015 and the year of reonivation

i will categorize the renovation years variable into 4 categories according to residuals

houses$yr_renovated%>%table()%>%kable("html")
. Freq
0 19281
1940 2
1944 1
1945 3
1946 2
1948 1
1950 2
1951 1
1953 3
1954 1
1955 3
1956 1
1957 3
1958 5
1959 1
1960 3
1962 1
1963 3
1964 5
1965 3
1967 2
1968 6
1969 3
1970 8
1971 2
1972 2
1973 5
1974 3
1975 6
1976 2
1977 7
1978 4
1979 7
1980 9
1981 3
1982 7
1983 14
1984 15
1985 12
1986 8
1987 12
1988 12
1989 17
1990 20
1991 13
1992 12
1993 15
1994 13
1995 13
1996 9
1997 11
1998 13
1999 9
2000 21
2001 15
2002 12
2003 25
2004 20
2005 27
2006 18
2007 26
2008 9
2009 19
2010 15
2011 12
2012 11
2013 28
2014 78
2015 9
stdresid<-rstudent(fit)

catigor<-houses%>%mutate(stdresid)%>%
        group_by(yr_renovated)%>%
        summarize(medstd=median(stdresid),cnt=n())%>%
        arrange(medstd)%>%
        mutate(cumcnt=cumsum(cnt),renovC=ntile(cumcnt,4))

catigor%>%kable("markdown")
yr_renovated medstd cnt cumcnt renovC
1967 -1.3146319 2 2 1
1976 -1.1681682 2 4 1
1963 -1.1644635 3 7 1
1969 -1.1364518 3 10 1
1964 -1.1031153 5 15 1
1970 -1.0855883 8 23 1
1996 -1.0752276 9 32 1
1953 -1.0183430 3 35 1
1945 -0.9554270 3 38 1
1960 -0.8671719 3 41 1
1959 -0.6575658 1 42 1
1950 -0.6536802 2 44 1
1978 -0.5931954 4 48 1
1982 -0.5847983 7 55 1
1973 -0.5711407 5 60 1
1971 -0.5155870 2 62 1
1981 -0.4807747 3 65 1
1957 -0.4715717 3 68 1
1995 -0.4594498 13 81 2
1983 -0.4412839 14 95 2
1977 -0.4311997 7 102 2
1984 -0.4227199 15 117 2
1997 -0.4130501 11 128 2
1968 -0.4060072 6 134 2
1974 -0.3795095 3 137 2
1946 -0.3785293 2 139 2
1940 -0.3263868 2 141 2
1994 -0.3208032 13 154 2
1993 -0.2803082 15 169 2
1988 -0.2740405 12 181 2
1948 -0.2578666 1 182 2
1991 -0.2541159 13 195 2
1992 -0.2498357 12 207 2
1989 -0.2221852 17 224 2
1958 -0.2079038 5 229 2
2001 -0.2000301 15 244 3
1987 -0.1873965 12 256 3
2000 -0.1789553 21 277 3
1990 -0.1543012 20 297 3
1980 -0.1477219 9 306 3
1972 -0.1006371 2 308 3
1965 -0.0930242 3 311 3
2013 -0.0892279 28 339 3
1955 -0.0803034 3 342 3
1986 -0.0612799 8 350 3
2012 -0.0611215 11 361 3
1985 -0.0536939 12 373 3
0 -0.0324502 19281 19654 3
1975 0.0167476 6 19660 3
2015 0.0227493 9 19669 3
2009 0.0288613 19 19688 3
2011 0.0822008 12 19700 3
1998 0.0945444 13 19713 4
2007 0.1200742 26 19739 4
1979 0.1245174 7 19746 4
2004 0.2679457 20 19766 4
2006 0.2908748 18 19784 4
2003 0.2965783 25 19809 4
1944 0.3629567 1 19810 4
1951 0.4672665 1 19811 4
2014 0.4996164 78 19889 4
2005 0.5223866 27 19916 4
2010 0.5686400 15 19931 4
2002 0.6547362 12 19943 4
1956 0.8582450 1 19944 4
1999 0.9077240 9 19953 4
2008 1.0379758 9 19962 4
1962 1.2237277 1 19963 4
1954 2.1236397 1 19964 4
catigor<-catigor%>%select(yr_renovated,renovC)


houses<-(houses)%>%left_join(catigor)%>%mutate(renovC=factor(renovC))
## Joining, by = "yr_renovated"
houses<-houses%>%select(-yr_renovated)

houses%>%head%>%kable("markdown")
price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above yr_built sqft_living15 sqft_lot15 group renovC
221900 3 1.00 1180 5650 1 0 0 3 7 1180 1955 1340 5650 5 3
538000 3 2.25 2570 7242 2 0 0 3 7 2170 1951 1690 7639 4 2
180000 2 1.00 770 10000 1 0 0 3 6 770 1933 2720 8062 4 3
604000 4 3.00 1960 5000 1 0 0 5 7 1050 1965 1360 5000 4 3
510000 3 2.00 1680 8080 1 0 0 3 8 1680 1987 1800 7503 4 3
257500 3 2.25 1715 6819 2 0 0 3 7 1715 1995 2238 6819 5 3
table(houses$renovC)
## 
##     1     2     3     4 
##    68   161 19471   264

recalculating the fit

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)
## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -523310  -77900   -4030   67725  930210 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.239e+06  8.498e+04  38.117  < 2e-16 ***
## bedrooms      -1.986e+04  1.332e+03 -14.911  < 2e-16 ***
## bathrooms      2.627e+04  2.226e+03  11.801  < 2e-16 ***
## sqft_living    1.028e+02  3.123e+00  32.907  < 2e-16 ***
## sqft_lot       4.764e-01  1.227e-01   3.884 0.000103 ***
## floors         5.000e+03  2.358e+03   2.120 0.034018 *  
## waterfront1    5.332e+05  3.522e+04  15.140  < 2e-16 ***
## view3          1.200e+05  7.211e+03  16.637  < 2e-16 ***
## view4          3.492e+05  1.559e+04  22.398  < 2e-16 ***
## view2          6.184e+04  4.657e+03  13.277  < 2e-16 ***
## view1          1.023e+05  8.615e+03  11.877  < 2e-16 ***
## condition.L    5.938e+04  3.481e+04   1.706 0.088056 .  
## condition.Q   -5.216e+02  2.942e+04  -0.018 0.985853    
## condition.C    1.148e+04  1.881e+04   0.611 0.541475    
## condition^4    3.053e+03  8.662e+03   0.352 0.724501    
## grade          7.469e+04  1.426e+03  52.397  < 2e-16 ***
## sqft_above     2.671e+01  2.949e+00   9.056  < 2e-16 ***
## yr_built      -1.772e+03  4.501e+01 -39.360  < 2e-16 ***
## sqft_living15  2.181e+01  2.417e+00   9.024  < 2e-16 ***
## sqft_lot15    -5.939e-01  1.503e-01  -3.950 7.83e-05 ***
## group2        -1.367e+05  2.925e+03 -46.718  < 2e-16 ***
## group3        -2.332e+05  3.136e+03 -74.364  < 2e-16 ***
## group4        -2.107e+05  3.092e+03 -68.141  < 2e-16 ***
## group5        -3.184e+05  3.328e+03 -95.665  < 2e-16 ***
## renovC2        6.885e+04  1.767e+04   3.897 9.79e-05 ***
## renovC3        6.777e+04  1.493e+04   4.541 5.64e-06 ***
## renovC4        1.366e+05  1.664e+04   8.204 2.46e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 122000 on 19937 degrees of freedom
## Multiple R-squared:  0.7611, Adjusted R-squared:  0.7608 
## F-statistic:  2443 on 26 and 19937 DF,  p-value: < 2.2e-16

Good , all categories are significant

now lets try the weighted regression and start with calculating the weights

residl<-resid(fit)%>%abs()
fited<-fitted(fit)
residfit<-lm(residl~fited)
fittresid<-fitted(residfit)
wts<-1/fittresid^2

wtsfit<-houses%>%with(lm(price~.,data=houses,weights = wts))
summary(wtsfit)
## 
## Call:
## lm(formula = price ~ ., data = houses, weights = wts)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0657 -0.8911 -0.0765  0.7666 13.6759 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.780e+06  7.615e+04  36.506  < 2e-16 ***
## bedrooms      -1.943e+04  1.158e+03 -16.779  < 2e-16 ***
## bathrooms      1.874e+04  1.974e+03   9.495  < 2e-16 ***
## sqft_living    9.526e+01  2.969e+00  32.083  < 2e-16 ***
## sqft_lot       6.726e-01  1.059e-01   6.349 2.22e-10 ***
## floors         1.150e+04  2.101e+03   5.473 4.47e-08 ***
## waterfront1    5.600e+05  7.409e+04   7.558 4.27e-14 ***
## view3          1.071e+05  8.500e+03  12.597  < 2e-16 ***
## view4          3.550e+05  2.536e+04  13.999  < 2e-16 ***
## view2          5.989e+04  4.929e+03  12.149  < 2e-16 ***
## view1          9.405e+04  9.276e+03  10.140  < 2e-16 ***
## condition.L    8.421e+04  1.866e+04   4.512 6.47e-06 ***
## condition.Q   -1.985e+04  1.577e+04  -1.259 0.208127    
## condition.C    1.014e+04  1.036e+04   0.979 0.327814    
## condition^4    8.627e+03  5.048e+03   1.709 0.087436 .  
## grade          5.794e+04  1.244e+03  46.559  < 2e-16 ***
## sqft_above     1.802e+01  2.819e+00   6.392 1.67e-10 ***
## yr_built      -1.481e+03  4.079e+01 -36.304  < 2e-16 ***
## sqft_living15  3.203e+01  2.260e+00  14.173  < 2e-16 ***
## sqft_lot15    -4.333e-01  1.249e-01  -3.471 0.000520 ***
## group2        -1.205e+05  3.246e+03 -37.123  < 2e-16 ***
## group3        -2.220e+05  3.243e+03 -68.442  < 2e-16 ***
## group4        -1.953e+05  3.262e+03 -59.878  < 2e-16 ***
## group5        -2.901e+05  3.290e+03 -88.173  < 2e-16 ***
## renovC2        5.253e+04  1.454e+04   3.613 0.000304 ***
## renovC3        6.112e+04  1.166e+04   5.244 1.59e-07 ***
## renovC4        1.237e+05  1.393e+04   8.878  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.26 on 19937 degrees of freedom
## Multiple R-squared:  0.7181, Adjusted R-squared:  0.7177 
## F-statistic:  1953 on 26 and 19937 DF,  p-value: < 2.2e-16
plot(wtsfit,3,main = "weighted regression")

plot(fit,3,main = "Ordinary regression")

we see here an improve in the linearity and heteroscedasticity

comparing the standardized coefficients of OLS model and weighted model

data.frame(cbind(lm.beta(fit)$standardized.coefficients,lm.beta(wtsfit)$standardized.coefficients))%>%kable(format = "markdown",digits = 3,col.names = c("Ordinary","Weighted"))
Ordinary Weighted
(Intercept) 0.000 0.000
bedrooms -0.068 -0.067
bathrooms 0.074 0.053
sqft_living 0.318 0.294
sqft_lot 0.025 0.035
floors 0.011 0.026
waterfront1 0.057 0.059
view3 0.059 0.053
view4 0.085 0.086
view2 0.047 0.046
view1 0.041 0.038
condition.L 0.048 0.068
condition.Q -0.001 -0.023
condition.C 0.014 0.013
condition^4 0.006 0.018
grade 0.320 0.248
sqft_above 0.078 0.053
yr_built -0.208 -0.173
sqft_living15 0.055 0.081
sqft_lot15 -0.025 -0.018
group2 -0.236 -0.208
group3 -0.377 -0.359
group4 -0.348 -0.322
group5 -0.497 -0.453
renovC2 0.025 0.019
renovC3 0.042 0.038
renovC4 0.063 0.057

The Coefficients are so close to each other

Now lets see the ability of our model to predict out sample data

houses<-cbind(houses,wts)

set.seed(4785)

trainidx<-createDataPartition(y = houses$price,p = .75,list = F)
traindata<-houses[trainidx,]
testdata<-houses[-trainidx,]

running the model

set.seed(3454)
trainfit<-traindata%>%with(lm(price~.,data = traindata[,-length(names(traindata))],weights = traindata$wts))
summary(trainfit)
## 
## Call:
## lm(formula = price ~ ., data = traindata[, -length(names(traindata))], 
##     weights = traindata$wts)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1019 -0.8939 -0.0758  0.7716 13.7050 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.807e+06  8.800e+04  31.900  < 2e-16 ***
## bedrooms      -1.950e+04  1.336e+03 -14.597  < 2e-16 ***
## bathrooms      1.838e+04  2.272e+03   8.090 6.40e-16 ***
## sqft_living    9.243e+01  3.417e+00  27.049  < 2e-16 ***
## sqft_lot       7.483e-01  1.235e-01   6.059 1.40e-09 ***
## floors         1.257e+04  2.433e+03   5.165 2.43e-07 ***
## waterfront1    5.638e+05  7.454e+04   7.564 4.13e-14 ***
## view3          1.107e+05  9.718e+03  11.394  < 2e-16 ***
## view4          3.500e+05  2.768e+04  12.643  < 2e-16 ***
## view2          6.173e+04  5.685e+03  10.859  < 2e-16 ***
## view1          9.688e+04  1.020e+04   9.502  < 2e-16 ***
## condition.L    9.235e+04  2.330e+04   3.964 7.41e-05 ***
## condition.Q   -2.660e+04  1.968e+04  -1.352  0.17643    
## condition.C    1.896e+04  1.275e+04   1.487  0.13693    
## condition^4    4.314e+03  6.060e+03   0.712  0.47647    
## grade          5.897e+04  1.439e+03  40.973  < 2e-16 ***
## sqft_above     2.151e+01  3.234e+00   6.651 3.00e-11 ***
## yr_built      -1.498e+03  4.712e+01 -31.799  < 2e-16 ***
## sqft_living15  3.170e+01  2.624e+00  12.084  < 2e-16 ***
## sqft_lot15    -5.765e-01  1.480e-01  -3.896 9.84e-05 ***
## group2        -1.225e+05  3.767e+03 -32.536  < 2e-16 ***
## group3        -2.252e+05  3.761e+03 -59.880  < 2e-16 ***
## group4        -1.974e+05  3.780e+03 -52.208  < 2e-16 ***
## group5        -2.945e+05  3.816e+03 -77.169  < 2e-16 ***
## renovC2        5.127e+04  1.678e+04   3.055  0.00226 ** 
## renovC3        6.332e+04  1.342e+04   4.718 2.41e-06 ***
## renovC4        1.256e+05  1.635e+04   7.684 1.64e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.259 on 14947 degrees of freedom
## Multiple R-squared:  0.7204, Adjusted R-squared:  0.7199 
## F-statistic:  1481 on 26 and 14947 DF,  p-value: < 2.2e-16
predicted<-predict(trainfit,testdata)

cbind(RMSE=RMSE(pred = predicted,obs = testdata$price),R2=R2(pred = predicted,obs = testdata$price))%>%kable("markdown")
RMSE R2
124517.3 0.7519751

REGARDS