Predicting houses prices

the aim of this analysis to use weighted regression to predict the houses prices

plz feel free to contact me for any advice or recommendations: vet.m.mohamed@gmail.com

find the data here

firstly lets load the libraries we are going to use

library(tidyverse)
library(caret)
library(mice)
library(car)
library(lm.beta)
library(knitr)

Next lets import the data

houses<-read_csv(file = "./data sets/kc_house_data.csv",col_types = "cTninnnifffinnddf??nn")

head(houses)%>%kable("markdown")

id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
7129300520	2014-10-13	221900	3	1.00	1180	5650	1	3	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
6414100192	2014-12-09	538000	3	2.25	2570	7242	2	3	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
5631500400	2015-02-25	180000	2	1.00	770	10000	1	3	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
2487200875	2014-12-09	604000	4	3.00	1960	5000	1	5	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
1954400510	2015-02-18	510000	3	2.00	1680	8080	1	3	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503
7237550310	2014-05-12	1225000	4	4.50	5420	101930	1	3	11	3890	1530	2001	0	98053	47.6561	-122.005	4760	101930

checking the structure of the data

str(houses)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 21613 obs. of  21 variables:
##  $ id           : chr  "7129300520" "6414100192" "5631500400" "2487200875" ...
##  $ date         : POSIXct, format: "2014-10-13" "2014-12-09" ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num  1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num  5650 7242 10000 5000 8080 ...
##  $ floors       : int  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ view         : Factor w/ 5 levels "0","3","4","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ condition    : Factor w/ 5 levels "3","5","4","1",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num  1180 2170 770 1050 1680 ...
##  $ sqft_basement: num  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num  1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num  0 1991 0 0 0 ...
##  $ zipcode      : Factor w/ 70 levels "98178","98125",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num  1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num  5650 7639 8062 5000 7503 ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 2079 obs. of  5 variables:
##   ..$ row     : int  13 15 18 26 28 33 34 36 55 66 ...
##   ..$ col     : chr  "floors" "floors" "floors" "floors" ...
##   ..$ expected: chr  "no trailing characters" "no trailing characters" "no trailing characters" "no trailing characters" ...
##   ..$ actual  : chr  ".5" ".5" ".5" ".5" ...
##   ..$ file    : chr  "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" "'./data sets/kc_house_data.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   date = col_datetime(format = ""),
##   ..   price = col_number(),
##   ..   bedrooms = col_integer(),
##   ..   bathrooms = col_number(),
##   ..   sqft_living = col_number(),
##   ..   sqft_lot = col_number(),
##   ..   floors = col_integer(),
##   ..   waterfront = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   view = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   condition = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   grade = col_integer(),
##   ..   sqft_above = col_number(),
##   ..   sqft_basement = col_number(),
##   ..   yr_built = col_double(),
##   ..   yr_renovated = col_double(),
##   ..   zipcode = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   lat = col_double(),
##   ..   long = col_double(),
##   ..   sqft_living15 = col_number(),
##   ..   sqft_lot15 = col_number()
##   .. )

removing id lat long date

houses<-houses%>%select(-c(id,date,lat,long))

need to check the condition if it need to be ordered

unique(houses$condition)

## [1] 3 5 4 1 2
## Levels: 3 5 4 1 2

i think that is preferable if it is ordered

houses<-houses%>%mutate(condition=factor(condition,levels = c("1","2","3","4","5"),labels = c("1","2","3","4","5"),ordered = T))

checking the structure again

str(houses)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 21613 obs. of  17 variables:
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num  1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num  5650 7242 10000 5000 8080 ...
##  $ floors       : int  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ view         : Factor w/ 5 levels "0","3","4","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ condition    : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num  1180 2170 770 1050 1680 ...
##  $ sqft_basement: num  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num  1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num  0 1991 0 0 0 ...
##  $ zipcode      : Factor w/ 70 levels "98178","98125",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ sqft_living15: num  1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : num  5650 7639 8062 5000 7503 ...

now we want to test some assumption starting with is the missing values , what about it ?

summary(houses)

##      price            bedrooms        bathrooms      sqft_living   
##  Min.   :  75000   Min.   : 0.000   Min.   :0.000   Min.   :  290  
##  1st Qu.: 321950   1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427  
##  Median : 450000   Median : 3.000   Median :2.250   Median : 1910  
##  Mean   : 540088   Mean   : 3.371   Mean   :2.115   Mean   : 2080  
##  3rd Qu.: 645000   3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550  
##  Max.   :7700000   Max.   :33.000   Max.   :8.000   Max.   :13540  
##                                                                    
##     sqft_lot           floors      waterfront view      condition
##  Min.   :    520   Min.   :1.000   0:21450    0:19489   1:   30  
##  1st Qu.:   5040   1st Qu.:1.000   1:  163    3:  510   2:  172  
##  Median :   7618   Median :1.000              4:  319   3:14031  
##  Mean   :  15107   Mean   :1.485              2:  963   4: 5679  
##  3rd Qu.:  10688   3rd Qu.:2.000              1:  332   5: 1701  
##  Max.   :1651359   Max.   :3.000                                 
##                    NA's   :2079                                  
##      grade          sqft_above   sqft_basement       yr_built   
##  Min.   : 1.000   Min.   : 290   Min.   :   0.0   Min.   :1900  
##  1st Qu.: 7.000   1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951  
##  Median : 7.000   Median :1560   Median :   0.0   Median :1975  
##  Mean   : 7.657   Mean   :1788   Mean   : 291.5   Mean   :1971  
##  3rd Qu.: 8.000   3rd Qu.:2210   3rd Qu.: 560.0   3rd Qu.:1997  
##  Max.   :13.000   Max.   :9410   Max.   :4820.0   Max.   :2015  
##                                                                 
##   yr_renovated       zipcode      sqft_living15    sqft_lot15    
##  Min.   :   0.0   98103  :  602   Min.   : 399   Min.   :   651  
##  1st Qu.:   0.0   98038  :  590   1st Qu.:1490   1st Qu.:  5100  
##  Median :   0.0   98115  :  583   Median :1840   Median :  7620  
##  Mean   :  84.4   98052  :  574   Mean   :1987   Mean   : 12768  
##  3rd Qu.:   0.0   98117  :  553   3rd Qu.:2360   3rd Qu.: 10083  
##  Max.   :2015.0   98042  :  548   Max.   :6210   Max.   :871200  
##                   (Other):18163

we have a problem only with floor

lets check the percent of missing

(sum(is.na(houses$floors))/nrow(houses)
)*100

## [1] 9.619211

only 10% of missing values , so i will impute it using multiple imputation method

first i want to extract the numeric variables

num<-houses%>%select(price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_living15,sqft_lot15)%>%data.frame()

set.seed(3215)

imp<-mice(data = num)

## 
##  iter imp variable
##   1   1  floors
##   1   2  floors
##   1   3  floors
##   1   4  floors
##   1   5  floors
##   2   1  floors
##   2   2  floors
##   2   3  floors
##   2   4  floors
##   2   5  floors
##   3   1  floors
##   3   2  floors
##   3   3  floors
##   3   4  floors
##   3   5  floors
##   4   1  floors
##   4   2  floors
##   4   3  floors
##   4   4  floors
##   4   5  floors
##   5   1  floors
##   5   2  floors
##   5   3  floors
##   5   4  floors
##   5   5  floors

impnum<-complete(imp)%>%data.frame()

houses<-houses%>%mutate(floors=impnum$floors,
                        floors=as.integer(floors))

check again for missing

(sum(is.na(houses$floors))/nrow(houses)
)*100

## [1] 0

great

now lets run regular regression to test the assumption

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)

## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1238733   -70188     -618    62402  4415643 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.143e+05  1.250e+05   7.316 2.64e-13 ***
## bedrooms      -2.733e+04  1.528e+03 -17.887  < 2e-16 ***
## bathrooms      2.559e+04  2.647e+03   9.669  < 2e-16 ***
## sqft_living    1.272e+02  3.547e+00  35.857  < 2e-16 ***
## sqft_lot       2.433e-01  3.833e-02   6.347 2.23e-10 ***
## floors        -4.887e+04  3.092e+03 -15.807  < 2e-16 ***
## waterfront1    5.634e+05  1.603e+04  35.149  < 2e-16 ***
## view3          1.529e+05  7.617e+03  20.070  < 2e-16 ***
## view4          3.373e+05  1.167e+04  28.913  < 2e-16 ***
## view2          6.829e+04  5.559e+03  12.284  < 2e-16 ***
## view1          7.658e+04  9.099e+03   8.416  < 2e-16 ***
## condition.L    6.642e+04  1.923e+04   3.453 0.000555 ***
## condition.Q    3.217e+02  1.626e+04   0.020 0.984212    
## condition.C    3.137e+04  1.223e+04   2.564 0.010345 *  
## condition^4   -1.478e+04  7.047e+03  -2.098 0.035950 *  
## grade          5.754e+04  1.814e+03  31.716  < 2e-16 ***
## sqft_above     8.021e+01  3.596e+00  22.305  < 2e-16 ***
## sqft_basement         NA         NA      NA       NA    
## yr_built      -6.588e+02  6.588e+01 -10.000  < 2e-16 ***
## yr_renovated   1.764e+01  2.949e+00   5.980 2.27e-09 ***
## zipcode98125   1.596e+05  1.271e+04  12.563  < 2e-16 ***
## zipcode98028   7.666e+04  1.384e+04   5.540 3.06e-08 ***
## zipcode98136   2.054e+05  1.408e+04  14.594  < 2e-16 ***
## zipcode98074   1.252e+05  1.280e+04   9.781  < 2e-16 ***
## zipcode98053   1.451e+05  1.309e+04  11.089  < 2e-16 ***
## zipcode98003  -5.580e+04  1.387e+04  -4.024 5.74e-05 ***
## zipcode98198  -4.202e+04  1.382e+04  -3.040 0.002370 ** 
## zipcode98146   6.460e+04  1.370e+04   4.714 2.45e-06 ***
## zipcode98038  -1.268e+04  1.216e+04  -1.043 0.297161    
## zipcode98007   2.016e+05  1.690e+04  11.933  < 2e-16 ***
## zipcode98115   2.898e+05  1.202e+04  24.102  < 2e-16 ***
## zipcode98107   3.095e+05  1.414e+04  21.890  < 2e-16 ***
## zipcode98126   1.583e+05  1.313e+04  12.063  < 2e-16 ***
## zipcode98019   4.584e+04  1.549e+04   2.960 0.003084 ** 
## zipcode98103   3.004e+05  1.209e+04  24.850  < 2e-16 ***
## zipcode98002  -1.429e+04  1.519e+04  -0.941 0.346577    
## zipcode98133   1.272e+05  1.232e+04  10.326  < 2e-16 ***
## zipcode98040   4.700e+05  1.412e+04  33.285  < 2e-16 ***
## zipcode98092  -8.049e+04  1.332e+04  -6.044 1.52e-09 ***
## zipcode98030  -3.796e+04  1.422e+04  -2.670 0.007598 ** 
## zipcode98119   4.395e+05  1.566e+04  28.072  < 2e-16 ***
## zipcode98112   5.786e+05  1.427e+04  40.555  < 2e-16 ***
## zipcode98052   1.832e+05  1.219e+04  15.034  < 2e-16 ***
## zipcode98027   1.241e+05  1.291e+04   9.611  < 2e-16 ***
## zipcode98117   2.769e+05  1.214e+04  22.802  < 2e-16 ***
## zipcode98058  -1.494e+04  1.258e+04  -1.187 0.235173    
## zipcode98001  -4.373e+04  1.311e+04  -3.335 0.000854 ***
## zipcode98056   5.351e+04  1.284e+04   4.168 3.09e-05 ***
## zipcode98166   1.566e+04  1.417e+04   1.105 0.269143    
## zipcode98023  -7.646e+04  1.237e+04  -6.184 6.37e-10 ***
## zipcode98070  -5.017e+04  1.833e+04  -2.736 0.006217 ** 
## zipcode98148   2.885e+04  2.347e+04   1.229 0.219014    
## zipcode98105   4.286e+05  1.470e+04  29.144  < 2e-16 ***
## zipcode98042  -3.875e+04  1.225e+04  -3.162 0.001569 ** 
## zipcode98008   2.079e+05  1.384e+04  15.023  < 2e-16 ***
## zipcode98059   3.939e+04  1.264e+04   3.116 0.001837 ** 
## zipcode98122   2.984e+05  1.389e+04  21.481  < 2e-16 ***
## zipcode98144   2.411e+05  1.326e+04  18.184  < 2e-16 ***
## zipcode98004   7.317e+05  1.368e+04  53.499  < 2e-16 ***
## zipcode98005   2.565e+05  1.610e+04  15.928  < 2e-16 ***
## zipcode98034   1.577e+05  1.214e+04  12.987  < 2e-16 ***
## zipcode98075   1.188e+05  1.346e+04   8.822  < 2e-16 ***
## zipcode98116   2.445e+05  1.340e+04  18.246  < 2e-16 ***
## zipcode98010   2.339e+04  1.905e+04   1.228 0.219631    
## zipcode98118   1.304e+05  1.223e+04  10.663  < 2e-16 ***
## zipcode98199   3.490e+05  1.353e+04  25.796  < 2e-16 ***
## zipcode98032  -3.643e+04  1.749e+04  -2.083 0.037261 *  
## zipcode98045   4.988e+04  1.480e+04   3.369 0.000755 ***
## zipcode98102   4.715e+05  1.877e+04  25.123  < 2e-16 ***
## zipcode98077   7.336e+04  1.551e+04   4.730 2.26e-06 ***
## zipcode98108   8.659e+04  1.541e+04   5.619 1.94e-08 ***
## zipcode98168   3.608e+04  1.395e+04   2.587 0.009693 ** 
## zipcode98177   1.702e+05  1.421e+04  11.980  < 2e-16 ***
## zipcode98065   3.907e+04  1.375e+04   2.842 0.004484 ** 
## zipcode98029   1.674e+05  1.360e+04  12.304  < 2e-16 ***
## zipcode98006   2.219e+05  1.259e+04  17.635  < 2e-16 ***
## zipcode98109   4.584e+05  1.847e+04  24.824  < 2e-16 ***
## zipcode98022  -5.148e+04  1.470e+04  -3.502 0.000463 ***
## zipcode98033   3.208e+05  1.271e+04  25.233  < 2e-16 ***
## zipcode98155   1.043e+05  1.251e+04   8.335  < 2e-16 ***
## zipcode98024   1.187e+05  2.071e+04   5.730 1.02e-08 ***
## zipcode98011   7.712e+04  1.529e+04   5.043 4.62e-07 ***
## zipcode98031  -2.838e+04  1.399e+04  -2.029 0.042490 *  
## zipcode98106   1.081e+05  1.327e+04   8.149 3.86e-16 ***
## zipcode98072   1.073e+05  1.407e+04   7.631 2.43e-14 ***
## zipcode98188  -4.004e+03  1.696e+04  -0.236 0.813388    
## zipcode98014   5.725e+04  1.785e+04   3.206 0.001346 ** 
## zipcode98055   4.549e+03  1.398e+04   0.325 0.744864    
## zipcode98039   1.271e+06  2.507e+04  50.694  < 2e-16 ***
## sqft_living15  1.125e+01  2.884e+00   3.901 9.62e-05 ***
## sqft_lot15    -1.420e-01  6.024e-02  -2.357 0.018455 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160200 on 21523 degrees of freedom
## Multiple R-squared:  0.8103, Adjusted R-squared:  0.8095 
## F-statistic:  1033 on 89 and 21523 DF,  p-value: < 2.2e-16

here from round 1 we can see that most of predictors are significant

also we see that the ares of basement has nothing to do for prediction , so i will remove it and run the analysis again

houses<-houses%>%select(-sqft_basement)

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)

## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1238733   -70188     -618    62402  4415643 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.143e+05  1.250e+05   7.316 2.64e-13 ***
## bedrooms      -2.733e+04  1.528e+03 -17.887  < 2e-16 ***
## bathrooms      2.559e+04  2.647e+03   9.669  < 2e-16 ***
## sqft_living    1.272e+02  3.547e+00  35.857  < 2e-16 ***
## sqft_lot       2.433e-01  3.833e-02   6.347 2.23e-10 ***
## floors        -4.887e+04  3.092e+03 -15.807  < 2e-16 ***
## waterfront1    5.634e+05  1.603e+04  35.149  < 2e-16 ***
## view3          1.529e+05  7.617e+03  20.070  < 2e-16 ***
## view4          3.373e+05  1.167e+04  28.913  < 2e-16 ***
## view2          6.829e+04  5.559e+03  12.284  < 2e-16 ***
## view1          7.658e+04  9.099e+03   8.416  < 2e-16 ***
## condition.L    6.642e+04  1.923e+04   3.453 0.000555 ***
## condition.Q    3.217e+02  1.626e+04   0.020 0.984212    
## condition.C    3.137e+04  1.223e+04   2.564 0.010345 *  
## condition^4   -1.478e+04  7.047e+03  -2.098 0.035950 *  
## grade          5.754e+04  1.814e+03  31.716  < 2e-16 ***
## sqft_above     8.021e+01  3.596e+00  22.305  < 2e-16 ***
## yr_built      -6.588e+02  6.588e+01 -10.000  < 2e-16 ***
## yr_renovated   1.764e+01  2.949e+00   5.980 2.27e-09 ***
## zipcode98125   1.596e+05  1.271e+04  12.563  < 2e-16 ***
## zipcode98028   7.666e+04  1.384e+04   5.540 3.06e-08 ***
## zipcode98136   2.054e+05  1.408e+04  14.594  < 2e-16 ***
## zipcode98074   1.252e+05  1.280e+04   9.781  < 2e-16 ***
## zipcode98053   1.451e+05  1.309e+04  11.089  < 2e-16 ***
## zipcode98003  -5.580e+04  1.387e+04  -4.024 5.74e-05 ***
## zipcode98198  -4.202e+04  1.382e+04  -3.040 0.002370 ** 
## zipcode98146   6.460e+04  1.370e+04   4.714 2.45e-06 ***
## zipcode98038  -1.268e+04  1.216e+04  -1.043 0.297161    
## zipcode98007   2.016e+05  1.690e+04  11.933  < 2e-16 ***
## zipcode98115   2.898e+05  1.202e+04  24.102  < 2e-16 ***
## zipcode98107   3.095e+05  1.414e+04  21.890  < 2e-16 ***
## zipcode98126   1.583e+05  1.313e+04  12.063  < 2e-16 ***
## zipcode98019   4.584e+04  1.549e+04   2.960 0.003084 ** 
## zipcode98103   3.004e+05  1.209e+04  24.850  < 2e-16 ***
## zipcode98002  -1.429e+04  1.519e+04  -0.941 0.346577    
## zipcode98133   1.272e+05  1.232e+04  10.326  < 2e-16 ***
## zipcode98040   4.700e+05  1.412e+04  33.285  < 2e-16 ***
## zipcode98092  -8.049e+04  1.332e+04  -6.044 1.52e-09 ***
## zipcode98030  -3.796e+04  1.422e+04  -2.670 0.007598 ** 
## zipcode98119   4.395e+05  1.566e+04  28.072  < 2e-16 ***
## zipcode98112   5.786e+05  1.427e+04  40.555  < 2e-16 ***
## zipcode98052   1.832e+05  1.219e+04  15.034  < 2e-16 ***
## zipcode98027   1.241e+05  1.291e+04   9.611  < 2e-16 ***
## zipcode98117   2.769e+05  1.214e+04  22.802  < 2e-16 ***
## zipcode98058  -1.494e+04  1.258e+04  -1.187 0.235173    
## zipcode98001  -4.373e+04  1.311e+04  -3.335 0.000854 ***
## zipcode98056   5.351e+04  1.284e+04   4.168 3.09e-05 ***
## zipcode98166   1.566e+04  1.417e+04   1.105 0.269143    
## zipcode98023  -7.646e+04  1.237e+04  -6.184 6.37e-10 ***
## zipcode98070  -5.017e+04  1.833e+04  -2.736 0.006217 ** 
## zipcode98148   2.885e+04  2.347e+04   1.229 0.219014    
## zipcode98105   4.286e+05  1.470e+04  29.144  < 2e-16 ***
## zipcode98042  -3.875e+04  1.225e+04  -3.162 0.001569 ** 
## zipcode98008   2.079e+05  1.384e+04  15.023  < 2e-16 ***
## zipcode98059   3.939e+04  1.264e+04   3.116 0.001837 ** 
## zipcode98122   2.984e+05  1.389e+04  21.481  < 2e-16 ***
## zipcode98144   2.411e+05  1.326e+04  18.184  < 2e-16 ***
## zipcode98004   7.317e+05  1.368e+04  53.499  < 2e-16 ***
## zipcode98005   2.565e+05  1.610e+04  15.928  < 2e-16 ***
## zipcode98034   1.577e+05  1.214e+04  12.987  < 2e-16 ***
## zipcode98075   1.188e+05  1.346e+04   8.822  < 2e-16 ***
## zipcode98116   2.445e+05  1.340e+04  18.246  < 2e-16 ***
## zipcode98010   2.339e+04  1.905e+04   1.228 0.219631    
## zipcode98118   1.304e+05  1.223e+04  10.663  < 2e-16 ***
## zipcode98199   3.490e+05  1.353e+04  25.796  < 2e-16 ***
## zipcode98032  -3.643e+04  1.749e+04  -2.083 0.037261 *  
## zipcode98045   4.988e+04  1.480e+04   3.369 0.000755 ***
## zipcode98102   4.715e+05  1.877e+04  25.123  < 2e-16 ***
## zipcode98077   7.336e+04  1.551e+04   4.730 2.26e-06 ***
## zipcode98108   8.659e+04  1.541e+04   5.619 1.94e-08 ***
## zipcode98168   3.608e+04  1.395e+04   2.587 0.009693 ** 
## zipcode98177   1.702e+05  1.421e+04  11.980  < 2e-16 ***
## zipcode98065   3.907e+04  1.375e+04   2.842 0.004484 ** 
## zipcode98029   1.674e+05  1.360e+04  12.304  < 2e-16 ***
## zipcode98006   2.219e+05  1.259e+04  17.635  < 2e-16 ***
## zipcode98109   4.584e+05  1.847e+04  24.824  < 2e-16 ***
## zipcode98022  -5.148e+04  1.470e+04  -3.502 0.000463 ***
## zipcode98033   3.208e+05  1.271e+04  25.233  < 2e-16 ***
## zipcode98155   1.043e+05  1.251e+04   8.335  < 2e-16 ***
## zipcode98024   1.187e+05  2.071e+04   5.730 1.02e-08 ***
## zipcode98011   7.712e+04  1.529e+04   5.043 4.62e-07 ***
## zipcode98031  -2.838e+04  1.399e+04  -2.029 0.042490 *  
## zipcode98106   1.081e+05  1.327e+04   8.149 3.86e-16 ***
## zipcode98072   1.073e+05  1.407e+04   7.631 2.43e-14 ***
## zipcode98188  -4.004e+03  1.696e+04  -0.236 0.813388    
## zipcode98014   5.725e+04  1.785e+04   3.206 0.001346 ** 
## zipcode98055   4.549e+03  1.398e+04   0.325 0.744864    
## zipcode98039   1.271e+06  2.507e+04  50.694  < 2e-16 ***
## sqft_living15  1.125e+01  2.884e+00   3.901 9.62e-05 ***
## sqft_lot15    -1.420e-01  6.024e-02  -2.357 0.018455 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160200 on 21523 degrees of freedom
## Multiple R-squared:  0.8103, Adjusted R-squared:  0.8095 
## F-statistic:  1033 on 89 and 21523 DF,  p-value: < 2.2e-16

another thing to take care with is that zibcode has a lot of levels and we need to round it

according to residual i will try to round it to small numbers of areas using residuals

grouping<-houses%>%mutate(resid=resid(fit))%>%
        group_by(zipcode)%>%summarize(medresid=median(resid),
                                      cnt=n())%>%arrange(medresid)%>%
        mutate(cumcnt=cumsum(cnt),
               group=ntile(cumcnt,5))

houses<-houses%>%left_join(grouping[,c("zipcode","group")])%>%mutate(group=factor(group))

now remove the zip from the data and run fit again

houses<-houses%>%select(-zipcode)

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)

## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1442272   -96219    -2491    82020  4346063 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.666e+06  1.240e+05  29.576  < 2e-16 ***
## bedrooms      -3.549e+04  1.766e+03 -20.102  < 2e-16 ***
## bathrooms      3.544e+04  3.077e+03  11.518  < 2e-16 ***
## sqft_living    1.445e+02  4.084e+00  35.384  < 2e-16 ***
## sqft_lot       1.418e-01  4.464e-02   3.175 0.001499 ** 
## floors        -2.230e+04  3.343e+03  -6.672 2.58e-11 ***
## waterfront1    5.423e+05  1.840e+04  29.474  < 2e-16 ***
## view3          1.248e+05  8.777e+03  14.213  < 2e-16 ***
## view4          3.287e+05  1.347e+04  24.399  < 2e-16 ***
## view2          5.990e+04  6.399e+03   9.361  < 2e-16 ***
## view1          1.073e+05  1.053e+04  10.192  < 2e-16 ***
## condition.L    4.642e+04  2.247e+04   2.066 0.038845 *  
## condition.Q   -2.564e+03  1.901e+04  -0.135 0.892712    
## condition.C    2.743e+04  1.430e+04   1.918 0.055123 .  
## condition^4   -8.245e+03  8.226e+03  -1.002 0.316153    
## grade          7.973e+04  2.021e+03  39.455  < 2e-16 ***
## sqft_above     5.174e+01  3.999e+00  12.939  < 2e-16 ***
## yr_built      -1.951e+03  6.585e+01 -29.625  < 2e-16 ***
## yr_renovated   1.237e+01  3.416e+00   3.620 0.000295 ***
## sqft_living15  3.527e+00  3.140e+00   1.123 0.261383    
## sqft_lot15    -2.867e-01  6.825e-02  -4.200 2.68e-05 ***
## group2        -1.898e+05  4.245e+03 -44.714  < 2e-16 ***
## group3        -2.847e+05  4.573e+03 -62.261  < 2e-16 ***
## group4        -2.700e+05  4.470e+03 -60.399  < 2e-16 ***
## group5        -3.798e+05  4.807e+03 -79.017  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 187800 on 21588 degrees of freedom
## Multiple R-squared:  0.7387, Adjusted R-squared:  0.7384 
## F-statistic:  2542 on 24 and 21588 DF,  p-value: < 2.2e-16

most coefficients here are beautiful

now what about outlier!!

#using mahalanobis distance for multivariate outliers 

num2<-houses%>%select(price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_living15,sqft_lot15)

mah<-mahalanobis(x = num2,center = colMeans(num2),cov = cov(num2))

cutmahup<-qchisq(.99,ncol(num2))
cutmahdown<-qchisq(.01,ncol(num2))

outidxup<-mah>cutmahup%>%as.numeric()
outidxdown<-mah<cutmahdown%>%as.numeric()

table(outidxup)

## outidxup
## FALSE  TRUE 
## 20489  1124

table(outidxdown)

## outidxdown
## FALSE  TRUE 
## 20434  1179

next testing for influential points

inf<-hatvalues(fit)
cutinf<-(2*(ncol(houses)-1)+1)/nrow(houses)

table(inf>cutinf)

## 
## FALSE  TRUE 
## 17832  3781

horrible !!

infidx<-inf>cutinf%>%as.numeric()

testing cooks distance

cook<-cooks.distance(fit)

cutcook<-4/(nrow(houses)-(ncol(houses)-1)-1)

table(cook>cutcook)

## 
## FALSE  TRUE 
## 20450  1163

cookidx<-cook>cutcook%>%as.numeric()

testing for variance inflation

vif<-vif(fit)
vif

##                   GVIF Df GVIF^(1/(2*Df))
## bedrooms      1.652718  1        1.285581
## bathrooms     3.442414  1        1.855374
## sqft_living   8.620412  1        2.936054
## sqft_lot      2.095620  1        1.447626
## floors        2.123134  1        1.457098
## waterfront    1.552769  1        1.246101
## view          1.852788  4        1.080135
## condition     1.325168  4        1.035819
## grade         3.457940  1        1.859554
## sqft_above    6.719385  1        2.592178
## yr_built      2.292582  1        1.514128
## yr_renovated  1.153622  1        1.074068
## sqft_living15 2.839157  1        1.684980
## sqft_lot15    2.128469  1        1.458927
## group         1.313838  4        1.034708

good news , we don’t have here any vif value over 10 and this indicate absence of multicolinearity

testing auto correlation

auto<-durbinWatsonTest(fit,simulate = T,reps = 1000,method = "resample")
auto

##  lag Autocorrelation D-W Statistic p-value
##    1      0.01596825      1.968049   0.024
##  Alternative hypothesis: rho != 0

great , p value>.01 indicate that there is no auto correlation

Now lets see the most bad records in our data

allbad<-outidxup+outidxdown+infidx+cookidx

table(allbad)

## allbad
##     0     1     2     3 
## 16491  3473  1173   476

i will delete all records which break 2 rules

bad<-allbad>=2

table(bad)

## bad
## FALSE  TRUE 
## 19964  1649

houses<-houses%>%anti_join(houses[bad,])

running the fit again

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)

## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -523005  -78395   -4085   68184  929903 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.282e+06  8.637e+04  37.999  < 2e-16 ***
## bedrooms      -1.981e+04  1.334e+03 -14.845  < 2e-16 ***
## bathrooms      2.669e+04  2.234e+03  11.951  < 2e-16 ***
## sqft_living    1.027e+02  3.129e+00  32.826  < 2e-16 ***
## sqft_lot       4.835e-01  1.229e-01   3.934 8.37e-05 ***
## floors         4.338e+03  2.362e+03   1.836   0.0664 .  
## waterfront1    5.293e+05  3.527e+04  15.006  < 2e-16 ***
## view3          1.195e+05  7.224e+03  16.547  < 2e-16 ***
## view4          3.497e+05  1.561e+04  22.400  < 2e-16 ***
## view2          6.127e+04  4.665e+03  13.135  < 2e-16 ***
## view1          1.020e+05  8.631e+03  11.813  < 2e-16 ***
## condition.L    5.875e+04  3.487e+04   1.685   0.0921 .  
## condition.Q    3.234e+02  2.947e+04   0.011   0.9912    
## condition.C    1.060e+04  1.884e+04   0.563   0.5735    
## condition^4    3.950e+03  8.677e+03   0.455   0.6489    
## grade          7.506e+04  1.428e+03  52.582  < 2e-16 ***
## sqft_above     2.623e+01  2.954e+00   8.880  < 2e-16 ***
## yr_built      -1.760e+03  4.563e+01 -38.576  < 2e-16 ***
## yr_renovated   1.293e+01  2.527e+00   5.118 3.11e-07 ***
## sqft_living15  2.192e+01  2.424e+00   9.045  < 2e-16 ***
## sqft_lot15    -6.147e-01  1.506e-01  -4.082 4.48e-05 ***
## group2        -1.369e+05  2.930e+03 -46.710  < 2e-16 ***
## group3        -2.334e+05  3.141e+03 -74.298  < 2e-16 ***
## group4        -2.113e+05  3.097e+03 -68.238  < 2e-16 ***
## group5        -3.185e+05  3.334e+03 -95.529  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 122200 on 19939 degrees of freedom
## Multiple R-squared:  0.7602, Adjusted R-squared:  0.7599 
## F-statistic:  2634 on 24 and 19939 DF,  p-value: < 2.2e-16

plot(fit,3)

we see here that we have some degree of heteroscedasticity

checking the normality

qqPlot(fit)

## [1]  5718 18796

OK , the qqplot shows that the data has a degree of skewness

and to prove it lets see histogram

hist(rstudent(fit))

some right skew but the data still good

the only issue we have to handle is heteroscedasticity

now i will check partial residual to know the problematic variable

resid<-resid(fit)
term<-predict(fit,houses,type = "terms")%>%data.frame()

partial.resid<-resid+term
houses.org<-houses
names(houses.org)<-paste("org",names(houses),sep = ".")
names(partial.resid)<-paste("part",names(partial.resid),sep = ".")
names(term)<-paste("term",names(term),sep = ".")
allpartial<-cbind(houses.org[,-1],partial.resid,term)


allpartial%>%ggplot(aes(org.sqft_living,part.sqft_living))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_living,y=term.sqft_living),color="red",lwd=1)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.sqft_lot,part.sqft_lot))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_lot,y=term.sqft_lot),color="red",lwd=1)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.sqft_above,part.sqft_above))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_above,y=term.sqft_above),color="red",lwd=1)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.sqft_living15,part.sqft_living15))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_living15,y=term.sqft_living15),color="red",lwd=1)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.sqft_lot15,part.sqft_lot15))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.sqft_lot15,y=term.sqft_lot15),color="red",lwd=1)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.yr_built,part.yr_built))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.yr_built,y=term.yr_built),color="red",lwd=1)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

allpartial%>%ggplot(aes(org.yr_renovated,part.yr_renovated))+
        geom_point()+
        geom_smooth()+
        geom_line(aes(x = org.yr_renovated,y=term.yr_renovated),color="red",lwd=1)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

here we can find that the most problems associated with square foot of lot and the same variable in 2015 and the year of reonivation

i will categorize the renovation years variable into 4 categories according to residuals

houses$yr_renovated%>%table()%>%kable("html")

.	Freq
0	19281
1940	2
1944	1
1945	3
1946	2
1948	1
1950	2
1951	1
1953	3
1954	1
1955	3
1956	1
1957	3
1958	5
1959	1
1960	3
1962	1
1963	3
1964	5
1965	3
1967	2
1968	6
1969	3
1970	8
1971	2
1972	2
1973	5
1974	3
1975	6
1976	2
1977	7
1978	4
1979	7
1980	9
1981	3
1982	7
1983	14
1984	15
1985	12
1986	8
1987	12
1988	12
1989	17
1990	20
1991	13
1992	12
1993	15
1994	13
1995	13
1996	9
1997	11
1998	13
1999	9
2000	21
2001	15
2002	12
2003	25
2004	20
2005	27
2006	18
2007	26
2008	9
2009	19
2010	15
2011	12
2012	11
2013	28
2014	78
2015	9

stdresid<-rstudent(fit)

catigor<-houses%>%mutate(stdresid)%>%
        group_by(yr_renovated)%>%
        summarize(medstd=median(stdresid),cnt=n())%>%
        arrange(medstd)%>%
        mutate(cumcnt=cumsum(cnt),renovC=ntile(cumcnt,4))

catigor%>%kable("markdown")

yr_renovated	medstd	cnt	cumcnt	renovC
1967	-1.3146319	2	2	1
1976	-1.1681682	2	4	1
1963	-1.1644635	3	7	1
1969	-1.1364518	3	10	1
1964	-1.1031153	5	15	1
1970	-1.0855883	8	23	1
1996	-1.0752276	9	32	1
1953	-1.0183430	3	35	1
1945	-0.9554270	3	38	1
1960	-0.8671719	3	41	1
1959	-0.6575658	1	42	1
1950	-0.6536802	2	44	1
1978	-0.5931954	4	48	1
1982	-0.5847983	7	55	1
1973	-0.5711407	5	60	1
1971	-0.5155870	2	62	1
1981	-0.4807747	3	65	1
1957	-0.4715717	3	68	1
1995	-0.4594498	13	81	2
1983	-0.4412839	14	95	2
1977	-0.4311997	7	102	2
1984	-0.4227199	15	117	2
1997	-0.4130501	11	128	2
1968	-0.4060072	6	134	2
1974	-0.3795095	3	137	2
1946	-0.3785293	2	139	2
1940	-0.3263868	2	141	2
1994	-0.3208032	13	154	2
1993	-0.2803082	15	169	2
1988	-0.2740405	12	181	2
1948	-0.2578666	1	182	2
1991	-0.2541159	13	195	2
1992	-0.2498357	12	207	2
1989	-0.2221852	17	224	2
1958	-0.2079038	5	229	2
2001	-0.2000301	15	244	3
1987	-0.1873965	12	256	3
2000	-0.1789553	21	277	3
1990	-0.1543012	20	297	3
1980	-0.1477219	9	306	3
1972	-0.1006371	2	308	3
1965	-0.0930242	3	311	3
2013	-0.0892279	28	339	3
1955	-0.0803034	3	342	3
1986	-0.0612799	8	350	3
2012	-0.0611215	11	361	3
1985	-0.0536939	12	373	3
0	-0.0324502	19281	19654	3
1975	0.0167476	6	19660	3
2015	0.0227493	9	19669	3
2009	0.0288613	19	19688	3
2011	0.0822008	12	19700	3
1998	0.0945444	13	19713	4
2007	0.1200742	26	19739	4
1979	0.1245174	7	19746	4
2004	0.2679457	20	19766	4
2006	0.2908748	18	19784	4
2003	0.2965783	25	19809	4
1944	0.3629567	1	19810	4
1951	0.4672665	1	19811	4
2014	0.4996164	78	19889	4
2005	0.5223866	27	19916	4
2010	0.5686400	15	19931	4
2002	0.6547362	12	19943	4
1956	0.8582450	1	19944	4
1999	0.9077240	9	19953	4
2008	1.0379758	9	19962	4
1962	1.2237277	1	19963	4
1954	2.1236397	1	19964	4

catigor<-catigor%>%select(yr_renovated,renovC)


houses<-(houses)%>%left_join(catigor)%>%mutate(renovC=factor(renovC))

## Joining, by = "yr_renovated"

houses<-houses%>%select(-yr_renovated)

houses%>%head%>%kable("markdown")

price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	yr_built	sqft_living15	sqft_lot15	group	renovC
221900	3	1.00	1180	5650	1	3	7	1180	1955	1340	5650	5	3
538000	3	2.25	2570	7242	2	3	7	2170	1951	1690	7639	4	2
180000	2	1.00	770	10000	1	3	6	770	1933	2720	8062	4	3
604000	4	3.00	1960	5000	1	5	7	1050	1965	1360	5000	4	3
510000	3	2.00	1680	8080	1	3	8	1680	1987	1800	7503	4	3
257500	3	2.25	1715	6819	2	3	7	1715	1995	2238	6819	5	3

table(houses$renovC)

## 
##     1     2     3     4 
##    68   161 19471   264

recalculating the fit

fit<-houses%>%with(lm(price~.,data=houses))
summary(fit)

## 
## Call:
## lm(formula = price ~ ., data = houses)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -523310  -77900   -4030   67725  930210 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.239e+06  8.498e+04  38.117  < 2e-16 ***
## bedrooms      -1.986e+04  1.332e+03 -14.911  < 2e-16 ***
## bathrooms      2.627e+04  2.226e+03  11.801  < 2e-16 ***
## sqft_living    1.028e+02  3.123e+00  32.907  < 2e-16 ***
## sqft_lot       4.764e-01  1.227e-01   3.884 0.000103 ***
## floors         5.000e+03  2.358e+03   2.120 0.034018 *  
## waterfront1    5.332e+05  3.522e+04  15.140  < 2e-16 ***
## view3          1.200e+05  7.211e+03  16.637  < 2e-16 ***
## view4          3.492e+05  1.559e+04  22.398  < 2e-16 ***
## view2          6.184e+04  4.657e+03  13.277  < 2e-16 ***
## view1          1.023e+05  8.615e+03  11.877  < 2e-16 ***
## condition.L    5.938e+04  3.481e+04   1.706 0.088056 .  
## condition.Q   -5.216e+02  2.942e+04  -0.018 0.985853    
## condition.C    1.148e+04  1.881e+04   0.611 0.541475    
## condition^4    3.053e+03  8.662e+03   0.352 0.724501    
## grade          7.469e+04  1.426e+03  52.397  < 2e-16 ***
## sqft_above     2.671e+01  2.949e+00   9.056  < 2e-16 ***
## yr_built      -1.772e+03  4.501e+01 -39.360  < 2e-16 ***
## sqft_living15  2.181e+01  2.417e+00   9.024  < 2e-16 ***
## sqft_lot15    -5.939e-01  1.503e-01  -3.950 7.83e-05 ***
## group2        -1.367e+05  2.925e+03 -46.718  < 2e-16 ***
## group3        -2.332e+05  3.136e+03 -74.364  < 2e-16 ***
## group4        -2.107e+05  3.092e+03 -68.141  < 2e-16 ***
## group5        -3.184e+05  3.328e+03 -95.665  < 2e-16 ***
## renovC2        6.885e+04  1.767e+04   3.897 9.79e-05 ***
## renovC3        6.777e+04  1.493e+04   4.541 5.64e-06 ***
## renovC4        1.366e+05  1.664e+04   8.204 2.46e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 122000 on 19937 degrees of freedom
## Multiple R-squared:  0.7611, Adjusted R-squared:  0.7608 
## F-statistic:  2443 on 26 and 19937 DF,  p-value: < 2.2e-16

Good , all categories are significant

now lets try the weighted regression and start with calculating the weights

residl<-resid(fit)%>%abs()
fited<-fitted(fit)
residfit<-lm(residl~fited)
fittresid<-fitted(residfit)
wts<-1/fittresid^2

wtsfit<-houses%>%with(lm(price~.,data=houses,weights = wts))
summary(wtsfit)

## 
## Call:
## lm(formula = price ~ ., data = houses, weights = wts)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0657 -0.8911 -0.0765  0.7666 13.6759 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.780e+06  7.615e+04  36.506  < 2e-16 ***
## bedrooms      -1.943e+04  1.158e+03 -16.779  < 2e-16 ***
## bathrooms      1.874e+04  1.974e+03   9.495  < 2e-16 ***
## sqft_living    9.526e+01  2.969e+00  32.083  < 2e-16 ***
## sqft_lot       6.726e-01  1.059e-01   6.349 2.22e-10 ***
## floors         1.150e+04  2.101e+03   5.473 4.47e-08 ***
## waterfront1    5.600e+05  7.409e+04   7.558 4.27e-14 ***
## view3          1.071e+05  8.500e+03  12.597  < 2e-16 ***
## view4          3.550e+05  2.536e+04  13.999  < 2e-16 ***
## view2          5.989e+04  4.929e+03  12.149  < 2e-16 ***
## view1          9.405e+04  9.276e+03  10.140  < 2e-16 ***
## condition.L    8.421e+04  1.866e+04   4.512 6.47e-06 ***
## condition.Q   -1.985e+04  1.577e+04  -1.259 0.208127    
## condition.C    1.014e+04  1.036e+04   0.979 0.327814    
## condition^4    8.627e+03  5.048e+03   1.709 0.087436 .  
## grade          5.794e+04  1.244e+03  46.559  < 2e-16 ***
## sqft_above     1.802e+01  2.819e+00   6.392 1.67e-10 ***
## yr_built      -1.481e+03  4.079e+01 -36.304  < 2e-16 ***
## sqft_living15  3.203e+01  2.260e+00  14.173  < 2e-16 ***
## sqft_lot15    -4.333e-01  1.249e-01  -3.471 0.000520 ***
## group2        -1.205e+05  3.246e+03 -37.123  < 2e-16 ***
## group3        -2.220e+05  3.243e+03 -68.442  < 2e-16 ***
## group4        -1.953e+05  3.262e+03 -59.878  < 2e-16 ***
## group5        -2.901e+05  3.290e+03 -88.173  < 2e-16 ***
## renovC2        5.253e+04  1.454e+04   3.613 0.000304 ***
## renovC3        6.112e+04  1.166e+04   5.244 1.59e-07 ***
## renovC4        1.237e+05  1.393e+04   8.878  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.26 on 19937 degrees of freedom
## Multiple R-squared:  0.7181, Adjusted R-squared:  0.7177 
## F-statistic:  1953 on 26 and 19937 DF,  p-value: < 2.2e-16

plot(wtsfit,3,main = "weighted regression")

plot(fit,3,main = "Ordinary regression")

we see here an improve in the linearity and heteroscedasticity

comparing the standardized coefficients of OLS model and weighted model

data.frame(cbind(lm.beta(fit)$standardized.coefficients,lm.beta(wtsfit)$standardized.coefficients))%>%kable(format = "markdown",digits = 3,col.names = c("Ordinary","Weighted"))

	Ordinary	Weighted
(Intercept)	0.000	0.000
bedrooms	-0.068	-0.067
bathrooms	0.074	0.053
sqft_living	0.318	0.294
sqft_lot	0.025	0.035
floors	0.011	0.026
waterfront1	0.057	0.059
view3	0.059	0.053
view4	0.085	0.086
view2	0.047	0.046
view1	0.041	0.038
condition.L	0.048	0.068
condition.Q	-0.001	-0.023
condition.C	0.014	0.013
condition^4	0.006	0.018
grade	0.320	0.248
sqft_above	0.078	0.053
yr_built	-0.208	-0.173
sqft_living15	0.055	0.081
sqft_lot15	-0.025	-0.018
group2	-0.236	-0.208
group3	-0.377	-0.359
group4	-0.348	-0.322
group5	-0.497	-0.453
renovC2	0.025	0.019
renovC3	0.042	0.038
renovC4	0.063	0.057

The Coefficients are so close to each other

Now lets see the ability of our model to predict out sample data

houses<-cbind(houses,wts)

set.seed(4785)

trainidx<-createDataPartition(y = houses$price,p = .75,list = F)
traindata<-houses[trainidx,]
testdata<-houses[-trainidx,]

running the model

set.seed(3454)
trainfit<-traindata%>%with(lm(price~.,data = traindata[,-length(names(traindata))],weights = traindata$wts))
summary(trainfit)

## 
## Call:
## lm(formula = price ~ ., data = traindata[, -length(names(traindata))], 
##     weights = traindata$wts)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1019 -0.8939 -0.0758  0.7716 13.7050 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.807e+06  8.800e+04  31.900  < 2e-16 ***
## bedrooms      -1.950e+04  1.336e+03 -14.597  < 2e-16 ***
## bathrooms      1.838e+04  2.272e+03   8.090 6.40e-16 ***
## sqft_living    9.243e+01  3.417e+00  27.049  < 2e-16 ***
## sqft_lot       7.483e-01  1.235e-01   6.059 1.40e-09 ***
## floors         1.257e+04  2.433e+03   5.165 2.43e-07 ***
## waterfront1    5.638e+05  7.454e+04   7.564 4.13e-14 ***
## view3          1.107e+05  9.718e+03  11.394  < 2e-16 ***
## view4          3.500e+05  2.768e+04  12.643  < 2e-16 ***
## view2          6.173e+04  5.685e+03  10.859  < 2e-16 ***
## view1          9.688e+04  1.020e+04   9.502  < 2e-16 ***
## condition.L    9.235e+04  2.330e+04   3.964 7.41e-05 ***
## condition.Q   -2.660e+04  1.968e+04  -1.352  0.17643    
## condition.C    1.896e+04  1.275e+04   1.487  0.13693    
## condition^4    4.314e+03  6.060e+03   0.712  0.47647    
## grade          5.897e+04  1.439e+03  40.973  < 2e-16 ***
## sqft_above     2.151e+01  3.234e+00   6.651 3.00e-11 ***
## yr_built      -1.498e+03  4.712e+01 -31.799  < 2e-16 ***
## sqft_living15  3.170e+01  2.624e+00  12.084  < 2e-16 ***
## sqft_lot15    -5.765e-01  1.480e-01  -3.896 9.84e-05 ***
## group2        -1.225e+05  3.767e+03 -32.536  < 2e-16 ***
## group3        -2.252e+05  3.761e+03 -59.880  < 2e-16 ***
## group4        -1.974e+05  3.780e+03 -52.208  < 2e-16 ***
## group5        -2.945e+05  3.816e+03 -77.169  < 2e-16 ***
## renovC2        5.127e+04  1.678e+04   3.055  0.00226 ** 
## renovC3        6.332e+04  1.342e+04   4.718 2.41e-06 ***
## renovC4        1.256e+05  1.635e+04   7.684 1.64e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.259 on 14947 degrees of freedom
## Multiple R-squared:  0.7204, Adjusted R-squared:  0.7199 
## F-statistic:  1481 on 26 and 14947 DF,  p-value: < 2.2e-16

predicted<-predict(trainfit,testdata)

cbind(RMSE=RMSE(pred = predicted,obs = testdata$price),R2=R2(pred = predicted,obs = testdata$price))%>%kable("markdown")

RMSE	R2
124517.3	0.7519751

Predicting houses prices

Mo’men Mohamed

April 19, 2019

REGARDS