library(data.table)
house <- read.csv("kc_house_data.csv")
dim(house)
[1] 21613 21
data.table(head(house,10))
data.table(tail(house,10))
str(house)
'data.frame': 21613 obs. of 21 variables:
$ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
$ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
$ price : num 221900 538000 180000 604000 510000 ...
$ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
$ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
$ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
$ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
$ floors : num 1 2 1 1 1 1 2 1 1 2 ...
$ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
$ view : int 0 0 0 0 0 0 0 0 0 0 ...
$ condition : int 3 3 3 5 3 3 3 3 3 3 ...
$ grade : int 7 7 6 7 8 11 7 7 7 7 ...
$ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
$ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
$ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
$ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
$ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
$ lat : num 47.5 47.7 47.7 47.5 47.6 ...
$ long : num -122 -122 -122 -122 -122 ...
$ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
$ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
library(dplyr)
glimpse(house)
Observations: 21,613
Variables: 21
$ id <dbl> 7129300520, 6414100192, 5631500400, 2487200875, 1954400510, 7237550310, 132...
$ date <fctr> 20141013T000000, 20141209T000000, 20150225T000000, 20141209T000000, 201502...
$ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 291850, 229500, 32...
$ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3, 4, 3, 5, 2, 3, ...
$ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50, 2.50, 1.00, 1.0...
$ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890, 3560, 1160, 1430...
$ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6560, 9796, 6000, ...
$ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.5, 1.0, 1.5, ...
$ waterfront <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0, ...
$ condition <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 4, ...
$ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 9, 8, 7, 8,...
$ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 1890, 1860, 860, 1430,...
$ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, 0, 970, 0, 0, 0, 0,...
$ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003, 1965, 1942, 192...
$ yr_renovated <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ zipcode <int> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, 98146, 98038, 98007...
$ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47.3097, 47.4095, 47....
$ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.005, -122.327, -122....
$ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2390, 2210, 1330, 178...
$ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, 7570, 8925, 6000, 1...
## DATA WRANGLING
#### Is data is missing?
miss <- function(x){
sum = 0
for(i in 1:ncol(x))
{
cat("In column",colnames(x[i]),"total NA values are:",colSums(is.na(x[i])),"\n")
}
}
miss(house)
In column id total NA values are: 0
In column date total NA values are: 0
In column price total NA values are: 0
In column bedrooms total NA values are: 0
In column bathrooms total NA values are: 0
In column sqft_living total NA values are: 0
In column sqft_lot total NA values are: 0
In column floors total NA values are: 0
In column waterfront total NA values are: 0
In column view total NA values are: 0
In column condition total NA values are: 0
In column grade total NA values are: 0
In column sqft_above total NA values are: 0
In column sqft_basement total NA values are: 0
In column yr_built total NA values are: 0
In column yr_renovated total NA values are: 0
In column zipcode total NA values are: 0
In column lat total NA values are: 0
In column long total NA values are: 0
In column sqft_living15 total NA values are: 0
In column sqft_lot15 total NA values are: 0
blank <- function(x){
sum = 0
for(i in 1:ncol(x))
{
cat("In column",colnames(x[i]),"total blank values are:",colSums(x[i]==""),"\n")
}
}
blank(house)
In column id total blank values are: 0
In column date total blank values are: 0
In column price total blank values are: 0
In column bedrooms total blank values are: 0
In column bathrooms total blank values are: 0
In column sqft_living total blank values are: 0
In column sqft_lot total blank values are: 0
In column floors total blank values are: 0
In column waterfront total blank values are: 0
In column view total blank values are: 0
In column condition total blank values are: 0
In column grade total blank values are: 0
In column sqft_above total blank values are: 0
In column sqft_basement total blank values are: 0
In column yr_built total blank values are: 0
In column yr_renovated total blank values are: 0
In column zipcode total blank values are: 0
In column lat total blank values are: 0
In column long total blank values are: 0
In column sqft_living15 total blank values are: 0
In column sqft_lot15 total blank values are: 0
house1 <- house[,-c(1,2)]
summary(house1)
price bedrooms bathrooms sqft_living sqft_lot floors
Min. : 75000 Min. : 0.000 Min. :0.000 Min. : 290 Min. : 520 Min. :1.000
1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427 1st Qu.: 5040 1st Qu.:1.000
Median : 450000 Median : 3.000 Median :2.250 Median : 1910 Median : 7618 Median :1.500
Mean : 540088 Mean : 3.371 Mean :2.115 Mean : 2080 Mean : 15107 Mean :1.494
3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10688 3rd Qu.:2.000
Max. :7700000 Max. :33.000 Max. :8.000 Max. :13540 Max. :1651359 Max. :3.500
waterfront view condition grade sqft_above sqft_basement
Min. :0.000000 Min. :0.0000 Min. :1.000 Min. : 1.000 Min. : 290 Min. : 0.0
1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.: 7.000 1st Qu.:1190 1st Qu.: 0.0
Median :0.000000 Median :0.0000 Median :3.000 Median : 7.000 Median :1560 Median : 0.0
Mean :0.007542 Mean :0.2343 Mean :3.409 Mean : 7.657 Mean :1788 Mean : 291.5
3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.: 8.000 3rd Qu.:2210 3rd Qu.: 560.0
Max. :1.000000 Max. :4.0000 Max. :5.000 Max. :13.000 Max. :9410 Max. :4820.0
yr_built yr_renovated zipcode lat long sqft_living15
Min. :1900 Min. : 0.0 Min. :98001 Min. :47.16 Min. :-122.5 Min. : 399
1st Qu.:1951 1st Qu.: 0.0 1st Qu.:98033 1st Qu.:47.47 1st Qu.:-122.3 1st Qu.:1490
Median :1975 Median : 0.0 Median :98065 Median :47.57 Median :-122.2 Median :1840
Mean :1971 Mean : 84.4 Mean :98078 Mean :47.56 Mean :-122.2 Mean :1987
3rd Qu.:1997 3rd Qu.: 0.0 3rd Qu.:98118 3rd Qu.:47.68 3rd Qu.:-122.1 3rd Qu.:2360
Max. :2015 Max. :2015.0 Max. :98199 Max. :47.78 Max. :-121.3 Max. :6210
sqft_lot15
Min. : 651
1st Qu.: 5100
Median : 7620
Mean : 12768
3rd Qu.: 10083
Max. :871200
library(caTools)
package <U+393C><U+3E31>caTools<U+393C><U+3E32> was built under R version 3.4.4
set.seed(123)
split <- sample.split(house1$price,SplitRatio = 0.75)
training_set <- subset(house1, split == TRUE)
test_set <- subset(house1, split == FALSE)
library(corrplot)
corrplot 0.84 loaded
options(repr.plot.width=10, repr.plot.height=10)
corr<-cor(house1[,c(1:15,18,19)])
corrplot(corr,method = "color", outline = T, addgrid.col = "darkgray", order="hclust", addrect = 4, rect.col = "black", rect.lwd = 5,cl.pos = "b", tl.col = "indianred4", tl.cex = 1.5, cl.cex = 1.5, addCoef.col = "black", number.digits = 2, number.cex = 0.75, col = colorRampPalette(c("green4","white","red"))(100))
regressor <- lm(price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + waterfront + view + condition + grade + sqft_above + sqft_basement + yr_built + yr_renovated + zipcode + sqft_living15 + sqft_lot15, data = training_set)
options(scipen = 999)
summary(regressor)
Call:
lm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
floors + waterfront + view + condition + grade + sqft_above +
sqft_basement + yr_built + yr_renovated + zipcode + sqft_living15 +
sqft_lot15, data = training_set)
Residuals:
Min 1Q Median 3Q Max
-1296939 -111677 -9825 92081 4168237
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9699903.30218 3634933.32384 2.669 0.00763 **
bedrooms -42737.27095 2337.32266 -18.285 < 0.0000000000000002 ***
bathrooms 45152.95452 4046.07923 11.160 < 0.0000000000000002 ***
sqft_living 181.61504 5.42746 33.462 < 0.0000000000000002 ***
sqft_lot 0.02038 0.05948 0.343 0.73193
floors 23342.45448 4472.82155 5.219 0.000000182277830 ***
waterfront 622641.20338 21429.25029 29.056 < 0.0000000000000002 ***
view 43166.46999 2622.78965 16.458 < 0.0000000000000002 ***
condition 18590.10956 2950.98054 6.300 0.000000000305699 ***
grade 119691.42897 2629.77930 45.514 < 0.0000000000000002 ***
sqft_above -5.59422 5.32644 -1.050 0.29361
sqft_basement NA NA NA NA
yr_built -3603.01098 86.55945 -41.625 < 0.0000000000000002 ***
yr_renovated 9.96793 4.54884 2.191 0.02844 *
zipcode -34.99556 36.48498 -0.959 0.33748
sqft_living15 17.41835 4.23536 4.113 0.000039307331800 ***
sqft_lot15 -0.65627 0.09155 -7.169 0.000000000000789 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 222900 on 16940 degrees of freedom
Multiple R-squared: 0.6612, Adjusted R-squared: 0.6609
F-statistic: 2204 on 15 and 16940 DF, p-value: < 0.00000000000000022
pricePred <- predict(regressor, newdata = test_set)
prediction from a rank-deficient fit may be misleading
head(pricePred,30)
4 12 15 17 19 48 49 51 54 65
474906.6 389701.7 565319.5 344345.8 470651.4 713791.7 349678.9 279401.6 414766.4 343844.2
71 72 75 80 84 88 90 94 95 109
1394889.4 309000.9 414544.9 938408.8 490465.5 759349.3 171798.4 560398.5 560398.5 936329.6
123 128 143 145 147 149 154 156 157 166
148009.7 525819.0 361023.8 584687.7 822615.7 555430.6 1656548.8 257226.2 943530.5 321495.1
priceDiff <- test_set$price - pricePred
head(priceDiff, 30)
4 12 15 17 19 48 49 51 54
129093.39 78298.28 -35319.50 50654.20 -281651.44 71208.29 100321.06 -51401.62 170233.63
65 71 72 75 80 84 88 90 94
81155.82 -354889.36 15999.07 -65544.90 -218408.76 -134465.52 -298349.33 163201.58 -130398.52
95 109 123 128 143 145 147 149 154
139601.48 -256329.55 246990.35 -265818.99 -156023.77 -84687.69 67384.28 -297430.56 593451.23
156 157 166
-42226.19 -293530.53 48504.86
library(ggplot2)
library(gridExtra)
Attaching package: <U+393C><U+3E31>gridExtra<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:dplyr<U+393C><U+3E32>:
combine
g1 = ggplot(test_set,aes(x = price)) + geom_histogram(aes(y = ..density..), color = "white", fill = "purple") + stat_function(fun=dnorm, args = list(mean = mean(test_set$price), sd= sd(test_set$price)), col = 'black')
g2 = ggplot(test_set, aes(sample=c(scale(price)))) + stat_qq() + geom_abline(intercept = 0, slope = 1)
g3 = data.frame(priceDiff) %>% ggplot(aes(priceDiff)) + geom_histogram(aes(y = ..density..), color = "white", fill = "purple") + stat_function(fun=dnorm, args = list(mean = mean(priceDiff), sd= sd(priceDiff)), col = 'black')
g4 = data.frame(priceDiff) %>% ggplot(aes(sample=c(scale(priceDiff)))) + stat_qq() + geom_abline(intercept = 0, slope = 1)
grid.arrange(g1, g2, g3, g4, nrow = 2)
par(mfrow = c(2,2))
plot(regressor)
df1 <- data.frame(cbind(actual = test_set$price, predicted = pricePred))
corraccuracy <- cor(df1)
corraccuracy
actual predicted
actual 1.0000000 0.7869606
predicted 0.7869606 1.0000000
regressor1 <- lm(price ~ bedrooms + bathrooms + sqft_living + floors + waterfront + view + condition + grade + yr_built + sqft_living15 + sqft_lot15, data = training_set)
summary(regressor1)
Call:
lm(formula = price ~ bedrooms + bathrooms + sqft_living + floors +
waterfront + view + condition + grade + yr_built + sqft_living15 +
sqft_lot15, data = training_set)
Residuals:
Min 1Q Median 3Q Max
-1307416 -112001 -10135 92130 4175456
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6350684.88941 151067.08913 42.039 < 0.0000000000000002 ***
bedrooms -42780.75211 2333.01079 -18.337 < 0.0000000000000002 ***
bathrooms 47047.82502 3949.21716 11.913 < 0.0000000000000002 ***
sqft_living 177.98913 4.11935 43.208 < 0.0000000000000002 ***
floors 21280.27611 3996.52367 5.325 0.000000102411 ***
waterfront 624194.65274 21385.63463 29.188 < 0.0000000000000002 ***
view 43514.16897 2563.21234 16.976 < 0.0000000000000002 ***
condition 18004.05223 2872.28743 6.268 0.000000000374 ***
grade 119404.88001 2608.82004 45.770 < 0.0000000000000002 ***
yr_built -3643.75905 77.76533 -46.856 < 0.0000000000000002 ***
sqft_living15 16.87998 4.08846 4.129 0.000036658928 ***
sqft_lot15 -0.63156 0.06415 -9.846 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 223000 on 16944 degrees of freedom
Multiple R-squared: 0.661, Adjusted R-squared: 0.6608
F-statistic: 3004 on 11 and 16944 DF, p-value: < 0.00000000000000022
pricePred2 <- predict(regressor1, newdata = test_set)
head(pricePred,30)
4 12 15 17 19 48 49 51 54 65
474906.6 389701.7 565319.5 344345.8 470651.4 713791.7 349678.9 279401.6 414766.4 343844.2
71 72 75 80 84 88 90 94 95 109
1394889.4 309000.9 414544.9 938408.8 490465.5 759349.3 171798.4 560398.5 560398.5 936329.6
123 128 143 145 147 149 154 156 157 166
148009.7 525819.0 361023.8 584687.7 822615.7 555430.6 1656548.8 257226.2 943530.5 321495.1
priceDiff2 <- test_set$price - pricePred2
head(priceDiff2, 30)
4 12 15 17 19 48 49 51 54
127488.80 77749.79 -41555.36 52360.94 -280902.67 72693.95 96815.40 -54948.45 171557.59
65 71 72 75 80 84 88 90 94
79479.95 -351316.31 15971.18 -62260.27 -218503.39 -135796.97 -296544.58 164500.02 -133079.43
95 109 123 128 143 145 147 149 154
136920.57 -258334.63 245241.44 -263026.16 -152786.99 -81930.95 64457.53 -293488.75 597772.82
156 157 166
-41517.99 -290485.64 50095.64
g1 = ggplot(test_set,aes(x = price)) + geom_histogram(aes(y = ..density..), color = "white", fill = "purple") + stat_function(fun=dnorm, args = list(mean = mean(test_set$price), sd= sd(test_set$price)), col = 'black')
g2 = ggplot(test_set, aes(sample=c(scale(price)))) + stat_qq() + geom_abline(intercept = 0, slope = 1)
g3 = data.frame(priceDiff2) %>% ggplot(aes(priceDiff2)) + geom_histogram(aes(y = ..density..), color = "white", fill = "purple") + stat_function(fun=dnorm, args = list(mean = mean(priceDiff2), sd= sd(priceDiff2)), col = 'black')
g4 = data.frame(priceDiff2) %>% ggplot(aes(sample=c(scale(priceDiff2)))) + stat_qq() + geom_abline(intercept = 0, slope = 1)
grid.arrange(g1, g2, g3, g4, nrow = 2)
par(mfrow = c(2,2))
plot(regressor1)
df2 <- data.frame(cbind(actual = test_set$price, predicted = pricePred2))
corraccuracy <- cor(df2)
corraccuracy
actual predicted
actual 1.0000000 0.7868934
predicted 0.7868934 1.0000000
gvlma::gvlma(regressor1)
Call:
lm(formula = price ~ bedrooms + bathrooms + sqft_living + floors +
waterfront + view + condition + grade + yr_built + sqft_living15 +
sqft_lot15, data = training_set)
Coefficients:
(Intercept) bedrooms bathrooms sqft_living floors waterfront
6350684.8894 -42780.7521 47047.8250 177.9891 21280.2761 624194.6527
view condition grade yr_built sqft_living15 sqft_lot15
43514.1690 18004.0522 119404.8800 -3643.7590 16.8800 -0.6316