House Prices: Advanced Regression Techniques

Rafal Decowski

May 2018

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(knitr)
## Warning: package 'knitr' was built under R version 3.4.4
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
library(psych)
## Warning: package 'psych' was built under R version 3.4.4
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
########################################################
# Loading data and simple transformations
########################################################
train <- read.csv('train.csv', stringsAsFactors=FALSE)
test <-  read.csv('test.csv', stringsAsFactors=FALSE)


# Variable choice 
# Above grade (ground) living area square feet
X <- train$GrLivArea
Y <- train$SalePrice
  
  
# Skewness 
ggplot(train, aes(x = GrLivArea)) + 
  stat_density(aes(y = ..density..), fill = 'black', alpha = 0.3) +
  geom_histogram(aes(y = ..density..), fill = 'blue', alpha = 0.5) +
  geom_density() + 
  scale_y_continuous(labels = scales::percent) +
  labs(x = 'Ground Living Area', y = 'Density') +
  theme(panel.background = element_rect(fill = "white", colour = "gray"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Probability

# Find Quantiles values and counts
p_df <- train[,c('GrLivArea', 'SalePrice')]

quartiles_X <- summary(X)
quartiles_Y <- summary(Y)

x <- as.vector(quartiles_X)[2]
y <- as.vector(quartiles_Y)[2]

a. P(X>x | Y>y)

What is the probability of X being greater than 1129.5 (x) given that Y is greater than 129975 (y)

\[ P(A | B) = P(A \& B) / P(B) \]

p_df$x1[p_df$GrLivArea <= x ] <- 0
p_df$x1[p_df$GrLivArea >  x ] <- 1

p_df$y1[p_df$SalePrice <= y ] <- 0
p_df$y1[p_df$SalePrice >  y ] <- 1

conf_mx <- table(p_df[,c(3:4)])
prob_conf_mx <- round(prop.table(conf_mx),2)
final_a <- prob_conf_mx[2,2] / (prob_conf_mx[1,2] + prob_conf_mx[2,2]) 
final_a
## [1] 0.8666667

b. P(X>x, Y>y)

final_b <- prob_conf_mx[2,2]
final_b
## [1] 0.65
  1. P(Xy)

\[ P(X<x | Y>y) = P(X<x \& Y>y) / P(Y>y) \]

final_c <- (prob_conf_mx[1,1] + prob_conf_mx[1,2]) / (prob_conf_mx[1,2] + prob_conf_mx[2,2]) 
final_c 
## [1] 0.3333333
A <- sum(p_df$x1 == 1)
B <- sum(p_df$y1 == 1)


PAxPB  <- 0.75 * 0.75

# Chi Square test for association
chisq.test(conf_mx)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  conf_mx
## X-squared = 340.75, df = 1, p-value < 2.2e-16

2nd and 3rd quartiles

p_df$x2[p_df$GrLivArea < as.vector(quartiles_X)[3] ] <- 0
p_df$x2[p_df$GrLivArea > as.vector(quartiles_X)[3] ] <- 1
p_df$x3[p_df$GrLivArea < as.vector(quartiles_X)[4] ] <- 0
p_df$x3[p_df$GrLivArea > as.vector(quartiles_X)[4] ] <- 1

p_df$y2[p_df$SalePrice < as.vector(quartiles_Y)[3] ] <- 0
p_df$y2[p_df$SalePrice > as.vector(quartiles_Y)[3] ] <- 1
p_df$y3[p_df$SalePrice < as.vector(quartiles_Y)[4] ] <- 0
p_df$y3[p_df$SalePrice > as.vector(quartiles_Y)[4] ] <- 1

Descriptive and Inferential Statistics

# Summary Stats for GrLivArea
summary(X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642
# Scatter for GrLivArea & SalePrice
ggplot(p_df, aes(x=GrLivArea, y=SalePrice)) +
    geom_point() +
    scale_y_continuous(labels = scales::comma) +
    theme_minimal() +
    geom_smooth(method='lm') +
    labs(x = 'Ground Living Area', y = 'Sale Price')

#Correlation

vars <- c('X1stFlrSF','LotArea', 'SalePrice')

selected_vars <- train[,vars]
selected_vars <- as.data.frame(sapply(selected_vars, as.numeric ))


cormat <- cor(train[,vars], use="complete")

res <- cor.mtest(selected_vars, conf.level = 0.92)

corrplot(cormat, type = "lower", order = "hclust", tl.col = "black", tl.srt = 45)

In statistics, family-wise error rate (FWER) is the probability of making one or more false discoveries, or type I errors when performing multiple hypotheses tests.

A type I error occurs when the null hypothesis (H0) is true, but is rejected. It is asserting something that is absent, a false hit. A type I error may be likened to a so-called false positive (a result that indicates that a given condition is present when it actually is not present).

Linear Algebra and Correlation

# Correlation matrix inverse
# This is known as the precision matrix and contains variance inflation factors on the diagonal.

det(cormat) # When != 0, inverse of matrix exists
## [1] 0.5693869
precision_matrix <- solve(cormat)

cor_by_prec <- round(cormat %*% precision_matrix,2)
prec_by_cor <- round(precision_matrix %*% cormat,2)


# LU Decomposition
library(matrixcalc)
lu <- lu.decomposition(cormat)



knitr::kable(lu ,  caption = "LU Decomposition")
LU Decomposition
1.0000000 0.0000000 0
0.2994746 1.0000000 0
0.6058522 0.0905247 1
1 0.2994746 0.6058522
0 0.9103150 0.0824060
0 0.0000000 0.6254833

Calculus-Based Probability & Statistics

# Fit an exponential probability density function.
mlf <- fitdistr(train$GrLivArea, "exponential")

# Simulated values using the most optimal lambda value for the exponential distribution
simdata = rexp(length(train$GrLivArea),rate=mlf$estimate[[1]])

# Push into one dataframe for plotting
simulation_df <- data.frame(Original=train$GrLivArea, Simulated=simdata)
simulation_df_plot <- melt(data.frame(Original=train$GrLivArea, Simulated=simdata))
## No id variables; using all as measure variables
# Histogram of simulated explonential vs original distribution
ggplot(simulation_df_plot, aes(x=value, fill=variable)) + 
  geom_histogram(alpha=.7, position="identity") +
  theme(panel.background = element_rect(fill = "white", colour = "gray"), legend.position = c(0.8, 0.8)) +
  labs(x = 'Ground Living Area', y = 'Frequency') +
  scale_x_continuous(limits = c(0, 7000)) +
  ggtitle("Simulated vs Original Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12 rows containing non-finite values (stat_bin).

# Calcualte 5% and 95% values 
sim_q <- round(quantile(simulation_df$Original, c(0.05, 0.95)))
# GGPLOT supported ECDF
ggplot (NULL, aes(x=simulation_df$Original)) + 
  geom_step(stat="ecdf") +
  geom_vline(aes(xintercept=sim_q),linetype = "dashed") +
  scale_x_continuous(breaks = sim_q,labels = sim_q) +
  theme(panel.background = element_rect(fill = "white", colour = "gray"),plot.title = element_text(hjust = 0.5)) +
  labs(x = 'Ground Living Area', y = 'Percentile') +
  ggtitle("Cumulative Distribution Function") +
  scale_x_continuous(limits = c(0, 3500)) +
  annotate("text", x = 500, y = 0.75, label = paste("5th Percentile \n Value: ", sim_q[[1]])) +
  annotate("text", x = 2800, y = 0.25, label = paste("95th Percentile \n Value: ", sim_q[[2]]))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 6 rows containing non-finite values (stat_ecdf).

# Confidence intervals
a <- mean(simulation_df$Simulated)
s <- sd(simulation_df$Simulated)
n <- length(simulation_df$Simulated)
error <- qnorm(0.975)*s/sqrt(n)
lower_ci <- a-error
upper_ci <- a+error
round(lower_ci)
## [1] 1413
round(upper_ci)
## [1] 1564
# Calcualte 5% and 95% values 
sim_q <- round(quantile(simulation_df$Simulated, c(0.05, 0.95)))
# GGPLOT supported ECDF
ggplot (NULL, aes(x=simulation_df$Simulated)) + 
  geom_step(stat="ecdf") +
  geom_vline(aes(xintercept=sim_q),linetype = "dashed") +
  scale_x_continuous(breaks = sim_q,labels = sim_q) +
  theme(panel.background = element_rect(fill = "white", colour = "gray"),plot.title = element_text(hjust = 0.5)) +
  labs(x = 'Ground Living Area', y = 'Percentile') +
  ggtitle("Empirical Cumulative Distribution Function") +
  scale_x_continuous(limits = c(0, 5000)) +
  annotate("text", x = 550, y = 0.75, label = paste("5th Percentile \n Value: ", sim_q[[1]])) +
  annotate("text", x = 4000, y = 0.25, label = paste("95th Percentile \n Value: ", sim_q[[2]]))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 45 rows containing non-finite values (stat_ecdf).

Modeling

library(mice)
## Warning: package 'mice' was built under R version 3.4.4
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete
# Populate missing
train2 <- mice(train, printFlag = FALSE)
train2 <- mice::complete(train2)


########################################################
# Size Model
########################################################

size_model <- glm(SalePrice ~  LotArea + BsmtFinSF1 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X1stFlrSF + GrLivArea +
                                        GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + ScreenPorch + PoolArea, 
                                        data = train2)



########################################################
# Age/time Model
########################################################
age_model <- glm(SalePrice ~ YearBuilt + YearRemodAdd + GarageYrBlt + MoSold + YrSold,
                                      data = train2)



########################################################
# Quality & Condition Model
########################################################
# Variables listed
qual_cond_features <- c("SalePrice","OverallQual", "LowQualFinSF", "Condition1", "Condition2","ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", "Functional", "GarageFinish", "GarageQual", "GarageCond", "SaleCondition")

# Only full cases selected
qual_cond_df <- train2[,qual_cond_features]
qual_cond_df <- qual_cond_df[complete.cases(qual_cond_df),]
 
# Convert all char class columns to factors
qual_cond_df <- qual_cond_df %>% mutate_if(is.character, as.factor)
## Warning: package 'bindrcpp' was built under R version 3.4.4
qual_cond_model <- glm(SalePrice ~ .,data = qual_cond_df)




########################################################
# Features Model
########################################################

# Variables listed
property_features <- c("SalePrice", "Utilities", "Exterior1st", "Heating", "CentralAir", "Electrical", "Fireplaces", "GarageCars", "PavedDrive", "MiscVal", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "BsmtFinType1")

# Only full cases selected
property_features_df <- train2[,property_features]
describe(property_features_df)
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion

## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
##                vars    n      mean       sd median   trimmed      mad
## SalePrice         1 1460 180921.20 79442.50 163000 170783.29 56338.80
## Utilities*        2 1460       NaN       NA     NA       NaN       NA
## Exterior1st*      3 1460       NaN       NA     NA       NaN       NA
## Heating*          4 1460       NaN       NA     NA       NaN       NA
## CentralAir*       5 1460       NaN       NA     NA       NaN       NA
## Electrical*       6 1459       NaN       NA     NA       NaN       NA
## Fireplaces        7 1460      0.61     0.64      1      0.53     1.48
## GarageCars        8 1460      1.77     0.75      2      1.77     0.00
## PavedDrive*       9 1460       NaN       NA     NA       NaN       NA
## MiscVal          10 1460     43.49   496.12      0      0.00     0.00
## RoofMatl*        11 1460       NaN       NA     NA       NaN       NA
## Exterior1st.1*   12 1460       NaN       NA     NA       NaN       NA
## Exterior2nd*     13 1460       NaN       NA     NA       NaN       NA
## MasVnrType*      14 1452       NaN       NA     NA       NaN       NA
## BsmtFinType1*    15 1423       NaN       NA     NA       NaN       NA
##                  min    max  range  skew kurtosis      se
## SalePrice      34900 755000 720100  1.88     6.50 2079.11
## Utilities*       Inf   -Inf   -Inf    NA       NA      NA
## Exterior1st*     Inf   -Inf   -Inf    NA       NA      NA
## Heating*         Inf   -Inf   -Inf    NA       NA      NA
## CentralAir*      Inf   -Inf   -Inf    NA       NA      NA
## Electrical*      Inf   -Inf   -Inf    NA       NA      NA
## Fireplaces         0      3      3  0.65    -0.22    0.02
## GarageCars         0      4      4 -0.34     0.21    0.02
## PavedDrive*      Inf   -Inf   -Inf    NA       NA      NA
## MiscVal            0  15500  15500 24.43   697.64   12.98
## RoofMatl*        Inf   -Inf   -Inf    NA       NA      NA
## Exterior1st.1*   Inf   -Inf   -Inf    NA       NA      NA
## Exterior2nd*     Inf   -Inf   -Inf    NA       NA      NA
## MasVnrType*      Inf   -Inf   -Inf    NA       NA      NA
## BsmtFinType1*    Inf   -Inf   -Inf    NA       NA      NA
property_features_df <- property_features_df[complete.cases(property_features_df),]

# Convert all char class columns to factors
property_features_df <- property_features_df %>% mutate_if(is.character, as.factor)

features_model <- glm(SalePrice ~ ., data = property_features_df)


########################################################
# Rooms Model
########################################################
room_vars <- c("SalePrice","BsmtFullBath", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "TotRmsAbvGrd")

# Only full cases selected
rooms <- train2[,room_vars]
rooms <- rooms[complete.cases(rooms),]

# Convert all char class columns to factors
rooms <- rooms %>% mutate_if(is.character, as.factor)

rooms_model<- glm(SalePrice ~  ., data = rooms)



########################################################
# Layout Model
########################################################

# List all Relevant Variables
layout_vars <- c("SalePrice", "LotShape", "LotConfig", "LandSlope", "BldgType", "HouseStyle", "RoofStyle", "Foundation", "BsmtExposure", "GarageType")

# Only full cases selected
layout_df <- train2[,layout_vars]
layout_df <- layout_df[complete.cases(layout_df),]

# Convert all char class columns to factors
layout_df <- layout_df %>% mutate_if(is.character, as.factor)

# Random Forest Model
layout_model<- glm(SalePrice ~  LotShape + ., data = layout_df)





model_names <- c('size_model', 'age_model','qual_cond_model','features_model', 'rooms_model', 'layout_model')

Model Evaluation

# Model Summary and selected features 
summary(size_model) 
## 
## Call:
## glm(formula = SalePrice ~ LotArea + BsmtFinSF1 + BsmtUnfSF + 
##     TotalBsmtSF + X1stFlrSF + X1stFlrSF + GrLivArea + GarageArea + 
##     WoodDeckSF + OpenPorchSF + EnclosedPorch + ScreenPorch + 
##     PoolArea, data = train2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -641199   -18703     -175    18318   298631  
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.723e+04  4.247e+03  -4.057 5.24e-05 ***
## LotArea        1.321e-01  1.259e-01   1.050 0.294056    
## BsmtFinSF1     2.740e+01  7.712e+00   3.553 0.000393 ***
## BsmtUnfSF      1.346e+01  7.527e+00   1.789 0.073850 .  
## TotalBsmtSF    3.056e+01  8.681e+00   3.520 0.000444 ***
## X1stFlrSF     -8.081e+00  5.802e+00  -1.393 0.163936    
## GrLivArea      6.842e+01  3.012e+00  22.715  < 2e-16 ***
## GarageArea     9.339e+01  6.748e+00  13.839  < 2e-16 ***
## WoodDeckSF     5.621e+01  1.006e+01   5.586 2.78e-08 ***
## OpenPorchSF    4.044e+01  1.912e+01   2.114 0.034650 *  
## EnclosedPorch -6.811e+01  1.981e+01  -3.438 0.000603 ***
## ScreenPorch    4.885e+01  2.155e+01   2.267 0.023546 *  
## PoolArea      -8.762e+01  2.997e+01  -2.923 0.003516 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2009198727)
## 
##     Null deviance: 9.2079e+12  on 1459  degrees of freedom
## Residual deviance: 2.9073e+12  on 1447  degrees of freedom
## AIC: 35433
## 
## Number of Fisher Scoring iterations: 2
summary(age_model)
## 
## Call:
## glm(formula = SalePrice ~ YearBuilt + YearRemodAdd + GarageYrBlt + 
##     MoSold + YrSold, data = train2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -145207   -40362   -10765    22461   544681  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -473739.56 2589277.90  -0.183 0.854853    
## YearBuilt        649.48      98.25   6.611 5.36e-11 ***
## YearRemodAdd    1072.66     105.83  10.136  < 2e-16 ***
## GarageYrBlt      418.50     117.19   3.571 0.000367 ***
## MoSold           923.50     632.42   1.460 0.144430    
## YrSold         -1786.93    1289.21  -1.386 0.165939    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 4171492107)
## 
##     Null deviance: 9.2079e+12  on 1459  degrees of freedom
## Residual deviance: 6.0653e+12  on 1454  degrees of freedom
## AIC: 36493
## 
## Number of Fisher Scoring iterations: 2
summary(qual_cond_model)
## 
## Call:
## glm(formula = SalePrice ~ ., data = qual_cond_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -266556   -22511    -2792    17179   362633  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.627e+04  6.654e+04   0.545 0.585782    
## OverallQual           2.935e+04  1.519e+03  19.330  < 2e-16 ***
## LowQualFinSF          4.968e+00  3.241e+01   0.153 0.878186    
## Condition1Feedr       1.220e+04  9.127e+03   1.336 0.181677    
## Condition1Norm        1.597e+04  7.340e+03   2.176 0.029753 *  
## Condition1PosA        3.758e+04  1.705e+04   2.204 0.027725 *  
## Condition1PosN        3.686e+04  1.270e+04   2.902 0.003776 ** 
## Condition1RRAe        6.782e+03  1.616e+04   0.420 0.674838    
## Condition1RRAn        1.553e+04  1.174e+04   1.322 0.186371    
## Condition1RRNe        9.568e+01  3.126e+04   0.003 0.997558    
## Condition1RRNn       -5.480e+01  2.146e+04  -0.003 0.997963    
## Condition2Feedr      -3.744e+04  3.878e+04  -0.965 0.334505    
## Condition2Norm       -1.753e+04  3.243e+04  -0.541 0.588877    
## Condition2PosA        5.513e+04  6.904e+04   0.799 0.424669    
## Condition2PosN       -1.839e+05  4.613e+04  -3.986 7.09e-05 ***
## Condition2RRAe        1.828e+04  5.434e+04   0.336 0.736665    
## Condition2RRAn        1.188e+04  5.466e+04   0.217 0.828039    
## Condition2RRNn       -2.495e+04  4.551e+04  -0.548 0.583636    
## ExterQualFa          -4.998e+04  1.948e+04  -2.566 0.010403 *  
## ExterQualGd          -2.878e+04  8.174e+03  -3.521 0.000446 ***
## ExterQualTA          -3.438e+04  9.068e+03  -3.792 0.000157 ***
## ExterCondFa           4.009e+04  4.466e+04   0.898 0.369543    
## ExterCondGd           5.125e+04  4.319e+04   1.187 0.235551    
## ExterCondTA           4.448e+04  4.307e+04   1.033 0.301942    
## BsmtQualFa           -3.986e+04  1.073e+04  -3.715 0.000212 ***
## BsmtQualGd           -4.148e+04  5.480e+03  -7.570 7.12e-14 ***
## BsmtQualTA           -4.694e+04  6.499e+03  -7.224 8.65e-13 ***
## BsmtCondGd           -8.326e+02  9.535e+03  -0.087 0.930433    
## BsmtCondPo            1.610e+04  5.913e+04   0.272 0.785499    
## BsmtCondTA            4.725e+03  7.758e+03   0.609 0.542608    
## BsmtFinType1BLQ      -7.978e+01  4.803e+03  -0.017 0.986749    
## BsmtFinType1GLQ       2.465e+02  4.269e+03   0.058 0.953974    
## BsmtFinType1LwQ      -6.240e+03  6.410e+03  -0.973 0.330490    
## BsmtFinType1Rec       1.757e+03  5.034e+03   0.349 0.727165    
## BsmtFinType1Unf      -1.807e+04  4.100e+03  -4.407 1.14e-05 ***
## BsmtFinType2BLQ      -2.809e+04  1.278e+04  -2.199 0.028043 *  
## BsmtFinType2GLQ      -2.625e+04  1.628e+04  -1.613 0.107072    
## BsmtFinType2LwQ      -2.129e+04  1.221e+04  -1.744 0.081368 .  
## BsmtFinType2Rec      -1.037e+04  1.200e+04  -0.864 0.387621    
## BsmtFinType2Unf      -2.255e+04  1.044e+04  -2.160 0.030929 *  
## HeatingQCFa          -4.310e+03  8.067e+03  -0.534 0.593226    
## HeatingQCGd          -2.863e+03  3.658e+03  -0.783 0.433929    
## HeatingQCPo          -3.738e+04  4.526e+04  -0.826 0.409039    
## HeatingQCTA          -4.004e+03  3.498e+03  -1.144 0.252655    
## KitchenQualFa        -5.049e+04  1.157e+04  -4.363 1.39e-05 ***
## KitchenQualGd        -3.252e+04  6.080e+03  -5.349 1.05e-07 ***
## KitchenQualTA        -4.389e+04  6.803e+03  -6.452 1.56e-10 ***
## FunctionalMaj2       -2.548e+04  2.880e+04  -0.885 0.376460    
## FunctionalMin1        2.098e+04  1.553e+04   1.351 0.177059    
## FunctionalMin2        3.004e+04  1.551e+04   1.937 0.052925 .  
## FunctionalMod         4.945e+04  1.862e+04   2.655 0.008029 ** 
## FunctionalSev        -2.184e+04  4.604e+04  -0.474 0.635319    
## FunctionalTyp         1.783e+04  1.327e+04   1.344 0.179119    
## GarageFinishRFn      -7.335e+03  3.255e+03  -2.254 0.024383 *  
## GarageFinishUnf      -1.376e+04  3.694e+03  -3.725 0.000204 ***
## GarageQualFa         -1.777e+05  4.829e+04  -3.680 0.000243 ***
## GarageQualGd         -1.620e+05  4.969e+04  -3.260 0.001144 ** 
## GarageQualPo         -1.891e+05  6.182e+04  -3.060 0.002261 ** 
## GarageQualTA         -1.695e+05  4.767e+04  -3.556 0.000390 ***
## GarageCondFa          2.085e+05  5.725e+04   3.642 0.000282 ***
## GarageCondGd          2.074e+05  5.939e+04   3.492 0.000496 ***
## GarageCondPo          2.193e+05  6.157e+04   3.562 0.000381 ***
## GarageCondTA          2.077e+05  5.664e+04   3.667 0.000256 ***
## SaleConditionAdjLand  5.161e+03  4.363e+04   0.118 0.905861    
## SaleConditionAlloca   4.052e+04  1.651e+04   2.455 0.014219 *  
## SaleConditionFamily   3.590e+03  1.076e+04   0.334 0.738762    
## SaleConditionNormal   8.496e+03  4.938e+03   1.721 0.085568 .  
## SaleConditionPartial  1.918e+04  6.622e+03   2.897 0.003837 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1817551053)
## 
##     Null deviance: 8.4079e+12  on 1347  degrees of freedom
## Residual deviance: 2.3265e+12  on 1280  degrees of freedom
## AIC: 32634
## 
## Number of Fisher Scoring iterations: 2
summary(features_model)
## 
## Call:
## glm(formula = SalePrice ~ ., data = property_features_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -158177   -28644    -2984    22023   380888  
## 
## Coefficients: (14 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.525e+05  5.608e+04  -2.720 0.006621 ** 
## UtilitiesNoSeWa      -8.104e+04  5.071e+04  -1.598 0.110257    
## Exterior1stBrkComm   -2.073e+04  6.098e+04  -0.340 0.733929    
## Exterior1stBrkFace    4.851e+04  2.597e+04   1.868 0.061952 .  
## Exterior1stCBlock     1.230e+04  5.204e+04   0.236 0.813246    
## Exterior1stCemntBd    9.345e+04  3.833e+04   2.438 0.014903 *  
## Exterior1stHdBoard    3.375e+03  2.591e+04   0.130 0.896366    
## Exterior1stImStucc   -4.392e+04  5.834e+04  -0.753 0.451660    
## Exterior1stMetalSd    2.552e+04  2.942e+04   0.868 0.385729    
## Exterior1stPlywood   -6.151e+03  2.567e+04  -0.240 0.810693    
## Exterior1stStone      8.760e+04  5.098e+04   1.718 0.085969 .  
## Exterior1stStucco     6.057e+04  2.880e+04   2.103 0.035615 *  
## Exterior1stVinylSd    1.032e+04  2.689e+04   0.384 0.701345    
## Exterior1stWd Sdng    2.342e+04  2.503e+04   0.936 0.349583    
## Exterior1stWdShing    3.008e+03  2.716e+04   0.111 0.911829    
## HeatingGasW           1.514e+04  1.361e+04   1.112 0.266235    
## HeatingGrav          -4.044e+03  2.023e+04  -0.200 0.841553    
## HeatingOthW           2.353e+04  3.708e+04   0.635 0.525804    
## CentralAirY           1.025e+04  7.272e+03   1.410 0.158764    
## ElectricalFuseF      -5.666e+03  1.277e+04  -0.444 0.657279    
## ElectricalFuseP      -4.323e+03  3.675e+04  -0.118 0.906394    
## ElectricalMix        -3.730e+04  5.086e+04  -0.733 0.463502    
## ElectricalSBrkr       5.416e+03  6.085e+03   0.890 0.373585    
## Fireplaces            3.477e+04  2.315e+03  15.020  < 2e-16 ***
## GarageCars            4.048e+04  2.260e+03  17.909  < 2e-16 ***
## PavedDriveP          -1.209e+04  1.118e+04  -1.081 0.279737    
## PavedDriveY          -5.665e+02  6.529e+03  -0.087 0.930868    
## MiscVal               1.785e+00  2.721e+00   0.656 0.511837    
## RoofMatlCompShg       1.748e+05  5.198e+04   3.362 0.000794 ***
## RoofMatlMembran       2.136e+05  7.267e+04   2.940 0.003340 ** 
## RoofMatlMetal         1.565e+05  7.240e+04   2.162 0.030815 *  
## RoofMatlRoll          1.451e+05  7.380e+04   1.966 0.049539 *  
## RoofMatlTar&Grv       1.969e+05  5.442e+04   3.617 0.000308 ***
## RoofMatlWdShake       2.126e+05  5.744e+04   3.701 0.000223 ***
## RoofMatlWdShngl       3.379e+05  5.596e+04   6.038 2.01e-09 ***
## Exterior1st.1BrkComm         NA         NA      NA       NA    
## Exterior1st.1BrkFace         NA         NA      NA       NA    
## Exterior1st.1CBlock          NA         NA      NA       NA    
## Exterior1st.1CemntBd         NA         NA      NA       NA    
## Exterior1st.1HdBoard         NA         NA      NA       NA    
## Exterior1st.1ImStucc         NA         NA      NA       NA    
## Exterior1st.1MetalSd         NA         NA      NA       NA    
## Exterior1st.1Plywood         NA         NA      NA       NA    
## Exterior1st.1Stone           NA         NA      NA       NA    
## Exterior1st.1Stucco          NA         NA      NA       NA    
## Exterior1st.1VinylSd         NA         NA      NA       NA    
## Exterior1st.1Wd Sdng         NA         NA      NA       NA    
## Exterior1st.1WdShing         NA         NA      NA       NA    
## Exterior2ndAsphShn   -1.230e+02  4.391e+04  -0.003 0.997766    
## Exterior2ndBrk Cmn   -1.280e+04  3.377e+04  -0.379 0.704744    
## Exterior2ndBrkFace   -1.072e+04  2.702e+04  -0.397 0.691788    
## Exterior2ndCBlock            NA         NA      NA       NA    
## Exterior2ndCmentBd   -5.013e+04  3.807e+04  -1.317 0.188097    
## Exterior2ndHdBoard    3.704e+03  2.526e+04   0.147 0.883451    
## Exterior2ndImStucc    6.091e+04  2.912e+04   2.091 0.036680 *  
## Exterior2ndMetalSd   -1.532e+04  2.901e+04  -0.528 0.597432    
## Exterior2ndOther      5.133e+04  5.663e+04   0.906 0.364880    
## Exterior2ndPlywood    1.696e+03  2.456e+04   0.069 0.944942    
## Exterior2ndStone     -3.563e+04  4.841e+04  -0.736 0.461909    
## Exterior2ndStucco    -3.711e+04  2.765e+04  -1.342 0.179794    
## Exterior2ndVinylSd    1.647e+04  2.607e+04   0.632 0.527674    
## Exterior2ndWd Sdng   -1.237e+04  2.432e+04  -0.509 0.611065    
## Exterior2ndWd Shng   -1.234e+04  2.546e+04  -0.485 0.627854    
## MasVnrTypeBrkFace     2.721e+04  1.356e+04   2.006 0.045083 *  
## MasVnrTypeNone        1.486e+04  1.344e+04   1.106 0.268998    
## MasVnrTypeStone       5.211e+04  1.427e+04   3.652 0.000270 ***
## BsmtFinType1BLQ       1.627e+01  5.510e+03   0.003 0.997644    
## BsmtFinType1GLQ       3.658e+04  4.592e+03   7.968 3.40e-15 ***
## BsmtFinType1LwQ       4.382e+03  7.020e+03   0.624 0.532608    
## BsmtFinType1Rec       1.700e+03  5.870e+03   0.290 0.772140    
## BsmtFinType1Unf       6.242e+03  4.555e+03   1.370 0.170803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2512706806)
## 
##     Null deviance: 8.8768e+12  on 1413  degrees of freedom
## Residual deviance: 3.4097e+12  on 1357  degrees of freedom
## AIC: 34676
## 
## Number of Fisher Scoring iterations: 2
summary(rooms_model)
## 
## Call:
## glm(formula = SalePrice ~ ., data = rooms)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -261395   -31166    -4507    25200   412948  
## 
## Coefficients: (2 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       68748       8013   8.580  < 2e-16 ***
## BsmtFullBath      36694       2652  13.836  < 2e-16 ***
## BsmtFullBath.1       NA         NA      NA       NA    
## BsmtHalfBath      19467       5710   3.409 0.000669 ***
## FullBath          57230       2943  19.444  < 2e-16 ***
## HalfBath          15061       2892   5.207 2.19e-07 ***
## BedroomAbvGr     -28250       2268 -12.459  < 2e-16 ***
## KitchenAbvGr     -91921       6408 -14.345  < 2e-16 ***
## TotRmsAbvGrd      27201       1334  20.388  < 2e-16 ***
## TotRmsAbvGrd.1       NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2625806246)
## 
##     Null deviance: 9.2079e+12  on 1459  degrees of freedom
## Residual deviance: 3.8127e+12  on 1452  degrees of freedom
## AIC: 35819
## 
## Number of Fisher Scoring iterations: 2
summary(layout_model)
## 
## Call:
## glm(formula = SalePrice ~ LotShape + ., data = layout_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -192099   -32473    -5033    23747   441363  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         181639      32011   5.674 1.71e-08 ***
## LotShapeIR2           5359       9666   0.554 0.579394    
## LotShapeIR3         -38349      18342  -2.091 0.036738 *  
## LotShapeReg         -14127       3628  -3.894 0.000104 ***
## LotConfigCulDSac     11328       7382   1.535 0.125139    
## LotConfigFR2        -22983       9530  -2.412 0.016021 *  
## LotConfigFR3         -1874      28778  -0.065 0.948096    
## LotConfigInside      -3712       4147  -0.895 0.370844    
## LandSlopeMod         -3328       8067  -0.413 0.680033    
## LandSlopeSev        -12293      18542  -0.663 0.507473    
## BldgType2fmCon      -31210      12589  -2.479 0.013296 *  
## BldgTypeDuplex        5875      11726   0.501 0.616412    
## BldgTypeTwnhs       -29711       9799  -3.032 0.002476 ** 
## BldgTypeTwnhsE      -24849       5827  -4.264 2.15e-05 ***
## HouseStyle1.5Unf    -25217      17834  -1.414 0.157613    
## HouseStyle1Story    -13813       5984  -2.308 0.021128 *  
## HouseStyle2.5Fin     88317      23957   3.687 0.000237 ***
## HouseStyle2.5Unf     17768      18818   0.944 0.345244    
## HouseStyle2Story     11716       6224   1.883 0.059979 .  
## HouseStyleSFoyer    -59267      12633  -4.691 3.00e-06 ***
## HouseStyleSLvl      -48709       9542  -5.104 3.81e-07 ***
## RoofStyleGable       12867      18947   0.679 0.497186    
## RoofStyleGambrel     23692      26297   0.901 0.367798    
## RoofStyleHip         54380      19104   2.847 0.004488 ** 
## RoofStyleMansard     51565      30187   1.708 0.087835 .  
## RoofStyleShed        10295      44498   0.231 0.817065    
## FoundationCBlock     -9667       6399  -1.511 0.131104    
## FoundationPConc      41887       6750   6.206 7.29e-10 ***
## FoundationStone       3757      24195   0.155 0.876622    
## FoundationWood        2861      33897   0.084 0.932745    
## BsmtExposureGd       47109       6650   7.084 2.28e-12 ***
## BsmtExposureMn      -16920       6955  -2.433 0.015123 *  
## BsmtExposureNo      -33664       4824  -6.978 4.75e-12 ***
## GarageTypeAttchd     18418      23824   0.773 0.439608    
## GarageTypeBasment   -10935      27305  -0.400 0.688878    
## GarageTypeBuiltIn    43830      24691   1.775 0.076104 .  
## GarageTypeCarPort   -34824      31756  -1.097 0.273009    
## GarageTypeDetchd    -17649      23810  -0.741 0.458675    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3194693609)
## 
##     Null deviance: 8.4173e+12  on 1347  degrees of freedom
## Residual deviance: 4.1850e+12  on 1310  degrees of freedom
## AIC: 33366
## 
## Number of Fisher Scoring iterations: 2
predict1 <- predict(size_model, type = 'response')
predict2 <- predict(age_model, type = 'response')
predict3 <- predict(qual_cond_model, type = 'response')
predict4 <- predict(features_model, type = 'response')
predict5 <- predict(rooms_model, type = 'response')
predict6 <- predict(layout_model, type = 'response')


actuals1 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict1))
actuals2 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict2))
actuals3 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict3))
## Warning in cbind(actuals = train2$SalePrice, predicteds = predict3): number
## of rows of result is not a multiple of vector length (arg 2)
actuals4 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict4))
## Warning in cbind(actuals = train2$SalePrice, predicteds = predict4): number
## of rows of result is not a multiple of vector length (arg 2)
actuals5 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict5))
actuals6 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict6))
## Warning in cbind(actuals = train2$SalePrice, predicteds = predict6): number
## of rows of result is not a multiple of vector length (arg 2)
min_max_accuracy1 <- mean(apply(actuals1, 1, min) / apply(actuals1, 1, max)) 
min_max_accuracy2 <- mean(apply(actuals2, 1, min) / apply(actuals2, 1, max)) 
min_max_accuracy3 <- mean(apply(actuals3, 1, min) / apply(actuals3, 1, max)) 
min_max_accuracy4 <- mean(apply(actuals4, 1, min) / apply(actuals4, 1, max)) 
min_max_accuracy5 <- mean(apply(actuals5, 1, min) / apply(actuals5, 1, max)) 
min_max_accuracy6 <- mean(apply(actuals6, 1, min) / apply(actuals6, 1, max)) 


mape1 <- mean(abs((actuals1$predicteds - actuals1$actuals))/actuals1$actuals)
mape2 <- mean(abs((actuals2$predicteds - actuals2$actuals))/actuals2$actuals)
mape3 <- mean(abs((actuals3$predicteds - actuals3$actuals))/actuals3$actuals)
mape4 <- mean(abs((actuals4$predicteds - actuals4$actuals))/actuals4$actuals)
mape5 <- mean(abs((actuals5$predicteds - actuals5$actuals))/actuals5$actuals)
mape6 <- mean(abs((actuals6$predicteds - actuals6$actuals))/actuals6$actuals)


mlr <- data.frame(row.names = c('MinMax', 'MAPE'), 
                  size_model=c(min_max_accuracy1, mape1), 
                  age_model=c(min_max_accuracy2, mape2), 
                  qual_cond_model=c(min_max_accuracy3, mape3), 
                  features_model=c(min_max_accuracy4, mape4), 
                  rooms_model=c(min_max_accuracy5, mape5), 
                  layout_model=c(min_max_accuracy6, mape6))


mlr
##        size_model age_model qual_cond_model features_model rooms_model
## MinMax  0.8654670 0.7979043       0.6966195      0.6867300   0.8250866
## MAPE    0.1652379 0.2583608       0.4749015      0.4663379   0.2163855
##        layout_model
## MinMax    0.6988340
## MAPE      0.4608254

Predictions

final_prediction <- predict(size_model, test, type = 'response')

test$SalePrice<- round(final_prediction,2)

write.csv(test[,c('Id', 'SalePrice')], 'D:\\Rafal\\CUNY\\605\\final\\price_predictions.csv', row.names=FALSE)