library(dplyr)## Warning: package 'dplyr' was built under R version 3.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)## Warning: package 'tidyr' was built under R version 3.4.4
library(knitr)## Warning: package 'knitr' was built under R version 3.4.4
library(stringr)## Warning: package 'stringr' was built under R version 3.4.4
library(reshape2)## Warning: package 'reshape2' was built under R version 3.4.4
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
library(corrplot)## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
library(psych)## Warning: package 'psych' was built under R version 3.4.4
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(MASS)##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
########################################################
# Loading data and simple transformations
########################################################
train <- read.csv('train.csv', stringsAsFactors=FALSE)
test <- read.csv('test.csv', stringsAsFactors=FALSE)
# Variable choice
# Above grade (ground) living area square feet
X <- train$GrLivArea
Y <- train$SalePrice
# Skewness
ggplot(train, aes(x = GrLivArea)) +
stat_density(aes(y = ..density..), fill = 'black', alpha = 0.3) +
geom_histogram(aes(y = ..density..), fill = 'blue', alpha = 0.5) +
geom_density() +
scale_y_continuous(labels = scales::percent) +
labs(x = 'Ground Living Area', y = 'Density') +
theme(panel.background = element_rect(fill = "white", colour = "gray"))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Probability
# Find Quantiles values and counts
p_df <- train[,c('GrLivArea', 'SalePrice')]
quartiles_X <- summary(X)
quartiles_Y <- summary(Y)
x <- as.vector(quartiles_X)[2]
y <- as.vector(quartiles_Y)[2]a. P(X>x | Y>y)
What is the probability of X being greater than 1129.5 (x) given that Y is greater than 129975 (y)
\[ P(A | B) = P(A \& B) / P(B) \]
p_df$x1[p_df$GrLivArea <= x ] <- 0
p_df$x1[p_df$GrLivArea > x ] <- 1
p_df$y1[p_df$SalePrice <= y ] <- 0
p_df$y1[p_df$SalePrice > y ] <- 1
conf_mx <- table(p_df[,c(3:4)])
prob_conf_mx <- round(prop.table(conf_mx),2)final_a <- prob_conf_mx[2,2] / (prob_conf_mx[1,2] + prob_conf_mx[2,2])
final_a## [1] 0.8666667
b. P(X>x, Y>y)
final_b <- prob_conf_mx[2,2]
final_b## [1] 0.65
- P(X
y)
\[ P(X<x | Y>y) = P(X<x \& Y>y) / P(Y>y) \]
final_c <- (prob_conf_mx[1,1] + prob_conf_mx[1,2]) / (prob_conf_mx[1,2] + prob_conf_mx[2,2])
final_c ## [1] 0.3333333
A <- sum(p_df$x1 == 1)
B <- sum(p_df$y1 == 1)
PAxPB <- 0.75 * 0.75
# Chi Square test for association
chisq.test(conf_mx)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: conf_mx
## X-squared = 340.75, df = 1, p-value < 2.2e-16
2nd and 3rd quartiles
p_df$x2[p_df$GrLivArea < as.vector(quartiles_X)[3] ] <- 0
p_df$x2[p_df$GrLivArea > as.vector(quartiles_X)[3] ] <- 1
p_df$x3[p_df$GrLivArea < as.vector(quartiles_X)[4] ] <- 0
p_df$x3[p_df$GrLivArea > as.vector(quartiles_X)[4] ] <- 1
p_df$y2[p_df$SalePrice < as.vector(quartiles_Y)[3] ] <- 0
p_df$y2[p_df$SalePrice > as.vector(quartiles_Y)[3] ] <- 1
p_df$y3[p_df$SalePrice < as.vector(quartiles_Y)[4] ] <- 0
p_df$y3[p_df$SalePrice > as.vector(quartiles_Y)[4] ] <- 1Descriptive and Inferential Statistics
# Summary Stats for GrLivArea
summary(X)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
# Scatter for GrLivArea & SalePrice
ggplot(p_df, aes(x=GrLivArea, y=SalePrice)) +
geom_point() +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
geom_smooth(method='lm') +
labs(x = 'Ground Living Area', y = 'Sale Price')#Correlation
vars <- c('X1stFlrSF','LotArea', 'SalePrice')
selected_vars <- train[,vars]
selected_vars <- as.data.frame(sapply(selected_vars, as.numeric ))
cormat <- cor(train[,vars], use="complete")
res <- cor.mtest(selected_vars, conf.level = 0.92)
corrplot(cormat, type = "lower", order = "hclust", tl.col = "black", tl.srt = 45)In statistics, family-wise error rate (FWER) is the probability of making one or more false discoveries, or type I errors when performing multiple hypotheses tests.
A type I error occurs when the null hypothesis (H0) is true, but is rejected. It is asserting something that is absent, a false hit. A type I error may be likened to a so-called false positive (a result that indicates that a given condition is present when it actually is not present).
Linear Algebra and Correlation
# Correlation matrix inverse
# This is known as the precision matrix and contains variance inflation factors on the diagonal.
det(cormat) # When != 0, inverse of matrix exists## [1] 0.5693869
precision_matrix <- solve(cormat)
cor_by_prec <- round(cormat %*% precision_matrix,2)
prec_by_cor <- round(precision_matrix %*% cormat,2)
# LU Decomposition
library(matrixcalc)
lu <- lu.decomposition(cormat)
knitr::kable(lu , caption = "LU Decomposition")
|
|
Calculus-Based Probability & Statistics
# Fit an exponential probability density function.
mlf <- fitdistr(train$GrLivArea, "exponential")
# Simulated values using the most optimal lambda value for the exponential distribution
simdata = rexp(length(train$GrLivArea),rate=mlf$estimate[[1]])
# Push into one dataframe for plotting
simulation_df <- data.frame(Original=train$GrLivArea, Simulated=simdata)
simulation_df_plot <- melt(data.frame(Original=train$GrLivArea, Simulated=simdata))## No id variables; using all as measure variables
# Histogram of simulated explonential vs original distribution
ggplot(simulation_df_plot, aes(x=value, fill=variable)) +
geom_histogram(alpha=.7, position="identity") +
theme(panel.background = element_rect(fill = "white", colour = "gray"), legend.position = c(0.8, 0.8)) +
labs(x = 'Ground Living Area', y = 'Frequency') +
scale_x_continuous(limits = c(0, 7000)) +
ggtitle("Simulated vs Original Distribution")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12 rows containing non-finite values (stat_bin).
# Calcualte 5% and 95% values
sim_q <- round(quantile(simulation_df$Original, c(0.05, 0.95)))
# GGPLOT supported ECDF
ggplot (NULL, aes(x=simulation_df$Original)) +
geom_step(stat="ecdf") +
geom_vline(aes(xintercept=sim_q),linetype = "dashed") +
scale_x_continuous(breaks = sim_q,labels = sim_q) +
theme(panel.background = element_rect(fill = "white", colour = "gray"),plot.title = element_text(hjust = 0.5)) +
labs(x = 'Ground Living Area', y = 'Percentile') +
ggtitle("Cumulative Distribution Function") +
scale_x_continuous(limits = c(0, 3500)) +
annotate("text", x = 500, y = 0.75, label = paste("5th Percentile \n Value: ", sim_q[[1]])) +
annotate("text", x = 2800, y = 0.25, label = paste("95th Percentile \n Value: ", sim_q[[2]]))## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 6 rows containing non-finite values (stat_ecdf).
# Confidence intervals
a <- mean(simulation_df$Simulated)
s <- sd(simulation_df$Simulated)
n <- length(simulation_df$Simulated)
error <- qnorm(0.975)*s/sqrt(n)
lower_ci <- a-error
upper_ci <- a+error
round(lower_ci)## [1] 1413
round(upper_ci)## [1] 1564
# Calcualte 5% and 95% values
sim_q <- round(quantile(simulation_df$Simulated, c(0.05, 0.95)))
# GGPLOT supported ECDF
ggplot (NULL, aes(x=simulation_df$Simulated)) +
geom_step(stat="ecdf") +
geom_vline(aes(xintercept=sim_q),linetype = "dashed") +
scale_x_continuous(breaks = sim_q,labels = sim_q) +
theme(panel.background = element_rect(fill = "white", colour = "gray"),plot.title = element_text(hjust = 0.5)) +
labs(x = 'Ground Living Area', y = 'Percentile') +
ggtitle("Empirical Cumulative Distribution Function") +
scale_x_continuous(limits = c(0, 5000)) +
annotate("text", x = 550, y = 0.75, label = paste("5th Percentile \n Value: ", sim_q[[1]])) +
annotate("text", x = 4000, y = 0.25, label = paste("95th Percentile \n Value: ", sim_q[[2]]))## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 45 rows containing non-finite values (stat_ecdf).
Modeling
library(mice)## Warning: package 'mice' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
# Populate missing
train2 <- mice(train, printFlag = FALSE)
train2 <- mice::complete(train2)
########################################################
# Size Model
########################################################
size_model <- glm(SalePrice ~ LotArea + BsmtFinSF1 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X1stFlrSF + GrLivArea +
GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + ScreenPorch + PoolArea,
data = train2)
########################################################
# Age/time Model
########################################################
age_model <- glm(SalePrice ~ YearBuilt + YearRemodAdd + GarageYrBlt + MoSold + YrSold,
data = train2)
########################################################
# Quality & Condition Model
########################################################
# Variables listed
qual_cond_features <- c("SalePrice","OverallQual", "LowQualFinSF", "Condition1", "Condition2","ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", "Functional", "GarageFinish", "GarageQual", "GarageCond", "SaleCondition")
# Only full cases selected
qual_cond_df <- train2[,qual_cond_features]
qual_cond_df <- qual_cond_df[complete.cases(qual_cond_df),]
# Convert all char class columns to factors
qual_cond_df <- qual_cond_df %>% mutate_if(is.character, as.factor)## Warning: package 'bindrcpp' was built under R version 3.4.4
qual_cond_model <- glm(SalePrice ~ .,data = qual_cond_df)
########################################################
# Features Model
########################################################
# Variables listed
property_features <- c("SalePrice", "Utilities", "Exterior1st", "Heating", "CentralAir", "Electrical", "Fireplaces", "GarageCars", "PavedDrive", "MiscVal", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "BsmtFinType1")
# Only full cases selected
property_features_df <- train2[,property_features]
describe(property_features_df)## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in describe(property_features_df): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## vars n mean sd median trimmed mad
## SalePrice 1 1460 180921.20 79442.50 163000 170783.29 56338.80
## Utilities* 2 1460 NaN NA NA NaN NA
## Exterior1st* 3 1460 NaN NA NA NaN NA
## Heating* 4 1460 NaN NA NA NaN NA
## CentralAir* 5 1460 NaN NA NA NaN NA
## Electrical* 6 1459 NaN NA NA NaN NA
## Fireplaces 7 1460 0.61 0.64 1 0.53 1.48
## GarageCars 8 1460 1.77 0.75 2 1.77 0.00
## PavedDrive* 9 1460 NaN NA NA NaN NA
## MiscVal 10 1460 43.49 496.12 0 0.00 0.00
## RoofMatl* 11 1460 NaN NA NA NaN NA
## Exterior1st.1* 12 1460 NaN NA NA NaN NA
## Exterior2nd* 13 1460 NaN NA NA NaN NA
## MasVnrType* 14 1452 NaN NA NA NaN NA
## BsmtFinType1* 15 1423 NaN NA NA NaN NA
## min max range skew kurtosis se
## SalePrice 34900 755000 720100 1.88 6.50 2079.11
## Utilities* Inf -Inf -Inf NA NA NA
## Exterior1st* Inf -Inf -Inf NA NA NA
## Heating* Inf -Inf -Inf NA NA NA
## CentralAir* Inf -Inf -Inf NA NA NA
## Electrical* Inf -Inf -Inf NA NA NA
## Fireplaces 0 3 3 0.65 -0.22 0.02
## GarageCars 0 4 4 -0.34 0.21 0.02
## PavedDrive* Inf -Inf -Inf NA NA NA
## MiscVal 0 15500 15500 24.43 697.64 12.98
## RoofMatl* Inf -Inf -Inf NA NA NA
## Exterior1st.1* Inf -Inf -Inf NA NA NA
## Exterior2nd* Inf -Inf -Inf NA NA NA
## MasVnrType* Inf -Inf -Inf NA NA NA
## BsmtFinType1* Inf -Inf -Inf NA NA NA
property_features_df <- property_features_df[complete.cases(property_features_df),]
# Convert all char class columns to factors
property_features_df <- property_features_df %>% mutate_if(is.character, as.factor)
features_model <- glm(SalePrice ~ ., data = property_features_df)
########################################################
# Rooms Model
########################################################
room_vars <- c("SalePrice","BsmtFullBath", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "TotRmsAbvGrd")
# Only full cases selected
rooms <- train2[,room_vars]
rooms <- rooms[complete.cases(rooms),]
# Convert all char class columns to factors
rooms <- rooms %>% mutate_if(is.character, as.factor)
rooms_model<- glm(SalePrice ~ ., data = rooms)
########################################################
# Layout Model
########################################################
# List all Relevant Variables
layout_vars <- c("SalePrice", "LotShape", "LotConfig", "LandSlope", "BldgType", "HouseStyle", "RoofStyle", "Foundation", "BsmtExposure", "GarageType")
# Only full cases selected
layout_df <- train2[,layout_vars]
layout_df <- layout_df[complete.cases(layout_df),]
# Convert all char class columns to factors
layout_df <- layout_df %>% mutate_if(is.character, as.factor)
# Random Forest Model
layout_model<- glm(SalePrice ~ LotShape + ., data = layout_df)
model_names <- c('size_model', 'age_model','qual_cond_model','features_model', 'rooms_model', 'layout_model')Model Evaluation
# Model Summary and selected features
summary(size_model) ##
## Call:
## glm(formula = SalePrice ~ LotArea + BsmtFinSF1 + BsmtUnfSF +
## TotalBsmtSF + X1stFlrSF + X1stFlrSF + GrLivArea + GarageArea +
## WoodDeckSF + OpenPorchSF + EnclosedPorch + ScreenPorch +
## PoolArea, data = train2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -641199 -18703 -175 18318 298631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.723e+04 4.247e+03 -4.057 5.24e-05 ***
## LotArea 1.321e-01 1.259e-01 1.050 0.294056
## BsmtFinSF1 2.740e+01 7.712e+00 3.553 0.000393 ***
## BsmtUnfSF 1.346e+01 7.527e+00 1.789 0.073850 .
## TotalBsmtSF 3.056e+01 8.681e+00 3.520 0.000444 ***
## X1stFlrSF -8.081e+00 5.802e+00 -1.393 0.163936
## GrLivArea 6.842e+01 3.012e+00 22.715 < 2e-16 ***
## GarageArea 9.339e+01 6.748e+00 13.839 < 2e-16 ***
## WoodDeckSF 5.621e+01 1.006e+01 5.586 2.78e-08 ***
## OpenPorchSF 4.044e+01 1.912e+01 2.114 0.034650 *
## EnclosedPorch -6.811e+01 1.981e+01 -3.438 0.000603 ***
## ScreenPorch 4.885e+01 2.155e+01 2.267 0.023546 *
## PoolArea -8.762e+01 2.997e+01 -2.923 0.003516 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2009198727)
##
## Null deviance: 9.2079e+12 on 1459 degrees of freedom
## Residual deviance: 2.9073e+12 on 1447 degrees of freedom
## AIC: 35433
##
## Number of Fisher Scoring iterations: 2
summary(age_model)##
## Call:
## glm(formula = SalePrice ~ YearBuilt + YearRemodAdd + GarageYrBlt +
## MoSold + YrSold, data = train2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -145207 -40362 -10765 22461 544681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -473739.56 2589277.90 -0.183 0.854853
## YearBuilt 649.48 98.25 6.611 5.36e-11 ***
## YearRemodAdd 1072.66 105.83 10.136 < 2e-16 ***
## GarageYrBlt 418.50 117.19 3.571 0.000367 ***
## MoSold 923.50 632.42 1.460 0.144430
## YrSold -1786.93 1289.21 -1.386 0.165939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 4171492107)
##
## Null deviance: 9.2079e+12 on 1459 degrees of freedom
## Residual deviance: 6.0653e+12 on 1454 degrees of freedom
## AIC: 36493
##
## Number of Fisher Scoring iterations: 2
summary(qual_cond_model)##
## Call:
## glm(formula = SalePrice ~ ., data = qual_cond_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -266556 -22511 -2792 17179 362633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.627e+04 6.654e+04 0.545 0.585782
## OverallQual 2.935e+04 1.519e+03 19.330 < 2e-16 ***
## LowQualFinSF 4.968e+00 3.241e+01 0.153 0.878186
## Condition1Feedr 1.220e+04 9.127e+03 1.336 0.181677
## Condition1Norm 1.597e+04 7.340e+03 2.176 0.029753 *
## Condition1PosA 3.758e+04 1.705e+04 2.204 0.027725 *
## Condition1PosN 3.686e+04 1.270e+04 2.902 0.003776 **
## Condition1RRAe 6.782e+03 1.616e+04 0.420 0.674838
## Condition1RRAn 1.553e+04 1.174e+04 1.322 0.186371
## Condition1RRNe 9.568e+01 3.126e+04 0.003 0.997558
## Condition1RRNn -5.480e+01 2.146e+04 -0.003 0.997963
## Condition2Feedr -3.744e+04 3.878e+04 -0.965 0.334505
## Condition2Norm -1.753e+04 3.243e+04 -0.541 0.588877
## Condition2PosA 5.513e+04 6.904e+04 0.799 0.424669
## Condition2PosN -1.839e+05 4.613e+04 -3.986 7.09e-05 ***
## Condition2RRAe 1.828e+04 5.434e+04 0.336 0.736665
## Condition2RRAn 1.188e+04 5.466e+04 0.217 0.828039
## Condition2RRNn -2.495e+04 4.551e+04 -0.548 0.583636
## ExterQualFa -4.998e+04 1.948e+04 -2.566 0.010403 *
## ExterQualGd -2.878e+04 8.174e+03 -3.521 0.000446 ***
## ExterQualTA -3.438e+04 9.068e+03 -3.792 0.000157 ***
## ExterCondFa 4.009e+04 4.466e+04 0.898 0.369543
## ExterCondGd 5.125e+04 4.319e+04 1.187 0.235551
## ExterCondTA 4.448e+04 4.307e+04 1.033 0.301942
## BsmtQualFa -3.986e+04 1.073e+04 -3.715 0.000212 ***
## BsmtQualGd -4.148e+04 5.480e+03 -7.570 7.12e-14 ***
## BsmtQualTA -4.694e+04 6.499e+03 -7.224 8.65e-13 ***
## BsmtCondGd -8.326e+02 9.535e+03 -0.087 0.930433
## BsmtCondPo 1.610e+04 5.913e+04 0.272 0.785499
## BsmtCondTA 4.725e+03 7.758e+03 0.609 0.542608
## BsmtFinType1BLQ -7.978e+01 4.803e+03 -0.017 0.986749
## BsmtFinType1GLQ 2.465e+02 4.269e+03 0.058 0.953974
## BsmtFinType1LwQ -6.240e+03 6.410e+03 -0.973 0.330490
## BsmtFinType1Rec 1.757e+03 5.034e+03 0.349 0.727165
## BsmtFinType1Unf -1.807e+04 4.100e+03 -4.407 1.14e-05 ***
## BsmtFinType2BLQ -2.809e+04 1.278e+04 -2.199 0.028043 *
## BsmtFinType2GLQ -2.625e+04 1.628e+04 -1.613 0.107072
## BsmtFinType2LwQ -2.129e+04 1.221e+04 -1.744 0.081368 .
## BsmtFinType2Rec -1.037e+04 1.200e+04 -0.864 0.387621
## BsmtFinType2Unf -2.255e+04 1.044e+04 -2.160 0.030929 *
## HeatingQCFa -4.310e+03 8.067e+03 -0.534 0.593226
## HeatingQCGd -2.863e+03 3.658e+03 -0.783 0.433929
## HeatingQCPo -3.738e+04 4.526e+04 -0.826 0.409039
## HeatingQCTA -4.004e+03 3.498e+03 -1.144 0.252655
## KitchenQualFa -5.049e+04 1.157e+04 -4.363 1.39e-05 ***
## KitchenQualGd -3.252e+04 6.080e+03 -5.349 1.05e-07 ***
## KitchenQualTA -4.389e+04 6.803e+03 -6.452 1.56e-10 ***
## FunctionalMaj2 -2.548e+04 2.880e+04 -0.885 0.376460
## FunctionalMin1 2.098e+04 1.553e+04 1.351 0.177059
## FunctionalMin2 3.004e+04 1.551e+04 1.937 0.052925 .
## FunctionalMod 4.945e+04 1.862e+04 2.655 0.008029 **
## FunctionalSev -2.184e+04 4.604e+04 -0.474 0.635319
## FunctionalTyp 1.783e+04 1.327e+04 1.344 0.179119
## GarageFinishRFn -7.335e+03 3.255e+03 -2.254 0.024383 *
## GarageFinishUnf -1.376e+04 3.694e+03 -3.725 0.000204 ***
## GarageQualFa -1.777e+05 4.829e+04 -3.680 0.000243 ***
## GarageQualGd -1.620e+05 4.969e+04 -3.260 0.001144 **
## GarageQualPo -1.891e+05 6.182e+04 -3.060 0.002261 **
## GarageQualTA -1.695e+05 4.767e+04 -3.556 0.000390 ***
## GarageCondFa 2.085e+05 5.725e+04 3.642 0.000282 ***
## GarageCondGd 2.074e+05 5.939e+04 3.492 0.000496 ***
## GarageCondPo 2.193e+05 6.157e+04 3.562 0.000381 ***
## GarageCondTA 2.077e+05 5.664e+04 3.667 0.000256 ***
## SaleConditionAdjLand 5.161e+03 4.363e+04 0.118 0.905861
## SaleConditionAlloca 4.052e+04 1.651e+04 2.455 0.014219 *
## SaleConditionFamily 3.590e+03 1.076e+04 0.334 0.738762
## SaleConditionNormal 8.496e+03 4.938e+03 1.721 0.085568 .
## SaleConditionPartial 1.918e+04 6.622e+03 2.897 0.003837 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1817551053)
##
## Null deviance: 8.4079e+12 on 1347 degrees of freedom
## Residual deviance: 2.3265e+12 on 1280 degrees of freedom
## AIC: 32634
##
## Number of Fisher Scoring iterations: 2
summary(features_model)##
## Call:
## glm(formula = SalePrice ~ ., data = property_features_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -158177 -28644 -2984 22023 380888
##
## Coefficients: (14 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.525e+05 5.608e+04 -2.720 0.006621 **
## UtilitiesNoSeWa -8.104e+04 5.071e+04 -1.598 0.110257
## Exterior1stBrkComm -2.073e+04 6.098e+04 -0.340 0.733929
## Exterior1stBrkFace 4.851e+04 2.597e+04 1.868 0.061952 .
## Exterior1stCBlock 1.230e+04 5.204e+04 0.236 0.813246
## Exterior1stCemntBd 9.345e+04 3.833e+04 2.438 0.014903 *
## Exterior1stHdBoard 3.375e+03 2.591e+04 0.130 0.896366
## Exterior1stImStucc -4.392e+04 5.834e+04 -0.753 0.451660
## Exterior1stMetalSd 2.552e+04 2.942e+04 0.868 0.385729
## Exterior1stPlywood -6.151e+03 2.567e+04 -0.240 0.810693
## Exterior1stStone 8.760e+04 5.098e+04 1.718 0.085969 .
## Exterior1stStucco 6.057e+04 2.880e+04 2.103 0.035615 *
## Exterior1stVinylSd 1.032e+04 2.689e+04 0.384 0.701345
## Exterior1stWd Sdng 2.342e+04 2.503e+04 0.936 0.349583
## Exterior1stWdShing 3.008e+03 2.716e+04 0.111 0.911829
## HeatingGasW 1.514e+04 1.361e+04 1.112 0.266235
## HeatingGrav -4.044e+03 2.023e+04 -0.200 0.841553
## HeatingOthW 2.353e+04 3.708e+04 0.635 0.525804
## CentralAirY 1.025e+04 7.272e+03 1.410 0.158764
## ElectricalFuseF -5.666e+03 1.277e+04 -0.444 0.657279
## ElectricalFuseP -4.323e+03 3.675e+04 -0.118 0.906394
## ElectricalMix -3.730e+04 5.086e+04 -0.733 0.463502
## ElectricalSBrkr 5.416e+03 6.085e+03 0.890 0.373585
## Fireplaces 3.477e+04 2.315e+03 15.020 < 2e-16 ***
## GarageCars 4.048e+04 2.260e+03 17.909 < 2e-16 ***
## PavedDriveP -1.209e+04 1.118e+04 -1.081 0.279737
## PavedDriveY -5.665e+02 6.529e+03 -0.087 0.930868
## MiscVal 1.785e+00 2.721e+00 0.656 0.511837
## RoofMatlCompShg 1.748e+05 5.198e+04 3.362 0.000794 ***
## RoofMatlMembran 2.136e+05 7.267e+04 2.940 0.003340 **
## RoofMatlMetal 1.565e+05 7.240e+04 2.162 0.030815 *
## RoofMatlRoll 1.451e+05 7.380e+04 1.966 0.049539 *
## RoofMatlTar&Grv 1.969e+05 5.442e+04 3.617 0.000308 ***
## RoofMatlWdShake 2.126e+05 5.744e+04 3.701 0.000223 ***
## RoofMatlWdShngl 3.379e+05 5.596e+04 6.038 2.01e-09 ***
## Exterior1st.1BrkComm NA NA NA NA
## Exterior1st.1BrkFace NA NA NA NA
## Exterior1st.1CBlock NA NA NA NA
## Exterior1st.1CemntBd NA NA NA NA
## Exterior1st.1HdBoard NA NA NA NA
## Exterior1st.1ImStucc NA NA NA NA
## Exterior1st.1MetalSd NA NA NA NA
## Exterior1st.1Plywood NA NA NA NA
## Exterior1st.1Stone NA NA NA NA
## Exterior1st.1Stucco NA NA NA NA
## Exterior1st.1VinylSd NA NA NA NA
## Exterior1st.1Wd Sdng NA NA NA NA
## Exterior1st.1WdShing NA NA NA NA
## Exterior2ndAsphShn -1.230e+02 4.391e+04 -0.003 0.997766
## Exterior2ndBrk Cmn -1.280e+04 3.377e+04 -0.379 0.704744
## Exterior2ndBrkFace -1.072e+04 2.702e+04 -0.397 0.691788
## Exterior2ndCBlock NA NA NA NA
## Exterior2ndCmentBd -5.013e+04 3.807e+04 -1.317 0.188097
## Exterior2ndHdBoard 3.704e+03 2.526e+04 0.147 0.883451
## Exterior2ndImStucc 6.091e+04 2.912e+04 2.091 0.036680 *
## Exterior2ndMetalSd -1.532e+04 2.901e+04 -0.528 0.597432
## Exterior2ndOther 5.133e+04 5.663e+04 0.906 0.364880
## Exterior2ndPlywood 1.696e+03 2.456e+04 0.069 0.944942
## Exterior2ndStone -3.563e+04 4.841e+04 -0.736 0.461909
## Exterior2ndStucco -3.711e+04 2.765e+04 -1.342 0.179794
## Exterior2ndVinylSd 1.647e+04 2.607e+04 0.632 0.527674
## Exterior2ndWd Sdng -1.237e+04 2.432e+04 -0.509 0.611065
## Exterior2ndWd Shng -1.234e+04 2.546e+04 -0.485 0.627854
## MasVnrTypeBrkFace 2.721e+04 1.356e+04 2.006 0.045083 *
## MasVnrTypeNone 1.486e+04 1.344e+04 1.106 0.268998
## MasVnrTypeStone 5.211e+04 1.427e+04 3.652 0.000270 ***
## BsmtFinType1BLQ 1.627e+01 5.510e+03 0.003 0.997644
## BsmtFinType1GLQ 3.658e+04 4.592e+03 7.968 3.40e-15 ***
## BsmtFinType1LwQ 4.382e+03 7.020e+03 0.624 0.532608
## BsmtFinType1Rec 1.700e+03 5.870e+03 0.290 0.772140
## BsmtFinType1Unf 6.242e+03 4.555e+03 1.370 0.170803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2512706806)
##
## Null deviance: 8.8768e+12 on 1413 degrees of freedom
## Residual deviance: 3.4097e+12 on 1357 degrees of freedom
## AIC: 34676
##
## Number of Fisher Scoring iterations: 2
summary(rooms_model)##
## Call:
## glm(formula = SalePrice ~ ., data = rooms)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -261395 -31166 -4507 25200 412948
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68748 8013 8.580 < 2e-16 ***
## BsmtFullBath 36694 2652 13.836 < 2e-16 ***
## BsmtFullBath.1 NA NA NA NA
## BsmtHalfBath 19467 5710 3.409 0.000669 ***
## FullBath 57230 2943 19.444 < 2e-16 ***
## HalfBath 15061 2892 5.207 2.19e-07 ***
## BedroomAbvGr -28250 2268 -12.459 < 2e-16 ***
## KitchenAbvGr -91921 6408 -14.345 < 2e-16 ***
## TotRmsAbvGrd 27201 1334 20.388 < 2e-16 ***
## TotRmsAbvGrd.1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2625806246)
##
## Null deviance: 9.2079e+12 on 1459 degrees of freedom
## Residual deviance: 3.8127e+12 on 1452 degrees of freedom
## AIC: 35819
##
## Number of Fisher Scoring iterations: 2
summary(layout_model)##
## Call:
## glm(formula = SalePrice ~ LotShape + ., data = layout_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -192099 -32473 -5033 23747 441363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 181639 32011 5.674 1.71e-08 ***
## LotShapeIR2 5359 9666 0.554 0.579394
## LotShapeIR3 -38349 18342 -2.091 0.036738 *
## LotShapeReg -14127 3628 -3.894 0.000104 ***
## LotConfigCulDSac 11328 7382 1.535 0.125139
## LotConfigFR2 -22983 9530 -2.412 0.016021 *
## LotConfigFR3 -1874 28778 -0.065 0.948096
## LotConfigInside -3712 4147 -0.895 0.370844
## LandSlopeMod -3328 8067 -0.413 0.680033
## LandSlopeSev -12293 18542 -0.663 0.507473
## BldgType2fmCon -31210 12589 -2.479 0.013296 *
## BldgTypeDuplex 5875 11726 0.501 0.616412
## BldgTypeTwnhs -29711 9799 -3.032 0.002476 **
## BldgTypeTwnhsE -24849 5827 -4.264 2.15e-05 ***
## HouseStyle1.5Unf -25217 17834 -1.414 0.157613
## HouseStyle1Story -13813 5984 -2.308 0.021128 *
## HouseStyle2.5Fin 88317 23957 3.687 0.000237 ***
## HouseStyle2.5Unf 17768 18818 0.944 0.345244
## HouseStyle2Story 11716 6224 1.883 0.059979 .
## HouseStyleSFoyer -59267 12633 -4.691 3.00e-06 ***
## HouseStyleSLvl -48709 9542 -5.104 3.81e-07 ***
## RoofStyleGable 12867 18947 0.679 0.497186
## RoofStyleGambrel 23692 26297 0.901 0.367798
## RoofStyleHip 54380 19104 2.847 0.004488 **
## RoofStyleMansard 51565 30187 1.708 0.087835 .
## RoofStyleShed 10295 44498 0.231 0.817065
## FoundationCBlock -9667 6399 -1.511 0.131104
## FoundationPConc 41887 6750 6.206 7.29e-10 ***
## FoundationStone 3757 24195 0.155 0.876622
## FoundationWood 2861 33897 0.084 0.932745
## BsmtExposureGd 47109 6650 7.084 2.28e-12 ***
## BsmtExposureMn -16920 6955 -2.433 0.015123 *
## BsmtExposureNo -33664 4824 -6.978 4.75e-12 ***
## GarageTypeAttchd 18418 23824 0.773 0.439608
## GarageTypeBasment -10935 27305 -0.400 0.688878
## GarageTypeBuiltIn 43830 24691 1.775 0.076104 .
## GarageTypeCarPort -34824 31756 -1.097 0.273009
## GarageTypeDetchd -17649 23810 -0.741 0.458675
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 3194693609)
##
## Null deviance: 8.4173e+12 on 1347 degrees of freedom
## Residual deviance: 4.1850e+12 on 1310 degrees of freedom
## AIC: 33366
##
## Number of Fisher Scoring iterations: 2
predict1 <- predict(size_model, type = 'response')
predict2 <- predict(age_model, type = 'response')
predict3 <- predict(qual_cond_model, type = 'response')
predict4 <- predict(features_model, type = 'response')
predict5 <- predict(rooms_model, type = 'response')
predict6 <- predict(layout_model, type = 'response')
actuals1 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict1))
actuals2 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict2))
actuals3 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict3))## Warning in cbind(actuals = train2$SalePrice, predicteds = predict3): number
## of rows of result is not a multiple of vector length (arg 2)
actuals4 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict4))## Warning in cbind(actuals = train2$SalePrice, predicteds = predict4): number
## of rows of result is not a multiple of vector length (arg 2)
actuals5 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict5))
actuals6 <- data.frame(cbind(actuals=train2$SalePrice, predicteds=predict6))## Warning in cbind(actuals = train2$SalePrice, predicteds = predict6): number
## of rows of result is not a multiple of vector length (arg 2)
min_max_accuracy1 <- mean(apply(actuals1, 1, min) / apply(actuals1, 1, max))
min_max_accuracy2 <- mean(apply(actuals2, 1, min) / apply(actuals2, 1, max))
min_max_accuracy3 <- mean(apply(actuals3, 1, min) / apply(actuals3, 1, max))
min_max_accuracy4 <- mean(apply(actuals4, 1, min) / apply(actuals4, 1, max))
min_max_accuracy5 <- mean(apply(actuals5, 1, min) / apply(actuals5, 1, max))
min_max_accuracy6 <- mean(apply(actuals6, 1, min) / apply(actuals6, 1, max))
mape1 <- mean(abs((actuals1$predicteds - actuals1$actuals))/actuals1$actuals)
mape2 <- mean(abs((actuals2$predicteds - actuals2$actuals))/actuals2$actuals)
mape3 <- mean(abs((actuals3$predicteds - actuals3$actuals))/actuals3$actuals)
mape4 <- mean(abs((actuals4$predicteds - actuals4$actuals))/actuals4$actuals)
mape5 <- mean(abs((actuals5$predicteds - actuals5$actuals))/actuals5$actuals)
mape6 <- mean(abs((actuals6$predicteds - actuals6$actuals))/actuals6$actuals)
mlr <- data.frame(row.names = c('MinMax', 'MAPE'),
size_model=c(min_max_accuracy1, mape1),
age_model=c(min_max_accuracy2, mape2),
qual_cond_model=c(min_max_accuracy3, mape3),
features_model=c(min_max_accuracy4, mape4),
rooms_model=c(min_max_accuracy5, mape5),
layout_model=c(min_max_accuracy6, mape6))
mlr## size_model age_model qual_cond_model features_model rooms_model
## MinMax 0.8654670 0.7979043 0.6966195 0.6867300 0.8250866
## MAPE 0.1652379 0.2583608 0.4749015 0.4663379 0.2163855
## layout_model
## MinMax 0.6988340
## MAPE 0.4608254
Predictions
final_prediction <- predict(size_model, test, type = 'response')
test$SalePrice<- round(final_prediction,2)
write.csv(test[,c('Id', 'SalePrice')], 'D:\\Rafal\\CUNY\\605\\final\\price_predictions.csv', row.names=FALSE)