Data source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

X variable: LotArea
Y Variable: SalePrice

# Load libraries 
library(knitr)
library(MASS)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(stats)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# set exponential display off
options(scipen = 999)
# Read data
train <- read.csv('train.csv')

# Extract X, Y variables
X <- train$LotArea
Y <- train$SalePrice

# View in a table
trainxy <- as.data.frame(cbind(X,Y))
colnames(trainxy) <- c("LotArea", "SalePrice")
kable(head(trainxy))
LotArea SalePrice
8450 208500
9600 181500
11250 223500
9550 140000
14260 250000
14115 143000

Probability

Pick x and y

# x 3d quartile
x<- summary(train$LotArea)[5]

# y 2d quartile = median 
y<- summary(train$SalePrice)[3]

cat("3rd quartile of X (LotArea) = ",x,"\n")
## 3rd quartile of X (LotArea) =  11600
cat("2nd quartile of Y (SalePrice)= ",y)
## 2nd quartile of Y (SalePrice)=  163000

(a) Calculate P(X>x|Y>y)

p_Xgrx <- length(X[X>x])/length(X)
p_Ygry <- length(Y[Y>y])/length(Y)
p_Xgrx_and_Ygry <- nrow(trainxy[trainxy$LotArea >x & train$SalePrice>y,])/nrow(trainxy)
p_Xgrx_gvn_Ygry <- p_Xgrx_and_Ygry/p_Ygry
p_Xgrx_gvn_Ygry
## [1] 0.3791209

Given house SalePrice is above the median price, there is 37.91% probability that LotArea will fall above 75% of the LotArea data provided,

(a) Calculate P(X>x,Y>y)

# calculated above
p_Xgrx_and_Ygry 
## [1] 0.1890411

There 18.90% chance that LotArea is above 75th percentile with SalePrice above median.

(a) Calculate P(Xy)

p_Xlex_and_Ygry <- nrow(trainxy[trainxy$LotArea <x & train$SalePrice>y,])/nrow(trainxy)
p_Xlex_gvn_Ygry <- p_Xlex_and_Ygry/p_Ygry
p_Xlex_gvn_Ygry
## [1] 0.6208791

There is 62.09% chance the LotArea is below 75th percetile, given the SalePrice is greater than the median.

(d) Table of counts

# x<=3d quartile and y<=2d
n_Xleqx_Yleqy <- nrow(subset(trainxy, LotArea <= x & SalePrice <= y))

# x<=3d and y>2d 
n_Xleqx_Ygry <- nrow(subset(trainxy, LotArea <= x & SalePrice > y))

# x>3d quartile y<=2d 
n_Xgrx_Yleqy <- nrow(subset(trainxy, LotArea > x & SalePrice <= y))

# x>3d and y>2d
n_Xgrx_Ygry <- nrow(subset(trainxy, LotArea > x & SalePrice > y))

r1total <- sum(c(n_Xleqx_Yleqy,n_Xleqx_Ygry))
r2total <- sum(c(n_Xgrx_Yleqy,n_Xgrx_Ygry))

c1total <- sum(c(n_Xleqx_Yleqy,n_Xgrx_Yleqy))
c2total <- sum(c(n_Xleqx_Ygry,n_Xgrx_Ygry))

cnttable <- as.data.frame(rbind(c(n_Xleqx_Yleqy,n_Xleqx_Ygry, r1total),
                  c(n_Xgrx_Yleqy,n_Xgrx_Ygry,r2total),
                  c(c1total,c2total,sum(c(r1total,r2total)))))
colnames(cnttable) <- c("2nd quartile", "2d quartile","Total")
rownames(cnttable) <- c("<=3d quartile",">3d quartile","Total")
kable(cnttable)
2nd quartile 2d quartile Total
<=3d quartile 643 452 1095
>3d quartile 89 276 365
Total 732 728 1460

Check for dependency, ie: P(A|B) = P(A)P(B)

p_A <- 365/1460
p_B <- 728/1460
p_A_and_B <- 276/1460
p_A_gvn_B <- p_A_and_B/p_B
p_AB <- p_A * p_B

cat("P(A|B) = ",p_A_gvn_B,"\n")
## P(A|B) =  0.3791209
cat("P(A)P(B) = ", p_AB)
## P(A)P(B) =  0.1246575

Since the P(A|B) \(\ne\) P(A)P(B), the variables are not independent.

Chi Square test for association

# Chisq test

tbl <- table(trainxy$LotArea, trainxy$SalePrice)
chisq.test(tbl)
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 735090, df = 709660, p-value < 0.00000000000000022

The p-value for chi-sq test is less than 0.05, we reject the assumption that these 2 variables are independent.

Descriptive and Inferential Statistics

Provide univariate descriptive statistics for training data set

stargazer::stargazer(train,type = "html")
Statistic N Mean St. Dev. Min Max
Id 1,460 730.500 421.610 1 1,460
MSSubClass 1,460 56.897 42.301 20 190
LotFrontage 1,201 70.050 24.285 21 313
LotArea 1,460 10,516.830 9,981.265 1,300 215,245
OverallQual 1,460 6.099 1.383 1 10
OverallCond 1,460 5.575 1.113 1 9
YearBuilt 1,460 1,971.268 30.203 1,872 2,010
YearRemodAdd 1,460 1,984.866 20.645 1,950 2,010
MasVnrArea 1,452 103.685 181.066 0 1,600
BsmtFinSF1 1,460 443.640 456.098 0 5,644
BsmtFinSF2 1,460 46.549 161.319 0 1,474
BsmtUnfSF 1,460 567.240 441.867 0 2,336
TotalBsmtSF 1,460 1,057.429 438.705 0 6,110
X1stFlrSF 1,460 1,162.627 386.588 334 4,692
X2ndFlrSF 1,460 346.992 436.528 0 2,065
LowQualFinSF 1,460 5.845 48.623 0 572
GrLivArea 1,460 1,515.464 525.480 334 5,642
BsmtFullBath 1,460 0.425 0.519 0 3
BsmtHalfBath 1,460 0.058 0.239 0 2
FullBath 1,460 1.565 0.551 0 3
HalfBath 1,460 0.383 0.503 0 2
BedroomAbvGr 1,460 2.866 0.816 0 8
KitchenAbvGr 1,460 1.047 0.220 0 3
TotRmsAbvGrd 1,460 6.518 1.625 2 14
Fireplaces 1,460 0.613 0.645 0 3
GarageYrBlt 1,379 1,978.506 24.690 1,900 2,010
GarageCars 1,460 1.767 0.747 0 4
GarageArea 1,460 472.980 213.805 0 1,418
WoodDeckSF 1,460 94.245 125.339 0 857
OpenPorchSF 1,460 46.660 66.256 0 547
EnclosedPorch 1,460 21.954 61.119 0 552
X3SsnPorch 1,460 3.410 29.317 0 508
ScreenPorch 1,460 15.061 55.757 0 480
PoolArea 1,460 2.759 40.177 0 738
MiscVal 1,460 43.489 496.123 0 15,500
MoSold 1,460 6.322 2.704 1 12
YrSold 1,460 2,007.816 1.328 2,006 2,010
SalePrice 1,460 180,921.200 79,442.500 34,900 755,000

correlation plot for some the square footages, price and year built

train_plot <- dplyr::select(train,LotFrontage, 
                               LotArea, 
                               MasVnrArea, 
                               TotalBsmtSF,
                               X1stFlrSF,
                               X2ndFlrSF,
                               GrLivArea,
                               YearBuilt,
                               PoolArea,
                               SalePrice)
pairs.panels(train_plot)

Histogram for X and Y

ggplot(data=train, aes(train$LotArea)) + 
  geom_histogram(aes(y =..density..),breaks = seq(10, 50000, by =200), fill=I("darkblue"), alpha=.2) +
  geom_density(col=2) + 
  labs(title="Histogram for LotArea") +
  labs(x="LotArea", y="Count")

ggplot(data=train, aes(train$SalePrice)) + 
  geom_histogram(aes(y =..density..),breaks = seq(10, 500000, by =2000), fill="darkgreen", alpha=.2) +
  geom_density(col=2) + 
  labs(title="Histogram for SalePrice") +
  labs(x="SalePrice", y="Count")

Provide a scatterplot of X and Y

qplot(LotArea, SalePrice, data=trainxy)

trainxy_sub <- trainxy[trainxy$LotArea<50000,]
qplot(LotArea, SalePrice, data=trainxy_sub)

Provide a 95% confidence interval for the difference in the mean of the variables

t.test(trainxy$LotArea, trainxy$SalePrice, paired = TRUE)
## 
##  Paired t-test
## 
## data:  trainxy$LotArea and trainxy$SalePrice
## t = -84.112, df = 1459, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -174378.4 -166430.4
## sample estimates:
## mean of the differences 
##               -170404.4

The confidence interval at 95% is [-174378.4, -166430.4]. The p-value is below 0.05 significance level and we reject the hypothesis that there is no difference in means.

Derive a correlation matrix for two of the quantitative variables you selected.

cormatrix <- cor(trainxy)
cormatrix
##             LotArea SalePrice
## LotArea   1.0000000 0.2638434
## SalePrice 0.2638434 1.0000000

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval

cor.test(trainxy$LotArea, trainxy$SalePrice, conf.level = 0.99 )
## 
##  Pearson's product-moment correlation
## 
## data:  trainxy$LotArea and trainxy$SalePrice
## t = 10.445, df = 1458, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.2000196 0.3254375
## sample estimates:
##       cor 
## 0.2638434

The p-values is lower than 0.05 and reject null hypothesis, conclude with 99% confidence that there is a linear relationship between lot area and sale price of the house.

Linear Algebra and Correlation

Invert your correlation matrix

precision_matrix <- ginv(cormatrix)

# multiply correlation matrix by the precision matrix, 
# and then multiply the precision matrix by the correlation matrix

cormatrix %*% precision_matrix %*% cormatrix
##             LotArea SalePrice
## LotArea   1.0000000 0.2638434
## SalePrice 0.2638434 1.0000000

Conduct principle components analysis

In principle component analysis, data with large amount of variables transforms numbers by their correlation. PCA works only with numeric data so I have subset data to consider only records that are numeric and NA values are imputed. The princomp function is used to do the principle analysis.

# traing data set imputation for missing values
train$LotFrontage <- as.numeric(impute(train$LotFrontage, mean))
train$MasVnrArea <- as.numeric(impute(train$MasVnrArea, mean))
train$GarageYrBlt <- as.numeric(impute(train$GarageYrBlt, median))



# select only numeric data 
train_num <- select_if(train, is.numeric)
train_num <- train_num[,2:ncol(train_num)]

# do principle componenet analysis
pc <- princomp(train_num, cor=TRUE, scores = TRUE)
plot(pc)

summary(pc)
## Importance of components:
##                           Comp.1     Comp.2     Comp.3     Comp.4
## Standard deviation     2.8104618 1.79048112 1.60475896 1.42330388
## Proportion of Variance 0.2134783 0.08664386 0.06960139 0.05475119
## Cumulative Proportion  0.2134783 0.30012211 0.36972350 0.42447468
##                            Comp.5     Comp.6     Comp.7    Comp.8
## Standard deviation     1.22101074 1.09260549 1.07328455 1.0648935
## Proportion of Variance 0.04029371 0.03226451 0.03113351 0.0306486
## Cumulative Proportion  0.46476839 0.49703290 0.52816641 0.5588150
##                            Comp.9    Comp.10    Comp.11    Comp.12
## Standard deviation     1.05322496 1.04647008 1.02503272 1.01095453
## Proportion of Variance 0.02998062 0.02959729 0.02839708 0.02762241
## Cumulative Proportion  0.58879563 0.61839291 0.64679000 0.67441240
##                           Comp.13   Comp.14    Comp.15    Comp.16
## Standard deviation     1.00562761 0.9775285 0.95818171 0.94807145
## Proportion of Variance 0.02733208 0.0258260 0.02481384 0.02429296
## Cumulative Proportion  0.70174448 0.7275705 0.75238432 0.77667728
##                           Comp.17    Comp.18    Comp.19    Comp.20
## Standard deviation     0.91918036 0.90155569 0.88618408 0.85985013
## Proportion of Variance 0.02283493 0.02196764 0.02122493 0.01998222
## Cumulative Proportion  0.79951221 0.82147985 0.84270478 0.86268700
##                           Comp.21   Comp.22    Comp.23    Comp.24
## Standard deviation     0.82091814 0.7902795 0.76629939 0.73534884
## Proportion of Variance 0.01821369 0.0168795 0.01587067 0.01461454
## Cumulative Proportion  0.88090069 0.8977802 0.91365087 0.92826540
##                           Comp.25    Comp.26     Comp.27     Comp.28
## Standard deviation     0.65581567 0.63354871 0.567672597 0.539558986
## Proportion of Variance 0.01162417 0.01084822 0.008709518 0.007868213
## Cumulative Proportion  0.93988957 0.95073779 0.959447305 0.967315518
##                            Comp.29     Comp.30     Comp.31     Comp.32
## Standard deviation     0.513570858 0.492135861 0.441660989 0.389426605
## Proportion of Variance 0.007128514 0.006545884 0.005272012 0.004098732
## Cumulative Proportion  0.974444032 0.980989916 0.986261928 0.990360660
##                           Comp.33     Comp.34     Comp.35
## Standard deviation     0.37523153 0.349743631 0.305837012
## Proportion of Variance 0.00380537 0.003305962 0.002528008
## Cumulative Proportion  0.99416603 0.997471992 1.000000000
##                                         Comp.36                  Comp.37
## Standard deviation     0.0000000798013763402817 0.0000000772583438416897
## Proportion of Variance 0.0000000000000001721151 0.0000000000000001613203
## Cumulative Proportion  0.9999999999999997779554 1.0000000000000000000000

The component 1 explains roughly 21% variance in data. Cumulative propotion from Component 1 through component 26 explains 95% of variance in the data.

Calculus-Based Probability & Statistics

For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function.

LotArea_exp <- fitdistr(train$LotArea,"exponential")
lambda <- LotArea_exp$estimate
LotArea_exp_sample <- as.data.frame(rexp(1000,lambda))
colnames(LotArea_exp_sample) <- "LotArea"


ggplot(data=LotArea_exp_sample, aes(LotArea_exp_sample$LotArea)) + 
  geom_histogram(breaks = seq(10, 50000, by =300), fill="blue", alpha=.2) +
  labs(title="Exponential Distribution for LotArea") +
  labs(x="LotArea", y="Count")

train_lotarea_sub <- train[train$LotArea<50000,]
ggplot(data=train_lotarea_sub, aes(train_lotarea_sub$LotArea)) + 
  geom_histogram(breaks = seq(10, 50000, by =200), fill="blue", alpha=.2) +
  labs(title="Original Distribution for LotArea") +
  labs(x="LotArea", y="Count")

find the 5th and 95th percentiles using the cumulative distribution function (CDF)

quantile(LotArea_exp_sample$LotArea, c(0.05,0.95))
##         5%        95% 
##   459.9923 32261.2770
quantile(train$LotArea, c(0.05,0.95))
##       5%      95% 
##  3311.70 17401.15

generate a 95% confidence interval from the empirical data, assuming normality

qnorm(0.95, mean(train$LotArea), sd(train$LotArea))
## [1] 26934.55

Modeling

I will be using stepwise model selection strategy with backward-elimination.

# separate data into training and actual values
train_names <- names(train_num)
train_names <- train_names[train_names != "SalePrice"]
train_fmla <- as.formula(paste("SalePrice ~", paste(train_names, collapse="+")))
fit <- lm(train_fmla, data=train_num)
summary(fit)
## 
## Call:
## lm(formula = train_fmla, data = train_num)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -471008  -16501   -2038   13863  302577 
## 
## Coefficients: (2 not defined because of singularities)
##                   Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)    462778.8692 1413527.1082   0.327             0.743419    
## MSSubClass       -181.7062      27.6713  -6.567      0.0000000000719 ***
## LotFrontage       -56.1963      51.7692  -1.086             0.277876    
## LotArea             0.4303       0.1021   4.214      0.0000266980127 ***
## OverallQual     17320.7293    1187.5402  14.585 < 0.0000000000000002 ***
## OverallCond      4680.5348    1032.5016   4.533      0.0000062962059 ***
## YearBuilt         269.4994      67.4129   3.998      0.0000672250033 ***
## YearRemodAdd      134.3047      68.5842   1.958             0.050396 .  
## MasVnrArea         31.4455       5.9491   5.286      0.0000001446829 ***
## BsmtFinSF1         19.2053       4.6669   4.115      0.0000409017230 ***
## BsmtFinSF2          8.3132       7.0570   1.178             0.238993    
## BsmtUnfSF           9.3065       4.1938   2.219             0.026636 *  
## TotalBsmtSF             NA           NA      NA                   NA    
## X1stFlrSF          48.9705       5.8103   8.428 < 0.0000000000000002 ***
## X2ndFlrSF          49.0140       4.9836   9.835 < 0.0000000000000002 ***
## LowQualFinSF       25.3127      19.9690   1.268             0.205149    
## GrLivArea               NA           NA      NA                   NA    
## BsmtFullBath     9355.2260    2611.5166   3.582             0.000352 ***
## BsmtHalfBath     2049.1849    4090.9839   0.501             0.616517    
## FullBath         3419.0253    2836.5346   1.205             0.228267    
## HalfBath        -1903.7569    2662.9675  -0.715             0.474788    
## BedroomAbvGr   -10087.7212    1701.6749  -5.928      0.0000000038384 ***
## KitchenAbvGr   -12230.3391    5211.4037  -2.347             0.019070 *  
## TotRmsAbvGrd     5059.0816    1236.9326   4.090      0.0000455460633 ***
## Fireplaces       3985.6769    1776.6968   2.243             0.025030 *  
## GarageYrBlt       126.4950      68.9776   1.834             0.066884 .  
## GarageCars      11294.3950    2876.3587   3.927      0.0000902606349 ***
## GarageArea         -4.3535       9.9407  -0.438             0.661491    
## WoodDeckSF         23.9664       8.0114   2.992             0.002823 ** 
## OpenPorchSF        -2.9775      15.1803  -0.196             0.844528    
## EnclosedPorch      11.8417      16.8632   0.702             0.482658    
## X3SsnPorch         20.5174      31.3903   0.654             0.513461    
## ScreenPorch        56.0462      17.1900   3.260             0.001139 ** 
## PoolArea          -29.0098      23.8071  -1.219             0.223223    
## MiscVal            -0.7290       1.8548  -0.393             0.694331    
## MoSold            -49.6533     344.7623  -0.144             0.885504    
## YrSold           -780.2940     702.4832  -1.111             0.266857    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34750 on 1425 degrees of freedom
## Multiple R-squared:  0.8132, Adjusted R-squared:  0.8087 
## F-statistic: 182.4 on 34 and 1425 DF,  p-value: < 0.00000000000000022

The NA co-efficient for TotalBsmtSF and GrLivArea means they cannot be estimated. Excluding them will result in same fit.

train_names <- train_names[!train_names %in% c('LotFrontage',
                                               'YearRemodAdd',
                                              'BsmtFinSF2',
                                              'TotalBsmtSF',
                                              'LowQualFinSF',
                                              'GrLivArea',
                                              'BsmtHalfBath',
                                              'FullBath',
                                              'HalfBath',
                                              'GarageYrBlt',
                                              'GarageArea',
                                              'OpenPorchSF',
                                              'EnclosedPorch',
                                              'X3SsnPorch',
                                              'ScreenPorch',
                                              'PoolArea',
                                              'MiscVal',
                                              'MoSold',
                                              'YrSold') ]


train_fmla <- as.formula(paste("SalePrice ~", paste(train_names, collapse="+")))
fit <- lm(formula = train_fmla, data=train)
summary(fit)
## 
## Call:
## lm(formula = train_fmla, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -497266  -16027   -1982   13430  281677 
## 
## Coefficients:
##                  Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)  -802841.6059   88817.2625  -9.039 < 0.0000000000000002 ***
## MSSubClass      -158.7004      25.9291  -6.121 0.000000001199554531 ***
## LotArea            0.4114       0.1004   4.096 0.000044431411301891 ***
## OverallQual    18365.5683    1161.9374  15.806 < 0.0000000000000002 ***
## OverallCond     5366.6109     928.2592   5.781 0.000000009068577346 ***
## YearBuilt        373.1563      45.0783   8.278 0.000000000000000282 ***
## MasVnrArea        29.2194       5.8863   4.964 0.000000772760240138 ***
## BsmtFinSF1        15.1807       3.9262   3.866             0.000115 ***
## BsmtUnfSF          7.2111       3.6447   1.978             0.048063 *  
## X1stFlrSF         52.9347       5.1065  10.366 < 0.0000000000000002 ***
## X2ndFlrSF         48.5568       4.0932  11.863 < 0.0000000000000002 ***
## BsmtFullBath    9625.3950    2406.7022   3.999 0.000066717756195610 ***
## BedroomAbvGr  -10438.5120    1649.2894  -6.329 0.000000000328477548 ***
## KitchenAbvGr  -14182.0715    5081.7497  -2.791             0.005327 ** 
## TotRmsAbvGrd    5528.3000    1213.7019   4.555 0.000005682680123286 ***
## Fireplaces      3384.3991    1720.7570   1.967             0.049397 *  
## GarageCars     10803.0010    1697.7420   6.363 0.000000000264859174 ***
## WoodDeckSF        22.9435       7.8647   2.917             0.003586 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34950 on 1442 degrees of freedom
## Multiple R-squared:  0.8087, Adjusted R-squared:  0.8065 
## F-statistic: 358.6 on 17 and 1442 DF,  p-value: < 0.00000000000000022
# Let's plot the fit
plot(fit)

The first plot shows that there is a small degree of non-linearity with the curve where some points go below 0. We should have probably normalized the data. There are about 3 extreme values and if we exclude them, the data might show more linearity.

The second plot has data beginning and end that are going off the line. Ideally, all points should go on the line that would signify the model that is generated.

The third plot shows us the distribution of residuals around the linear model. Most prices range in the middle and less on the higher and lower end.

The last plot shows the impact of model with extreme values. In our case the extreme values do not deter the model in from the line.

Run the model against the test data.

# Read the file
test_data <- read.csv('test.csv')

# Select only numeric columns
test_num <- select_if(test_data, is.numeric)

# Impute columns with NA values
test_data$LotFrontage <- as.numeric(impute(test_data$LotFrontage, median))
test_data$MasVnrArea <- as.numeric(impute(test_data$MasVnrArea, mean))
test_data$BsmtFinSF1 <- as.numeric(impute(test_data$BsmtFinSF1, mean))
test_data$BsmtFinSF2 <- as.numeric(impute(test_data$BsmtFinSF2, mean))
test_data$BsmtUnfSF <- as.numeric(impute(test_data$BsmtUnfSF, mean))
test_data$TotalBsmtSF <- as.numeric(impute(test_data$TotalBsmtSF, mean))
test_data$BsmtFullBath <- as.numeric(impute(test_data$BsmtFullBath, median))
test_data$BsmtHalfBath <- as.numeric(impute(test_data$BsmtHalfBath, median))
test_data$GarageYrBlt <- as.numeric(impute(test_data$GarageYrBlt, median))
test_data$GarageCars  <- as.numeric(impute(test_data$GarageCars , median))
test_data$GarageArea  <- as.numeric(impute(test_data$GarageArea , mean))

test_predicted <- as.data.frame(predict(fit, test_data))
colnames(test_predicted) <- "LotArea"
# hist(test_predicted)
#write.csv(cbind(test_data$Id, predict(fit, test_data)),"house_sale_price.csv", row.names = FALSE)

qplot(test_predicted,
      geom="histogram",
      main = "Predicted SalePrice Distribution", 
      xlab = "test_predicted",  
      fill=I("darkgreen"),
      col = I('red'),alpha=I(.2))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The model predicts Sale Price with test data and I have submitted on Kaggle under Puneet Auluck with score of 0.24823.