Data source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques
X variable: LotArea
Y Variable: SalePrice
# Load libraries
library(knitr)
library(MASS)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(stats)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# set exponential display off
options(scipen = 999)
# Read data
train <- read.csv('train.csv')
# Extract X, Y variables
X <- train$LotArea
Y <- train$SalePrice
# View in a table
trainxy <- as.data.frame(cbind(X,Y))
colnames(trainxy) <- c("LotArea", "SalePrice")
kable(head(trainxy))
| LotArea | SalePrice |
|---|---|
| 8450 | 208500 |
| 9600 | 181500 |
| 11250 | 223500 |
| 9550 | 140000 |
| 14260 | 250000 |
| 14115 | 143000 |
# x 3d quartile
x<- summary(train$LotArea)[5]
# y 2d quartile = median
y<- summary(train$SalePrice)[3]
cat("3rd quartile of X (LotArea) = ",x,"\n")
## 3rd quartile of X (LotArea) = 11600
cat("2nd quartile of Y (SalePrice)= ",y)
## 2nd quartile of Y (SalePrice)= 163000
p_Xgrx <- length(X[X>x])/length(X)
p_Ygry <- length(Y[Y>y])/length(Y)
p_Xgrx_and_Ygry <- nrow(trainxy[trainxy$LotArea >x & train$SalePrice>y,])/nrow(trainxy)
p_Xgrx_gvn_Ygry <- p_Xgrx_and_Ygry/p_Ygry
p_Xgrx_gvn_Ygry
## [1] 0.3791209
Given house SalePrice is above the median price, there is 37.91% probability that LotArea will fall above 75% of the LotArea data provided,
# calculated above
p_Xgrx_and_Ygry
## [1] 0.1890411
There 18.90% chance that LotArea is above 75th percentile with SalePrice above median.
p_Xlex_and_Ygry <- nrow(trainxy[trainxy$LotArea <x & train$SalePrice>y,])/nrow(trainxy)
p_Xlex_gvn_Ygry <- p_Xlex_and_Ygry/p_Ygry
p_Xlex_gvn_Ygry
## [1] 0.6208791
There is 62.09% chance the LotArea is below 75th percetile, given the SalePrice is greater than the median.
# x<=3d quartile and y<=2d
n_Xleqx_Yleqy <- nrow(subset(trainxy, LotArea <= x & SalePrice <= y))
# x<=3d and y>2d
n_Xleqx_Ygry <- nrow(subset(trainxy, LotArea <= x & SalePrice > y))
# x>3d quartile y<=2d
n_Xgrx_Yleqy <- nrow(subset(trainxy, LotArea > x & SalePrice <= y))
# x>3d and y>2d
n_Xgrx_Ygry <- nrow(subset(trainxy, LotArea > x & SalePrice > y))
r1total <- sum(c(n_Xleqx_Yleqy,n_Xleqx_Ygry))
r2total <- sum(c(n_Xgrx_Yleqy,n_Xgrx_Ygry))
c1total <- sum(c(n_Xleqx_Yleqy,n_Xgrx_Yleqy))
c2total <- sum(c(n_Xleqx_Ygry,n_Xgrx_Ygry))
cnttable <- as.data.frame(rbind(c(n_Xleqx_Yleqy,n_Xleqx_Ygry, r1total),
c(n_Xgrx_Yleqy,n_Xgrx_Ygry,r2total),
c(c1total,c2total,sum(c(r1total,r2total)))))
colnames(cnttable) <- c("2nd quartile", "2d quartile","Total")
rownames(cnttable) <- c("<=3d quartile",">3d quartile","Total")
kable(cnttable)
| 2nd quartile | 2d quartile | Total | |
|---|---|---|---|
| <=3d quartile | 643 | 452 | 1095 |
| >3d quartile | 89 | 276 | 365 |
| Total | 732 | 728 | 1460 |
p_A <- 365/1460
p_B <- 728/1460
p_A_and_B <- 276/1460
p_A_gvn_B <- p_A_and_B/p_B
p_AB <- p_A * p_B
cat("P(A|B) = ",p_A_gvn_B,"\n")
## P(A|B) = 0.3791209
cat("P(A)P(B) = ", p_AB)
## P(A)P(B) = 0.1246575
Since the P(A|B) \(\ne\) P(A)P(B), the variables are not independent.
# Chisq test
tbl <- table(trainxy$LotArea, trainxy$SalePrice)
chisq.test(tbl)
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 735090, df = 709660, p-value < 0.00000000000000022
The p-value for chi-sq test is less than 0.05, we reject the assumption that these 2 variables are independent.
stargazer::stargazer(train,type = "html")
| Statistic | N | Mean | St. Dev. | Min | Max |
| Id | 1,460 | 730.500 | 421.610 | 1 | 1,460 |
| MSSubClass | 1,460 | 56.897 | 42.301 | 20 | 190 |
| LotFrontage | 1,201 | 70.050 | 24.285 | 21 | 313 |
| LotArea | 1,460 | 10,516.830 | 9,981.265 | 1,300 | 215,245 |
| OverallQual | 1,460 | 6.099 | 1.383 | 1 | 10 |
| OverallCond | 1,460 | 5.575 | 1.113 | 1 | 9 |
| YearBuilt | 1,460 | 1,971.268 | 30.203 | 1,872 | 2,010 |
| YearRemodAdd | 1,460 | 1,984.866 | 20.645 | 1,950 | 2,010 |
| MasVnrArea | 1,452 | 103.685 | 181.066 | 0 | 1,600 |
| BsmtFinSF1 | 1,460 | 443.640 | 456.098 | 0 | 5,644 |
| BsmtFinSF2 | 1,460 | 46.549 | 161.319 | 0 | 1,474 |
| BsmtUnfSF | 1,460 | 567.240 | 441.867 | 0 | 2,336 |
| TotalBsmtSF | 1,460 | 1,057.429 | 438.705 | 0 | 6,110 |
| X1stFlrSF | 1,460 | 1,162.627 | 386.588 | 334 | 4,692 |
| X2ndFlrSF | 1,460 | 346.992 | 436.528 | 0 | 2,065 |
| LowQualFinSF | 1,460 | 5.845 | 48.623 | 0 | 572 |
| GrLivArea | 1,460 | 1,515.464 | 525.480 | 334 | 5,642 |
| BsmtFullBath | 1,460 | 0.425 | 0.519 | 0 | 3 |
| BsmtHalfBath | 1,460 | 0.058 | 0.239 | 0 | 2 |
| FullBath | 1,460 | 1.565 | 0.551 | 0 | 3 |
| HalfBath | 1,460 | 0.383 | 0.503 | 0 | 2 |
| BedroomAbvGr | 1,460 | 2.866 | 0.816 | 0 | 8 |
| KitchenAbvGr | 1,460 | 1.047 | 0.220 | 0 | 3 |
| TotRmsAbvGrd | 1,460 | 6.518 | 1.625 | 2 | 14 |
| Fireplaces | 1,460 | 0.613 | 0.645 | 0 | 3 |
| GarageYrBlt | 1,379 | 1,978.506 | 24.690 | 1,900 | 2,010 |
| GarageCars | 1,460 | 1.767 | 0.747 | 0 | 4 |
| GarageArea | 1,460 | 472.980 | 213.805 | 0 | 1,418 |
| WoodDeckSF | 1,460 | 94.245 | 125.339 | 0 | 857 |
| OpenPorchSF | 1,460 | 46.660 | 66.256 | 0 | 547 |
| EnclosedPorch | 1,460 | 21.954 | 61.119 | 0 | 552 |
| X3SsnPorch | 1,460 | 3.410 | 29.317 | 0 | 508 |
| ScreenPorch | 1,460 | 15.061 | 55.757 | 0 | 480 |
| PoolArea | 1,460 | 2.759 | 40.177 | 0 | 738 |
| MiscVal | 1,460 | 43.489 | 496.123 | 0 | 15,500 |
| MoSold | 1,460 | 6.322 | 2.704 | 1 | 12 |
| YrSold | 1,460 | 2,007.816 | 1.328 | 2,006 | 2,010 |
| SalePrice | 1,460 | 180,921.200 | 79,442.500 | 34,900 | 755,000 |
train_plot <- dplyr::select(train,LotFrontage,
LotArea,
MasVnrArea,
TotalBsmtSF,
X1stFlrSF,
X2ndFlrSF,
GrLivArea,
YearBuilt,
PoolArea,
SalePrice)
pairs.panels(train_plot)
ggplot(data=train, aes(train$LotArea)) +
geom_histogram(aes(y =..density..),breaks = seq(10, 50000, by =200), fill=I("darkblue"), alpha=.2) +
geom_density(col=2) +
labs(title="Histogram for LotArea") +
labs(x="LotArea", y="Count")
ggplot(data=train, aes(train$SalePrice)) +
geom_histogram(aes(y =..density..),breaks = seq(10, 500000, by =2000), fill="darkgreen", alpha=.2) +
geom_density(col=2) +
labs(title="Histogram for SalePrice") +
labs(x="SalePrice", y="Count")
qplot(LotArea, SalePrice, data=trainxy)
trainxy_sub <- trainxy[trainxy$LotArea<50000,]
qplot(LotArea, SalePrice, data=trainxy_sub)
t.test(trainxy$LotArea, trainxy$SalePrice, paired = TRUE)
##
## Paired t-test
##
## data: trainxy$LotArea and trainxy$SalePrice
## t = -84.112, df = 1459, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -174378.4 -166430.4
## sample estimates:
## mean of the differences
## -170404.4
The confidence interval at 95% is [-174378.4, -166430.4]. The p-value is below 0.05 significance level and we reject the hypothesis that there is no difference in means.
cormatrix <- cor(trainxy)
cormatrix
## LotArea SalePrice
## LotArea 1.0000000 0.2638434
## SalePrice 0.2638434 1.0000000
cor.test(trainxy$LotArea, trainxy$SalePrice, conf.level = 0.99 )
##
## Pearson's product-moment correlation
##
## data: trainxy$LotArea and trainxy$SalePrice
## t = 10.445, df = 1458, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.2000196 0.3254375
## sample estimates:
## cor
## 0.2638434
The p-values is lower than 0.05 and reject null hypothesis, conclude with 99% confidence that there is a linear relationship between lot area and sale price of the house.
precision_matrix <- ginv(cormatrix)
# multiply correlation matrix by the precision matrix,
# and then multiply the precision matrix by the correlation matrix
cormatrix %*% precision_matrix %*% cormatrix
## LotArea SalePrice
## LotArea 1.0000000 0.2638434
## SalePrice 0.2638434 1.0000000
In principle component analysis, data with large amount of variables transforms numbers by their correlation. PCA works only with numeric data so I have subset data to consider only records that are numeric and NA values are imputed. The princomp function is used to do the principle analysis.
# traing data set imputation for missing values
train$LotFrontage <- as.numeric(impute(train$LotFrontage, mean))
train$MasVnrArea <- as.numeric(impute(train$MasVnrArea, mean))
train$GarageYrBlt <- as.numeric(impute(train$GarageYrBlt, median))
# select only numeric data
train_num <- select_if(train, is.numeric)
train_num <- train_num[,2:ncol(train_num)]
# do principle componenet analysis
pc <- princomp(train_num, cor=TRUE, scores = TRUE)
plot(pc)
summary(pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 2.8104618 1.79048112 1.60475896 1.42330388
## Proportion of Variance 0.2134783 0.08664386 0.06960139 0.05475119
## Cumulative Proportion 0.2134783 0.30012211 0.36972350 0.42447468
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 1.22101074 1.09260549 1.07328455 1.0648935
## Proportion of Variance 0.04029371 0.03226451 0.03113351 0.0306486
## Cumulative Proportion 0.46476839 0.49703290 0.52816641 0.5588150
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 1.05322496 1.04647008 1.02503272 1.01095453
## Proportion of Variance 0.02998062 0.02959729 0.02839708 0.02762241
## Cumulative Proportion 0.58879563 0.61839291 0.64679000 0.67441240
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 1.00562761 0.9775285 0.95818171 0.94807145
## Proportion of Variance 0.02733208 0.0258260 0.02481384 0.02429296
## Cumulative Proportion 0.70174448 0.7275705 0.75238432 0.77667728
## Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 0.91918036 0.90155569 0.88618408 0.85985013
## Proportion of Variance 0.02283493 0.02196764 0.02122493 0.01998222
## Cumulative Proportion 0.79951221 0.82147985 0.84270478 0.86268700
## Comp.21 Comp.22 Comp.23 Comp.24
## Standard deviation 0.82091814 0.7902795 0.76629939 0.73534884
## Proportion of Variance 0.01821369 0.0168795 0.01587067 0.01461454
## Cumulative Proportion 0.88090069 0.8977802 0.91365087 0.92826540
## Comp.25 Comp.26 Comp.27 Comp.28
## Standard deviation 0.65581567 0.63354871 0.567672597 0.539558986
## Proportion of Variance 0.01162417 0.01084822 0.008709518 0.007868213
## Cumulative Proportion 0.93988957 0.95073779 0.959447305 0.967315518
## Comp.29 Comp.30 Comp.31 Comp.32
## Standard deviation 0.513570858 0.492135861 0.441660989 0.389426605
## Proportion of Variance 0.007128514 0.006545884 0.005272012 0.004098732
## Cumulative Proportion 0.974444032 0.980989916 0.986261928 0.990360660
## Comp.33 Comp.34 Comp.35
## Standard deviation 0.37523153 0.349743631 0.305837012
## Proportion of Variance 0.00380537 0.003305962 0.002528008
## Cumulative Proportion 0.99416603 0.997471992 1.000000000
## Comp.36 Comp.37
## Standard deviation 0.0000000798013763402817 0.0000000772583438416897
## Proportion of Variance 0.0000000000000001721151 0.0000000000000001613203
## Cumulative Proportion 0.9999999999999997779554 1.0000000000000000000000
The component 1 explains roughly 21% variance in data. Cumulative propotion from Component 1 through component 26 explains 95% of variance in the data.
LotArea_exp <- fitdistr(train$LotArea,"exponential")
lambda <- LotArea_exp$estimate
LotArea_exp_sample <- as.data.frame(rexp(1000,lambda))
colnames(LotArea_exp_sample) <- "LotArea"
ggplot(data=LotArea_exp_sample, aes(LotArea_exp_sample$LotArea)) +
geom_histogram(breaks = seq(10, 50000, by =300), fill="blue", alpha=.2) +
labs(title="Exponential Distribution for LotArea") +
labs(x="LotArea", y="Count")
train_lotarea_sub <- train[train$LotArea<50000,]
ggplot(data=train_lotarea_sub, aes(train_lotarea_sub$LotArea)) +
geom_histogram(breaks = seq(10, 50000, by =200), fill="blue", alpha=.2) +
labs(title="Original Distribution for LotArea") +
labs(x="LotArea", y="Count")
quantile(LotArea_exp_sample$LotArea, c(0.05,0.95))
## 5% 95%
## 459.9923 32261.2770
quantile(train$LotArea, c(0.05,0.95))
## 5% 95%
## 3311.70 17401.15
qnorm(0.95, mean(train$LotArea), sd(train$LotArea))
## [1] 26934.55
I will be using stepwise model selection strategy with backward-elimination.
# separate data into training and actual values
train_names <- names(train_num)
train_names <- train_names[train_names != "SalePrice"]
train_fmla <- as.formula(paste("SalePrice ~", paste(train_names, collapse="+")))
fit <- lm(train_fmla, data=train_num)
summary(fit)
##
## Call:
## lm(formula = train_fmla, data = train_num)
##
## Residuals:
## Min 1Q Median 3Q Max
## -471008 -16501 -2038 13863 302577
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 462778.8692 1413527.1082 0.327 0.743419
## MSSubClass -181.7062 27.6713 -6.567 0.0000000000719 ***
## LotFrontage -56.1963 51.7692 -1.086 0.277876
## LotArea 0.4303 0.1021 4.214 0.0000266980127 ***
## OverallQual 17320.7293 1187.5402 14.585 < 0.0000000000000002 ***
## OverallCond 4680.5348 1032.5016 4.533 0.0000062962059 ***
## YearBuilt 269.4994 67.4129 3.998 0.0000672250033 ***
## YearRemodAdd 134.3047 68.5842 1.958 0.050396 .
## MasVnrArea 31.4455 5.9491 5.286 0.0000001446829 ***
## BsmtFinSF1 19.2053 4.6669 4.115 0.0000409017230 ***
## BsmtFinSF2 8.3132 7.0570 1.178 0.238993
## BsmtUnfSF 9.3065 4.1938 2.219 0.026636 *
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 48.9705 5.8103 8.428 < 0.0000000000000002 ***
## X2ndFlrSF 49.0140 4.9836 9.835 < 0.0000000000000002 ***
## LowQualFinSF 25.3127 19.9690 1.268 0.205149
## GrLivArea NA NA NA NA
## BsmtFullBath 9355.2260 2611.5166 3.582 0.000352 ***
## BsmtHalfBath 2049.1849 4090.9839 0.501 0.616517
## FullBath 3419.0253 2836.5346 1.205 0.228267
## HalfBath -1903.7569 2662.9675 -0.715 0.474788
## BedroomAbvGr -10087.7212 1701.6749 -5.928 0.0000000038384 ***
## KitchenAbvGr -12230.3391 5211.4037 -2.347 0.019070 *
## TotRmsAbvGrd 5059.0816 1236.9326 4.090 0.0000455460633 ***
## Fireplaces 3985.6769 1776.6968 2.243 0.025030 *
## GarageYrBlt 126.4950 68.9776 1.834 0.066884 .
## GarageCars 11294.3950 2876.3587 3.927 0.0000902606349 ***
## GarageArea -4.3535 9.9407 -0.438 0.661491
## WoodDeckSF 23.9664 8.0114 2.992 0.002823 **
## OpenPorchSF -2.9775 15.1803 -0.196 0.844528
## EnclosedPorch 11.8417 16.8632 0.702 0.482658
## X3SsnPorch 20.5174 31.3903 0.654 0.513461
## ScreenPorch 56.0462 17.1900 3.260 0.001139 **
## PoolArea -29.0098 23.8071 -1.219 0.223223
## MiscVal -0.7290 1.8548 -0.393 0.694331
## MoSold -49.6533 344.7623 -0.144 0.885504
## YrSold -780.2940 702.4832 -1.111 0.266857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34750 on 1425 degrees of freedom
## Multiple R-squared: 0.8132, Adjusted R-squared: 0.8087
## F-statistic: 182.4 on 34 and 1425 DF, p-value: < 0.00000000000000022
The NA co-efficient for TotalBsmtSF and GrLivArea means they cannot be estimated. Excluding them will result in same fit.
train_names <- train_names[!train_names %in% c('LotFrontage',
'YearRemodAdd',
'BsmtFinSF2',
'TotalBsmtSF',
'LowQualFinSF',
'GrLivArea',
'BsmtHalfBath',
'FullBath',
'HalfBath',
'GarageYrBlt',
'GarageArea',
'OpenPorchSF',
'EnclosedPorch',
'X3SsnPorch',
'ScreenPorch',
'PoolArea',
'MiscVal',
'MoSold',
'YrSold') ]
train_fmla <- as.formula(paste("SalePrice ~", paste(train_names, collapse="+")))
fit <- lm(formula = train_fmla, data=train)
summary(fit)
##
## Call:
## lm(formula = train_fmla, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -497266 -16027 -1982 13430 281677
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -802841.6059 88817.2625 -9.039 < 0.0000000000000002 ***
## MSSubClass -158.7004 25.9291 -6.121 0.000000001199554531 ***
## LotArea 0.4114 0.1004 4.096 0.000044431411301891 ***
## OverallQual 18365.5683 1161.9374 15.806 < 0.0000000000000002 ***
## OverallCond 5366.6109 928.2592 5.781 0.000000009068577346 ***
## YearBuilt 373.1563 45.0783 8.278 0.000000000000000282 ***
## MasVnrArea 29.2194 5.8863 4.964 0.000000772760240138 ***
## BsmtFinSF1 15.1807 3.9262 3.866 0.000115 ***
## BsmtUnfSF 7.2111 3.6447 1.978 0.048063 *
## X1stFlrSF 52.9347 5.1065 10.366 < 0.0000000000000002 ***
## X2ndFlrSF 48.5568 4.0932 11.863 < 0.0000000000000002 ***
## BsmtFullBath 9625.3950 2406.7022 3.999 0.000066717756195610 ***
## BedroomAbvGr -10438.5120 1649.2894 -6.329 0.000000000328477548 ***
## KitchenAbvGr -14182.0715 5081.7497 -2.791 0.005327 **
## TotRmsAbvGrd 5528.3000 1213.7019 4.555 0.000005682680123286 ***
## Fireplaces 3384.3991 1720.7570 1.967 0.049397 *
## GarageCars 10803.0010 1697.7420 6.363 0.000000000264859174 ***
## WoodDeckSF 22.9435 7.8647 2.917 0.003586 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34950 on 1442 degrees of freedom
## Multiple R-squared: 0.8087, Adjusted R-squared: 0.8065
## F-statistic: 358.6 on 17 and 1442 DF, p-value: < 0.00000000000000022
# Let's plot the fit
plot(fit)
The first plot shows that there is a small degree of non-linearity with the curve where some points go below 0. We should have probably normalized the data. There are about 3 extreme values and if we exclude them, the data might show more linearity.
The second plot has data beginning and end that are going off the line. Ideally, all points should go on the line that would signify the model that is generated.
The third plot shows us the distribution of residuals around the linear model. Most prices range in the middle and less on the higher and lower end.
The last plot shows the impact of model with extreme values. In our case the extreme values do not deter the model in from the line.
# Read the file
test_data <- read.csv('test.csv')
# Select only numeric columns
test_num <- select_if(test_data, is.numeric)
# Impute columns with NA values
test_data$LotFrontage <- as.numeric(impute(test_data$LotFrontage, median))
test_data$MasVnrArea <- as.numeric(impute(test_data$MasVnrArea, mean))
test_data$BsmtFinSF1 <- as.numeric(impute(test_data$BsmtFinSF1, mean))
test_data$BsmtFinSF2 <- as.numeric(impute(test_data$BsmtFinSF2, mean))
test_data$BsmtUnfSF <- as.numeric(impute(test_data$BsmtUnfSF, mean))
test_data$TotalBsmtSF <- as.numeric(impute(test_data$TotalBsmtSF, mean))
test_data$BsmtFullBath <- as.numeric(impute(test_data$BsmtFullBath, median))
test_data$BsmtHalfBath <- as.numeric(impute(test_data$BsmtHalfBath, median))
test_data$GarageYrBlt <- as.numeric(impute(test_data$GarageYrBlt, median))
test_data$GarageCars <- as.numeric(impute(test_data$GarageCars , median))
test_data$GarageArea <- as.numeric(impute(test_data$GarageArea , mean))
test_predicted <- as.data.frame(predict(fit, test_data))
colnames(test_predicted) <- "LotArea"
# hist(test_predicted)
#write.csv(cbind(test_data$Id, predict(fit, test_data)),"house_sale_price.csv", row.names = FALSE)
qplot(test_predicted,
geom="histogram",
main = "Predicted SalePrice Distribution",
xlab = "test_predicted",
fill=I("darkgreen"),
col = I('red'),alpha=I(.2))
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The model predicts Sale Price with test data and I have submitted on Kaggle under Puneet Auluck with score of 0.24823.