set.seed(10000)
N <- 25
X <- round(runif(10000, 1, N))
Y <- round(rnorm(10000, mean = (N+1)/2, sd = (N+1)/2))
x <- median(X)
y <- quantile(Y,0.25,names=FALSE)
(a<-min(pnorm(X>x | X>y)))
## [1] 0.5
The minimum probabilty of random uniform number X being greater than median number x given X is greater than the 1st quartile value in y is 0.5
(b<-min(pnorm(X>x ,Y>y)))
## [1] 0.1586553
The minimum probabilty of random uniform number X being greater than median number x and random normal number Y is greater than the 1st quartile value in y is 0.16
(c<-min(pnorm(X<x, X>y)))
## [1] 0.1586553
The minimum probabilty of random uniform number X being less than median number x and X is greater than the 1st quartile value in y is 0.16
a<-pnorm(X>x)*pnorm(Y>y)
#a<-rbinom(n=6, size = 10000, prob =dnorm((X>x)*(Y>y)))/10000
b<-pnorm((X>x)*(Y>y))
#b<-rbinom(n=6, size = 10000, prob =dnorm(X>x)*dnorm(Y>y))/10000
r<-rbind(table(a),table(b))
## Warning in rbind(table(a), table(b)): number of columns of result is not a
## multiple of vector length (arg 2)
#r<-rbind(a[1:6],b[1:6])
row.names(r)<-c('P(X>x and Y>y)','P(X>x)P(Y>y)')
colnames(r)<-names(table(round(a,2)))
#colnames(r)<-c(1,2,3,4,5,6)
rp<-round(addmargins(prop.table(r)),2)
ftable(round(a,2))
## 0.25 0.42 0.71
##
## 1334 5058 3608
ftable(round(b,2))
## 0.5 0.84
##
## 6392 3608
rp
## 0.25 0.42 0.71 Sum
## P(X>x and Y>y) 0.05 0.19 0.14 0.38
## P(X>x)P(Y>y) 0.24 0.14 0.24 0.62
## Sum 0.29 0.33 0.38 1.00
fst<-fisher.test(rp[1,],rp[2,])
cst<-chisq.test(rp[1,],rp[2,])
## Warning in chisq.test(rp[1, ], rp[2, ]): Chi-squared approximation may be
## incorrect
print(fst$p.value)
## [1] 1
print(cst$p.value)
## [1] 0.2381033
For independency we will use Chisquare test and Fisher test. Here Fisher test is used for small datesets. We got Fisher test value as 1 which fits the data better when compared to Chisquare having P value 0.24
Here the variables i am using from the dataset is SalePrice, GrLivArea, BedroomAbvGr, YearBuilt
df.train <- read.csv("train.csv")
summary(df.train$GrLivArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
a_sd <- sd(df.train$GrLivArea)
a_mean <- mean(df.train$GrLivArea)
a_max <- max(df.train$GrLivArea)
a_min <- min(df.train$GrLivArea)
a_x <- 0:a_max
a_y <- dnorm(x=a_x, mean=a_mean, sd=a_sd)
hist(df.train$GrLivArea, probability = T)
lines(x=a_x, y=a_y, col='red')
summary(df.train$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
a_sd <- sd(df.train$SalePrice)
a_mean <- mean(df.train$SalePrice)
a_max <- max(df.train$SalePrice)
a_min <- min(df.train$SalePrice)
a_x <- 0:a_max
a_y <- dnorm(x=a_x, mean=a_mean, sd=a_sd)
hist(df.train$SalePrice, probability = T)
lines(x=a_x, y=a_y, col='red')
# Scatter Plot with Regression Line
plot(SalePrice~GrLivArea, data = df.train)
a_lm <- lm(SalePrice~GrLivArea, data = df.train)
abline(a_lm, col = 'blue')
# Residual Analysis
plot(fitted(a_lm), resid(a_lm), main = "Residuals")
abline(h = 0, lty = 3)
qqnorm(a_lm$residuals, main = "Q-Q plot")
qqline(a_lm$residuals, col = 'blue')
After comparing the relation between GrLiveArea variable and Saleprice variable, both seems to be nearly normal. Scatter plot and Residual plots shows that Linear correlation exists between them.
# Plot function for variables
plot(df.train[,c("SalePrice", "GrLivArea", "BedroomAbvGr", "YearBuilt")])
(cm_a <- cor(df.train[,c("SalePrice", "GrLivArea", "BedroomAbvGr", "YearBuilt")]))
## SalePrice GrLivArea BedroomAbvGr YearBuilt
## SalePrice 1.0000000 0.7086245 0.16821315 0.52289733
## GrLivArea 0.7086245 1.0000000 0.52126951 0.19900971
## BedroomAbvGr 0.1682132 0.5212695 1.00000000 -0.07065122
## YearBuilt 0.5228973 0.1990097 -0.07065122 1.00000000
cor.test(~GrLivArea+YearBuilt, data = df.train, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: GrLivArea and YearBuilt
## t = 7.754, df = 1458, p-value = 1.66e-14
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.1665605 0.2310283
## sample estimates:
## cor
## 0.1990097
cor.test(~BedroomAbvGr+YearBuilt, data = df.train, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: BedroomAbvGr and YearBuilt
## t = -2.7045, df = 1458, p-value = 0.006921
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.10396633 -0.03717773
## sample estimates:
## cor
## -0.07065122
cor.test(~BedroomAbvGr+GrLivArea, data = df.train, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: BedroomAbvGr and GrLivArea
## t = 23.323, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4963921 0.5452915
## sample estimates:
## cor
## 0.5212695
Despite the above hypothesis rejections, I would be worried about the familywise error, given that there are a lot of observations and the likelihood of the error is almost guaranteed.
(cm_b <- solve(cm_a))
## SalePrice GrLivArea BedroomAbvGr YearBuilt
## SalePrice 3.1146672 -2.2892182 0.58943471 -1.13143025
## GrLivArea -2.2892182 3.1697849 -1.23338893 0.47906764
## BedroomAbvGr 0.5894347 -1.2333889 1.54706581 0.04654462
## YearBuilt -1.1314302 0.4790676 0.04654462 1.49957118
cm_a %*% cm_b
## SalePrice GrLivArea BedroomAbvGr YearBuilt
## SalePrice 1.000000e+00 0.000000e+00 2.081668e-17 1.110223e-16
## GrLivArea -1.942890e-16 1.000000e+00 3.469447e-18 1.110223e-16
## BedroomAbvGr -1.387779e-17 -2.775558e-17 1.000000e+00 -1.387779e-17
## YearBuilt -2.220446e-16 -2.220446e-16 0.000000e+00 1.000000e+00
cm_b %*% cm_a
## SalePrice GrLivArea BedroomAbvGr YearBuilt
## SalePrice 1.000000e+00 -6.383782e-16 -2.359224e-16 -2.220446e-16
## GrLivArea 4.996004e-16 1.000000e+00 1.942890e-16 0.000000e+00
## BedroomAbvGr -1.977585e-16 -2.168404e-16 1.000000e+00 -4.857226e-17
## YearBuilt 1.110223e-16 5.551115e-17 -2.775558e-17 1.000000e+00
library(matrixcalc)
(m_lu <- lu.decomposition(cm_b))
## $L
## [,1] [,2] [,3] [,4]
## [1,] 1.0000000 0.0000000 0.00000000 0
## [2,] -0.7349800 1.0000000 0.00000000 0
## [3,] 0.1892448 -0.5380153 1.00000000 0
## [4,] -0.3632588 -0.2370212 0.07065122 1
##
## $U
## [,1] [,2] [,3] [,4]
## [1,] 3.114667 -2.289218e+00 0.5894347 -1.13143025
## [2,] 0.000000 1.487255e+00 -0.8001662 -0.35251098
## [3,] 0.000000 1.110223e-16 1.0050166 0.07100565
## [4,] 0.000000 -7.843861e-18 0.0000000 1.00000000
# Validating the decomposition by multiplying both halves of the matrix to get the original one.
message("Validating the decomposition, by getting the original matrix")
## Validating the decomposition, by getting the original matrix
m_lu$L %*% m_lu$U
## [,1] [,2] [,3] [,4]
## [1,] 3.1146672 -2.2892182 0.58943471 -1.13143025
## [2,] -2.2892182 3.1697849 -1.23338893 0.47906764
## [3,] 0.5894347 -1.2333889 1.54706581 0.04654462
## [4,] -1.1314302 0.4790676 0.04654462 1.49957118
library(MASS)
hs_liv <- df.train$GrLivArea
(fd_rate <- fitdistr(hs_liv, "exponential"))
## rate
## 6.598640e-04
## (1.726943e-05)
# Taking sample size 1000
fd_liv <- rexp(1000, rate = fd_rate$estimate)
# graph
par(mfrow = c(1, 2))
hist(hs_liv, main = "Histogram of GrLivArea")
hist(fd_liv, main = "Histogram of fitted distribution")
# Find the 5th and 95th percentiles
qexp(c(0.05, 0.95), rate = fd_rate$estimate)
## [1] 77.73313 4539.92351
# Generate a 95% confidence interval from the empirical data, assuming normality.
(a_qn <- qnorm(c(0.05, 0.95), mean = mean(hs_liv), sd = sd(hs_liv)))
## [1] 651.1254 2379.8020
# Provide the empirical 5th and 95th percentiles of the data
quantile(ecdf(hs_liv), c(0.05, 0.95))
## 5% 95%
## 848.0 2466.1
a_x <- seq(1, max(hs_liv), length.out = length(hs_liv))
a_y <- dnorm(x = a_x, mean = mean(hs_liv), sd = sd(hs_liv))
hist(hs_liv, probability = T)
lines(x = a_x, y = a_y, col = 'blue')
abline(v = a_qn, col = 'red')
Even though, the data for the GrLivArea variable appeares to be right-skewed, the interval numbers and the plots above show that it is better described by a normal distribution rather than an exponential one.
trn <- df.train[,(names(df.train) %in% c("MSSubClass", "MSZoning", "LotFrontage", "LotArea", "LotShape", "LandContour", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "Exterior1st", "Exterior2nd", "ExterQual", "ExterCond", "Foundation", "HeatingQC", "CentralAir", "GrLivArea", "TotRmsAbvGrd", "GarageArea"))]
# Impute missing data
mean_LotFrontage <- as.integer(summary(trn$LotFrontage)["Mean"])
trn$LotFrontage <- replace(trn$LotFrontage, is.na(trn$LotFrontage), mean_LotFrontage)
# Derive/Calculate additional features
trn$AgeSold <- df.train$YrSold - df.train$YearBuilt + 1
trn$AgeRemod <- df.train$YrSold - df.train$YearRemodAdd + 1
# Rescale numeric data
# Use Standardization: Subtract the mean and divide by variance
# This way the features are centered around zero and have variance one
standardScaler <- function(x) {
m <- mean(x)
s <- sd(x)
return ((x - m) / s)
}
trn$GrLivArea <- standardScaler(trn$GrLivArea)
trn$GarageArea <- standardScaler(trn$GarageArea)
trn$AgeSold <- standardScaler(trn$AgeSold)
trn$AgeRemod <- standardScaler(trn$AgeRemod)
trn$SalePrice <- df.train$SalePrice
hs.lm <- lm(SalePrice~MSSubClass + MSZoning + LotFrontage + LotArea + LotShape + LandContour + LotConfig + LandSlope + Neighborhood+ Condition1 + Condition2 + BldgType +
+ HouseStyle + OverallQual+ OverallCond + Exterior1st+ Exterior2nd+ ExterQual +
+ ExterCond + Foundation + HeatingQC + CentralAir + GrLivArea + TotRmsAbvGrd
+ GarageArea + AgeSold + AgeRemod, data = trn)
summary(hs.lm)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + MSZoning + LotFrontage +
## LotArea + LotShape + LandContour + LotConfig + LandSlope +
## Neighborhood + Condition1 + Condition2 + BldgType + +HouseStyle +
## OverallQual + OverallCond + Exterior1st + Exterior2nd + ExterQual +
## +ExterCond + Foundation + HeatingQC + CentralAir + GrLivArea +
## TotRmsAbvGrd + GarageArea + AgeSold + AgeRemod, data = trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -346415 -13916 -356 11491 236400
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.094e+05 4.093e+04 2.673 0.007605 **
## MSSubClass -1.093e+02 1.025e+02 -1.066 0.286558
## MSZoningFV 2.890e+04 1.487e+04 1.943 0.052206 .
## MSZoningRH 1.760e+04 1.496e+04 1.176 0.239730
## MSZoningRL 2.342e+04 1.263e+04 1.854 0.064022 .
## MSZoningRM 2.414e+04 1.181e+04 2.045 0.041040 *
## LotFrontage -8.340e+01 5.257e+01 -1.586 0.112864
## LotArea 6.713e-01 1.214e-01 5.529 3.86e-08 ***
## LotShapeIR2 3.082e+03 5.394e+03 0.571 0.567924
## LotShapeIR3 -3.563e+04 1.079e+04 -3.301 0.000987 ***
## LotShapeReg -3.984e+02 2.085e+03 -0.191 0.848528
## LandContourHLS 2.506e+04 6.509e+03 3.851 0.000123 ***
## LandContourLow 2.056e+04 7.853e+03 2.618 0.008942 **
## LandContourLvl 1.636e+04 4.623e+03 3.538 0.000417 ***
## LotConfigCulDSac 8.088e+03 4.185e+03 1.933 0.053482 .
## LotConfigFR2 -5.291e+03 5.181e+03 -1.021 0.307316
## LotConfigFR3 -1.083e+04 1.651e+04 -0.656 0.511969
## LotConfigInside 3.137e+01 2.287e+03 0.014 0.989058
## LandSlopeMod 1.096e+04 4.961e+03 2.210 0.027271 *
## LandSlopeSev -1.578e+04 1.170e+04 -1.349 0.177630
## NeighborhoodBlueste 2.180e+03 2.431e+04 0.090 0.928555
## NeighborhoodBrDale 2.554e+03 1.372e+04 0.186 0.852289
## NeighborhoodBrkSide -1.035e+04 1.160e+04 -0.892 0.372621
## NeighborhoodClearCr -7.798e+03 1.140e+04 -0.684 0.494039
## NeighborhoodCollgCr -1.480e+04 8.987e+03 -1.647 0.099734 .
## NeighborhoodCrawfor 6.833e+03 1.068e+04 0.640 0.522272
## NeighborhoodEdwards -2.372e+04 9.827e+03 -2.413 0.015937 *
## NeighborhoodGilbert -1.776e+04 9.756e+03 -1.821 0.068906 .
## NeighborhoodIDOTRR -1.922e+04 1.328e+04 -1.448 0.147896
## NeighborhoodMeadowV -1.203e+04 1.396e+04 -0.861 0.389249
## NeighborhoodMitchel -2.424e+04 1.014e+04 -2.391 0.016958 *
## NeighborhoodNAmes -2.027e+04 9.614e+03 -2.108 0.035179 *
## NeighborhoodNoRidge 3.882e+04 1.032e+04 3.763 0.000175 ***
## NeighborhoodNPkVill 4.562e+03 1.787e+04 0.255 0.798528
## NeighborhoodNridgHt 3.930e+04 9.148e+03 4.296 1.86e-05 ***
## NeighborhoodNWAmes -2.384e+04 1.001e+04 -2.381 0.017396 *
## NeighborhoodOldTown -2.563e+04 1.190e+04 -2.154 0.031450 *
## NeighborhoodSawyer -1.545e+04 1.012e+04 -1.527 0.126943
## NeighborhoodSawyerW -8.704e+03 9.678e+03 -0.899 0.368658
## NeighborhoodSomerst -2.648e+03 1.114e+04 -0.238 0.812118
## NeighborhoodStoneBr 4.815e+04 1.044e+04 4.612 4.37e-06 ***
## NeighborhoodSWISU -1.794e+04 1.210e+04 -1.483 0.138239
## NeighborhoodTimber -4.441e+03 1.033e+04 -0.430 0.667404
## NeighborhoodVeenker 2.134e+04 1.312e+04 1.627 0.103995
## Condition1Feedr -5.298e+03 6.292e+03 -0.842 0.399916
## Condition1Norm 7.738e+03 5.197e+03 1.489 0.136777
## Condition1PosA 4.580e+03 1.260e+04 0.363 0.716337
## Condition1PosN 1.467e+04 9.372e+03 1.565 0.117864
## Condition1RRAe -2.396e+04 1.158e+04 -2.070 0.038687 *
## Condition1RRAn 1.133e+04 8.694e+03 1.303 0.192736
## Condition1RRNe -7.436e+03 2.304e+04 -0.323 0.746919
## Condition1RRNn 7.789e+03 1.623e+04 0.480 0.631440
## Condition2Feedr -3.011e+04 2.848e+04 -1.057 0.290667
## Condition2Norm -2.000e+04 2.436e+04 -0.821 0.411966
## Condition2PosA -2.021e+04 4.658e+04 -0.434 0.664396
## Condition2PosN -2.057e+05 3.442e+04 -5.975 2.94e-09 ***
## Condition2RRAe -3.665e+04 4.023e+04 -0.911 0.362567
## Condition2RRAn -3.415e+04 4.016e+04 -0.850 0.395291
## Condition2RRNn -1.689e+04 3.376e+04 -0.500 0.617025
## BldgType2fmCon 7.447e+03 1.530e+04 0.487 0.626653
## BldgTypeDuplex -9.971e+03 7.282e+03 -1.369 0.171124
## BldgTypeTwnhs -2.994e+04 1.246e+04 -2.403 0.016382 *
## BldgTypeTwnhsE -2.011e+04 1.114e+04 -1.805 0.071275 .
## HouseStyle1.5Unf 7.205e+03 9.553e+03 0.754 0.450844
## HouseStyle1Story 1.360e+04 4.316e+03 3.152 0.001660 **
## HouseStyle2.5Fin -1.523e+04 1.262e+04 -1.207 0.227614
## HouseStyle2.5Unf -8.911e+03 1.116e+04 -0.799 0.424679
## HouseStyle2Story -3.354e+03 3.852e+03 -0.871 0.384099
## HouseStyleSFoyer 2.525e+04 7.255e+03 3.481 0.000516 ***
## HouseStyleSLvl 1.354e+04 6.213e+03 2.179 0.029527 *
## OverallQual 1.267e+04 1.200e+03 10.555 < 2e-16 ***
## OverallCond 5.449e+03 1.046e+03 5.210 2.18e-07 ***
## Exterior1stAsphShn -4.097e+04 4.123e+04 -0.994 0.320442
## Exterior1stBrkComm -3.722e+04 3.400e+04 -1.095 0.273924
## Exterior1stBrkFace 1.482e+04 1.515e+04 0.979 0.327989
## Exterior1stCBlock 1.862e+03 3.426e+04 0.054 0.956661
## Exterior1stCemntBd 6.079e+03 2.375e+04 0.256 0.798055
## Exterior1stHdBoard -3.197e+03 1.512e+04 -0.211 0.832540
## Exterior1stImStucc -6.420e+04 3.575e+04 -1.796 0.072772 .
## Exterior1stMetalSd 2.818e+03 1.748e+04 0.161 0.871957
## Exterior1stPlywood 7.193e+02 1.485e+04 0.048 0.961380
## Exterior1stStone -1.561e+04 2.767e+04 -0.564 0.572890
## Exterior1stStucco -4.527e+03 1.682e+04 -0.269 0.787818
## Exterior1stVinylSd -1.357e+04 1.598e+04 -0.849 0.396109
## Exterior1stWd Sdng -6.157e+03 1.463e+04 -0.421 0.674010
## Exterior1stWdShing 6.430e+03 1.582e+04 0.406 0.684477
## Exterior2ndAsphShn 9.195e+03 2.687e+04 0.342 0.732202
## Exterior2ndBrk Cmn 1.366e+04 2.554e+04 0.535 0.592967
## Exterior2ndBrkFace 2.615e+03 1.623e+04 0.161 0.872043
## Exterior2ndCBlock NA NA NA NA
## Exterior2ndCmentBd 7.904e+03 2.384e+04 0.332 0.740304
## Exterior2ndHdBoard 1.077e+03 1.498e+04 0.072 0.942718
## Exterior2ndImStucc 3.606e+04 1.760e+04 2.049 0.040687 *
## Exterior2ndMetalSd 3.663e+02 1.753e+04 0.021 0.983331
## Exterior2ndOther 1.506e+04 3.529e+04 0.427 0.669736
## Exterior2ndPlywood 8.642e+02 1.446e+04 0.060 0.952346
## Exterior2ndStone -9.046e+03 2.086e+04 -0.434 0.664618
## Exterior2ndStucco -6.917e+03 1.659e+04 -0.417 0.676756
## Exterior2ndVinylSd 1.665e+04 1.587e+04 1.049 0.294155
## Exterior2ndWd Sdng 7.366e+03 1.455e+04 0.506 0.612871
## Exterior2ndWd Shng -3.772e+03 1.516e+04 -0.249 0.803613
## ExterQualFa -3.947e+04 1.232e+04 -3.203 0.001393 **
## ExterQualGd -4.951e+04 5.478e+03 -9.039 < 2e-16 ***
## ExterQualTA -4.967e+04 6.230e+03 -7.972 3.31e-15 ***
## ExterCondFa 9.502e+01 2.331e+04 0.004 0.996748
## ExterCondGd -4.339e+03 2.221e+04 -0.195 0.845175
## ExterCondPo -1.009e+04 3.940e+04 -0.256 0.797812
## ExterCondTA -1.954e+03 2.218e+04 -0.088 0.929812
## FoundationCBlock 3.729e+03 3.940e+03 0.946 0.344179
## FoundationPConc 7.952e+03 4.344e+03 1.830 0.067406 .
## FoundationSlab -9.945e+03 7.774e+03 -1.279 0.201043
## FoundationStone -8.599e+03 1.342e+04 -0.641 0.521741
## FoundationWood -1.791e+04 1.873e+04 -0.956 0.339058
## HeatingQCFa -5.481e+02 5.451e+03 -0.101 0.919920
## HeatingQCGd -4.563e+03 2.646e+03 -1.725 0.084792 .
## HeatingQCPo 1.439e+04 3.442e+04 0.418 0.675987
## HeatingQCTA -4.280e+03 2.610e+03 -1.640 0.101243
## CentralAirY -3.364e+03 4.516e+03 -0.745 0.456418
## GrLivArea 3.489e+04 2.103e+03 16.588 < 2e-16 ***
## TotRmsAbvGrd -6.577e+02 1.046e+03 -0.629 0.529585
## GarageArea 5.961e+03 1.171e+03 5.093 4.04e-07 ***
## AgeSold -9.305e+03 2.474e+03 -3.760 0.000177 ***
## AgeRemod -2.277e+03 1.339e+03 -1.700 0.089284 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30660 on 1338 degrees of freedom
## Multiple R-squared: 0.8634, Adjusted R-squared: 0.8511
## F-statistic: 69.91 on 121 and 1338 DF, p-value: < 2.2e-16
# Remove insignificant features and rebuil the model
# Removed Features: LotConfig, Exterior1st, Exterior2nd, ExterCond, HeatingQC
hs.lm <- lm(SalePrice~MSSubClass + MSZoning + LotFrontage + LotArea + LotShape + LandContour + LandSlope + Neighborhood+ Condition1 + Condition2 + BldgType +
+ HouseStyle + OverallQual+ OverallCond + ExterQual +
+ Foundation + CentralAir + GrLivArea + TotRmsAbvGrd
+ GarageArea + AgeSold + AgeRemod, data = trn)
summary(hs.lm)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + MSZoning + LotFrontage +
## LotArea + LotShape + LandContour + LandSlope + Neighborhood +
## Condition1 + Condition2 + BldgType + +HouseStyle + OverallQual +
## OverallCond + ExterQual + +Foundation + CentralAir + GrLivArea +
## TotRmsAbvGrd + GarageArea + AgeSold + AgeRemod, data = trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -365214 -13095 -757 12322 261389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.149e+05 3.290e+04 3.491 0.000496 ***
## MSSubClass -1.530e+02 1.020e+02 -1.500 0.133862
## MSZoningFV 2.440e+04 1.476e+04 1.654 0.098366 .
## MSZoningRH 1.287e+04 1.489e+04 0.864 0.387549
## MSZoningRL 1.992e+04 1.248e+04 1.596 0.110696
## MSZoningRM 2.054e+04 1.168e+04 1.759 0.078879 .
## LotFrontage -1.046e+02 5.032e+01 -2.079 0.037761 *
## LotArea 6.772e-01 1.192e-01 5.679 1.65e-08 ***
## LotShapeIR2 2.721e+03 5.345e+03 0.509 0.610711
## LotShapeIR3 -3.441e+04 1.081e+04 -3.183 0.001489 **
## LotShapeReg -9.790e+02 1.983e+03 -0.494 0.621647
## LandContourHLS 2.509e+04 6.422e+03 3.907 9.79e-05 ***
## LandContourLow 1.969e+04 7.732e+03 2.546 0.011004 *
## LandContourLvl 1.696e+04 4.553e+03 3.724 0.000204 ***
## LandSlopeMod 1.229e+04 4.854e+03 2.531 0.011486 *
## LandSlopeSev -1.217e+04 1.157e+04 -1.051 0.293307
## NeighborhoodBlueste 1.976e+03 2.416e+04 0.082 0.934825
## NeighborhoodBrDale 3.621e+03 1.340e+04 0.270 0.787003
## NeighborhoodBrkSide -1.007e+04 1.137e+04 -0.886 0.375920
## NeighborhoodClearCr -5.353e+03 1.121e+04 -0.478 0.633054
## NeighborhoodCollgCr -1.419e+04 8.957e+03 -1.585 0.113273
## NeighborhoodCrawfor 7.042e+03 1.046e+04 0.673 0.500824
## NeighborhoodEdwards -2.339e+04 9.709e+03 -2.409 0.016114 *
## NeighborhoodGilbert -1.853e+04 9.667e+03 -1.917 0.055481 .
## NeighborhoodIDOTRR -1.709e+04 1.298e+04 -1.317 0.188104
## NeighborhoodMeadowV 3.939e+02 1.270e+04 0.031 0.975261
## NeighborhoodMitchel -2.185e+04 1.000e+04 -2.184 0.029127 *
## NeighborhoodNAmes -1.944e+04 9.460e+03 -2.055 0.040077 *
## NeighborhoodNoRidge 3.868e+04 1.016e+04 3.807 0.000147 ***
## NeighborhoodNPkVill 1.058e+04 1.346e+04 0.786 0.431989
## NeighborhoodNridgHt 3.734e+04 9.079e+03 4.112 4.15e-05 ***
## NeighborhoodNWAmes -2.520e+04 9.797e+03 -2.572 0.010215 *
## NeighborhoodOldTown -2.452e+04 1.167e+04 -2.100 0.035912 *
## NeighborhoodSawyer -1.695e+04 9.985e+03 -1.698 0.089786 .
## NeighborhoodSawyerW -1.030e+04 9.494e+03 -1.085 0.278116
## NeighborhoodSomerst -1.231e+02 1.107e+04 -0.011 0.991130
## NeighborhoodStoneBr 4.894e+04 1.020e+04 4.797 1.78e-06 ***
## NeighborhoodSWISU -1.604e+04 1.201e+04 -1.336 0.181700
## NeighborhoodTimber -6.282e+03 1.021e+04 -0.615 0.538367
## NeighborhoodVeenker 2.394e+04 1.265e+04 1.892 0.058667 .
## Condition1Feedr -6.302e+03 6.187e+03 -1.019 0.308575
## Condition1Norm 6.751e+03 5.140e+03 1.313 0.189255
## Condition1PosA 6.200e+03 1.241e+04 0.500 0.617390
## Condition1PosN 1.572e+04 9.321e+03 1.686 0.091949 .
## Condition1RRAe -2.112e+04 1.110e+04 -1.903 0.057277 .
## Condition1RRAn 1.188e+04 8.598e+03 1.381 0.167373
## Condition1RRNe -7.284e+03 2.296e+04 -0.317 0.751150
## Condition1RRNn 9.337e+03 1.563e+04 0.597 0.550332
## Condition2Feedr -3.415e+04 2.819e+04 -1.211 0.225928
## Condition2Norm -1.836e+04 2.420e+04 -0.759 0.448155
## Condition2PosA -2.331e+04 4.094e+04 -0.570 0.569104
## Condition2PosN -1.988e+05 3.407e+04 -5.836 6.67e-09 ***
## Condition2RRAe -3.131e+04 4.010e+04 -0.781 0.435168
## Condition2RRAn -2.587e+04 3.983e+04 -0.650 0.516056
## Condition2RRNn -1.313e+04 3.339e+04 -0.393 0.694197
## BldgType2fmCon 1.345e+04 1.508e+04 0.892 0.372544
## BldgTypeDuplex -9.923e+03 7.173e+03 -1.383 0.166781
## BldgTypeTwnhs -2.756e+04 1.229e+04 -2.243 0.025086 *
## BldgTypeTwnhsE -1.630e+04 1.104e+04 -1.477 0.139966
## HouseStyle1.5Unf 1.071e+04 9.207e+03 1.163 0.244844
## HouseStyle1Story 1.263e+04 4.242e+03 2.978 0.002952 **
## HouseStyle2.5Fin -1.315e+04 1.249e+04 -1.053 0.292575
## HouseStyle2.5Unf -9.817e+03 1.099e+04 -0.893 0.371836
## HouseStyle2Story -3.857e+03 3.798e+03 -1.016 0.310030
## HouseStyleSFoyer 2.786e+04 7.171e+03 3.886 0.000107 ***
## HouseStyleSLvl 1.410e+04 6.123e+03 2.302 0.021461 *
## OverallQual 1.294e+04 1.176e+03 11.007 < 2e-16 ***
## OverallCond 5.651e+03 9.802e+02 5.765 1.01e-08 ***
## ExterQualFa -4.451e+04 1.157e+04 -3.848 0.000125 ***
## ExterQualGd -5.253e+04 5.338e+03 -9.842 < 2e-16 ***
## ExterQualTA -5.438e+04 6.074e+03 -8.953 < 2e-16 ***
## FoundationCBlock 2.795e+03 3.871e+03 0.722 0.470317
## FoundationPConc 7.753e+03 4.291e+03 1.807 0.070998 .
## FoundationSlab -8.395e+03 7.634e+03 -1.100 0.271644
## FoundationStone -1.061e+04 1.331e+04 -0.797 0.425667
## FoundationWood -1.843e+04 1.879e+04 -0.981 0.326766
## CentralAirY -2.649e+03 4.193e+03 -0.632 0.527638
## GrLivArea 3.562e+04 2.048e+03 17.395 < 2e-16 ***
## TotRmsAbvGrd -6.337e+02 1.038e+03 -0.610 0.541824
## GarageArea 5.684e+03 1.154e+03 4.924 9.53e-07 ***
## AgeSold -1.086e+04 2.355e+03 -4.612 4.36e-06 ***
## AgeRemod -2.021e+03 1.292e+03 -1.563 0.118170
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30850 on 1378 degrees of freedom
## Multiple R-squared: 0.8575, Adjusted R-squared: 0.8492
## F-statistic: 102.4 on 81 and 1378 DF, p-value: < 2.2e-16
# Backward Elimination Process
hs.lm <- update(hs.lm, .~. - TotRmsAbvGrd, data = trn)
summary(hs.lm)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + MSZoning + LotFrontage +
## LotArea + LotShape + LandContour + LandSlope + Neighborhood +
## Condition1 + Condition2 + BldgType + HouseStyle + OverallQual +
## OverallCond + ExterQual + Foundation + CentralAir + GrLivArea +
## GarageArea + AgeSold + AgeRemod, data = trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -362823 -13089 -760 12351 263131
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.095e+05 3.168e+04 3.455 0.000567 ***
## MSSubClass -1.540e+02 1.020e+02 -1.510 0.131259
## MSZoningFV 2.489e+04 1.473e+04 1.690 0.091344 .
## MSZoningRH 1.283e+04 1.489e+04 0.862 0.388742
## MSZoningRL 2.004e+04 1.248e+04 1.607 0.108360
## MSZoningRM 2.061e+04 1.168e+04 1.765 0.077865 .
## LotFrontage -1.041e+02 5.030e+01 -2.070 0.038635 *
## LotArea 6.763e-01 1.192e-01 5.673 1.71e-08 ***
## LotShapeIR2 2.812e+03 5.341e+03 0.526 0.598710
## LotShapeIR3 -3.411e+04 1.080e+04 -3.159 0.001618 **
## LotShapeReg -9.795e+02 1.983e+03 -0.494 0.621409
## LandContourHLS 2.507e+04 6.421e+03 3.905 9.87e-05 ***
## LandContourLow 1.982e+04 7.728e+03 2.564 0.010445 *
## LandContourLvl 1.702e+04 4.551e+03 3.739 0.000192 ***
## LandSlopeMod 1.249e+04 4.842e+03 2.580 0.009970 **
## LandSlopeSev -1.155e+04 1.152e+04 -1.002 0.316616
## NeighborhoodBlueste 2.820e+03 2.412e+04 0.117 0.906942
## NeighborhoodBrDale 4.024e+03 1.338e+04 0.301 0.763639
## NeighborhoodBrkSide -8.817e+03 1.118e+04 -0.789 0.430425
## NeighborhoodClearCr -4.470e+03 1.111e+04 -0.402 0.687600
## NeighborhoodCollgCr -1.333e+04 8.841e+03 -1.507 0.131996
## NeighborhoodCrawfor 8.072e+03 1.032e+04 0.782 0.434127
## NeighborhoodEdwards -2.244e+04 9.580e+03 -2.342 0.019318 *
## NeighborhoodGilbert -1.767e+04 9.563e+03 -1.848 0.064778 .
## NeighborhoodIDOTRR -1.603e+04 1.286e+04 -1.247 0.212637
## NeighborhoodMeadowV 1.039e+03 1.265e+04 0.082 0.934600
## NeighborhoodMitchel -2.093e+04 9.887e+03 -2.117 0.034431 *
## NeighborhoodNAmes -1.858e+04 9.352e+03 -1.987 0.047162 *
## NeighborhoodNoRidge 3.996e+04 9.936e+03 4.022 6.08e-05 ***
## NeighborhoodNPkVill 1.085e+04 1.345e+04 0.807 0.419914
## NeighborhoodNridgHt 3.784e+04 9.039e+03 4.186 3.02e-05 ***
## NeighborhoodNWAmes -2.444e+04 9.715e+03 -2.515 0.012005 *
## NeighborhoodOldTown -2.333e+04 1.151e+04 -2.027 0.042851 *
## NeighborhoodSawyer -1.608e+04 9.881e+03 -1.628 0.103809
## NeighborhoodSawyerW -9.408e+03 9.379e+03 -1.003 0.315985
## NeighborhoodSomerst 4.873e+02 1.102e+04 0.044 0.964737
## NeighborhoodStoneBr 4.968e+04 1.013e+04 4.906 1.04e-06 ***
## NeighborhoodSWISU -1.483e+04 1.184e+04 -1.253 0.210458
## NeighborhoodTimber -5.533e+03 1.013e+04 -0.546 0.584999
## NeighborhoodVeenker 2.488e+04 1.255e+04 1.982 0.047629 *
## Condition1Feedr -6.363e+03 6.185e+03 -1.029 0.303756
## Condition1Norm 6.714e+03 5.138e+03 1.307 0.191542
## Condition1PosA 6.217e+03 1.241e+04 0.501 0.616316
## Condition1PosN 1.550e+04 9.312e+03 1.665 0.096159 .
## Condition1RRAe -2.132e+04 1.109e+04 -1.922 0.054807 .
## Condition1RRAn 1.169e+04 8.590e+03 1.360 0.173933
## Condition1RRNe -7.121e+03 2.296e+04 -0.310 0.756473
## Condition1RRNn 9.201e+03 1.562e+04 0.589 0.556034
## Condition2Feedr -3.423e+04 2.818e+04 -1.215 0.224656
## Condition2Norm -1.865e+04 2.419e+04 -0.771 0.440722
## Condition2PosA -2.266e+04 4.091e+04 -0.554 0.579743
## Condition2PosN -1.975e+05 3.399e+04 -5.810 7.75e-09 ***
## Condition2RRAe -3.268e+04 4.003e+04 -0.816 0.414388
## Condition2RRAn -2.609e+04 3.982e+04 -0.655 0.512366
## Condition2RRNn -1.327e+04 3.338e+04 -0.398 0.691026
## BldgType2fmCon 1.323e+04 1.507e+04 0.878 0.380367
## BldgTypeDuplex -1.060e+04 7.086e+03 -1.496 0.134964
## BldgTypeTwnhs -2.679e+04 1.222e+04 -2.191 0.028584 *
## BldgTypeTwnhsE -1.544e+04 1.095e+04 -1.411 0.158589
## HouseStyle1.5Unf 1.097e+04 9.195e+03 1.193 0.233043
## HouseStyle1Story 1.273e+04 4.238e+03 3.004 0.002716 **
## HouseStyle2.5Fin -1.380e+04 1.244e+04 -1.110 0.267307
## HouseStyle2.5Unf -1.036e+04 1.095e+04 -0.946 0.344377
## HouseStyle2Story -4.005e+03 3.789e+03 -1.057 0.290812
## HouseStyleSFoyer 2.824e+04 7.143e+03 3.953 8.10e-05 ***
## HouseStyleSLvl 1.412e+04 6.122e+03 2.307 0.021224 *
## OverallQual 1.297e+04 1.175e+03 11.045 < 2e-16 ***
## OverallCond 5.666e+03 9.796e+02 5.784 9.02e-09 ***
## ExterQualFa -4.422e+04 1.156e+04 -3.827 0.000136 ***
## ExterQualGd -5.228e+04 5.321e+03 -9.826 < 2e-16 ***
## ExterQualTA -5.415e+04 6.061e+03 -8.935 < 2e-16 ***
## FoundationCBlock 2.979e+03 3.858e+03 0.772 0.440104
## FoundationPConc 7.818e+03 4.289e+03 1.823 0.068546 .
## FoundationSlab -8.208e+03 7.626e+03 -1.076 0.281947
## FoundationStone -1.084e+04 1.330e+04 -0.815 0.415448
## FoundationWood -1.808e+04 1.877e+04 -0.963 0.335615
## CentralAirY -2.793e+03 4.186e+03 -0.667 0.504664
## GrLivArea 3.485e+04 1.611e+03 21.635 < 2e-16 ***
## GarageArea 5.720e+03 1.153e+03 4.963 7.82e-07 ***
## AgeSold -1.093e+04 2.351e+03 -4.650 3.64e-06 ***
## AgeRemod -2.004e+03 1.292e+03 -1.551 0.121019
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30850 on 1379 degrees of freedom
## Multiple R-squared: 0.8575, Adjusted R-squared: 0.8492
## F-statistic: 103.7 on 80 and 1379 DF, p-value: < 2.2e-16
hs.lm <- update(hs.lm, .~. - CentralAir, data = trn)
summary(hs.lm)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + MSZoning + LotFrontage +
## LotArea + LotShape + LandContour + LandSlope + Neighborhood +
## Condition1 + Condition2 + BldgType + HouseStyle + OverallQual +
## OverallCond + ExterQual + Foundation + GrLivArea + GarageArea +
## AgeSold + AgeRemod, data = trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -362618 -13168 -864 12332 263313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.067e+05 3.141e+04 3.398 0.000698 ***
## MSSubClass -1.510e+02 1.018e+02 -1.483 0.138322
## MSZoningFV 2.443e+04 1.471e+04 1.661 0.097005 .
## MSZoningRH 1.289e+04 1.488e+04 0.866 0.386660
## MSZoningRL 1.970e+04 1.246e+04 1.581 0.114138
## MSZoningRM 2.029e+04 1.167e+04 1.739 0.082289 .
## LotFrontage -1.038e+02 5.029e+01 -2.064 0.039164 *
## LotArea 6.745e-01 1.192e-01 5.660 1.83e-08 ***
## LotShapeIR2 2.814e+03 5.340e+03 0.527 0.598384
## LotShapeIR3 -3.385e+04 1.079e+04 -3.138 0.001738 **
## LotShapeReg -9.988e+02 1.982e+03 -0.504 0.614434
## LandContourHLS 2.500e+04 6.418e+03 3.895 0.000103 ***
## LandContourLow 1.965e+04 7.722e+03 2.545 0.011050 *
## LandContourLvl 1.698e+04 4.550e+03 3.732 0.000197 ***
## LandSlopeMod 1.251e+04 4.840e+03 2.584 0.009860 **
## LandSlopeSev -1.151e+04 1.152e+04 -0.999 0.318164
## NeighborhoodBlueste 2.737e+03 2.411e+04 0.113 0.909653
## NeighborhoodBrDale 3.731e+03 1.337e+04 0.279 0.780207
## NeighborhoodBrkSide -8.982e+03 1.117e+04 -0.804 0.421638
## NeighborhoodClearCr -4.619e+03 1.111e+04 -0.416 0.677624
## NeighborhoodCollgCr -1.342e+04 8.838e+03 -1.519 0.129085
## NeighborhoodCrawfor 7.856e+03 1.031e+04 0.762 0.446238
## NeighborhoodEdwards -2.248e+04 9.578e+03 -2.347 0.019062 *
## NeighborhoodGilbert -1.779e+04 9.559e+03 -1.861 0.062911 .
## NeighborhoodIDOTRR -1.588e+04 1.285e+04 -1.235 0.216921
## NeighborhoodMeadowV 7.465e+02 1.264e+04 0.059 0.952928
## NeighborhoodMitchel -2.116e+04 9.880e+03 -2.141 0.032418 *
## NeighborhoodNAmes -1.886e+04 9.341e+03 -2.019 0.043669 *
## NeighborhoodNoRidge 3.991e+04 9.933e+03 4.018 6.18e-05 ***
## NeighborhoodNPkVill 1.071e+04 1.345e+04 0.797 0.425862
## NeighborhoodNridgHt 3.785e+04 9.037e+03 4.189 2.98e-05 ***
## NeighborhoodNWAmes -2.456e+04 9.711e+03 -2.529 0.011537 *
## NeighborhoodOldTown -2.353e+04 1.150e+04 -2.046 0.040940 *
## NeighborhoodSawyer -1.635e+04 9.870e+03 -1.657 0.097760 .
## NeighborhoodSawyerW -9.657e+03 9.369e+03 -1.031 0.302878
## NeighborhoodSomerst 5.437e+02 1.102e+04 0.049 0.960653
## NeighborhoodStoneBr 4.964e+04 1.013e+04 4.903 1.06e-06 ***
## NeighborhoodSWISU -1.482e+04 1.184e+04 -1.252 0.210706
## NeighborhoodTimber -5.612e+03 1.013e+04 -0.554 0.579556
## NeighborhoodVeenker 2.488e+04 1.255e+04 1.982 0.047628 *
## Condition1Feedr -6.555e+03 6.177e+03 -1.061 0.288750
## Condition1Norm 6.635e+03 5.136e+03 1.292 0.196621
## Condition1PosA 6.275e+03 1.240e+04 0.506 0.612967
## Condition1PosN 1.563e+04 9.309e+03 1.679 0.093357 .
## Condition1RRAe -2.120e+04 1.109e+04 -1.912 0.056105 .
## Condition1RRAn 1.157e+04 8.587e+03 1.348 0.177971
## Condition1RRNe -7.128e+03 2.295e+04 -0.311 0.756201
## Condition1RRNn 9.162e+03 1.562e+04 0.586 0.557636
## Condition2Feedr -3.225e+04 2.802e+04 -1.151 0.249954
## Condition2Norm -1.686e+04 2.403e+04 -0.702 0.483091
## Condition2PosA -2.238e+04 4.090e+04 -0.547 0.584295
## Condition2PosN -1.957e+05 3.388e+04 -5.777 9.39e-09 ***
## Condition2RRAe -3.080e+04 3.992e+04 -0.772 0.440542
## Condition2RRAn -2.464e+04 3.975e+04 -0.620 0.535441
## Condition2RRNn -1.038e+04 3.309e+04 -0.314 0.753838
## BldgType2fmCon 1.360e+04 1.506e+04 0.903 0.366442
## BldgTypeDuplex -1.034e+04 7.074e+03 -1.462 0.143939
## BldgTypeTwnhs -2.715e+04 1.221e+04 -2.224 0.026334 *
## BldgTypeTwnhsE -1.580e+04 1.093e+04 -1.446 0.148466
## HouseStyle1.5Unf 1.148e+04 9.162e+03 1.252 0.210601
## HouseStyle1Story 1.305e+04 4.211e+03 3.098 0.001986 **
## HouseStyle2.5Fin -1.401e+04 1.243e+04 -1.127 0.259951
## HouseStyle2.5Unf -9.121e+03 1.079e+04 -0.845 0.398116
## HouseStyle2Story -3.734e+03 3.767e+03 -0.991 0.321720
## HouseStyleSFoyer 2.832e+04 7.140e+03 3.966 7.69e-05 ***
## HouseStyleSLvl 1.428e+04 6.116e+03 2.335 0.019676 *
## OverallQual 1.292e+04 1.172e+03 11.027 < 2e-16 ***
## OverallCond 5.541e+03 9.612e+02 5.764 1.01e-08 ***
## ExterQualFa -4.404e+04 1.155e+04 -3.812 0.000144 ***
## ExterQualGd -5.242e+04 5.316e+03 -9.861 < 2e-16 ***
## ExterQualTA -5.447e+04 6.041e+03 -9.018 < 2e-16 ***
## FoundationCBlock 2.722e+03 3.838e+03 0.709 0.478308
## FoundationPConc 7.587e+03 4.274e+03 1.775 0.076086 .
## FoundationSlab -8.012e+03 7.619e+03 -1.052 0.293176
## FoundationStone -1.034e+04 1.328e+04 -0.779 0.436404
## FoundationWood -1.807e+04 1.877e+04 -0.963 0.335779
## GrLivArea 3.482e+04 1.610e+03 21.629 < 2e-16 ***
## GarageArea 5.677e+03 1.151e+03 4.934 9.02e-07 ***
## AgeSold -1.069e+04 2.322e+03 -4.603 4.55e-06 ***
## AgeRemod -1.962e+03 1.290e+03 -1.521 0.128540
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30840 on 1380 degrees of freedom
## Multiple R-squared: 0.8574, Adjusted R-squared: 0.8493
## F-statistic: 105.1 on 79 and 1380 DF, p-value: < 2.2e-16
hs.lm <- update(hs.lm, .~. - Foundation, data = trn)
summary(hs.lm)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + MSZoning + LotFrontage +
## LotArea + LotShape + LandContour + LandSlope + Neighborhood +
## Condition1 + Condition2 + BldgType + HouseStyle + OverallQual +
## OverallCond + ExterQual + GrLivArea + GarageArea + AgeSold +
## AgeRemod, data = trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -361478 -13279 -700 12134 263181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.090e+05 3.131e+04 3.481 0.000515 ***
## MSSubClass -1.599e+02 1.010e+02 -1.583 0.113621
## MSZoningFV 2.574e+04 1.465e+04 1.757 0.079215 .
## MSZoningRH 1.399e+04 1.482e+04 0.944 0.345222
## MSZoningRL 2.106e+04 1.238e+04 1.701 0.089252 .
## MSZoningRM 2.122e+04 1.159e+04 1.831 0.067280 .
## LotFrontage -1.082e+02 5.000e+01 -2.164 0.030619 *
## LotArea 6.738e-01 1.187e-01 5.677 1.66e-08 ***
## LotShapeIR2 2.331e+03 5.312e+03 0.439 0.660944
## LotShapeIR3 -3.384e+04 1.079e+04 -3.136 0.001751 **
## LotShapeReg -8.739e+02 1.983e+03 -0.441 0.659509
## LandContourHLS 2.587e+04 6.393e+03 4.046 5.49e-05 ***
## LandContourLow 1.973e+04 7.708e+03 2.559 0.010589 *
## LandContourLvl 1.711e+04 4.549e+03 3.760 0.000177 ***
## LandSlopeMod 1.207e+04 4.835e+03 2.497 0.012627 *
## LandSlopeSev -1.092e+04 1.150e+04 -0.950 0.342510
## NeighborhoodBlueste 6.399e+02 2.403e+04 0.027 0.978760
## NeighborhoodBrDale 2.158e+03 1.319e+04 0.164 0.870031
## NeighborhoodBrkSide -8.318e+03 1.118e+04 -0.744 0.457099
## NeighborhoodClearCr -4.278e+03 1.109e+04 -0.386 0.699742
## NeighborhoodCollgCr -1.280e+04 8.848e+03 -1.446 0.148303
## NeighborhoodCrawfor 8.467e+03 1.032e+04 0.821 0.411975
## NeighborhoodEdwards -2.230e+04 9.582e+03 -2.327 0.020101 *
## NeighborhoodGilbert -1.720e+04 9.562e+03 -1.798 0.072349 .
## NeighborhoodIDOTRR -1.388e+04 1.282e+04 -1.083 0.279113
## NeighborhoodMeadowV 8.211e+02 1.258e+04 0.065 0.947970
## NeighborhoodMitchel -2.128e+04 9.875e+03 -2.155 0.031319 *
## NeighborhoodNAmes -1.927e+04 9.273e+03 -2.078 0.037912 *
## NeighborhoodNoRidge 4.103e+04 9.939e+03 4.129 3.87e-05 ***
## NeighborhoodNPkVill 8.516e+03 1.334e+04 0.638 0.523262
## NeighborhoodNridgHt 3.817e+04 9.049e+03 4.218 2.62e-05 ***
## NeighborhoodNWAmes -2.565e+04 9.633e+03 -2.663 0.007840 **
## NeighborhoodOldTown -2.257e+04 1.150e+04 -1.963 0.049895 *
## NeighborhoodSawyer -1.663e+04 9.823e+03 -1.693 0.090605 .
## NeighborhoodSawyerW -1.021e+04 9.374e+03 -1.090 0.276086
## NeighborhoodSomerst 9.093e+02 1.103e+04 0.082 0.934325
## NeighborhoodStoneBr 4.966e+04 1.013e+04 4.901 1.07e-06 ***
## NeighborhoodSWISU -1.280e+04 1.182e+04 -1.083 0.279042
## NeighborhoodTimber -6.628e+03 1.011e+04 -0.655 0.512383
## NeighborhoodVeenker 2.444e+04 1.255e+04 1.948 0.051636 .
## Condition1Feedr -7.155e+03 6.182e+03 -1.157 0.247284
## Condition1Norm 5.444e+03 5.123e+03 1.063 0.288166
## Condition1PosA 5.176e+03 1.241e+04 0.417 0.676737
## Condition1PosN 1.458e+04 9.316e+03 1.565 0.117841
## Condition1RRAe -2.259e+04 1.109e+04 -2.037 0.041871 *
## Condition1RRAn 1.017e+04 8.577e+03 1.186 0.235852
## Condition1RRNe -6.075e+03 2.297e+04 -0.264 0.791438
## Condition1RRNn 8.254e+03 1.563e+04 0.528 0.597422
## Condition2Feedr -3.099e+04 2.805e+04 -1.105 0.269517
## Condition2Norm -1.644e+04 2.406e+04 -0.683 0.494607
## Condition2PosA -2.377e+04 4.096e+04 -0.580 0.561836
## Condition2PosN -1.950e+05 3.393e+04 -5.748 1.11e-08 ***
## Condition2RRAe -2.888e+04 3.996e+04 -0.723 0.469919
## Condition2RRAn -2.210e+04 3.978e+04 -0.556 0.578598
## Condition2RRNn -1.163e+04 3.313e+04 -0.351 0.725677
## BldgType2fmCon 1.477e+04 1.493e+04 0.989 0.322834
## BldgTypeDuplex -1.243e+04 6.972e+03 -1.783 0.074744 .
## BldgTypeTwnhs -2.616e+04 1.213e+04 -2.157 0.031141 *
## BldgTypeTwnhsE -1.447e+04 1.085e+04 -1.334 0.182312
## HouseStyle1.5Unf 1.155e+04 9.156e+03 1.261 0.207352
## HouseStyle1Story 1.332e+04 4.191e+03 3.178 0.001516 **
## HouseStyle2.5Fin -1.229e+04 1.242e+04 -0.990 0.322463
## HouseStyle2.5Unf -8.255e+03 1.079e+04 -0.765 0.444416
## HouseStyle2Story -2.829e+03 3.736e+03 -0.757 0.449058
## HouseStyleSFoyer 2.877e+04 7.091e+03 4.057 5.26e-05 ***
## HouseStyleSLvl 1.516e+04 6.059e+03 2.502 0.012474 *
## OverallQual 1.324e+04 1.154e+03 11.468 < 2e-16 ***
## OverallCond 5.359e+03 9.454e+02 5.669 1.75e-08 ***
## ExterQualFa -4.452e+04 1.151e+04 -3.868 0.000115 ***
## ExterQualGd -5.180e+04 5.312e+03 -9.751 < 2e-16 ***
## ExterQualTA -5.406e+04 6.020e+03 -8.981 < 2e-16 ***
## GrLivArea 3.474e+04 1.608e+03 21.598 < 2e-16 ***
## GarageArea 5.646e+03 1.148e+03 4.919 9.72e-07 ***
## AgeSold -1.238e+04 2.133e+03 -5.804 8.03e-09 ***
## AgeRemod -2.270e+03 1.274e+03 -1.782 0.075022 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30890 on 1385 degrees of freedom
## Multiple R-squared: 0.8565, Adjusted R-squared: 0.8488
## F-statistic: 111.7 on 74 and 1385 DF, p-value: < 2.2e-16
hs.lm <- update(hs.lm, .~. - MSZoning, data = trn)
summary(hs.lm)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## LotShape + LandContour + LandSlope + Neighborhood + Condition1 +
## Condition2 + BldgType + HouseStyle + OverallQual + OverallCond +
## ExterQual + GrLivArea + GarageArea + AgeSold + AgeRemod,
## data = trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -361183 -13393 -789 12437 263162
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.280e+05 2.798e+04 4.575 5.19e-06 ***
## MSSubClass -1.506e+02 1.008e+02 -1.494 0.135450
## LotFrontage -1.080e+02 4.996e+01 -2.161 0.030850 *
## LotArea 6.735e-01 1.187e-01 5.676 1.68e-08 ***
## LotShapeIR2 2.441e+03 5.309e+03 0.460 0.645772
## LotShapeIR3 -3.355e+04 1.078e+04 -3.111 0.001900 **
## LotShapeReg -8.843e+02 1.971e+03 -0.449 0.653681
## LandContourHLS 2.487e+04 6.372e+03 3.903 9.96e-05 ***
## LandContourLow 1.866e+04 7.677e+03 2.430 0.015212 *
## LandContourLvl 1.675e+04 4.541e+03 3.690 0.000233 ***
## LandSlopeMod 1.187e+04 4.830e+03 2.457 0.014116 *
## LandSlopeSev -1.011e+04 1.149e+04 -0.880 0.379247
## NeighborhoodBlueste 6.673e+02 2.360e+04 0.028 0.977442
## NeighborhoodBrDale 2.141e+03 1.231e+04 0.174 0.861962
## NeighborhoodBrkSide -7.324e+03 1.036e+04 -0.707 0.479832
## NeighborhoodClearCr -3.685e+03 1.097e+04 -0.336 0.737013
## NeighborhoodCollgCr -1.258e+04 8.714e+03 -1.443 0.149200
## NeighborhoodCrawfor 8.742e+03 1.015e+04 0.861 0.389227
## NeighborhoodEdwards -2.188e+04 9.390e+03 -2.330 0.019949 *
## NeighborhoodGilbert -1.710e+04 9.456e+03 -1.809 0.070699 .
## NeighborhoodIDOTRR -1.771e+04 1.088e+04 -1.628 0.103790
## NeighborhoodMeadowV 1.165e+03 1.165e+04 0.100 0.920326
## NeighborhoodMitchel -2.073e+04 9.702e+03 -2.137 0.032812 *
## NeighborhoodNAmes -1.879e+04 9.125e+03 -2.059 0.039720 *
## NeighborhoodNoRidge 4.124e+04 9.853e+03 4.185 3.03e-05 ***
## NeighborhoodNPkVill 8.540e+03 1.333e+04 0.641 0.521880
## NeighborhoodNridgHt 3.833e+04 9.007e+03 4.255 2.23e-05 ***
## NeighborhoodNWAmes -2.529e+04 9.500e+03 -2.662 0.007852 **
## NeighborhoodOldTown -2.184e+04 1.001e+04 -2.182 0.029273 *
## NeighborhoodSawyer -1.610e+04 9.666e+03 -1.665 0.096094 .
## NeighborhoodSawyerW -1.048e+04 9.276e+03 -1.129 0.258975
## NeighborhoodSomerst 4.495e+03 8.783e+03 0.512 0.608872
## NeighborhoodStoneBr 4.992e+04 1.012e+04 4.934 9.03e-07 ***
## NeighborhoodSWISU -1.367e+04 1.162e+04 -1.176 0.239641
## NeighborhoodTimber -6.213e+03 1.001e+04 -0.621 0.534987
## NeighborhoodVeenker 2.484e+04 1.250e+04 1.987 0.047109 *
## Condition1Feedr -7.698e+03 6.169e+03 -1.248 0.212343
## Condition1Norm 5.347e+03 5.117e+03 1.045 0.296206
## Condition1PosA 5.150e+03 1.241e+04 0.415 0.678253
## Condition1PosN 1.452e+04 9.311e+03 1.560 0.119059
## Condition1RRAe -2.206e+04 1.105e+04 -1.995 0.046205 *
## Condition1RRAn 9.903e+03 8.522e+03 1.162 0.245408
## Condition1RRNe -5.888e+03 2.295e+04 -0.257 0.797597
## Condition1RRNn 8.569e+03 1.538e+04 0.557 0.577405
## Condition2Feedr -3.342e+04 2.777e+04 -1.203 0.229090
## Condition2Norm -1.638e+04 2.388e+04 -0.686 0.492907
## Condition2PosA -2.236e+04 4.083e+04 -0.548 0.584061
## Condition2PosN -1.950e+05 3.379e+04 -5.771 9.70e-09 ***
## Condition2RRAe -2.816e+04 3.987e+04 -0.706 0.480090
## Condition2RRAn -2.156e+04 3.968e+04 -0.543 0.586896
## Condition2RRNn -1.082e+04 3.283e+04 -0.330 0.741767
## BldgType2fmCon 1.340e+04 1.489e+04 0.900 0.368439
## BldgTypeDuplex -1.285e+04 6.942e+03 -1.851 0.064341 .
## BldgTypeTwnhs -2.683e+04 1.204e+04 -2.229 0.025981 *
## BldgTypeTwnhsE -1.523e+04 1.072e+04 -1.421 0.155495
## HouseStyle1.5Unf 1.142e+04 9.125e+03 1.251 0.211085
## HouseStyle1Story 1.338e+04 4.188e+03 3.196 0.001423 **
## HouseStyle2.5Fin -1.217e+04 1.241e+04 -0.980 0.327259
## HouseStyle2.5Unf -9.918e+03 1.075e+04 -0.922 0.356463
## HouseStyle2Story -2.868e+03 3.717e+03 -0.772 0.440526
## HouseStyleSFoyer 2.816e+04 7.073e+03 3.981 7.23e-05 ***
## HouseStyleSLvl 1.475e+04 6.047e+03 2.439 0.014853 *
## OverallQual 1.341e+04 1.148e+03 11.675 < 2e-16 ***
## OverallCond 5.451e+03 9.442e+02 5.773 9.60e-09 ***
## ExterQualFa -4.729e+04 1.141e+04 -4.145 3.61e-05 ***
## ExterQualGd -5.166e+04 5.310e+03 -9.729 < 2e-16 ***
## ExterQualTA -5.387e+04 6.016e+03 -8.955 < 2e-16 ***
## GrLivArea 3.465e+04 1.607e+03 21.562 < 2e-16 ***
## GarageArea 5.470e+03 1.144e+03 4.783 1.92e-06 ***
## AgeSold -1.260e+04 2.119e+03 -5.947 3.46e-09 ***
## AgeRemod -2.265e+03 1.274e+03 -1.778 0.075659 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30890 on 1389 degrees of freedom
## Multiple R-squared: 0.856, Adjusted R-squared: 0.8488
## F-statistic: 118 on 70 and 1389 DF, p-value: < 2.2e-16
hs.lm <- update(hs.lm, .~. - MSSubClass, data = trn)
summary(hs.lm)
##
## Call:
## lm(formula = SalePrice ~ LotFrontage + LotArea + LotShape + LandContour +
## LandSlope + Neighborhood + Condition1 + Condition2 + BldgType +
## HouseStyle + OverallQual + OverallCond + ExterQual + GrLivArea +
## GarageArea + AgeSold + AgeRemod, data = trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -362056 -13274 -649 12645 263281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.203e+05 2.751e+04 4.373 1.32e-05 ***
## LotFrontage -1.075e+02 4.998e+01 -2.151 0.031628 *
## LotArea 6.808e-01 1.186e-01 5.740 1.16e-08 ***
## LotShapeIR2 2.513e+03 5.311e+03 0.473 0.636140
## LotShapeIR3 -3.357e+04 1.079e+04 -3.113 0.001892 **
## LotShapeReg -9.131e+02 1.971e+03 -0.463 0.643322
## LandContourHLS 2.436e+04 6.366e+03 3.827 0.000135 ***
## LandContourLow 1.742e+04 7.635e+03 2.281 0.022691 *
## LandContourLvl 1.657e+04 4.541e+03 3.648 0.000274 ***
## LandSlopeMod 1.210e+04 4.829e+03 2.505 0.012356 *
## LandSlopeSev -9.685e+03 1.150e+04 -0.843 0.399642
## NeighborhoodBlueste 6.686e+02 2.361e+04 0.028 0.977407
## NeighborhoodBrDale 2.335e+03 1.231e+04 0.190 0.849616
## NeighborhoodBrkSide -7.579e+03 1.037e+04 -0.731 0.464806
## NeighborhoodClearCr -4.482e+03 1.096e+04 -0.409 0.682711
## NeighborhoodCollgCr -1.248e+04 8.718e+03 -1.432 0.152402
## NeighborhoodCrawfor 8.656e+03 1.015e+04 0.852 0.394133
## NeighborhoodEdwards -2.175e+04 9.394e+03 -2.315 0.020764 *
## NeighborhoodGilbert -1.719e+04 9.460e+03 -1.817 0.069454 .
## NeighborhoodIDOTRR -1.796e+04 1.088e+04 -1.650 0.099131 .
## NeighborhoodMeadowV 8.678e+02 1.165e+04 0.074 0.940634
## NeighborhoodMitchel -2.066e+04 9.706e+03 -2.129 0.033461 *
## NeighborhoodNAmes -1.834e+04 9.125e+03 -2.010 0.044655 *
## NeighborhoodNoRidge 4.130e+04 9.858e+03 4.190 2.97e-05 ***
## NeighborhoodNPkVill 8.687e+03 1.334e+04 0.651 0.514922
## NeighborhoodNridgHt 3.836e+04 9.011e+03 4.257 2.21e-05 ***
## NeighborhoodNWAmes -2.506e+04 9.503e+03 -2.637 0.008457 **
## NeighborhoodOldTown -2.144e+04 1.001e+04 -2.142 0.032385 *
## NeighborhoodSawyer -1.593e+04 9.669e+03 -1.647 0.099741 .
## NeighborhoodSawyerW -1.039e+04 9.280e+03 -1.120 0.262891
## NeighborhoodSomerst 4.473e+03 8.787e+03 0.509 0.610763
## NeighborhoodStoneBr 4.952e+04 1.012e+04 4.894 1.10e-06 ***
## NeighborhoodSWISU -1.369e+04 1.163e+04 -1.178 0.239084
## NeighborhoodTimber -5.886e+03 1.001e+04 -0.588 0.556824
## NeighborhoodVeenker 2.501e+04 1.251e+04 2.000 0.045725 *
## Condition1Feedr -7.065e+03 6.157e+03 -1.147 0.251394
## Condition1Norm 5.715e+03 5.113e+03 1.118 0.263865
## Condition1PosA 5.685e+03 1.241e+04 0.458 0.647012
## Condition1PosN 1.508e+04 9.307e+03 1.620 0.105403
## Condition1RRAe -2.193e+04 1.106e+04 -1.983 0.047571 *
## Condition1RRAn 1.009e+04 8.525e+03 1.184 0.236631
## Condition1RRNe -5.579e+03 2.296e+04 -0.243 0.808066
## Condition1RRNn 8.043e+03 1.538e+04 0.523 0.601070
## Condition2Feedr -3.412e+04 2.778e+04 -1.228 0.219548
## Condition2Norm -1.652e+04 2.389e+04 -0.692 0.489346
## Condition2PosA -2.241e+04 4.085e+04 -0.549 0.583296
## Condition2PosN -1.955e+05 3.381e+04 -5.784 8.99e-09 ***
## Condition2RRAe -2.784e+04 3.989e+04 -0.698 0.485254
## Condition2RRAn -2.492e+04 3.963e+04 -0.629 0.529507
## Condition2RRNn -1.105e+04 3.284e+04 -0.336 0.736695
## BldgType2fmCon -6.818e+03 6.213e+03 -1.097 0.272679
## BldgTypeDuplex -2.016e+04 4.925e+03 -4.094 4.48e-05 ***
## BldgTypeTwnhs -4.186e+04 6.621e+03 -6.322 3.47e-10 ***
## BldgTypeTwnhsE -2.998e+04 4.177e+03 -7.176 1.16e-12 ***
## HouseStyle1.5Unf 1.230e+04 9.110e+03 1.350 0.177253
## HouseStyle1Story 1.690e+04 3.463e+03 4.881 1.18e-06 ***
## HouseStyle2.5Fin -1.556e+04 1.221e+04 -1.275 0.202608
## HouseStyle2.5Unf -1.329e+04 1.052e+04 -1.264 0.206566
## HouseStyle2Story -4.996e+03 3.434e+03 -1.455 0.146024
## HouseStyleSFoyer 2.419e+04 6.560e+03 3.688 0.000235 ***
## HouseStyleSLvl 9.915e+03 5.111e+03 1.940 0.052579 .
## OverallQual 1.342e+04 1.149e+03 11.686 < 2e-16 ***
## OverallCond 5.496e+03 9.441e+02 5.821 7.26e-09 ***
## ExterQualFa -4.498e+04 1.131e+04 -3.977 7.34e-05 ***
## ExterQualGd -5.165e+04 5.312e+03 -9.723 < 2e-16 ***
## ExterQualTA -5.378e+04 6.018e+03 -8.936 < 2e-16 ***
## GrLivArea 3.458e+04 1.607e+03 21.517 < 2e-16 ***
## GarageArea 5.507e+03 1.144e+03 4.814 1.64e-06 ***
## AgeSold -1.298e+04 2.105e+03 -6.165 9.22e-10 ***
## AgeRemod -2.271e+03 1.274e+03 -1.782 0.075041 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30910 on 1390 degrees of freedom
## Multiple R-squared: 0.8558, Adjusted R-squared: 0.8486
## F-statistic: 119.6 on 69 and 1390 DF, p-value: < 2.2e-16
plot(fitted(hs.lm), resid(hs.lm))
abline(h = 0)
qqnorm(resid(hs.lm))
qqline(resid(hs.lm))
df.test <- read.csv("test.csv")
tst <- df.test[,(names(df.test) %in% c("MSSubClass", "MSZoning", "LotFrontage", "LotArea", "LotShape", "LandContour", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "Exterior1st", "Exterior2nd", "ExterQual", "ExterCond", "Foundation", "HeatingQC", "CentralAir", "GrLivArea", "TotRmsAbvGrd", "GarageArea"))]
# Impute missing data
mean_LotFrontage <- as.integer(summary(tst$LotFrontage)["Mean"])
tst$LotFrontage <- replace(tst$LotFrontage, is.na(tst$LotFrontage), mean_LotFrontage)
mean_GarageArea <- as.integer(summary(tst$GarageArea)["Mean"])
tst$GarageArea <- replace(tst$GarageArea, is.na(tst$GarageArea), mean_GarageArea)
# Derive/Calculate additional features
tst$AgeSold <- df.test$YrSold - df.test$YearBuilt + 1
tst$AgeRemod <- df.test$YrSold - df.test$YearRemodAdd + 1
# Rescale numeric data
# Use Standardization: Subtract the mean and divide by variance
# This way the features are centered around zero and have variance one
tst$GrLivArea <- standardScaler(tst$GrLivArea)
tst$GarageArea <- standardScaler(tst$GarageArea)
tst$AgeSold <- standardScaler(tst$AgeSold)
tst$AgeRemod <- standardScaler(tst$AgeRemod)
# Predicting the House Prices
hs.pd <- data.frame(
Id = seq(nrow(trn) + 1, length.out = nrow(tst)),
SalePrice = predict(hs.lm, newdata = tst)
)
head(hs.pd)
## Id SalePrice
## 1 1461 113017.6
## 2 1462 160456.4
## 3 1463 167846.7
## 4 1464 182056.0
## 5 1465 246980.3
## 6 1466 177207.0
write.csv(hs.pd, file = "submission.csv", quote = FALSE, row.names = FALSE)
summary(hs.pd$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3935 128784 165085 180770 218227 572421
par(mfrow = c(1, 2))
hist(hs.pd$SalePrice, main = "Predicted Prices")
hist(trn$SalePrice, main = "Training Prices")