X = lot area Y = Sales Price
options(scipen=999)
X <- train$LotArea
Y <- train$SalePrice
plot(X,Y, xlab = 'Lot Area', ylab = 'Sales Price')
###Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
## [1] "First Quantile of Lot Area: "
## 25%
## 7553.5
## [1] "First Quantile Sales Price: "
## 25%
## 129975
P1 <- nrow(subset(train,X > x | Y >y ))
P_1 <- P1/nrow(train)
P_1
## [1] 0.8849315
P2 <- nrow(subset(train, X>x & Y>y))
P_2 <- P2/nrow(train)
P_2
## [1] 0.6150685
P3 <- nrow(subset(train, X < x | Y>y ))
P_3 <- P3/nrow(train)
P_3
## [1] 0.8650685
p1 <- nrow(subset(train, X <= x & Y <= y))
p2 <- nrow(subset(train,X <= x & Y > y))
p3 <- nrow(subset(train, X > x & Y <= y ))
p4 <- nrow(subset(train, X > x & Y > y))
t <- matrix(c('Y <= 1st Q'," Y > 1st Q","Total",p1,p2,p1+p2,p3,p4,p3+p4,p1+p3,p2+p4,p1+p2+p3+p4),nrow = 3)
t1 <- data.frame(t)
colnames(t1) <- (c("x/y","X <= 1st Q","X > 1st Q","Total"))
t1
## x/y X <= 1st Q X > 1st Q Total
## 1 Y <= 1st Q 168 197 365
## 2 Y > 1st Q 197 898 1095
## 3 Total 365 1095 1460
Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.
A = nrow(subset(train, X > x ))
B = nrow(subset(train, Y > y ))
rows <- nrow(train)
paste('P(AB) = ')
## [1] "P(AB) = "
P_2
## [1] 0.6150685
paste('P(A) * P(B) = ')
## [1] "P(A) * P(B) = "
(A/rows)*(B/rows)
## [1] 0.5625
train$Xgreaterx <- train$LotArea > x
train$Ygreatery <- train$SalePrice > y
c <- chisq.test(table(train$Xgreaterx, train$Ygreatery))
c
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(train$Xgreaterx, train$Ygreatery)
## X-squared = 113.27, df = 1, p-value < 0.00000000000000022
P value from our chi-squred test is less than the 0.05 significance level, so we reject the null hypothesis. This suggests that the Lot area is not independent of Sales Price.
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
Quick analysis of the variables from the training data:
paste('Lot Area Overview')
## [1] "Lot Area Overview"
describe(train$LotArea)
## vars n mean sd median trimmed mad min max range
## X1 1 1460 10516.83 9981.26 9478.5 9563.28 2962.23 1300 215245 213945
## skew kurtosis se
## X1 12.18 202.26 261.22
paste('Sales Price Overview')
## [1] "Sales Price Overview"
describe(train$SalePrice)
## vars n mean sd median trimmed mad min max range
## X1 1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100
## skew kurtosis se
## X1 1.88 6.5 2079.11
plot(X,Y, xlab = 'Lot Area', ylab = 'Sales Price', main = 'Lot Area vs Sales Price')
H0: correlations between each pairwise set of variables is 0.
HA: correlations between each pairwise set of variables is NOT 0.
paste('Overall Quality Overview')
## [1] "Overall Quality Overview"
describe(train$OverallQual)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1460 6.1 1.38 6 6.08 1.48 1 10 9 0.22 0.09
## se
## X1 0.04
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
data <- matrix(c(train$SalePrice, train$LotArea, train$OverallQual),ncol = 3)
data <- data.frame(data)
colnames(data) <- c('SalesPrice', 'LotArea', 'OverallQual')
correlation <- rcorr(as.matrix(data), type = 'pearson')
paste('Correlation Matrix')
## [1] "Correlation Matrix"
cor1 <- print(correlation$r, digits = 3)
## SalesPrice LotArea OverallQual
## SalesPrice 1.000 0.264 0.791
## LotArea 0.264 1.000 0.106
## OverallQual 0.791 0.106 1.000
I choose Overall Quality as my 3rd variable. From the correlation, we can see that there is a correlation between Overall Quality and Sales Price is very high with 79% but Lot Area does not correlate with niether Sales Price or Overall Quality.
Lets look at the t test at a 0.99 confidence level:
paste('Lot Area vs Overall Quality')
## [1] "Lot Area vs Overall Quality"
t.test(data$LotArea, data$OverallQual , conf.level=0.92)
##
## Welch Two Sample t-test
##
## data: data$LotArea and data$OverallQual
## t = 40.237, df = 1459, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## 10053.09 10968.36
## sample estimates:
## mean of x mean of y
## 10516.828082 6.099315
paste('Sales Price vs Overall Quality')
## [1] "Sales Price vs Overall Quality"
t.test(data$SalesPrice, data$OverallQual , conf.level=0.92)
##
## Welch Two Sample t-test
##
## data: data$SalesPrice and data$OverallQual
## t = 87.016, df = 1459, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## 177272.7 184557.5
## sample estimates:
## mean of x mean of y
## 180921.195890 6.099315
paste('Sales Price vs Lot Area')
## [1] "Sales Price vs Lot Area"
t.test(data$SalesPrice, data$LotArea , conf.level=0.92)
##
## Welch Two Sample t-test
##
## data: data$SalesPrice and data$LotArea
## t = 81.321, df = 1505.1, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## 166733.4 174075.3
## sample estimates:
## mean of x mean of y
## 180921.20 10516.83
From the analysis above we reject the Null hypothesis that the correlation between each set of variables is 0. I would not be worried about a family wise error because From the 92% confidence intervval, we can see that the results are not 0.
Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
Inverse:
inv_cor <- solve(cor1)
inv_cor
## SalesPrice LotArea OverallQual
## SalesPrice 2.9280384 -0.5334669 -2.2595806
## LotArea -0.5334669 1.1085153 0.3046752
## OverallQual -2.2595806 0.3046752 2.7550503
Correlation Matrix * Precision Matrix
cor_inv_mult <- cor1 %*% inv_cor
cor_inv_mult
## SalesPrice LotArea
## SalesPrice 1.00000000000000022204460 0.00000000000000005551115
## LotArea 0.00000000000000008326673 1.00000000000000000000000
## OverallQual 0.00000000000000044408921 0.00000000000000011102230
## OverallQual
## SalesPrice 0
## LotArea 0
## OverallQual 1
Precision Matrix * Correlation Matrix
inv_cor_mult <- inv_cor %*% cor1
inv_cor_mult
## SalesPrice LotArea
## SalesPrice 1.00000000000000022204460 0.000000000000000194289
## LotArea -0.00000000000000005551115 1.000000000000000000000
## OverallQual 0.00000000000000000000000 0.000000000000000000000
## OverallQual
## SalesPrice 0.0000000000000004440892
## LotArea 0.0000000000000000000000
## OverallQual 1.0000000000000000000000
LU Decompostion of Correlation Matrix
luA <- lu.decomposition(cor1)
L <- luA$L
U <- luA$U
paste('The numeric lower triangular matrix')
## [1] "The numeric lower triangular matrix"
print( L )
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.2638434 1.0000000 0
## [3,] 0.7909816 -0.1105879 1
paste('The number upper triangular matrix')
## [1] "The number upper triangular matrix"
print( U )
## [,1] [,2] [,3]
## [1,] 1 0.2638434 0.7909816
## [2,] 0 0.9303867 -0.1028895
## [3,] 0 0.0000000 0.3629698
paste('L * U')
## [1] "L * U"
print( L %*% U )
## [,1] [,2] [,3]
## [1,] 1.0000000 0.2638434 0.7909816
## [2,] 0.2638434 1.0000000 0.1058057
## [3,] 0.7909816 0.1058057 1.0000000
LU Decompostion of Precision Matrix
luA <- lu.decomposition(inv_cor)
L <- luA$L
U <- luA$U
paste('The numeric lower triangular matrix')
## [1] "The numeric lower triangular matrix"
print( L )
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] -0.1821926 1.0000000 0
## [3,] -0.7717046 -0.1058057 1
paste('The number upper triangular matrix')
## [1] "The number upper triangular matrix"
print( U )
## [,1] [,2] [,3]
## [1,] 2.928038 -0.5334669 -2.2595806
## [2,] 0.000000 1.0113216 -0.1070036
## [3,] 0.000000 0.0000000 1.0000000
paste('L * U')
## [1] "L * U"
print( L %*% U )
## [,1] [,2] [,3]
## [1,] 2.9280384 -0.5334669 -2.2595806
## [2,] -0.5334669 1.1085153 0.3046752
## [3,] -2.2595806 0.3046752 2.7550503
Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).
Find the optimal value of ?? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ??)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
min_x <- min(X)
paste("Minimum value of Lot area is", min_x)
## [1] "Minimum value of Lot area is 1300"
optimal value of ??:
exp <- fitdistr(X, densfun = "exponential")
l <- exp$estimate
sample <- rexp(1000, l)
paste('optimal value of ??:',round(l,8))
## [1] "optimal value of ??: 0.00009509"
sample
par(mfrow = c(1, 2))
hist(sample, main = 'Histogram of Sample Lot Area')
hist(X, main = 'Histogram of Lot Area')
cdf5 <- qexp(0.05, rate = l, lower.tail = TRUE, log.p = FALSE)
paste('The 5th percentile is',round(cdf5,2))
## [1] "The 5th percentile is 539.44"
cdf95 <- qexp(0.95, rate = l, lower.tail = TRUE, log.p = FALSE)
paste('The 95th percentile is',round(cdf95,2))
## [1] "The 95th percentile is 31505.6"
ci95 <- CI(X, 0.95)
paste('95% confidence interval from empiracal data assuming normality')
## [1] "95% confidence interval from empiracal data assuming normality"
ci95
## upper mean lower
## 11029.24 10516.83 10004.42
e5 <- quantile(X, c(0.05, 0.95))
paste('The 5th percentile of Lot Area',e5[1])
## [1] "The 5th percentile of Lot Area 3311.7"
paste('The 95th percentile of Lot Area',e5[2])
## [1] "The 95th percentile of Lot Area 17401.15"
Since the minimum value was 1300, there was no need to shift the data to thie right. The data was still skewed, so using the optimal value of ??, from the exponential probability density function, we were abe to create a sample values. The histogram of of the lamnda is a better fit than empiracal because it was spread out, but there is significant diffence from the 5th and 95th percentiles.
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
I wanted to take some of numerical data I thought was most relvent to predicing the sale price and then adding some catagorical data to improve model.
Step 1 Observe numerical values:
#Observe the numerical variables that I might be possible predictors of House Sale Price.
n_values <- train %>%
select(c("SalePrice","BedroomAbvGr","BsmtFinSF1", "BsmtUnfSF", "EnclosedPorch", "GarageArea", "GrLivArea", "LotArea", "LotFrontage", "MasVnrArea", "MiscVal", "MoSold", "MSSubClass", "OpenPorchSF", "TotalBsmtSF","TotRmsAbvGrd","OverallQual", "WoodDeckSF", "X1stFlrSF", "X2ndFlrSF", "OverallCond","GarageYrBlt","YearBuilt","YearRemodAdd","YrSold","OverallCond"))
n_values %>%
select(c("BedroomAbvGr","BsmtFinSF1", "BsmtUnfSF", "EnclosedPorch", "GarageArea", "GrLivArea", "LotArea", "LotFrontage", "MasVnrArea", "MiscVal", "MoSold", "MSSubClass", "OpenPorchSF", "TotalBsmtSF","TotRmsAbvGrd","OverallQual", "WoodDeckSF", "X1stFlrSF", "X2ndFlrSF", "OverallCond")) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
Model with just numerical values:
#Model with numeric values
n_model <- lm(SalePrice ~ ., data= n_values)
n_model_sum <- summary(n_model)
n_model_sum
##
## Call:
## lm(formula = SalePrice ~ ., data = n_values)
##
## Residuals:
## Min 1Q Median 3Q Max
## -517713 -17067 -2564 13872 285879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -462656.86006 1728293.34532 -0.268 0.788984
## BedroomAbvGr -9712.00720 2166.42458 -4.483 0.00000813497929 ***
## BsmtFinSF1 8.42276 7.70004 1.094 0.274257
## BsmtUnfSF -9.07850 7.46526 -1.216 0.224209
## EnclosedPorch 1.48723 20.87798 0.071 0.943224
## GarageArea 44.42931 8.98165 4.947 0.00000087309373 ***
## GrLivArea 39.63955 28.25052 1.403 0.160857
## LotArea 0.64563 0.16051 4.022 0.00006156965145 ***
## LotFrontage -157.68040 61.74859 -2.554 0.010796 *
## MasVnrArea 33.03158 7.12640 4.635 0.00000399632929 ***
## MiscVal -1.40886 6.91865 -0.204 0.838678
## MoSold -149.16539 431.47826 -0.346 0.729629
## MSSubClass -229.95134 32.91701 -6.986 0.00000000000491 ***
## OpenPorchSF 0.03912 19.74854 0.002 0.998420
## TotalBsmtSF 12.44746 8.66939 1.436 0.151346
## TotRmsAbvGrd 5131.88005 1456.11245 3.524 0.000442 ***
## OverallQual 21507.70083 1448.03879 14.853 < 0.0000000000000002 ***
## WoodDeckSF 22.58721 10.09019 2.239 0.025387 *
## X1stFlrSF 9.17077 28.85105 0.318 0.750647
## X2ndFlrSF 9.79769 28.33629 0.346 0.729586
## OverallCond 5188.78756 1384.53898 3.748 0.000188 ***
## GarageYrBlt -66.11328 92.22636 -0.717 0.473614
## YearBuilt 391.76876 84.40576 4.641 0.00000387694675 ***
## YearRemodAdd 175.56884 87.11665 2.015 0.044114 *
## YrSold -302.28437 860.29120 -0.351 0.725375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37760 on 1096 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.7974, Adjusted R-squared: 0.793
## F-statistic: 179.7 on 24 and 1096 DF, p-value: < 0.00000000000000022
Adjusted R squared is 79% but there are alot variables that doesn’t fit in our model. Backward elimination is performed to to improve our model.
#Model with numeric values
n_model2 <- lm(SalePrice ~ BedroomAbvGr + GarageArea + LotArea + LotFrontage + MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF + OverallCond + YearBuilt + YearRemodAdd, data= n_values)
n_model_sum2 <- summary(n_model2)
n_model_sum2
##
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + GarageArea + LotArea +
## LotFrontage + MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual +
## WoodDeckSF + OverallCond + YearBuilt + YearRemodAdd, data = n_values)
##
## Residuals:
## Min 1Q Median 3Q Max
## -361429 -21393 -3658 15833 398239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1093205.7564 140657.6554 -7.772 0.0000000000000167 ***
## BedroomAbvGr -9322.3098 2057.3118 -4.531 0.0000064565922659 ***
## GarageArea 45.9535 7.1360 6.440 0.0000000001737618 ***
## LotArea 1.0858 0.1654 6.564 0.0000000000782179 ***
## LotFrontage 32.6109 61.4556 0.531 0.59577
## MasVnrArea 56.4118 7.2806 7.748 0.0000000000000200 ***
## MSSubClass -148.7649 30.4232 -4.890 0.0000011482742454 ***
## TotRmsAbvGrd 13598.1608 1204.2464 11.292 < 0.0000000000000002 ***
## OverallQual 25768.3289 1303.4936 19.769 < 0.0000000000000002 ***
## WoodDeckSF 46.5424 10.3277 4.507 0.0000072419246356 ***
## OverallCond 3735.9077 1301.7480 2.870 0.00418 **
## YearBuilt 268.6362 62.2756 4.314 0.0000173964086604 ***
## YearRemodAdd 235.5410 80.3354 2.932 0.00343 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40320 on 1182 degrees of freedom
## (265 observations deleted due to missingness)
## Multiple R-squared: 0.7673, Adjusted R-squared: 0.765
## F-statistic: 324.9 on 12 and 1182 DF, p-value: < 0.00000000000000022
Model 1 : Numericl variables only.
#Model with numeric values
model1 <- lm(SalePrice ~ BedroomAbvGr + GarageArea + LotArea + MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF + OverallCond + YearBuilt + YearRemodAdd, data= n_values)
summary(model1)
##
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + GarageArea + LotArea +
## MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF +
## OverallCond + YearBuilt + YearRemodAdd, data = n_values)
##
## Residuals:
## Min 1Q Median 3Q Max
## -342898 -20933 -3203 15311 401339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1061769.7525 124070.7838 -8.558 < 0.0000000000000002 ***
## BedroomAbvGr -9815.3094 1794.5857 -5.469 0.00000005317046417 ***
## GarageArea 48.3784 6.2232 7.774 0.00000000000001440 ***
## LotArea 0.8925 0.1065 8.376 < 0.0000000000000002 ***
## MasVnrArea 51.5658 6.3807 8.082 0.00000000000000134 ***
## MSSubClass -165.5541 24.6467 -6.717 0.00000000002661331 ***
## TotRmsAbvGrd 14096.5574 1034.8157 13.622 < 0.0000000000000002 ***
## OverallQual 25361.8019 1129.0649 22.463 < 0.0000000000000002 ***
## WoodDeckSF 44.9242 8.5735 5.240 0.00000018453810178 ***
## OverallCond 3301.0216 1092.0307 3.023 0.00255 **
## YearBuilt 273.7409 55.4733 4.935 0.00000089638723246 ***
## YearRemodAdd 218.5333 70.5734 3.097 0.00200 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38640 on 1440 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.7643, Adjusted R-squared: 0.7625
## F-statistic: 424.5 on 11 and 1440 DF, p-value: < 0.00000000000000022
Model 2 : adding Catagorical values to the numeric values.
#Observe the numerical variables that I might be possible predictors of House Sale Price.
c_values <- train %>%
select(c("SalePrice", "BldgType", "BsmtCond", "BsmtQual", "CentralAir", "Condition1", "Condition2", "Electrical", "Exterior1st", "Exterior2nd", "ExterQual", "Fence", "Foundation", "Functional", "GarageQual", "Heating", "HeatingQC", "HouseStyle", "KitchenQual", "MSZoning", "Neighborhood", "PavedDrive", "RoofMatl", "RoofStyle", "SaleCondition", "SaleType"))
c_model <- lm(SalePrice ~ BedroomAbvGr + GarageArea + LotArea + MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF + OverallCond + YearBuilt + YearRemodAdd + SalePrice + BldgType + BsmtCond + BsmtQual + CentralAir + Condition1 + Condition2 + Electrical + Exterior1st + Exterior2nd + ExterQual + Fence + Foundation + Functional + GarageQual + Heating + HeatingQC + HouseStyle + KitchenQual + MSZoning + Neighborhood + PavedDrive + RoofMatl + RoofStyle + SaleCondition + SaleType, data = train)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared
## on the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 12 in
## model.matrix: no columns are assigned
summary(c_model)
##
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + GarageArea + LotArea +
## MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF +
## OverallCond + YearBuilt + YearRemodAdd + SalePrice + BldgType +
## BsmtCond + BsmtQual + CentralAir + Condition1 + Condition2 +
## Electrical + Exterior1st + Exterior2nd + ExterQual + Fence +
## Foundation + Functional + GarageQual + Heating + HeatingQC +
## HouseStyle + KitchenQual + MSZoning + Neighborhood + PavedDrive +
## RoofMatl + RoofStyle + SaleCondition + SaleType, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48983 -8817 0 7501 55856
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -969444.1004 421735.5976 -2.299 0.022971
## BedroomAbvGr 4822.9137 3483.9079 1.384 0.168411
## GarageArea 36.6264 11.9278 3.071 0.002557
## LotArea 4.1238 0.6418 6.426 0.000000001825381
## MasVnrArea 32.1231 12.3539 2.600 0.010294
## MSSubClass -119.0365 683.8900 -0.174 0.862066
## TotRmsAbvGrd 7029.3113 2170.9198 3.238 0.001497
## OverallQual 11440.8670 2498.3262 4.579 0.000010052994650
## WoodDeckSF 27.6732 11.3253 2.443 0.015764
## OverallCond 4377.7991 1627.6347 2.690 0.008003
## YearBuilt 504.4812 168.6033 2.992 0.003264
## YearRemodAdd 64.2865 123.8007 0.519 0.604372
## BldgType2fmCon 6617.3557 91991.1512 0.072 0.942754
## BldgTypeDuplex 11988.8150 16729.2494 0.717 0.474766
## BldgTypeTwnhs 8550.6322 72773.9723 0.117 0.906632
## BldgTypeTwnhsE -11749.9638 70437.2286 -0.167 0.867752
## BsmtCondGd -13379.1301 17922.0045 -0.747 0.456579
## BsmtCondTA -4432.6580 16106.1892 -0.275 0.783549
## BsmtQualFa -2142.3206 27981.9463 -0.077 0.939080
## BsmtQualGd 6049.6211 20964.3316 0.289 0.773330
## BsmtQualTA 3664.5879 21642.9970 0.169 0.865784
## CentralAirY 20547.3522 11357.7110 1.809 0.072534
## Condition1Feedr 11978.3935 9433.1344 1.270 0.206211
## Condition1Norm 6901.3231 8024.2031 0.860 0.391194
## Condition1PosN 44096.3155 23280.5696 1.894 0.060228
## Condition1RRAe 2295.6171 21581.0002 0.106 0.915436
## Condition1RRAn -1147.3431 15939.8669 -0.072 0.942719
## Condition1RRNn 7267.6767 29494.6404 0.246 0.805721
## Condition2Norm 18753.3899 22387.5123 0.838 0.403613
## Condition2RRNn 8420.3959 31213.8136 0.270 0.787730
## ElectricalFuseF 11995.2832 32370.3050 0.371 0.711510
## ElectricalSBrkr 13371.1318 7339.3824 1.822 0.070570
## Exterior1stBrkFace -6592.9011 34154.4682 -0.193 0.847208
## Exterior1stCemntBd -61669.7765 27353.9493 -2.255 0.025684
## Exterior1stHdBoard -33681.2846 33609.4141 -1.002 0.317969
## Exterior1stImStucc -410087.4617 51492.7483 -7.964 0.000000000000472
## Exterior1stMetalSd -29012.2733 35277.9979 -0.822 0.412224
## Exterior1stPlywood -21498.8463 33405.4866 -0.644 0.520884
## Exterior1stStucco -33659.2707 37604.3043 -0.895 0.372243
## Exterior1stVinylSd -10631.0356 41093.8735 -0.259 0.796238
## Exterior1stWd Sdng -46753.6043 34717.0641 -1.347 0.180206
## Exterior1stWdShing -24055.5677 35330.0621 -0.681 0.497048
## Exterior2ndAsphShn -16771.1156 43837.8355 -0.383 0.702605
## Exterior2ndBrkFace 3373.9999 36829.8353 0.092 0.927136
## Exterior2ndCmentBd NA NA NA NA
## Exterior2ndHdBoard 1386.5865 35237.4945 0.039 0.968666
## Exterior2ndImStucc 386209.9840 57808.2982 6.681 0.000000000490852
## Exterior2ndMetalSd -2650.3075 36953.9167 -0.072 0.942926
## Exterior2ndPlywood -3766.9258 34449.6063 -0.109 0.913081
## Exterior2ndStucco 18293.6126 42285.9614 0.433 0.665945
## Exterior2ndVinylSd -16618.3618 43204.7259 -0.385 0.701074
## Exterior2ndWd Sdng 20758.6547 36574.0598 0.568 0.571211
## Exterior2ndWd Shng 19540.7221 35695.6179 0.547 0.584939
## ExterQualFa -141359.8740 61802.8463 -2.287 0.023647
## ExterQualGd -136158.6722 40535.0428 -3.359 0.001003
## ExterQualTA -146724.1147 42160.4931 -3.480 0.000665
## FenceGdWo 8777.6878 5066.5092 1.732 0.085342
## FenceMnPrv 7553.7294 4177.1299 1.808 0.072653
## FenceMnWw 5401.4939 7829.9387 0.690 0.491406
## FoundationCBlock -1708.9255 7883.4508 -0.217 0.828694
## FoundationPConc -5023.3596 7796.7441 -0.644 0.520420
## FoundationStone -56981.2915 23837.9147 -2.390 0.018134
## FoundationWood -26160.8164 24300.9587 -1.077 0.283501
## FunctionalMin1 19401.6798 14856.1605 1.306 0.193660
## FunctionalMin2 15322.0112 16398.7331 0.934 0.351703
## FunctionalMod 38828.6794 25402.0775 1.529 0.128582
## FunctionalTyp 10957.6304 14286.1988 0.767 0.444341
## GarageQualFa -131121.6547 42008.3080 -3.121 0.002179
## GarageQualGd -103778.1606 44470.9342 -2.334 0.021009
## GarageQualPo -165639.2430 63268.0007 -2.618 0.009795
## GarageQualTA -134703.1010 40926.0919 -3.291 0.001256
## HeatingGasW 440.6257 17051.6042 0.026 0.979420
## HeatingGrav 7334.9425 35895.2652 0.204 0.838376
## HeatingQCFa 2398.5767 8948.7137 0.268 0.789058
## HeatingQCGd -5671.7493 4645.3228 -1.221 0.224111
## HeatingQCTA -2242.6157 3862.9867 -0.581 0.562465
## HouseStyle1.5Unf -2610.5618 23968.3805 -0.109 0.913421
## HouseStyle1Story -8240.1453 18772.5946 -0.439 0.661364
## HouseStyle2.5Fin NA NA NA NA
## HouseStyle2.5Unf -3964.3068 26617.0731 -0.149 0.881812
## HouseStyle2Story -1831.0116 11773.1117 -0.156 0.876627
## HouseStyleSFoyer -8391.9427 27793.1485 -0.302 0.763134
## HouseStyleSLvl -11746.8942 24737.7084 -0.475 0.635613
## KitchenQualFa -45510.9789 18555.1138 -2.453 0.015380
## KitchenQualGd -42077.6051 13027.3810 -3.230 0.001536
## KitchenQualTA -50128.5545 12823.2891 -3.909 0.000143
## MSZoningFV 67535.3491 46250.4615 1.460 0.146426
## MSZoningRH 91059.1678 36450.5299 2.498 0.013616
## MSZoningRL 99328.2316 32192.6765 3.085 0.002441
## MSZoningRM 83881.5531 31570.5972 2.657 0.008782
## NeighborhoodBrkSide -23576.9602 30090.8436 -0.784 0.434614
## NeighborhoodClearCr -18237.4600 33500.2461 -0.544 0.587016
## NeighborhoodCollgCr -50695.5925 30307.5364 -1.673 0.096572
## NeighborhoodCrawfor -16241.5395 32036.5045 -0.507 0.612957
## NeighborhoodEdwards -44828.2352 30709.0835 -1.460 0.146546
## NeighborhoodGilbert -44739.4853 35717.0849 -1.253 0.212393
## NeighborhoodIDOTRR -22149.1877 30135.1331 -0.735 0.463546
## NeighborhoodMeadowV 17119.3334 31656.3851 0.541 0.589496
## NeighborhoodMitchel -46229.2539 30742.9893 -1.504 0.134855
## NeighborhoodNAmes -50905.1783 30364.0016 -1.676 0.095826
## NeighborhoodNoRidge NA NA NA NA
## NeighborhoodNWAmes -52229.3562 30089.6480 -1.736 0.084755
## NeighborhoodOldTown -28957.0667 29369.3755 -0.986 0.325817
## NeighborhoodSawyer -50431.3146 30613.9371 -1.647 0.101686
## NeighborhoodSawyerW -36217.4504 28496.0161 -1.271 0.205805
## NeighborhoodSomerst NA NA NA NA
## NeighborhoodSWISU -59129.8107 33205.1023 -1.781 0.077077
## NeighborhoodTimber 24541.4211 41260.3311 0.595 0.552921
## NeighborhoodVeenker -12737.8979 37108.3491 -0.343 0.731905
## PavedDriveP -23076.9493 14218.6503 -1.623 0.106791
## PavedDriveY 7061.7827 10340.1463 0.683 0.495745
## RoofMatlTar&Grv 53044.1068 35731.3797 1.485 0.139871
## RoofMatlWdShngl -8642.8551 20556.7269 -0.420 0.674796
## RoofStyleGable 23732.3204 28278.6878 0.839 0.402741
## RoofStyleGambrel 48710.5440 31370.0235 1.553 0.122688
## RoofStyleHip 19940.1765 28741.0765 0.694 0.488941
## RoofStyleMansard NA NA NA NA
## SaleConditionAlloca -26618.4696 23290.1662 -1.143 0.254987
## SaleConditionFamily -1654.0488 9994.0119 -0.166 0.868781
## SaleConditionNormal 10341.2536 5233.8839 1.976 0.050099
## SaleTypeConLI -64066.4768 28180.7565 -2.273 0.024491
## SaleTypeCWD -13110.2459 20715.4994 -0.633 0.527828
## SaleTypeWD -215.0227 7439.3828 -0.029 0.976982
##
## (Intercept) *
## BedroomAbvGr
## GarageArea **
## LotArea ***
## MasVnrArea *
## MSSubClass
## TotRmsAbvGrd **
## OverallQual ***
## WoodDeckSF *
## OverallCond **
## YearBuilt **
## YearRemodAdd
## BldgType2fmCon
## BldgTypeDuplex
## BldgTypeTwnhs
## BldgTypeTwnhsE
## BsmtCondGd
## BsmtCondTA
## BsmtQualFa
## BsmtQualGd
## BsmtQualTA
## CentralAirY .
## Condition1Feedr
## Condition1Norm
## Condition1PosN .
## Condition1RRAe
## Condition1RRAn
## Condition1RRNn
## Condition2Norm
## Condition2RRNn
## ElectricalFuseF
## ElectricalSBrkr .
## Exterior1stBrkFace
## Exterior1stCemntBd *
## Exterior1stHdBoard
## Exterior1stImStucc ***
## Exterior1stMetalSd
## Exterior1stPlywood
## Exterior1stStucco
## Exterior1stVinylSd
## Exterior1stWd Sdng
## Exterior1stWdShing
## Exterior2ndAsphShn
## Exterior2ndBrkFace
## Exterior2ndCmentBd
## Exterior2ndHdBoard
## Exterior2ndImStucc ***
## Exterior2ndMetalSd
## Exterior2ndPlywood
## Exterior2ndStucco
## Exterior2ndVinylSd
## Exterior2ndWd Sdng
## Exterior2ndWd Shng
## ExterQualFa *
## ExterQualGd **
## ExterQualTA ***
## FenceGdWo .
## FenceMnPrv .
## FenceMnWw
## FoundationCBlock
## FoundationPConc
## FoundationStone *
## FoundationWood
## FunctionalMin1
## FunctionalMin2
## FunctionalMod
## FunctionalTyp
## GarageQualFa **
## GarageQualGd *
## GarageQualPo **
## GarageQualTA **
## HeatingGasW
## HeatingGrav
## HeatingQCFa
## HeatingQCGd
## HeatingQCTA
## HouseStyle1.5Unf
## HouseStyle1Story
## HouseStyle2.5Fin
## HouseStyle2.5Unf
## HouseStyle2Story
## HouseStyleSFoyer
## HouseStyleSLvl
## KitchenQualFa *
## KitchenQualGd **
## KitchenQualTA ***
## MSZoningFV
## MSZoningRH *
## MSZoningRL **
## MSZoningRM **
## NeighborhoodBrkSide
## NeighborhoodClearCr
## NeighborhoodCollgCr .
## NeighborhoodCrawfor
## NeighborhoodEdwards
## NeighborhoodGilbert
## NeighborhoodIDOTRR
## NeighborhoodMeadowV
## NeighborhoodMitchel
## NeighborhoodNAmes .
## NeighborhoodNoRidge
## NeighborhoodNWAmes .
## NeighborhoodOldTown
## NeighborhoodSawyer
## NeighborhoodSawyerW
## NeighborhoodSomerst
## NeighborhoodSWISU .
## NeighborhoodTimber
## NeighborhoodVeenker
## PavedDriveP
## PavedDriveY
## RoofMatlTar&Grv
## RoofMatlWdShngl
## RoofStyleGable
## RoofStyleGambrel
## RoofStyleHip
## RoofStyleMansard
## SaleConditionAlloca
## SaleConditionFamily
## SaleConditionNormal .
## SaleTypeConLI *
## SaleTypeCWD
## SaleTypeWD
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18470 on 143 degrees of freedom
## (1199 observations deleted due to missingness)
## Multiple R-squared: 0.9506, Adjusted R-squared: 0.9102
## F-statistic: 23.53 on 117 and 143 DF, p-value: < 0.00000000000000022
Goal: Perform backwards elmination multiple ways to build multiple modles and select best fit.
n_model <- lm(SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFinSF2 + BsmtFullBath + BsmtHalfBath + BsmtUnfSF + EnclosedPorch + Fireplaces + FullBath + GarageArea + GarageCars + GarageYrBlt + GrLivArea + HalfBath + KitchenAbvGr + LotArea + LotFrontage + LowQualFinSF + MasVnrArea + MiscVal + MoSold + MSSubClass + OpenPorchSF + OverallCond + OverallQual + PoolArea + ScreenPorch + TotalBsmtSF + TotRmsAbvGrd + WoodDeckSF + X1stFlrSF + X2ndFlrSF + X3SsnPorch + YearBuilt + YearRemodAdd + YrSold, data = train)
summary(n_model)
##
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFinSF2 +
## BsmtFullBath + BsmtHalfBath + BsmtUnfSF + EnclosedPorch +
## Fireplaces + FullBath + GarageArea + GarageCars + GarageYrBlt +
## GrLivArea + HalfBath + KitchenAbvGr + LotArea + LotFrontage +
## LowQualFinSF + MasVnrArea + MiscVal + MoSold + MSSubClass +
## OpenPorchSF + OverallCond + OverallQual + PoolArea + ScreenPorch +
## TotalBsmtSF + TotRmsAbvGrd + WoodDeckSF + X1stFlrSF + X2ndFlrSF +
## X3SsnPorch + YearBuilt + YearRemodAdd + YrSold, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442865 -16873 -2581 14998 318042
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -323175.5898 1700608.6185 -0.190 0.849317
## BedroomAbvGr -10232.5007 2154.0411 -4.750 0.0000023024581154 ***
## BsmtFinSF1 17.3892 5.8353 2.980 0.002947 **
## BsmtFinSF2 8.3616 8.7631 0.954 0.340205
## BsmtFullBath 8979.9920 3193.9015 2.812 0.005018 **
## BsmtHalfBath 2490.1194 5070.9578 0.491 0.623487
## BsmtUnfSF 5.0056 5.2753 0.949 0.342890
## EnclosedPorch 7.2332 20.6133 0.351 0.725733
## Fireplaces 4374.8147 2187.8569 2.000 0.045793 *
## FullBath 5389.6373 3528.5219 1.527 0.126941
## GarageArea 6.4882 12.1138 0.536 0.592338
## GarageCars 16788.4001 3486.6828 4.815 0.0000016803185415 ***
## GarageYrBlt -49.1431 90.9333 -0.540 0.589011
## GrLivArea 46.6797 6.0986 7.654 0.0000000000000428 ***
## HalfBath -1118.5234 3319.8706 -0.337 0.736244
## KitchenAbvGr -21931.2788 6704.4025 -3.271 0.001105 **
## LotArea 0.5454 0.1573 3.466 0.000548 ***
## LotFrontage -116.1232 61.2411 -1.896 0.058203 .
## LowQualFinSF -12.5260 27.9855 -0.448 0.654539
## MasVnrArea 31.6049 7.0060 4.511 0.0000071502458891 ***
## MiscVal -3.8501 6.9549 -0.554 0.579980
## MoSold -224.0209 422.6730 -0.530 0.596213
## MSSubClass -200.4890 34.4859 -5.814 0.0000000080292914 ***
## OpenPorchSF -2.3153 19.4782 -0.119 0.905404
## OverallCond 5227.2069 1367.0842 3.824 0.000139 ***
## OverallQual 18696.5040 1478.4163 12.646 < 0.0000000000000002 ***
## PoolArea -61.2618 29.8422 -2.053 0.040326 *
## ScreenPorch 57.9661 20.3986 2.842 0.004572 **
## TotalBsmtSF NA NA NA NA
## TotRmsAbvGrd 5439.7807 1485.7761 3.661 0.000263 ***
## WoodDeckSF 21.5457 10.0176 2.151 0.031713 *
## X1stFlrSF -0.7679 6.7133 -0.114 0.908948
## X2ndFlrSF NA NA NA NA
## X3SsnPorch 34.5789 37.4933 0.922 0.356593
## YearBuilt 316.9684 87.6223 3.617 0.000311 ***
## YearRemodAdd 120.5742 86.6125 1.392 0.164174
## YrSold -253.6384 845.3939 -0.300 0.764216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36790 on 1086 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.8095, Adjusted R-squared: 0.8036
## F-statistic: 135.7 on 34 and 1086 DF, p-value: < 0.00000000000000022
c_model <- lm(SalePrice ~ BedroomAbvGr + BldgType + BsmtCond + BsmtFinSF1 + BsmtFinSF2 + BsmtFullBath + BsmtHalfBath + BsmtQual + BsmtUnfSF + CentralAir + Condition1 + Condition2 + Electrical + EnclosedPorch + Exterior1st + Exterior2nd + ExterQual + Fence + Fireplaces + Foundation + FullBath + Functional + MiscVal + MoSold + MSSubClass + MSZoning + Neighborhood + OpenPorchSF + OverallCond + OverallQual + PavedDrive + PoolArea + RoofMatl + RoofStyle + SaleCondition + SaleType + ScreenPorch + TotalBsmtSF + GarageArea + GarageCars + GarageQual + GarageYrBlt + GrLivArea + HalfBath + Heating + HeatingQC + HouseStyle + KitchenAbvGr + KitchenQual + LotArea + LotFrontage + LowQualFinSF + MasVnrArea + TotRmsAbvGrd + WoodDeckSF + X1stFlrSF + X2ndFlrSF + X3SsnPorch + YearBuilt + YearRemodAdd + YrSold, data = train, na.action=na.omit)
summary(c_model)
summary(lm(SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFinSF2 + BsmtFullBath + BsmtHalfBath + BsmtUnfSF + EnclosedPorch + Fireplaces + FullBath + GarageArea + GarageCars + GarageYrBlt + GrLivArea + HalfBath + KitchenAbvGr + LotArea + LotFrontage + LowQualFinSF + MasVnrArea + MiscVal + MoSold + MSSubClass + OpenPorchSF + OverallCond + OverallQual + PoolArea + ScreenPorch + TotalBsmtSF + TotRmsAbvGrd + WoodDeckSF + X1stFlrSF + X2ndFlrSF + X3SsnPorch + YearBuilt + YearRemodAdd + YrSold + Exterior1st + Exterior2nd + Functional + MSZoning + GarageQual + KitchenQual + SaleType, data = train))
summary(lm(SalePrice ~ BsmtFinSF1 + BsmtFullBath + Fireplaces + FullBath + GarageCars + GrLivArea + KitchenAbvGr + LotArea + LotFrontage + MasVnrArea + MSSubClass + OverallCond + SaleType + Exterior2nd + Functional + GarageQual + KitchenQual, data = train))
Model 2 : Backward elimination of all non significant values
model2 <- (lm(SalePrice ~ BsmtFullBath + Fireplaces + FullBath + KitchenAbvGr + LotArea + LotFrontage + MasVnrArea + MSSubClass + OverallCond + SaleType + Exterior2nd + GarageQual + KitchenQual, data = train))
summary(model2)
##
## Call:
## lm(formula = SalePrice ~ BsmtFullBath + Fireplaces + FullBath +
## KitchenAbvGr + LotArea + LotFrontage + MasVnrArea + MSSubClass +
## OverallCond + SaleType + Exterior2nd + GarageQual + KitchenQual,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -360851 -21889 -2026 18049 315254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 196307.9116 32682.9563 6.006 0.00000000258758
## BsmtFullBath 19722.4609 2755.7302 7.157 0.00000000000152
## Fireplaces 22060.9194 2388.2352 9.237 < 0.0000000000000002
## FullBath 44203.9477 3197.8832 13.823 < 0.0000000000000002
## KitchenAbvGr -22490.8825 7249.7334 -3.102 0.00197
## LotArea 1.0795 0.1868 5.778 0.00000000989532
## LotFrontage 158.0544 68.6271 2.303 0.02146
## MasVnrArea 87.5928 7.8314 11.185 < 0.0000000000000002
## MSSubClass -116.2123 37.9872 -3.059 0.00227
## OverallCond 3276.5224 1355.6991 2.417 0.01582
## SaleTypeCon 85658.0360 32572.7264 2.630 0.00867
## SaleTypeConLD 5211.8180 19774.4104 0.264 0.79217
## SaleTypeConLI 16077.7086 26795.4970 0.600 0.54862
## SaleTypeConLw 882.8105 23468.0835 0.038 0.97000
## SaleTypeCWD 37722.8259 23620.1161 1.597 0.11054
## SaleTypeNew 30508.3753 9323.4361 3.272 0.00110
## SaleTypeOth 17743.3237 44675.6724 0.397 0.69133
## SaleTypeWD 7578.4084 8026.8175 0.944 0.34531
## Exterior2ndAsphShn 16562.5367 33250.3649 0.498 0.61850
## Exterior2ndBrk Cmn -10380.9470 20585.6034 -0.504 0.61417
## Exterior2ndBrkFace -1247.3837 15357.7758 -0.081 0.93528
## Exterior2ndCBlock -943.1211 45473.1691 -0.021 0.98346
## Exterior2ndCmentBd 16089.2134 13779.6971 1.168 0.24322
## Exterior2ndHdBoard -2346.8733 12288.4871 -0.191 0.84858
## Exterior2ndImStucc 51960.2210 18881.3255 2.752 0.00602
## Exterior2ndMetalSd -1532.2999 12115.0201 -0.126 0.89938
## Exterior2ndOther 53166.4987 45791.2098 1.161 0.24587
## Exterior2ndPlywood -9725.1747 12576.6196 -0.773 0.43953
## Exterior2ndStone -382.0658 24855.6420 -0.015 0.98774
## Exterior2ndStucco -23453.4449 15282.4708 -1.535 0.12516
## Exterior2ndVinylSd 9388.7911 12116.9011 0.775 0.43860
## Exterior2ndWd Sdng -1277.2133 12164.5490 -0.105 0.91640
## Exterior2ndWd Shng -5948.0868 14260.2391 -0.417 0.67668
## GarageQualFa -62863.8946 26324.4598 -2.388 0.01711
## GarageQualGd -37397.9552 28853.8964 -1.296 0.19521
## GarageQualPo -54655.3857 36379.7852 -1.502 0.13330
## GarageQualTA -51629.9449 25614.9996 -2.016 0.04409
## KitchenQualFa -105796.7897 11364.2023 -9.310 < 0.0000000000000002
## KitchenQualGd -75065.4808 5598.6672 -13.408 < 0.0000000000000002
## KitchenQualTA -101929.2843 6135.7426 -16.612 < 0.0000000000000002
##
## (Intercept) ***
## BsmtFullBath ***
## Fireplaces ***
## FullBath ***
## KitchenAbvGr **
## LotArea ***
## LotFrontage *
## MasVnrArea ***
## MSSubClass **
## OverallCond *
## SaleTypeCon **
## SaleTypeConLD
## SaleTypeConLI
## SaleTypeConLw
## SaleTypeCWD
## SaleTypeNew **
## SaleTypeOth
## SaleTypeWD
## Exterior2ndAsphShn
## Exterior2ndBrk Cmn
## Exterior2ndBrkFace
## Exterior2ndCBlock
## Exterior2ndCmentBd
## Exterior2ndHdBoard
## Exterior2ndImStucc **
## Exterior2ndMetalSd
## Exterior2ndOther
## Exterior2ndPlywood
## Exterior2ndStone
## Exterior2ndStucco
## Exterior2ndVinylSd
## Exterior2ndWd Sdng
## Exterior2ndWd Shng
## GarageQualFa *
## GarageQualGd
## GarageQualPo
## GarageQualTA *
## KitchenQualFa ***
## KitchenQualGd ***
## KitchenQualTA ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43850 on 1081 degrees of freedom
## (339 observations deleted due to missingness)
## Multiple R-squared: 0.7306, Adjusted R-squared: 0.7209
## F-statistic: 75.16 on 39 and 1081 DF, p-value: < 0.00000000000000022
#All remaining numerical + significant catagorical
summary(lm( SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFinSF2 + BsmtFullBath + BsmtHalfBath + BsmtUnfSF + EnclosedPorch + Fireplaces + FullBath + MiscVal + MoSold + MSSubClass + OpenPorchSF + OverallCond + OverallQual + PoolArea + GarageArea + GarageCars + GarageYrBlt + GrLivArea + HalfBath + KitchenAbvGr + LotArea + LotFrontage + LowQualFinSF + MasVnrArea + Exterior1st + Exterior2nd + Functional + MSZoning + GarageQual + KitchenQual, data = train))
Model 3 : Backward elimination of catagorical values and then Numerical Values.
model3 <- (lm(SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFullBath + Fireplaces + FullBath + MSSubClass + OverallCond + OverallQual + GarageCars + GrLivArea + KitchenAbvGr + LotArea + LotFrontage + MasVnrArea + MSZoning + KitchenQual, data = train))
summary(model3)
##
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFullBath +
## Fireplaces + FullBath + MSSubClass + OverallCond + OverallQual +
## GarageCars + GrLivArea + KitchenAbvGr + LotArea + LotFrontage +
## MasVnrArea + MSZoning + KitchenQual, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -440976 -16444 -1296 13950 264960
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3879.6031 16271.6219 -0.238 0.811591
## BedroomAbvGr -4806.3796 1717.0656 -2.799 0.005207 **
## BsmtFinSF1 11.5493 3.2629 3.540 0.000416 ***
## BsmtFullBath 10827.2228 2715.9942 3.986 0.00007120658 ***
## Fireplaces 4183.6519 1992.7963 2.099 0.035996 *
## FullBath 13706.5597 2818.1929 4.864 0.00000130945 ***
## MSSubClass -180.4528 29.6369 -6.089 0.00000000154 ***
## OverallCond 3190.9905 1011.4604 3.155 0.001647 **
## OverallQual 17293.2606 1306.0582 13.241 < 0.0000000000000002 ***
## GarageCars 15387.7318 1810.9282 8.497 < 0.0000000000000002 ***
## GrLivArea 44.9502 3.7216 12.078 < 0.0000000000000002 ***
## KitchenAbvGr -12162.1167 5135.7184 -2.368 0.018039 *
## LotArea 0.6130 0.1495 4.099 0.00004431207 ***
## LotFrontage -159.2464 55.8814 -2.850 0.004452 **
## MasVnrArea 39.7027 6.5322 6.078 0.00000000164 ***
## MSZoningFV 25024.4371 12719.6049 1.967 0.049373 *
## MSZoningRH 14339.1452 15172.3420 0.945 0.344810
## MSZoningRL 21267.8337 11610.8950 1.832 0.067248 .
## MSZoningRM 8125.3207 11782.5295 0.690 0.490577
## KitchenQualFa -47039.5329 8294.8440 -5.671 0.00000001787 ***
## KitchenQualGd -47024.0937 4481.5969 -10.493 < 0.0000000000000002 ***
## KitchenQualTA -57289.6461 5179.4156 -11.061 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35500 on 1173 degrees of freedom
## (265 observations deleted due to missingness)
## Multiple R-squared: 0.821, Adjusted R-squared: 0.8178
## F-statistic: 256.2 on 21 and 1173 DF, p-value: < 0.00000000000000022
Model 4 : Backward elimination of Numerical Values
model4 <- (lm(SalePrice ~ BsmtFinSF1 + BsmtFullBath + FullBath + OverallCond + OverallQual + GarageCars + GrLivArea + LotArea + LowQualFinSF + BldgType + BsmtQual + CentralAir + ExterQual + Fence + MSZoning + PavedDrive + GarageQual + KitchenQual, data= train))
summary(model4)
##
## Call:
## lm(formula = SalePrice ~ BsmtFinSF1 + BsmtFullBath + FullBath +
## OverallCond + OverallQual + GarageCars + GrLivArea + LotArea +
## LowQualFinSF + BldgType + BsmtQual + CentralAir + ExterQual +
## Fence + MSZoning + PavedDrive + GarageQual + KitchenQual,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -117542 -10286 1353 9303 117542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 232416.8187 51582.3562 4.506 0.000010652096255 ***
## BsmtFinSF1 29.8631 4.9124 6.079 0.000000005154209 ***
## BsmtFullBath 3319.0743 3310.0505 1.003 0.317075
## FullBath 5266.0252 4292.6824 1.227 0.221206
## OverallCond 5621.0401 1333.0414 4.217 0.000035984393215 ***
## OverallQual 13513.1748 2138.7685 6.318 0.000000001410005 ***
## GarageCars 5890.7078 2903.4117 2.029 0.043652 *
## GrLivArea 49.0802 4.7324 10.371 < 0.0000000000000002 ***
## LotArea 2.1263 0.6166 3.448 0.000674 ***
## LowQualFinSF -146.4405 40.9690 -3.574 0.000430 ***
## BldgType2fmCon -11602.7199 14263.8116 -0.813 0.416831
## BldgTypeDuplex -6239.9808 15992.7275 -0.390 0.696777
## BldgTypeTwnhs 14077.4161 17150.4634 0.821 0.412622
## BldgTypeTwnhsE -6790.1434 12224.4039 -0.555 0.579136
## BsmtQualFa -136193.4729 20313.0941 -6.705 0.000000000162060 ***
## BsmtQualGd -132630.4809 16969.8083 -7.816 0.000000000000211 ***
## BsmtQualTA -134184.9244 17418.8530 -7.703 0.000000000000424 ***
## CentralAirY 31826.3091 8218.7009 3.872 0.000141 ***
## ExterQualFa -41882.0580 35780.0257 -1.171 0.243026
## ExterQualGd -29207.9931 22534.4009 -1.296 0.196257
## ExterQualTA -45282.4288 22333.9747 -2.028 0.043794 *
## FenceGdWo 8986.4784 4748.7041 1.892 0.059726 .
## FenceMnPrv 11498.5675 3834.8631 2.998 0.003020 **
## FenceMnWw 1726.9122 7810.1918 0.221 0.825208
## MSZoningFV 64036.7263 32978.2128 1.942 0.053417 .
## MSZoningRH 18795.8988 28495.0453 0.660 0.510175
## MSZoningRL 26968.1600 24293.2575 1.110 0.268142
## MSZoningRM 18144.3943 24154.9149 0.751 0.453340
## PavedDriveP -7951.5849 11661.3980 -0.682 0.496024
## PavedDriveY -35.1946 7606.1544 -0.005 0.996312
## GarageQualFa -165401.3850 34063.2151 -4.856 0.000002250613782 ***
## GarageQualGd -123624.4199 35461.1682 -3.486 0.000589 ***
## GarageQualPo -129634.6776 40257.0505 -3.220 0.001471 **
## GarageQualTA -161529.4237 33170.2442 -4.870 0.000002111226286 ***
## KitchenQualFa -31529.6181 15423.4588 -2.044 0.042097 *
## KitchenQualGd -32851.8356 10141.0914 -3.239 0.001379 **
## KitchenQualTA -33324.1276 10277.1951 -3.243 0.001365 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21310 on 224 degrees of freedom
## (1199 observations deleted due to missingness)
## Multiple R-squared: 0.897, Adjusted R-squared: 0.8805
## F-statistic: 54.19 on 36 and 224 DF, p-value: < 0.00000000000000022
train_pred1 <- predict(model1, train)
train_pred2 <- predict(model2, train)
train_pred3 <- predict(model3, train)
train_pred4 <- predict(model4, train)
res1 <- train$SalePrice - train_pred1
res2 <- train$SalePrice - train_pred2
res3 <- train$SalePrice - train_pred3
res4 <- train$SalePrice - train_pred4
rmse1 <- sqrt(mean(res1**2,na.rm = TRUE))
rmse2 <- sqrt(mean(res2**2,na.rm = TRUE))
rmse3 <- sqrt(mean(res3**2,na.rm = TRUE))
rmse4 <- sqrt(mean(res4**2,na.rm = TRUE))
paste("")
## [1] ""
paste('RMSE for Model 1 :',round(rmse1,2))
## [1] "RMSE for Model 1 : 38477.99"
paste('RMSE for Model 2:',round(rmse2,2))
## [1] "RMSE for Model 2: 43062.57"
paste('RMSE for Model 3:',round(rmse3,2))
## [1] "RMSE for Model 3: 35170.76"
paste('RMSE for Model 4:',round(rmse4,2))
## [1] "RMSE for Model 4: 19742.29"
trainmean <-mean(train$SalePrice)
paste("Mean of Sales Price from Training Data is", round(trainmean,2))
## [1] "Mean of Sales Price from Training Data is 180921.2"
par(mfrow = c(2, 2))
hist(train_pred1)
hist(train_pred2)
hist(train_pred3)
hist(train_pred4)
hist(train$SalePrice)
test <- read.csv("C:/Users/User/Desktop/MSDS/DATA605/test.csv",stringsAsFactors = TRUE, na.strings = "NA")
test$pred1 <- predict(model1, test,na.rm = TRUE)
test$pred2 <- predict(model2, test,na.rm = TRUE)
test$pred3 <- predict(model3, test,na.rm = TRUE)
test$pred4 <- predict(model4, test,na.rm = TRUE)
kaggle1 <- data.frame(matrix(c(test$Id,test$pred1),ncol = 2))
kaggle2 <- data.frame(matrix(c(test$Id,test$pred2),ncol = 2))
kaggle3 <- data.frame(matrix(c(test$Id,test$pred3),ncol = 2))
kaggle4 <- data.frame(matrix(c(test$Id,test$pred4),ncol = 2))
colnames(kaggle1) <- c("Id","SalePrice")
colnames(kaggle2) <- c("Id","SalePrice")
colnames(kaggle3) <- c("Id","SalePrice")
colnames(kaggle4) <- c("Id","SalePrice")
rownames(kaggle1) <- NULL
rownames(kaggle2) <- NULL
rownames(kaggle3) <- NULL
rownames(kaggle4) <- NULL
for(i in 1:ncol(kaggle1)){
kaggle1[is.na(kaggle1[,i]), i] <- mean(kaggle1[,i], na.rm = TRUE)
}
for(i in 1:ncol(kaggle2)){
kaggle2[is.na(kaggle2[,i]), i] <- mean(kaggle2[,i], na.rm = TRUE)
}
for(i in 1:ncol(kaggle3)){
kaggle3[is.na(kaggle3[,i]), i] <- mean(kaggle3[,i], na.rm = TRUE)
}
for(i in 1:ncol(kaggle4)){
kaggle4[is.na(kaggle4[,i]), i] <- mean(kaggle4[,i], na.rm = TRUE)
}
write.csv(kaggle1, file = "kaggle1.csv")
write.csv(kaggle2, file = "kaggle2.csv")
write.csv(kaggle3, file = "kaggle3.csv")
write.csv(kaggle4, file = "kaggle4.csv")
Caption for the picture.