X = lot area Y = Sales Price

options(scipen=999)

X <- train$LotArea
Y <- train$SalePrice
plot(X,Y, xlab = 'Lot Area', ylab = 'Sales Price')

###Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

## [1] "First Quantile of Lot Area: "
##    25% 
## 7553.5
## [1] "First Quantile Sales Price: "
##    25% 
## 129975
  1. P(X>x | Y>y)
P1 <- nrow(subset(train,X > x | Y  >y ))
P_1 <- P1/nrow(train)
P_1
## [1] 0.8849315
  1. P(X>x, Y>y)
P2 <- nrow(subset(train, X>x &  Y>y))
P_2 <- P2/nrow(train)
P_2
## [1] 0.6150685
  1. P(Xy)
P3 <- nrow(subset(train, X < x | Y>y ))
P_3 <- P3/nrow(train)
P_3
## [1] 0.8650685
p1 <- nrow(subset(train, X <= x & Y <= y))
p2 <- nrow(subset(train,X <= x & Y > y))
p3 <- nrow(subset(train, X > x & Y <= y ))
p4 <- nrow(subset(train, X > x & Y > y))

t <- matrix(c('Y <= 1st Q'," Y > 1st Q","Total",p1,p2,p1+p2,p3,p4,p3+p4,p1+p3,p2+p4,p1+p2+p3+p4),nrow = 3)

t1 <- data.frame(t)
colnames(t1) <- (c("x/y","X <= 1st Q","X > 1st Q","Total"))
t1
##          x/y X <= 1st Q X > 1st Q Total
## 1 Y <= 1st Q        168       197   365
## 2  Y > 1st Q        197       898  1095
## 3      Total        365      1095  1460

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does P(AB)=P(A)P(B)? Check mathematically, and then evaluate by running a Chi Square test for association.

A = nrow(subset(train, X > x ))
B = nrow(subset(train, Y > y ))

rows <- nrow(train)

paste('P(AB) = ')
## [1] "P(AB) = "
P_2
## [1] 0.6150685
paste('P(A) * P(B) = ')
## [1] "P(A) * P(B) = "
(A/rows)*(B/rows)
## [1] 0.5625
train$Xgreaterx <- train$LotArea > x
train$Ygreatery <- train$SalePrice > y


c <- chisq.test(table(train$Xgreaterx, train$Ygreatery))
c
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(train$Xgreaterx, train$Ygreatery)
## X-squared = 113.27, df = 1, p-value < 0.00000000000000022

P value from our chi-squred test is less than the 0.05 significance level, so we reject the null hypothesis. This suggests that the Lot area is not independent of Sales Price.

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Quick analysis of the variables from the training data:

paste('Lot Area Overview')
## [1] "Lot Area Overview"
describe(train$LotArea)
##    vars    n     mean      sd median trimmed     mad  min    max  range
## X1    1 1460 10516.83 9981.26 9478.5 9563.28 2962.23 1300 215245 213945
##     skew kurtosis     se
## X1 12.18   202.26 261.22
paste('Sales Price Overview')
## [1] "Sales Price Overview"
describe(train$SalePrice)
##    vars    n     mean      sd median  trimmed     mad   min    max  range
## X1    1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100
##    skew kurtosis      se
## X1 1.88      6.5 2079.11
plot(X,Y, xlab = 'Lot Area', ylab = 'Sales Price', main = 'Lot Area vs Sales Price')

H0: correlations between each pairwise set of variables is 0.

HA: correlations between each pairwise set of variables is NOT 0.

paste('Overall Quality Overview')
## [1] "Overall Quality Overview"
describe(train$OverallQual)
##    vars    n mean   sd median trimmed  mad min max range skew kurtosis
## X1    1 1460  6.1 1.38      6    6.08 1.48   1  10     9 0.22     0.09
##      se
## X1 0.04
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
data <- matrix(c(train$SalePrice, train$LotArea, train$OverallQual),ncol = 3)
data <- data.frame(data)
colnames(data) <- c('SalesPrice', 'LotArea', 'OverallQual')
correlation <- rcorr(as.matrix(data), type = 'pearson')
paste('Correlation Matrix')
## [1] "Correlation Matrix"
cor1 <- print(correlation$r, digits = 3)
##             SalesPrice LotArea OverallQual
## SalesPrice       1.000   0.264       0.791
## LotArea          0.264   1.000       0.106
## OverallQual      0.791   0.106       1.000

I choose Overall Quality as my 3rd variable. From the correlation, we can see that there is a correlation between Overall Quality and Sales Price is very high with 79% but Lot Area does not correlate with niether Sales Price or Overall Quality.

Lets look at the t test at a 0.99 confidence level:

paste('Lot Area vs Overall Quality')
## [1] "Lot Area vs Overall Quality"
  t.test(data$LotArea, data$OverallQual , conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  data$LotArea and data$OverallQual
## t = 40.237, df = 1459, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  10053.09 10968.36
## sample estimates:
##    mean of x    mean of y 
## 10516.828082     6.099315
paste('Sales Price vs Overall Quality')
## [1] "Sales Price vs Overall Quality"
  t.test(data$SalesPrice, data$OverallQual , conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  data$SalesPrice and data$OverallQual
## t = 87.016, df = 1459, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  177272.7 184557.5
## sample estimates:
##     mean of x     mean of y 
## 180921.195890      6.099315
paste('Sales Price vs Lot Area')
## [1] "Sales Price vs Lot Area"
t.test(data$SalesPrice, data$LotArea , conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  data$SalesPrice and data$LotArea
## t = 81.321, df = 1505.1, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  166733.4 174075.3
## sample estimates:
## mean of x mean of y 
## 180921.20  10516.83

From the analysis above we reject the Null hypothesis that the correlation between each set of variables is 0. I would not be worried about a family wise error because From the 92% confidence intervval, we can see that the results are not 0.

Linear Algebra and Correlation.

Invert your 3 x 3 correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Inverse:

inv_cor <- solve(cor1)
inv_cor
##             SalesPrice    LotArea OverallQual
## SalesPrice   2.9280384 -0.5334669  -2.2595806
## LotArea     -0.5334669  1.1085153   0.3046752
## OverallQual -2.2595806  0.3046752   2.7550503

Correlation Matrix * Precision Matrix

cor_inv_mult <- cor1 %*% inv_cor
cor_inv_mult
##                            SalesPrice                   LotArea
## SalesPrice  1.00000000000000022204460 0.00000000000000005551115
## LotArea     0.00000000000000008326673 1.00000000000000000000000
## OverallQual 0.00000000000000044408921 0.00000000000000011102230
##             OverallQual
## SalesPrice            0
## LotArea               0
## OverallQual           1

Precision Matrix * Correlation Matrix

inv_cor_mult <- inv_cor %*% cor1
inv_cor_mult
##                             SalesPrice                 LotArea
## SalesPrice   1.00000000000000022204460 0.000000000000000194289
## LotArea     -0.00000000000000005551115 1.000000000000000000000
## OverallQual  0.00000000000000000000000 0.000000000000000000000
##                          OverallQual
## SalesPrice  0.0000000000000004440892
## LotArea     0.0000000000000000000000
## OverallQual 1.0000000000000000000000

LU Decompostion of Correlation Matrix

luA <- lu.decomposition(cor1)
L <- luA$L

U <- luA$U
paste('The numeric lower triangular matrix')
## [1] "The numeric lower triangular matrix"
print( L )
##           [,1]       [,2] [,3]
## [1,] 1.0000000  0.0000000    0
## [2,] 0.2638434  1.0000000    0
## [3,] 0.7909816 -0.1105879    1
paste('The number upper triangular matrix')
## [1] "The number upper triangular matrix"
print( U )
##      [,1]      [,2]       [,3]
## [1,]    1 0.2638434  0.7909816
## [2,]    0 0.9303867 -0.1028895
## [3,]    0 0.0000000  0.3629698
paste('L * U')
## [1] "L * U"
print( L %*% U )
##           [,1]      [,2]      [,3]
## [1,] 1.0000000 0.2638434 0.7909816
## [2,] 0.2638434 1.0000000 0.1058057
## [3,] 0.7909816 0.1058057 1.0000000

LU Decompostion of Precision Matrix

luA <- lu.decomposition(inv_cor)
L <- luA$L

U <- luA$U
paste('The numeric lower triangular matrix')
## [1] "The numeric lower triangular matrix"
print( L )
##            [,1]       [,2] [,3]
## [1,]  1.0000000  0.0000000    0
## [2,] -0.1821926  1.0000000    0
## [3,] -0.7717046 -0.1058057    1
paste('The number upper triangular matrix')
## [1] "The number upper triangular matrix"
print( U )
##          [,1]       [,2]       [,3]
## [1,] 2.928038 -0.5334669 -2.2595806
## [2,] 0.000000  1.0113216 -0.1070036
## [3,] 0.000000  0.0000000  1.0000000
paste('L * U')
## [1] "L * U"
print( L %*% U )
##            [,1]       [,2]       [,3]
## [1,]  2.9280384 -0.5334669 -2.2595806
## [2,] -0.5334669  1.1085153  0.3046752
## [3,] -2.2595806  0.3046752  2.7550503

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

Find the optimal value of ?? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ??)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

min_x <- min(X)
paste("Minimum value of Lot area is", min_x)
## [1] "Minimum value of Lot area is 1300"

optimal value of ??:

exp <- fitdistr(X, densfun = "exponential")
l <- exp$estimate
sample <- rexp(1000, l)
paste('optimal value of ??:',round(l,8))
## [1] "optimal value of ??: 0.00009509"

sample

par(mfrow = c(1, 2))
hist(sample, main = 'Histogram of Sample Lot Area')
hist(X, main = 'Histogram of Lot Area')

cdf5 <- qexp(0.05, rate = l, lower.tail = TRUE, log.p = FALSE)
paste('The 5th percentile is',round(cdf5,2))
## [1] "The 5th percentile is 539.44"
cdf95 <- qexp(0.95, rate = l, lower.tail = TRUE, log.p = FALSE)
paste('The 95th percentile is',round(cdf95,2))
## [1] "The 95th percentile is 31505.6"
ci95 <- CI(X, 0.95)
paste('95% confidence interval from empiracal data assuming normality')
## [1] "95% confidence interval from empiracal data assuming normality"
ci95
##    upper     mean    lower 
## 11029.24 10516.83 10004.42
e5 <- quantile(X, c(0.05, 0.95))
paste('The 5th percentile of Lot Area',e5[1])
## [1] "The 5th percentile of Lot Area 3311.7"
paste('The 95th percentile of Lot Area',e5[2])
## [1] "The 95th percentile of Lot Area 17401.15"

Since the minimum value was 1300, there was no need to shift the data to thie right. The data was still skewed, so using the optimal value of ??, from the exponential probability density function, we were abe to create a sample values. The histogram of of the lamnda is a better fit than empiracal because it was spread out, but there is significant diffence from the 5th and 95th percentiles.

Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

I wanted to take some of numerical data I thought was most relvent to predicing the sale price and then adding some catagorical data to improve model.

Step 1 Observe numerical values:

#Observe the numerical variables that I might be possible predictors of House Sale Price. 
n_values <- train %>%
  select(c("SalePrice","BedroomAbvGr","BsmtFinSF1", "BsmtUnfSF", "EnclosedPorch", "GarageArea", "GrLivArea", "LotArea", "LotFrontage", "MasVnrArea", "MiscVal", "MoSold", "MSSubClass", "OpenPorchSF", "TotalBsmtSF","TotRmsAbvGrd","OverallQual", "WoodDeckSF", "X1stFlrSF", "X2ndFlrSF", "OverallCond","GarageYrBlt","YearBuilt","YearRemodAdd","YrSold","OverallCond")) 

n_values %>% 
    select(c("BedroomAbvGr","BsmtFinSF1", "BsmtUnfSF", "EnclosedPorch", "GarageArea", "GrLivArea", "LotArea", "LotFrontage", "MasVnrArea", "MiscVal", "MoSold", "MSSubClass", "OpenPorchSF", "TotalBsmtSF","TotRmsAbvGrd","OverallQual", "WoodDeckSF", "X1stFlrSF", "X2ndFlrSF", "OverallCond")) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram()

Model with just numerical values:

#Model with numeric values
n_model <- lm(SalePrice ~ ., data= n_values)
n_model_sum <- summary(n_model)
n_model_sum
## 
## Call:
## lm(formula = SalePrice ~ ., data = n_values)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -517713  -17067   -2564   13872  285879 
## 
## Coefficients:
##                    Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)   -462656.86006 1728293.34532  -0.268             0.788984    
## BedroomAbvGr    -9712.00720    2166.42458  -4.483     0.00000813497929 ***
## BsmtFinSF1          8.42276       7.70004   1.094             0.274257    
## BsmtUnfSF          -9.07850       7.46526  -1.216             0.224209    
## EnclosedPorch       1.48723      20.87798   0.071             0.943224    
## GarageArea         44.42931       8.98165   4.947     0.00000087309373 ***
## GrLivArea          39.63955      28.25052   1.403             0.160857    
## LotArea             0.64563       0.16051   4.022     0.00006156965145 ***
## LotFrontage      -157.68040      61.74859  -2.554             0.010796 *  
## MasVnrArea         33.03158       7.12640   4.635     0.00000399632929 ***
## MiscVal            -1.40886       6.91865  -0.204             0.838678    
## MoSold           -149.16539     431.47826  -0.346             0.729629    
## MSSubClass       -229.95134      32.91701  -6.986     0.00000000000491 ***
## OpenPorchSF         0.03912      19.74854   0.002             0.998420    
## TotalBsmtSF        12.44746       8.66939   1.436             0.151346    
## TotRmsAbvGrd     5131.88005    1456.11245   3.524             0.000442 ***
## OverallQual     21507.70083    1448.03879  14.853 < 0.0000000000000002 ***
## WoodDeckSF         22.58721      10.09019   2.239             0.025387 *  
## X1stFlrSF           9.17077      28.85105   0.318             0.750647    
## X2ndFlrSF           9.79769      28.33629   0.346             0.729586    
## OverallCond      5188.78756    1384.53898   3.748             0.000188 ***
## GarageYrBlt       -66.11328      92.22636  -0.717             0.473614    
## YearBuilt         391.76876      84.40576   4.641     0.00000387694675 ***
## YearRemodAdd      175.56884      87.11665   2.015             0.044114 *  
## YrSold           -302.28437     860.29120  -0.351             0.725375    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37760 on 1096 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.7974, Adjusted R-squared:  0.793 
## F-statistic: 179.7 on 24 and 1096 DF,  p-value: < 0.00000000000000022

Adjusted R squared is 79% but there are alot variables that doesn’t fit in our model. Backward elimination is performed to to improve our model.

#Model with numeric values
n_model2 <- lm(SalePrice ~  BedroomAbvGr + GarageArea + LotArea + LotFrontage + MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF + OverallCond + YearBuilt + YearRemodAdd, data= n_values)
n_model_sum2 <- summary(n_model2)
n_model_sum2
## 
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + GarageArea + LotArea + 
##     LotFrontage + MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + 
##     WoodDeckSF + OverallCond + YearBuilt + YearRemodAdd, data = n_values)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -361429  -21393   -3658   15833  398239 
## 
## Coefficients:
##                   Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)  -1093205.7564   140657.6554  -7.772   0.0000000000000167 ***
## BedroomAbvGr    -9322.3098     2057.3118  -4.531   0.0000064565922659 ***
## GarageArea         45.9535        7.1360   6.440   0.0000000001737618 ***
## LotArea             1.0858        0.1654   6.564   0.0000000000782179 ***
## LotFrontage        32.6109       61.4556   0.531              0.59577    
## MasVnrArea         56.4118        7.2806   7.748   0.0000000000000200 ***
## MSSubClass       -148.7649       30.4232  -4.890   0.0000011482742454 ***
## TotRmsAbvGrd    13598.1608     1204.2464  11.292 < 0.0000000000000002 ***
## OverallQual     25768.3289     1303.4936  19.769 < 0.0000000000000002 ***
## WoodDeckSF         46.5424       10.3277   4.507   0.0000072419246356 ***
## OverallCond      3735.9077     1301.7480   2.870              0.00418 ** 
## YearBuilt         268.6362       62.2756   4.314   0.0000173964086604 ***
## YearRemodAdd      235.5410       80.3354   2.932              0.00343 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40320 on 1182 degrees of freedom
##   (265 observations deleted due to missingness)
## Multiple R-squared:  0.7673, Adjusted R-squared:  0.765 
## F-statistic: 324.9 on 12 and 1182 DF,  p-value: < 0.00000000000000022

Model 1 : Numericl variables only.

#Model with numeric values
model1 <- lm(SalePrice ~  BedroomAbvGr + GarageArea + LotArea + MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF + OverallCond + YearBuilt + YearRemodAdd, data= n_values)
summary(model1)
## 
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + GarageArea + LotArea + 
##     MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF + 
##     OverallCond + YearBuilt + YearRemodAdd, data = n_values)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -342898  -20933   -3203   15311  401339 
## 
## Coefficients:
##                   Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)  -1061769.7525   124070.7838  -8.558 < 0.0000000000000002 ***
## BedroomAbvGr    -9815.3094     1794.5857  -5.469  0.00000005317046417 ***
## GarageArea         48.3784        6.2232   7.774  0.00000000000001440 ***
## LotArea             0.8925        0.1065   8.376 < 0.0000000000000002 ***
## MasVnrArea         51.5658        6.3807   8.082  0.00000000000000134 ***
## MSSubClass       -165.5541       24.6467  -6.717  0.00000000002661331 ***
## TotRmsAbvGrd    14096.5574     1034.8157  13.622 < 0.0000000000000002 ***
## OverallQual     25361.8019     1129.0649  22.463 < 0.0000000000000002 ***
## WoodDeckSF         44.9242        8.5735   5.240  0.00000018453810178 ***
## OverallCond      3301.0216     1092.0307   3.023              0.00255 ** 
## YearBuilt         273.7409       55.4733   4.935  0.00000089638723246 ***
## YearRemodAdd      218.5333       70.5734   3.097              0.00200 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38640 on 1440 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.7643, Adjusted R-squared:  0.7625 
## F-statistic: 424.5 on 11 and 1440 DF,  p-value: < 0.00000000000000022

Model 2 : adding Catagorical values to the numeric values.

#Observe the numerical variables that I might be possible predictors of House Sale Price. 
c_values <- train %>%
  select(c("SalePrice", "BldgType", "BsmtCond", "BsmtQual", "CentralAir", "Condition1", "Condition2", "Electrical", "Exterior1st", "Exterior2nd", "ExterQual", "Fence", "Foundation", "Functional", "GarageQual", "Heating", "HeatingQC", "HouseStyle", "KitchenQual", "MSZoning", "Neighborhood", "PavedDrive", "RoofMatl", "RoofStyle", "SaleCondition", "SaleType")) 

c_model <- lm(SalePrice ~  BedroomAbvGr + GarageArea + LotArea + MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF + OverallCond + YearBuilt + YearRemodAdd + SalePrice + BldgType + BsmtCond + BsmtQual + CentralAir + Condition1 + Condition2 + Electrical + Exterior1st + Exterior2nd + ExterQual + Fence + Foundation + Functional + GarageQual + Heating + HeatingQC + HouseStyle + KitchenQual + MSZoning + Neighborhood + PavedDrive + RoofMatl + RoofStyle + SaleCondition + SaleType, data = train)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared
## on the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 12 in
## model.matrix: no columns are assigned
summary(c_model)
## 
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + GarageArea + LotArea + 
##     MasVnrArea + MSSubClass + TotRmsAbvGrd + OverallQual + WoodDeckSF + 
##     OverallCond + YearBuilt + YearRemodAdd + SalePrice + BldgType + 
##     BsmtCond + BsmtQual + CentralAir + Condition1 + Condition2 + 
##     Electrical + Exterior1st + Exterior2nd + ExterQual + Fence + 
##     Foundation + Functional + GarageQual + Heating + HeatingQC + 
##     HouseStyle + KitchenQual + MSZoning + Neighborhood + PavedDrive + 
##     RoofMatl + RoofStyle + SaleCondition + SaleType, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -48983  -8817      0   7501  55856 
## 
## Coefficients: (5 not defined because of singularities)
##                         Estimate   Std. Error t value          Pr(>|t|)
## (Intercept)         -969444.1004  421735.5976  -2.299          0.022971
## BedroomAbvGr           4822.9137    3483.9079   1.384          0.168411
## GarageArea               36.6264      11.9278   3.071          0.002557
## LotArea                   4.1238       0.6418   6.426 0.000000001825381
## MasVnrArea               32.1231      12.3539   2.600          0.010294
## MSSubClass             -119.0365     683.8900  -0.174          0.862066
## TotRmsAbvGrd           7029.3113    2170.9198   3.238          0.001497
## OverallQual           11440.8670    2498.3262   4.579 0.000010052994650
## WoodDeckSF               27.6732      11.3253   2.443          0.015764
## OverallCond            4377.7991    1627.6347   2.690          0.008003
## YearBuilt               504.4812     168.6033   2.992          0.003264
## YearRemodAdd             64.2865     123.8007   0.519          0.604372
## BldgType2fmCon         6617.3557   91991.1512   0.072          0.942754
## BldgTypeDuplex        11988.8150   16729.2494   0.717          0.474766
## BldgTypeTwnhs          8550.6322   72773.9723   0.117          0.906632
## BldgTypeTwnhsE       -11749.9638   70437.2286  -0.167          0.867752
## BsmtCondGd           -13379.1301   17922.0045  -0.747          0.456579
## BsmtCondTA            -4432.6580   16106.1892  -0.275          0.783549
## BsmtQualFa            -2142.3206   27981.9463  -0.077          0.939080
## BsmtQualGd             6049.6211   20964.3316   0.289          0.773330
## BsmtQualTA             3664.5879   21642.9970   0.169          0.865784
## CentralAirY           20547.3522   11357.7110   1.809          0.072534
## Condition1Feedr       11978.3935    9433.1344   1.270          0.206211
## Condition1Norm         6901.3231    8024.2031   0.860          0.391194
## Condition1PosN        44096.3155   23280.5696   1.894          0.060228
## Condition1RRAe         2295.6171   21581.0002   0.106          0.915436
## Condition1RRAn        -1147.3431   15939.8669  -0.072          0.942719
## Condition1RRNn         7267.6767   29494.6404   0.246          0.805721
## Condition2Norm        18753.3899   22387.5123   0.838          0.403613
## Condition2RRNn         8420.3959   31213.8136   0.270          0.787730
## ElectricalFuseF       11995.2832   32370.3050   0.371          0.711510
## ElectricalSBrkr       13371.1318    7339.3824   1.822          0.070570
## Exterior1stBrkFace    -6592.9011   34154.4682  -0.193          0.847208
## Exterior1stCemntBd   -61669.7765   27353.9493  -2.255          0.025684
## Exterior1stHdBoard   -33681.2846   33609.4141  -1.002          0.317969
## Exterior1stImStucc  -410087.4617   51492.7483  -7.964 0.000000000000472
## Exterior1stMetalSd   -29012.2733   35277.9979  -0.822          0.412224
## Exterior1stPlywood   -21498.8463   33405.4866  -0.644          0.520884
## Exterior1stStucco    -33659.2707   37604.3043  -0.895          0.372243
## Exterior1stVinylSd   -10631.0356   41093.8735  -0.259          0.796238
## Exterior1stWd Sdng   -46753.6043   34717.0641  -1.347          0.180206
## Exterior1stWdShing   -24055.5677   35330.0621  -0.681          0.497048
## Exterior2ndAsphShn   -16771.1156   43837.8355  -0.383          0.702605
## Exterior2ndBrkFace     3373.9999   36829.8353   0.092          0.927136
## Exterior2ndCmentBd            NA           NA      NA                NA
## Exterior2ndHdBoard     1386.5865   35237.4945   0.039          0.968666
## Exterior2ndImStucc   386209.9840   57808.2982   6.681 0.000000000490852
## Exterior2ndMetalSd    -2650.3075   36953.9167  -0.072          0.942926
## Exterior2ndPlywood    -3766.9258   34449.6063  -0.109          0.913081
## Exterior2ndStucco     18293.6126   42285.9614   0.433          0.665945
## Exterior2ndVinylSd   -16618.3618   43204.7259  -0.385          0.701074
## Exterior2ndWd Sdng    20758.6547   36574.0598   0.568          0.571211
## Exterior2ndWd Shng    19540.7221   35695.6179   0.547          0.584939
## ExterQualFa         -141359.8740   61802.8463  -2.287          0.023647
## ExterQualGd         -136158.6722   40535.0428  -3.359          0.001003
## ExterQualTA         -146724.1147   42160.4931  -3.480          0.000665
## FenceGdWo              8777.6878    5066.5092   1.732          0.085342
## FenceMnPrv             7553.7294    4177.1299   1.808          0.072653
## FenceMnWw              5401.4939    7829.9387   0.690          0.491406
## FoundationCBlock      -1708.9255    7883.4508  -0.217          0.828694
## FoundationPConc       -5023.3596    7796.7441  -0.644          0.520420
## FoundationStone      -56981.2915   23837.9147  -2.390          0.018134
## FoundationWood       -26160.8164   24300.9587  -1.077          0.283501
## FunctionalMin1        19401.6798   14856.1605   1.306          0.193660
## FunctionalMin2        15322.0112   16398.7331   0.934          0.351703
## FunctionalMod         38828.6794   25402.0775   1.529          0.128582
## FunctionalTyp         10957.6304   14286.1988   0.767          0.444341
## GarageQualFa        -131121.6547   42008.3080  -3.121          0.002179
## GarageQualGd        -103778.1606   44470.9342  -2.334          0.021009
## GarageQualPo        -165639.2430   63268.0007  -2.618          0.009795
## GarageQualTA        -134703.1010   40926.0919  -3.291          0.001256
## HeatingGasW             440.6257   17051.6042   0.026          0.979420
## HeatingGrav            7334.9425   35895.2652   0.204          0.838376
## HeatingQCFa            2398.5767    8948.7137   0.268          0.789058
## HeatingQCGd           -5671.7493    4645.3228  -1.221          0.224111
## HeatingQCTA           -2242.6157    3862.9867  -0.581          0.562465
## HouseStyle1.5Unf      -2610.5618   23968.3805  -0.109          0.913421
## HouseStyle1Story      -8240.1453   18772.5946  -0.439          0.661364
## HouseStyle2.5Fin              NA           NA      NA                NA
## HouseStyle2.5Unf      -3964.3068   26617.0731  -0.149          0.881812
## HouseStyle2Story      -1831.0116   11773.1117  -0.156          0.876627
## HouseStyleSFoyer      -8391.9427   27793.1485  -0.302          0.763134
## HouseStyleSLvl       -11746.8942   24737.7084  -0.475          0.635613
## KitchenQualFa        -45510.9789   18555.1138  -2.453          0.015380
## KitchenQualGd        -42077.6051   13027.3810  -3.230          0.001536
## KitchenQualTA        -50128.5545   12823.2891  -3.909          0.000143
## MSZoningFV            67535.3491   46250.4615   1.460          0.146426
## MSZoningRH            91059.1678   36450.5299   2.498          0.013616
## MSZoningRL            99328.2316   32192.6765   3.085          0.002441
## MSZoningRM            83881.5531   31570.5972   2.657          0.008782
## NeighborhoodBrkSide  -23576.9602   30090.8436  -0.784          0.434614
## NeighborhoodClearCr  -18237.4600   33500.2461  -0.544          0.587016
## NeighborhoodCollgCr  -50695.5925   30307.5364  -1.673          0.096572
## NeighborhoodCrawfor  -16241.5395   32036.5045  -0.507          0.612957
## NeighborhoodEdwards  -44828.2352   30709.0835  -1.460          0.146546
## NeighborhoodGilbert  -44739.4853   35717.0849  -1.253          0.212393
## NeighborhoodIDOTRR   -22149.1877   30135.1331  -0.735          0.463546
## NeighborhoodMeadowV   17119.3334   31656.3851   0.541          0.589496
## NeighborhoodMitchel  -46229.2539   30742.9893  -1.504          0.134855
## NeighborhoodNAmes    -50905.1783   30364.0016  -1.676          0.095826
## NeighborhoodNoRidge           NA           NA      NA                NA
## NeighborhoodNWAmes   -52229.3562   30089.6480  -1.736          0.084755
## NeighborhoodOldTown  -28957.0667   29369.3755  -0.986          0.325817
## NeighborhoodSawyer   -50431.3146   30613.9371  -1.647          0.101686
## NeighborhoodSawyerW  -36217.4504   28496.0161  -1.271          0.205805
## NeighborhoodSomerst           NA           NA      NA                NA
## NeighborhoodSWISU    -59129.8107   33205.1023  -1.781          0.077077
## NeighborhoodTimber    24541.4211   41260.3311   0.595          0.552921
## NeighborhoodVeenker  -12737.8979   37108.3491  -0.343          0.731905
## PavedDriveP          -23076.9493   14218.6503  -1.623          0.106791
## PavedDriveY            7061.7827   10340.1463   0.683          0.495745
## RoofMatlTar&Grv       53044.1068   35731.3797   1.485          0.139871
## RoofMatlWdShngl       -8642.8551   20556.7269  -0.420          0.674796
## RoofStyleGable        23732.3204   28278.6878   0.839          0.402741
## RoofStyleGambrel      48710.5440   31370.0235   1.553          0.122688
## RoofStyleHip          19940.1765   28741.0765   0.694          0.488941
## RoofStyleMansard              NA           NA      NA                NA
## SaleConditionAlloca  -26618.4696   23290.1662  -1.143          0.254987
## SaleConditionFamily   -1654.0488    9994.0119  -0.166          0.868781
## SaleConditionNormal   10341.2536    5233.8839   1.976          0.050099
## SaleTypeConLI        -64066.4768   28180.7565  -2.273          0.024491
## SaleTypeCWD          -13110.2459   20715.4994  -0.633          0.527828
## SaleTypeWD             -215.0227    7439.3828  -0.029          0.976982
##                        
## (Intercept)         *  
## BedroomAbvGr           
## GarageArea          ** 
## LotArea             ***
## MasVnrArea          *  
## MSSubClass             
## TotRmsAbvGrd        ** 
## OverallQual         ***
## WoodDeckSF          *  
## OverallCond         ** 
## YearBuilt           ** 
## YearRemodAdd           
## BldgType2fmCon         
## BldgTypeDuplex         
## BldgTypeTwnhs          
## BldgTypeTwnhsE         
## BsmtCondGd             
## BsmtCondTA             
## BsmtQualFa             
## BsmtQualGd             
## BsmtQualTA             
## CentralAirY         .  
## Condition1Feedr        
## Condition1Norm         
## Condition1PosN      .  
## Condition1RRAe         
## Condition1RRAn         
## Condition1RRNn         
## Condition2Norm         
## Condition2RRNn         
## ElectricalFuseF        
## ElectricalSBrkr     .  
## Exterior1stBrkFace     
## Exterior1stCemntBd  *  
## Exterior1stHdBoard     
## Exterior1stImStucc  ***
## Exterior1stMetalSd     
## Exterior1stPlywood     
## Exterior1stStucco      
## Exterior1stVinylSd     
## Exterior1stWd Sdng     
## Exterior1stWdShing     
## Exterior2ndAsphShn     
## Exterior2ndBrkFace     
## Exterior2ndCmentBd     
## Exterior2ndHdBoard     
## Exterior2ndImStucc  ***
## Exterior2ndMetalSd     
## Exterior2ndPlywood     
## Exterior2ndStucco      
## Exterior2ndVinylSd     
## Exterior2ndWd Sdng     
## Exterior2ndWd Shng     
## ExterQualFa         *  
## ExterQualGd         ** 
## ExterQualTA         ***
## FenceGdWo           .  
## FenceMnPrv          .  
## FenceMnWw              
## FoundationCBlock       
## FoundationPConc        
## FoundationStone     *  
## FoundationWood         
## FunctionalMin1         
## FunctionalMin2         
## FunctionalMod          
## FunctionalTyp          
## GarageQualFa        ** 
## GarageQualGd        *  
## GarageQualPo        ** 
## GarageQualTA        ** 
## HeatingGasW            
## HeatingGrav            
## HeatingQCFa            
## HeatingQCGd            
## HeatingQCTA            
## HouseStyle1.5Unf       
## HouseStyle1Story       
## HouseStyle2.5Fin       
## HouseStyle2.5Unf       
## HouseStyle2Story       
## HouseStyleSFoyer       
## HouseStyleSLvl         
## KitchenQualFa       *  
## KitchenQualGd       ** 
## KitchenQualTA       ***
## MSZoningFV             
## MSZoningRH          *  
## MSZoningRL          ** 
## MSZoningRM          ** 
## NeighborhoodBrkSide    
## NeighborhoodClearCr    
## NeighborhoodCollgCr .  
## NeighborhoodCrawfor    
## NeighborhoodEdwards    
## NeighborhoodGilbert    
## NeighborhoodIDOTRR     
## NeighborhoodMeadowV    
## NeighborhoodMitchel    
## NeighborhoodNAmes   .  
## NeighborhoodNoRidge    
## NeighborhoodNWAmes  .  
## NeighborhoodOldTown    
## NeighborhoodSawyer     
## NeighborhoodSawyerW    
## NeighborhoodSomerst    
## NeighborhoodSWISU   .  
## NeighborhoodTimber     
## NeighborhoodVeenker    
## PavedDriveP            
## PavedDriveY            
## RoofMatlTar&Grv        
## RoofMatlWdShngl        
## RoofStyleGable         
## RoofStyleGambrel       
## RoofStyleHip           
## RoofStyleMansard       
## SaleConditionAlloca    
## SaleConditionFamily    
## SaleConditionNormal .  
## SaleTypeConLI       *  
## SaleTypeCWD            
## SaleTypeWD             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18470 on 143 degrees of freedom
##   (1199 observations deleted due to missingness)
## Multiple R-squared:  0.9506, Adjusted R-squared:  0.9102 
## F-statistic: 23.53 on 117 and 143 DF,  p-value: < 0.00000000000000022

Goal: Perform backwards elmination multiple ways to build multiple modles and select best fit.

n_model <- lm(SalePrice ~ BedroomAbvGr  + BsmtFinSF1 + BsmtFinSF2 + BsmtFullBath + BsmtHalfBath + BsmtUnfSF + EnclosedPorch + Fireplaces + FullBath + GarageArea + GarageCars + GarageYrBlt + GrLivArea + HalfBath + KitchenAbvGr + LotArea + LotFrontage + LowQualFinSF + MasVnrArea + MiscVal + MoSold + MSSubClass + OpenPorchSF + OverallCond + OverallQual + PoolArea + ScreenPorch + TotalBsmtSF + TotRmsAbvGrd + WoodDeckSF + X1stFlrSF + X2ndFlrSF + X3SsnPorch + YearBuilt + YearRemodAdd + YrSold, data = train)

summary(n_model)
## 
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFinSF2 + 
##     BsmtFullBath + BsmtHalfBath + BsmtUnfSF + EnclosedPorch + 
##     Fireplaces + FullBath + GarageArea + GarageCars + GarageYrBlt + 
##     GrLivArea + HalfBath + KitchenAbvGr + LotArea + LotFrontage + 
##     LowQualFinSF + MasVnrArea + MiscVal + MoSold + MSSubClass + 
##     OpenPorchSF + OverallCond + OverallQual + PoolArea + ScreenPorch + 
##     TotalBsmtSF + TotRmsAbvGrd + WoodDeckSF + X1stFlrSF + X2ndFlrSF + 
##     X3SsnPorch + YearBuilt + YearRemodAdd + YrSold, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442865  -16873   -2581   14998  318042 
## 
## Coefficients: (2 not defined because of singularities)
##                   Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)   -323175.5898 1700608.6185  -0.190             0.849317    
## BedroomAbvGr   -10232.5007    2154.0411  -4.750   0.0000023024581154 ***
## BsmtFinSF1         17.3892       5.8353   2.980             0.002947 ** 
## BsmtFinSF2          8.3616       8.7631   0.954             0.340205    
## BsmtFullBath     8979.9920    3193.9015   2.812             0.005018 ** 
## BsmtHalfBath     2490.1194    5070.9578   0.491             0.623487    
## BsmtUnfSF           5.0056       5.2753   0.949             0.342890    
## EnclosedPorch       7.2332      20.6133   0.351             0.725733    
## Fireplaces       4374.8147    2187.8569   2.000             0.045793 *  
## FullBath         5389.6373    3528.5219   1.527             0.126941    
## GarageArea          6.4882      12.1138   0.536             0.592338    
## GarageCars      16788.4001    3486.6828   4.815   0.0000016803185415 ***
## GarageYrBlt       -49.1431      90.9333  -0.540             0.589011    
## GrLivArea          46.6797       6.0986   7.654   0.0000000000000428 ***
## HalfBath        -1118.5234    3319.8706  -0.337             0.736244    
## KitchenAbvGr   -21931.2788    6704.4025  -3.271             0.001105 ** 
## LotArea             0.5454       0.1573   3.466             0.000548 ***
## LotFrontage      -116.1232      61.2411  -1.896             0.058203 .  
## LowQualFinSF      -12.5260      27.9855  -0.448             0.654539    
## MasVnrArea         31.6049       7.0060   4.511   0.0000071502458891 ***
## MiscVal            -3.8501       6.9549  -0.554             0.579980    
## MoSold           -224.0209     422.6730  -0.530             0.596213    
## MSSubClass       -200.4890      34.4859  -5.814   0.0000000080292914 ***
## OpenPorchSF        -2.3153      19.4782  -0.119             0.905404    
## OverallCond      5227.2069    1367.0842   3.824             0.000139 ***
## OverallQual     18696.5040    1478.4163  12.646 < 0.0000000000000002 ***
## PoolArea          -61.2618      29.8422  -2.053             0.040326 *  
## ScreenPorch        57.9661      20.3986   2.842             0.004572 ** 
## TotalBsmtSF             NA           NA      NA                   NA    
## TotRmsAbvGrd     5439.7807    1485.7761   3.661             0.000263 ***
## WoodDeckSF         21.5457      10.0176   2.151             0.031713 *  
## X1stFlrSF          -0.7679       6.7133  -0.114             0.908948    
## X2ndFlrSF               NA           NA      NA                   NA    
## X3SsnPorch         34.5789      37.4933   0.922             0.356593    
## YearBuilt         316.9684      87.6223   3.617             0.000311 ***
## YearRemodAdd      120.5742      86.6125   1.392             0.164174    
## YrSold           -253.6384     845.3939  -0.300             0.764216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36790 on 1086 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.8095, Adjusted R-squared:  0.8036 
## F-statistic: 135.7 on 34 and 1086 DF,  p-value: < 0.00000000000000022
c_model <- lm(SalePrice ~ BedroomAbvGr + BldgType + BsmtCond + BsmtFinSF1 + BsmtFinSF2 + BsmtFullBath + BsmtHalfBath + BsmtQual + BsmtUnfSF + CentralAir + Condition1 + Condition2 + Electrical  + EnclosedPorch + Exterior1st + Exterior2nd + ExterQual + Fence  + Fireplaces + Foundation + FullBath + Functional + MiscVal + MoSold + MSSubClass + MSZoning + Neighborhood + OpenPorchSF + OverallCond + OverallQual + PavedDrive + PoolArea + RoofMatl + RoofStyle + SaleCondition + SaleType + ScreenPorch + TotalBsmtSF + GarageArea + GarageCars + GarageQual + GarageYrBlt + GrLivArea + HalfBath + Heating + HeatingQC + HouseStyle + KitchenAbvGr + KitchenQual + LotArea + LotFrontage + LowQualFinSF + MasVnrArea + TotRmsAbvGrd + WoodDeckSF + X1stFlrSF + X2ndFlrSF + X3SsnPorch + YearBuilt + YearRemodAdd + YrSold, data = train, na.action=na.omit)

summary(c_model)
summary(lm(SalePrice ~ BedroomAbvGr  + BsmtFinSF1 + BsmtFinSF2 + BsmtFullBath + BsmtHalfBath + BsmtUnfSF + EnclosedPorch + Fireplaces + FullBath + GarageArea + GarageCars + GarageYrBlt + GrLivArea + HalfBath + KitchenAbvGr + LotArea + LotFrontage + LowQualFinSF + MasVnrArea + MiscVal + MoSold + MSSubClass + OpenPorchSF + OverallCond + OverallQual + PoolArea + ScreenPorch + TotalBsmtSF + TotRmsAbvGrd + WoodDeckSF + X1stFlrSF + X2ndFlrSF + X3SsnPorch + YearBuilt + YearRemodAdd + YrSold + Exterior1st + Exterior2nd + Functional + MSZoning + GarageQual + KitchenQual + SaleType, data = train))
summary(lm(SalePrice ~  BsmtFinSF1 + BsmtFullBath + Fireplaces + FullBath + GarageCars + GrLivArea + KitchenAbvGr + LotArea + LotFrontage + MasVnrArea + MSSubClass + OverallCond + SaleType + Exterior2nd + Functional + GarageQual + KitchenQual, data = train))

Model 2 : Backward elimination of all non significant values

model2 <- (lm(SalePrice ~ BsmtFullBath + Fireplaces + FullBath + KitchenAbvGr + LotArea + LotFrontage + MasVnrArea + MSSubClass + OverallCond + SaleType + Exterior2nd + GarageQual + KitchenQual, data = train))
summary(model2)
## 
## Call:
## lm(formula = SalePrice ~ BsmtFullBath + Fireplaces + FullBath + 
##     KitchenAbvGr + LotArea + LotFrontage + MasVnrArea + MSSubClass + 
##     OverallCond + SaleType + Exterior2nd + GarageQual + KitchenQual, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -360851  -21889   -2026   18049  315254 
## 
## Coefficients:
##                        Estimate   Std. Error t value             Pr(>|t|)
## (Intercept)         196307.9116   32682.9563   6.006     0.00000000258758
## BsmtFullBath         19722.4609    2755.7302   7.157     0.00000000000152
## Fireplaces           22060.9194    2388.2352   9.237 < 0.0000000000000002
## FullBath             44203.9477    3197.8832  13.823 < 0.0000000000000002
## KitchenAbvGr        -22490.8825    7249.7334  -3.102              0.00197
## LotArea                  1.0795       0.1868   5.778     0.00000000989532
## LotFrontage            158.0544      68.6271   2.303              0.02146
## MasVnrArea              87.5928       7.8314  11.185 < 0.0000000000000002
## MSSubClass            -116.2123      37.9872  -3.059              0.00227
## OverallCond           3276.5224    1355.6991   2.417              0.01582
## SaleTypeCon          85658.0360   32572.7264   2.630              0.00867
## SaleTypeConLD         5211.8180   19774.4104   0.264              0.79217
## SaleTypeConLI        16077.7086   26795.4970   0.600              0.54862
## SaleTypeConLw          882.8105   23468.0835   0.038              0.97000
## SaleTypeCWD          37722.8259   23620.1161   1.597              0.11054
## SaleTypeNew          30508.3753    9323.4361   3.272              0.00110
## SaleTypeOth          17743.3237   44675.6724   0.397              0.69133
## SaleTypeWD            7578.4084    8026.8175   0.944              0.34531
## Exterior2ndAsphShn   16562.5367   33250.3649   0.498              0.61850
## Exterior2ndBrk Cmn  -10380.9470   20585.6034  -0.504              0.61417
## Exterior2ndBrkFace   -1247.3837   15357.7758  -0.081              0.93528
## Exterior2ndCBlock     -943.1211   45473.1691  -0.021              0.98346
## Exterior2ndCmentBd   16089.2134   13779.6971   1.168              0.24322
## Exterior2ndHdBoard   -2346.8733   12288.4871  -0.191              0.84858
## Exterior2ndImStucc   51960.2210   18881.3255   2.752              0.00602
## Exterior2ndMetalSd   -1532.2999   12115.0201  -0.126              0.89938
## Exterior2ndOther     53166.4987   45791.2098   1.161              0.24587
## Exterior2ndPlywood   -9725.1747   12576.6196  -0.773              0.43953
## Exterior2ndStone      -382.0658   24855.6420  -0.015              0.98774
## Exterior2ndStucco   -23453.4449   15282.4708  -1.535              0.12516
## Exterior2ndVinylSd    9388.7911   12116.9011   0.775              0.43860
## Exterior2ndWd Sdng   -1277.2133   12164.5490  -0.105              0.91640
## Exterior2ndWd Shng   -5948.0868   14260.2391  -0.417              0.67668
## GarageQualFa        -62863.8946   26324.4598  -2.388              0.01711
## GarageQualGd        -37397.9552   28853.8964  -1.296              0.19521
## GarageQualPo        -54655.3857   36379.7852  -1.502              0.13330
## GarageQualTA        -51629.9449   25614.9996  -2.016              0.04409
## KitchenQualFa      -105796.7897   11364.2023  -9.310 < 0.0000000000000002
## KitchenQualGd       -75065.4808    5598.6672 -13.408 < 0.0000000000000002
## KitchenQualTA      -101929.2843    6135.7426 -16.612 < 0.0000000000000002
##                       
## (Intercept)        ***
## BsmtFullBath       ***
## Fireplaces         ***
## FullBath           ***
## KitchenAbvGr       ** 
## LotArea            ***
## LotFrontage        *  
## MasVnrArea         ***
## MSSubClass         ** 
## OverallCond        *  
## SaleTypeCon        ** 
## SaleTypeConLD         
## SaleTypeConLI         
## SaleTypeConLw         
## SaleTypeCWD           
## SaleTypeNew        ** 
## SaleTypeOth           
## SaleTypeWD            
## Exterior2ndAsphShn    
## Exterior2ndBrk Cmn    
## Exterior2ndBrkFace    
## Exterior2ndCBlock     
## Exterior2ndCmentBd    
## Exterior2ndHdBoard    
## Exterior2ndImStucc ** 
## Exterior2ndMetalSd    
## Exterior2ndOther      
## Exterior2ndPlywood    
## Exterior2ndStone      
## Exterior2ndStucco     
## Exterior2ndVinylSd    
## Exterior2ndWd Sdng    
## Exterior2ndWd Shng    
## GarageQualFa       *  
## GarageQualGd          
## GarageQualPo          
## GarageQualTA       *  
## KitchenQualFa      ***
## KitchenQualGd      ***
## KitchenQualTA      ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43850 on 1081 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.7306, Adjusted R-squared:  0.7209 
## F-statistic: 75.16 on 39 and 1081 DF,  p-value: < 0.00000000000000022
#All remaining numerical + significant catagorical
summary(lm( SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFinSF2 + BsmtFullBath + BsmtHalfBath + BsmtUnfSF + EnclosedPorch + Fireplaces + FullBath + MiscVal + MoSold + MSSubClass + OpenPorchSF + OverallCond + OverallQual + PoolArea + GarageArea + GarageCars + GarageYrBlt + GrLivArea + HalfBath + KitchenAbvGr + LotArea + LotFrontage + LowQualFinSF + MasVnrArea + Exterior1st + Exterior2nd + Functional + MSZoning + GarageQual + KitchenQual, data = train))

Model 3 : Backward elimination of catagorical values and then Numerical Values.

model3 <- (lm(SalePrice ~  BedroomAbvGr + BsmtFinSF1 + BsmtFullBath + Fireplaces + FullBath + MSSubClass + OverallCond + OverallQual + GarageCars + GrLivArea + KitchenAbvGr + LotArea + LotFrontage + MasVnrArea + MSZoning + KitchenQual, data = train))

summary(model3)
## 
## Call:
## lm(formula = SalePrice ~ BedroomAbvGr + BsmtFinSF1 + BsmtFullBath + 
##     Fireplaces + FullBath + MSSubClass + OverallCond + OverallQual + 
##     GarageCars + GrLivArea + KitchenAbvGr + LotArea + LotFrontage + 
##     MasVnrArea + MSZoning + KitchenQual, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -440976  -16444   -1296   13950  264960 
## 
## Coefficients:
##                  Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)    -3879.6031  16271.6219  -0.238             0.811591    
## BedroomAbvGr   -4806.3796   1717.0656  -2.799             0.005207 ** 
## BsmtFinSF1        11.5493      3.2629   3.540             0.000416 ***
## BsmtFullBath   10827.2228   2715.9942   3.986        0.00007120658 ***
## Fireplaces      4183.6519   1992.7963   2.099             0.035996 *  
## FullBath       13706.5597   2818.1929   4.864        0.00000130945 ***
## MSSubClass      -180.4528     29.6369  -6.089        0.00000000154 ***
## OverallCond     3190.9905   1011.4604   3.155             0.001647 ** 
## OverallQual    17293.2606   1306.0582  13.241 < 0.0000000000000002 ***
## GarageCars     15387.7318   1810.9282   8.497 < 0.0000000000000002 ***
## GrLivArea         44.9502      3.7216  12.078 < 0.0000000000000002 ***
## KitchenAbvGr  -12162.1167   5135.7184  -2.368             0.018039 *  
## LotArea            0.6130      0.1495   4.099        0.00004431207 ***
## LotFrontage     -159.2464     55.8814  -2.850             0.004452 ** 
## MasVnrArea        39.7027      6.5322   6.078        0.00000000164 ***
## MSZoningFV     25024.4371  12719.6049   1.967             0.049373 *  
## MSZoningRH     14339.1452  15172.3420   0.945             0.344810    
## MSZoningRL     21267.8337  11610.8950   1.832             0.067248 .  
## MSZoningRM      8125.3207  11782.5295   0.690             0.490577    
## KitchenQualFa -47039.5329   8294.8440  -5.671        0.00000001787 ***
## KitchenQualGd -47024.0937   4481.5969 -10.493 < 0.0000000000000002 ***
## KitchenQualTA -57289.6461   5179.4156 -11.061 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35500 on 1173 degrees of freedom
##   (265 observations deleted due to missingness)
## Multiple R-squared:  0.821,  Adjusted R-squared:  0.8178 
## F-statistic: 256.2 on 21 and 1173 DF,  p-value: < 0.00000000000000022

Model 4 : Backward elimination of Numerical Values

model4 <- (lm(SalePrice ~ BsmtFinSF1 + BsmtFullBath + FullBath + OverallCond + OverallQual + GarageCars + GrLivArea + LotArea + LowQualFinSF + BldgType + BsmtQual + CentralAir + ExterQual + Fence  + MSZoning  + PavedDrive  + GarageQual + KitchenQual, data= train))

summary(model4)
## 
## Call:
## lm(formula = SalePrice ~ BsmtFinSF1 + BsmtFullBath + FullBath + 
##     OverallCond + OverallQual + GarageCars + GrLivArea + LotArea + 
##     LowQualFinSF + BldgType + BsmtQual + CentralAir + ExterQual + 
##     Fence + MSZoning + PavedDrive + GarageQual + KitchenQual, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -117542  -10286    1353    9303  117542 
## 
## Coefficients:
##                    Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)     232416.8187   51582.3562   4.506    0.000010652096255 ***
## BsmtFinSF1          29.8631       4.9124   6.079    0.000000005154209 ***
## BsmtFullBath      3319.0743    3310.0505   1.003             0.317075    
## FullBath          5266.0252    4292.6824   1.227             0.221206    
## OverallCond       5621.0401    1333.0414   4.217    0.000035984393215 ***
## OverallQual      13513.1748    2138.7685   6.318    0.000000001410005 ***
## GarageCars        5890.7078    2903.4117   2.029             0.043652 *  
## GrLivArea           49.0802       4.7324  10.371 < 0.0000000000000002 ***
## LotArea              2.1263       0.6166   3.448             0.000674 ***
## LowQualFinSF      -146.4405      40.9690  -3.574             0.000430 ***
## BldgType2fmCon  -11602.7199   14263.8116  -0.813             0.416831    
## BldgTypeDuplex   -6239.9808   15992.7275  -0.390             0.696777    
## BldgTypeTwnhs    14077.4161   17150.4634   0.821             0.412622    
## BldgTypeTwnhsE   -6790.1434   12224.4039  -0.555             0.579136    
## BsmtQualFa     -136193.4729   20313.0941  -6.705    0.000000000162060 ***
## BsmtQualGd     -132630.4809   16969.8083  -7.816    0.000000000000211 ***
## BsmtQualTA     -134184.9244   17418.8530  -7.703    0.000000000000424 ***
## CentralAirY      31826.3091    8218.7009   3.872             0.000141 ***
## ExterQualFa     -41882.0580   35780.0257  -1.171             0.243026    
## ExterQualGd     -29207.9931   22534.4009  -1.296             0.196257    
## ExterQualTA     -45282.4288   22333.9747  -2.028             0.043794 *  
## FenceGdWo         8986.4784    4748.7041   1.892             0.059726 .  
## FenceMnPrv       11498.5675    3834.8631   2.998             0.003020 ** 
## FenceMnWw         1726.9122    7810.1918   0.221             0.825208    
## MSZoningFV       64036.7263   32978.2128   1.942             0.053417 .  
## MSZoningRH       18795.8988   28495.0453   0.660             0.510175    
## MSZoningRL       26968.1600   24293.2575   1.110             0.268142    
## MSZoningRM       18144.3943   24154.9149   0.751             0.453340    
## PavedDriveP      -7951.5849   11661.3980  -0.682             0.496024    
## PavedDriveY        -35.1946    7606.1544  -0.005             0.996312    
## GarageQualFa   -165401.3850   34063.2151  -4.856    0.000002250613782 ***
## GarageQualGd   -123624.4199   35461.1682  -3.486             0.000589 ***
## GarageQualPo   -129634.6776   40257.0505  -3.220             0.001471 ** 
## GarageQualTA   -161529.4237   33170.2442  -4.870    0.000002111226286 ***
## KitchenQualFa   -31529.6181   15423.4588  -2.044             0.042097 *  
## KitchenQualGd   -32851.8356   10141.0914  -3.239             0.001379 ** 
## KitchenQualTA   -33324.1276   10277.1951  -3.243             0.001365 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21310 on 224 degrees of freedom
##   (1199 observations deleted due to missingness)
## Multiple R-squared:  0.897,  Adjusted R-squared:  0.8805 
## F-statistic: 54.19 on 36 and 224 DF,  p-value: < 0.00000000000000022

Predict values using model

train_pred1 <- predict(model1, train)
train_pred2 <- predict(model2, train)
train_pred3 <- predict(model3, train)
train_pred4 <- predict(model4, train)

res1 <- train$SalePrice - train_pred1
res2 <- train$SalePrice - train_pred2
res3 <- train$SalePrice - train_pred3
res4 <- train$SalePrice - train_pred4

rmse1 <- sqrt(mean(res1**2,na.rm = TRUE))
rmse2 <- sqrt(mean(res2**2,na.rm = TRUE))
rmse3 <- sqrt(mean(res3**2,na.rm = TRUE))
rmse4 <- sqrt(mean(res4**2,na.rm = TRUE))
paste("")
## [1] ""
paste('RMSE for Model 1 :',round(rmse1,2))
## [1] "RMSE for Model 1 : 38477.99"
paste('RMSE for Model 2:',round(rmse2,2))
## [1] "RMSE for Model 2: 43062.57"
paste('RMSE for Model 3:',round(rmse3,2))
## [1] "RMSE for Model 3: 35170.76"
paste('RMSE for Model 4:',round(rmse4,2))
## [1] "RMSE for Model 4: 19742.29"
trainmean <-mean(train$SalePrice)
paste("Mean of Sales Price from Training Data is", round(trainmean,2))
## [1] "Mean of Sales Price from Training Data is 180921.2"
par(mfrow = c(2, 2))
hist(train_pred1)
hist(train_pred2)
hist(train_pred3)
hist(train_pred4)

hist(train$SalePrice)

test <- read.csv("C:/Users/User/Desktop/MSDS/DATA605/test.csv",stringsAsFactors = TRUE, na.strings = "NA")

test$pred1 <- predict(model1, test,na.rm = TRUE)
test$pred2 <- predict(model2, test,na.rm = TRUE)
test$pred3 <- predict(model3, test,na.rm = TRUE)
test$pred4 <- predict(model4, test,na.rm = TRUE)
kaggle1 <- data.frame(matrix(c(test$Id,test$pred1),ncol = 2))
kaggle2 <- data.frame(matrix(c(test$Id,test$pred2),ncol = 2))
kaggle3 <- data.frame(matrix(c(test$Id,test$pred3),ncol = 2))
kaggle4 <- data.frame(matrix(c(test$Id,test$pred4),ncol = 2))

colnames(kaggle1) <- c("Id","SalePrice")
colnames(kaggle2) <- c("Id","SalePrice")
colnames(kaggle3) <- c("Id","SalePrice")
colnames(kaggle4) <- c("Id","SalePrice")

rownames(kaggle1) <- NULL
rownames(kaggle2) <- NULL
rownames(kaggle3) <- NULL
rownames(kaggle4) <- NULL


for(i in 1:ncol(kaggle1)){
 kaggle1[is.na(kaggle1[,i]), i] <- mean(kaggle1[,i], na.rm = TRUE)
}

for(i in 1:ncol(kaggle2)){
 kaggle2[is.na(kaggle2[,i]), i] <- mean(kaggle2[,i], na.rm = TRUE)
}

for(i in 1:ncol(kaggle3)){
 kaggle3[is.na(kaggle3[,i]), i] <- mean(kaggle3[,i], na.rm = TRUE)
}

for(i in 1:ncol(kaggle4)){
 kaggle4[is.na(kaggle4[,i]), i] <- mean(kaggle4[,i], na.rm = TRUE)
}
write.csv(kaggle1, file = "kaggle1.csv")
write.csv(kaggle2, file = "kaggle2.csv")
write.csv(kaggle3, file = "kaggle3.csv")
write.csv(kaggle4, file = "kaggle4.csv")
Caption for the picture.

Caption for the picture.