Goal : Predict the housing prices in test data set using train data set.

Upload libraries and data set

library(tidyverse)
library(psych)
library(pastecs)
library(imputeMissings)
library(ggplot2)
library(reshape2)

data_test = read_csv("/Users/aarontomat/Desktop/test.csv")
data_train = read_csv("/Users/aarontomat/Desktop/train.csv")

Looking at and sorting data

# Initial description data set
head(describe(data_train))
#Summary of SalePrice 
summary(data_train$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
# Look at which variabels are numeric and categorical
head(lapply(data_train,class))
## $Id
## [1] "numeric"
## 
## $MSSubClass
## [1] "numeric"
## 
## $MSZoning
## [1] "character"
## 
## $LotFrontage
## [1] "numeric"
## 
## $LotArea
## [1] "numeric"
## 
## $Street
## [1] "character"
# Creates a seperate numerical data set  
N_data_train = select(data_train,is.numeric) 
head(N_data_train)
# Turns all NA's into 0 for numerical dataset
N_data_train[is.na(N_data_train)] = 0
N_data_train = N_data_train %>% rename(fstFlrSF = '1stFlrSF')

#Creates a seperate categorical data set
C_data_train = select(data_train,Id,is.character,SalePrice)
head(C_data_train)
# Correlation between numeric variables
x = data.frame(cor(N_data_train)) %>% select(SalePrice)
xn = x %>% filter(SalePrice < -.5)
xn
xp = x %>% filter(SalePrice > .5)
xp
# Variables that have a strong correlation related to Sales Price 

#OverallQual    0.7909816
#GrLivArea  0.7086245
#GarageCars 0.6404092
#GarageArea 0.6234314
#TotalBsmtSF    0.6135806
#fstFlrSF   0.6058522
#FullBath   0.5606638
#TotRmsAbvGrd   0.5337232
#YearBuilt  0.5228973
#YearRemodAdd   0.5071010

# Data set containing only variables used for correlation formula
Cor_data_train = N_data_train %>% select(OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,fstFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd)

Comparing SalePrice to highly correlated Variables

# Comparing SalePrice and OverallQual 
cor(N_data_train$SalePrice,N_data_train$OverallQual)
## [1] 0.7909816
plot(N_data_train$OverallQual,N_data_train$SalePrice,main = "Comparing SalePrice and OverallQual",xlab = "OverallQual",ylab = "SalePrice")

model1 = lm(SalePrice~OverallQual,N_data_train)
model1
## 
## Call:
## lm(formula = SalePrice ~ OverallQual, data = N_data_train)
## 
## Coefficients:
## (Intercept)  OverallQual  
##      -96206        45436
summary(model1)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual, data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -198152  -29409   -1845   21463  396848 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -96206.1     5756.4  -16.71   <2e-16 ***
## OverallQual  45435.8      920.4   49.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48620 on 1458 degrees of freedom
## Multiple R-squared:  0.6257, Adjusted R-squared:  0.6254 
## F-statistic:  2437 on 1 and 1458 DF,  p-value: < 2.2e-16
# Adding GrLivArea to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$GrLivArea)
## [1] 0.7086245
plot(N_data_train$GrLivArea,N_data_train$SalePrice,main = "Comparing SalePrice and GrLivArea",xlab = "GrLivArea",ylab = "SalePrice")

model2 = lm(SalePrice~OverallQual+GrLivArea,N_data_train)
summary(model2)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea, data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -379572  -22266    -386   19895  289501 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -104092.67    5045.37  -20.63   <2e-16 ***
## OverallQual   32849.05     999.20   32.88   <2e-16 ***
## GrLivArea        55.86       2.63   21.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42500 on 1457 degrees of freedom
## Multiple R-squared:  0.7142, Adjusted R-squared:  0.7138 
## F-statistic:  1820 on 2 and 1457 DF,  p-value: < 2.2e-16
# Adding GarageCars to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$GarageCars)
## [1] 0.6404092
plot(N_data_train$GarageCars,N_data_train$SalePrice,main = "Comparing SalePrice and GarageCars",xlab = "GarageCars",ylab = "SalePrice")

model3 = lm(SalePrice~OverallQual+GrLivArea+GarageCars,N_data_train)
summary(model3)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars, 
##     data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -340718  -21675   -2085   19500  300177 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -98832.493   4842.897  -20.41   <2e-16 ***
## OverallQual  27104.826   1072.182   25.28   <2e-16 ***
## GrLivArea       50.674      2.552   19.86   <2e-16 ***
## GarageCars   21298.960   1807.065   11.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40620 on 1456 degrees of freedom
## Multiple R-squared:  0.7391, Adjusted R-squared:  0.7385 
## F-statistic:  1375 on 3 and 1456 DF,  p-value: < 2.2e-16
# Adding GarageArea to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$GarageArea)
## [1] 0.6234314
plot(N_data_train$GarageArea,N_data_train$SalePrice,main = "Comparing SalePrice and GarageArea",xlab = "GarageArea",ylab = "SalePrice")

model4 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea,N_data_train)
summary(model4)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea, data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -372594  -21236   -1594   18625  301129 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -98436.050   4820.467 -20.420  < 2e-16 ***
## OverallQual  26988.854   1067.393  25.285  < 2e-16 ***
## GrLivArea       49.573      2.555  19.402  < 2e-16 ***
## GarageCars   11317.522   3126.297   3.620 0.000305 ***
## GarageArea      41.478     10.627   3.903 9.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40420 on 1455 degrees of freedom
## Multiple R-squared:  0.7418, Adjusted R-squared:  0.7411 
## F-statistic:  1045 on 4 and 1455 DF,  p-value: < 2.2e-16
# Adding TotalBsmtSF to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$TotalBsmtSF)
## [1] 0.6135806
plot(N_data_train$TotalBsmtSF,N_data_train$SalePrice,main = "Comparing SalePrice and TotalBsmtSF",xlab = "TotalBsmtSF",ylab = "SalePrice")

model5 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF,N_data_train)
summary(model5)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF, data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -478977  -19915   -1503   16701  287132 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -99072.050   4638.450 -21.359  < 2e-16 ***
## OverallQual  23635.007   1072.532  22.037  < 2e-16 ***
## GrLivArea       45.346      2.489  18.218  < 2e-16 ***
## GarageCars   14544.315   3022.681   4.812 1.65e-06 ***
## GarageArea      17.133     10.468   1.637    0.102    
## TotalBsmtSF     31.501      2.904  10.848  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38900 on 1454 degrees of freedom
## Multiple R-squared:  0.7611, Adjusted R-squared:  0.7603 
## F-statistic: 926.5 on 5 and 1454 DF,  p-value: < 2.2e-16
# Adding fstFlrSF to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$fstFlrSF)
## [1] 0.6058522
plot(N_data_train$fstFlrSF,N_data_train$SalePrice,main = "Comparing SalePrice and fstFlrSF",xlab = "fstFlrSF",ylab = "SalePrice")

model6 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF,N_data_train)
summary(model6)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + fstFlrSF, data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -473373  -19732   -1080   16922  288035 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.027e+05  4.904e+03 -20.932  < 2e-16 ***
## OverallQual  2.400e+04  1.083e+03  22.150  < 2e-16 ***
## GrLivArea    4.312e+01  2.679e+00  16.095  < 2e-16 ***
## GarageCars   1.452e+04  3.019e+03   4.809 1.68e-06 ***
## GarageArea   1.566e+01  1.047e+01   1.495   0.1350    
## TotalBsmtSF  2.439e+01  4.318e+00   5.649 1.94e-08 ***
## fstFlrSF     1.119e+01  5.032e+00   2.223   0.0264 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38840 on 1453 degrees of freedom
## Multiple R-squared:  0.7619, Adjusted R-squared:  0.7609 
## F-statistic:   775 on 6 and 1453 DF,  p-value: < 2.2e-16
# Adding FullBath to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$FullBath)
## [1] 0.5606638
plot(N_data_train$FullBath,N_data_train$SalePrice,main = "Comparing SalePrice and FullBath",xlab = "FullBath",ylab = "SalePrice")

model7 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath,N_data_train)
summary(model7)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + fstFlrSF + FullBath, data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -471605  -19887   -1264   16874  288458 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.029e+05  4.923e+03 -20.901  < 2e-16 ***
## OverallQual  2.386e+04  1.108e+03  21.546  < 2e-16 ***
## GrLivArea    4.243e+01  2.940e+00  14.432  < 2e-16 ***
## GarageCars   1.421e+04  3.067e+03   4.632 3.94e-06 ***
## GarageArea   1.630e+01  1.054e+01   1.547   0.1220    
## TotalBsmtSF  2.453e+01  4.325e+00   5.671 1.71e-08 ***
## fstFlrSF     1.112e+01  5.035e+00   2.208   0.0274 *  
## FullBath     1.457e+03  2.529e+03   0.576   0.5646    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38850 on 1452 degrees of freedom
## Multiple R-squared:  0.762,  Adjusted R-squared:  0.7608 
## F-statistic:   664 on 7 and 1452 DF,  p-value: < 2.2e-16
# Adding TotRmsAbvGrd to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$TotRmsAbvGrd)
## [1] 0.5337232
plot(N_data_train$TotRmsAbvGrd,N_data_train$SalePrice,main = "Comparing SalePrice and TotRmsAbvGrd",xlab = "TotRmsAbvGrd",ylab = "SalePrice")

model8 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath+TotRmsAbvGrd,N_data_train)
summary(model8)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + fstFlrSF + FullBath + TotRmsAbvGrd, 
##     data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -474264  -19688   -1330   17195  285481 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.003e+05  6.007e+03 -16.704  < 2e-16 ***
## OverallQual   2.378e+04  1.114e+03  21.345  < 2e-16 ***
## GrLivArea     4.468e+01  4.233e+00  10.555  < 2e-16 ***
## GarageCars    1.436e+04  3.074e+03   4.672 3.27e-06 ***
## GarageArea    1.574e+01  1.056e+01   1.490   0.1364    
## TotalBsmtSF   2.425e+01  4.342e+00   5.584 2.80e-08 ***
## fstFlrSF      1.116e+01  5.036e+00   2.216   0.0269 *  
## FullBath      1.655e+03  2.543e+03   0.651   0.5152    
## TotRmsAbvGrd -8.461e+02  1.141e+03  -0.741   0.4586    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38860 on 1451 degrees of freedom
## Multiple R-squared:  0.7621, Adjusted R-squared:  0.7608 
## F-statistic: 580.9 on 8 and 1451 DF,  p-value: < 2.2e-16
# Adding YearBuilt to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$YearBuilt)
## [1] 0.5228973
plot(N_data_train$YearBuilt,N_data_train$SalePrice,main = "Comparing SalePrice and YearBuilt",xlab = "YearBuilt",ylab = "SalePrice")

model9 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath+TotRmsAbvGrd+YearBuilt,N_data_train)
summary(model9)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + fstFlrSF + FullBath + TotRmsAbvGrd + 
##     YearBuilt, data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -487024  -19856   -2161   16304  285943 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.585e+05  9.145e+04  -8.294 2.46e-16 ***
## OverallQual   2.092e+04  1.164e+03  17.971  < 2e-16 ***
## GrLivArea     5.137e+01  4.263e+00  12.051  < 2e-16 ***
## GarageCars    1.063e+04  3.066e+03   3.469 0.000538 ***
## GarageArea    1.451e+01  1.038e+01   1.397 0.162572    
## TotalBsmtSF   1.925e+01  4.324e+00   4.451 9.19e-06 ***
## fstFlrSF      1.395e+01  4.965e+00   2.810 0.005028 ** 
## FullBath     -5.410e+03  2.685e+03  -2.015 0.044081 *  
## TotRmsAbvGrd -8.664e+01  1.127e+03  -0.077 0.938718    
## YearBuilt     3.454e+02  4.789e+01   7.212 8.83e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38190 on 1450 degrees of freedom
## Multiple R-squared:  0.7703, Adjusted R-squared:  0.7689 
## F-statistic: 540.3 on 9 and 1450 DF,  p-value: < 2.2e-16
# Adding YearRemodAdd to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$YearRemodAdd)
## [1] 0.507101
plot(N_data_train$YearRemodAdd,N_data_train$SalePrice,main = "Comparing SalePrice and YearRemodAdd",xlab = "YearRemodAdd",ylab = "SalePrice")

model10 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath+TotRmsAbvGrd+YearBuilt+YearRemodAdd,N_data_train)
summary(model10)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + fstFlrSF + FullBath + TotRmsAbvGrd + 
##     YearBuilt + YearRemodAdd, data = N_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -489958  -19316   -1948   16020  290558 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.186e+06  1.291e+05  -9.187  < 2e-16 ***
## OverallQual   1.960e+04  1.190e+03  16.472  < 2e-16 ***
## GrLivArea     5.130e+01  4.233e+00  12.119  < 2e-16 ***
## GarageCars    1.042e+04  3.044e+03   3.422 0.000639 ***
## GarageArea    1.495e+01  1.031e+01   1.450 0.147384    
## TotalBsmtSF   1.986e+01  4.295e+00   4.625 4.09e-06 ***
## fstFlrSF      1.417e+01  4.930e+00   2.875 0.004097 ** 
## FullBath     -6.791e+03  2.682e+03  -2.532 0.011457 *  
## TotRmsAbvGrd  3.310e+01  1.119e+03   0.030 0.976404    
## YearBuilt     2.682e+02  5.035e+01   5.328 1.15e-07 ***
## YearRemodAdd  2.965e+02  6.363e+01   4.659 3.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37920 on 1449 degrees of freedom
## Multiple R-squared:  0.7737, Adjusted R-squared:  0.7721 
## F-statistic: 495.4 on 10 and 1449 DF,  p-value: < 2.2e-16

Prepare test data set

N_data_test = select(data_test,is.numeric)
head(N_data_test)
# Turns all NA's into 0 for numerical dataset
N_data_test[is.na(N_data_test)] = 0
N_data_test = N_data_test %>% rename(fstFlrSF = '1stFlrSF')

Cor_data_test= N_data_test %>% select(OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,fstFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd)

OverallQual = Cor_data_test %>% select(OverallQual)
GrLivArea   = Cor_data_test %>% select(GrLivArea)
GarageCars = Cor_data_test %>% select(GarageCars)
GarageArea = Cor_data_test %>% select(GarageArea)
TotalBsmtSF = Cor_data_test %>% select(TotalBsmtSF)
fstFlrSF = Cor_data_test %>% select(fstFlrSF)
FullBath = Cor_data_test %>% select(FullBath)   
TotRmsAbvGrd = Cor_data_test %>% select(TotRmsAbvGrd)
YearBuilt = Cor_data_test %>% select(YearBuilt)
YearRemodAdd = Cor_data_test %>% select(YearRemodAdd)

Plug in values from test data set into regression models to determine the predicted housing prices

#model1
head(coef(model1)[1] + coef(model1)[2]*OverallQual)
#model2 
head(coef(model2)[1] + coef(model2)[2]*OverallQual + coef(model2)[3]*GrLivArea)
#model3 
head(coef(model3)[1] + coef(model3)[2]*OverallQual + coef(model3)[3]*GrLivArea + coef(model3)[4]*GarageCars)
#model4 
head(coef(model4)[1] + coef(model4)[2]*OverallQual + coef(model4)[3]*GrLivArea + coef(model4)[4]*GarageCars + coef(model4)[5]*GarageArea)
#model5 
head(coef(model5)[1] + coef(model5)[2]*OverallQual + coef(model5)[3]*GrLivArea + coef(model5)[4]*GarageCars + coef(model5)[5]*GarageArea + coef(model5)[6]*TotalBsmtSF)
#model6
head(coef(model6)[1] + coef(model6)[2]*OverallQual + coef(model6)[3]*GrLivArea + coef(model6)[4]*GarageCars + coef(model6)[5]*GarageArea + coef(model6)[6]*TotalBsmtSF + coef(model6)[7]*fstFlrSF)
#model7
head(coef(model7)[1] + coef(model7)[2]*OverallQual + coef(model7)[3]*GrLivArea + coef(model7)[4]*GarageCars + coef(model7)[5]*GarageArea + coef(model7)[6]*TotalBsmtSF + coef(model7)[7]*fstFlrSF + coef(model7)[8]*FullBath)
#model8
head(coef(model8)[1] + coef(model8)[2]*OverallQual + coef(model8)[3]*GrLivArea + coef(model8)[4]*GarageCars + coef(model8)[5]*GarageArea + coef(model8)[6]*TotalBsmtSF + coef(model8)[7]*fstFlrSF + coef(model8)[8]*FullBath + coef(model8)[9]*TotRmsAbvGrd)
#model9
head(coef(model9)[1] + coef(model9)[2]*OverallQual + coef(model9)[3]*GrLivArea + coef(model9)[4]*GarageCars + coef(model9)[5]*GarageArea + coef(model9)[6]*TotalBsmtSF + coef(model9)[7]*fstFlrSF + coef(model9)[8]*FullBath + coef(model9)[9]*TotRmsAbvGrd + coef(model9)[10]*YearBuilt)
#model10
head(coef(model10)[1] + coef(model10)[2]*OverallQual + coef(model10)[3]*GrLivArea + coef(model10)[4]*GarageCars + coef(model10)[5]*GarageArea + coef(model10)[6]*TotalBsmtSF + coef(model10)[7]*fstFlrSF + coef(model10)[8]*FullBath + coef(model10)[9]*TotRmsAbvGrd + coef(model10)[10]*YearBuilt + coef(model10)[11]*YearRemodAdd)
Predictions = coef(model10)[1] + coef(model10)[2]*OverallQual + coef(model10)[3]*GrLivArea + coef(model10)[4]*GarageCars + coef(model10)[5]*GarageArea + coef(model10)[6]*TotalBsmtSF + coef(model10)[7]*fstFlrSF + coef(model10)[8]*FullBath + coef(model10)[9]*TotRmsAbvGrd + coef(model10)[10]*YearBuilt + coef(model10)[11]*YearRemodAdd

plot(model10$residuals)

Prediction data table for SalePrice using only numeric variables test data set

Predictions = Predictions %>% rename(SalePrice = OverallQual) %>% mutate(Id = 1461:2919)
Predictions = Predictions %>% select(Id, SalePrice)
PredictionsG = Predictions %>% select(Id, SalePrice) %>% mutate(SalePrice = SalePrice/1000)

hist(PredictionsG$SalePrice,main = "Numeric SalePrice Predictions",xlab = "SalePrice in Thousands of Dollars")

plot(model10$residuals)

summary(Predictions$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -9996  126204  169288  178277  222010  614315

Changing all the Categorical Variables to numeric ones

test = C_data_train
# MSZoning 
test$MSZoning[test$MSZoning == "A"] <- "0"
test$MSZoning[test$MSZoning == "c"] <- "1"
test$MSZoning[test$MSZoning == "FV"] <- "2"
test$MSZoning[test$MSZoning == "I"] <- "3"
test$MSZoning[test$MSZoning == "RH"] <- "4"
test$MSZoning[test$MSZoning == "RL"] <- "5"
test$MSZoning[test$MSZoning == "RP"] <- "6"
test$MSZoning[test$MSZoning == "RM"] <- "7"

# Street
test$Street[test$Street == "Grvl"] <- "0"
test$Street[test$Street == "Pave"] <- "1"

#Alley 
test$Alley[test$Alley == "Grvl"] <- "1"
test$Alley[test$Alley == "Pave"] <- "2"
test$Alley[is.na(test$Alley)] = 0

#Lot Shape
test$LotShape[test$LotShape == "Reg"] <- "0"
test$LotShape[test$LotShape == "IR1"] <- "1"
test$LotShape[test$LotShape == "IR2"] <- "2"
test$LotShape[test$LotShape == "IR3"] <- "3"

#LandContour
test$LandContour[test$LandContour == "Lvl"] <- "0"
test$LandContour[test$LandContour == "Bnk"] <- "1"
test$LandContour[test$LandContour == "HLS"] <- "2"
test$LandContour[test$LandContour == "Low"] <- "3"

#Utilities
test$Utilities[test$Utilities == "AllPub"] <- "0"
test$Utilities[test$Utilities == "NoSewr"] <- "1"
test$Utilities[test$Utilities == "NoSeWa"] <- "2"
test$Utilities[test$Utilities == "ELO"] <- "3"

#LotConfig
test$LotConfig[test$LotConfig == "Inside"] <- "0"
test$LotConfig[test$LotConfig == "Corner"] <- "1"
test$LotConfig[test$LotConfig == "CulDSac"] <- "2"
test$LotConfig[test$LotConfig == "FR2"] <- "3"
test$LotConfig[test$LotConfig == "FR3"] <- "4"

#LandSlope
test$LandSlope[test$LandSlope == "Gtl"] <- "0"
test$LandSlope[test$LandSlope == "Mod"] <- "1"
test$LandSlope[test$LandSlope == "Sev"] <- "2"

#Neighborhood
test$Neighborhood[test$Neighborhood == "Blmngtn"] <- "0"
test$Neighborhood[test$Neighborhood == "Blueste"] <- "1"
test$Neighborhood[test$Neighborhood == "BrDale"] <- "2"
test$Neighborhood[test$Neighborhood == "BrkSide"] <- "3"
test$Neighborhood[test$Neighborhood == "ClearCr"] <- "4"
test$Neighborhood[test$Neighborhood == "CollgCr"] <- "5"
test$Neighborhood[test$Neighborhood == "Crawfor"] <- "6"
test$Neighborhood[test$Neighborhood == "Edwards"] <- "7"
test$Neighborhood[test$Neighborhood == "Gilbert"] <- "8"
test$Neighborhood[test$Neighborhood == "IDOTRR"] <- "9"
test$Neighborhood[test$Neighborhood == "MeadowV"] <- "10"
test$Neighborhood[test$Neighborhood == "Mitchel"] <- "11"
test$Neighborhood[test$Neighborhood == "Names"] <- "12"
test$Neighborhood[test$Neighborhood == "NoRidge"] <- "13"
test$Neighborhood[test$Neighborhood == "NPkVill"] <- "14"
test$Neighborhood[test$Neighborhood == "NridgHt"] <- "15"
test$Neighborhood[test$Neighborhood == "NAmes"] <- "16"
test$Neighborhood[test$Neighborhood == "OldTown"] <- "17"
test$Neighborhood[test$Neighborhood == "SWISU"] <- "18"
test$Neighborhood[test$Neighborhood == "Sawyer"] <- "19"
test$Neighborhood[test$Neighborhood == "SawyerW"] <- "20"
test$Neighborhood[test$Neighborhood == "Somerst"] <- "21"
test$Neighborhood[test$Neighborhood == "StoneBr"] <- "22"
test$Neighborhood[test$Neighborhood == "Timber"] <- "23"
test$Neighborhood[test$Neighborhood == "Veenker"] <- "24"

#Condition1
test$Condition1[test$Condition1== "Artery"] <- "0"
test$Condition1[test$Condition1== "Feedr"] <- "1"
test$Condition1[test$Condition1== "Norm"] <- "2"
test$Condition1[test$Condition1== "RRNn"] <- "3"
test$Condition1[test$Condition1== "RRAn"] <- "4"
test$Condition1[test$Condition1== "PosN"] <- "5"
test$Condition1[test$Condition1== "PosA"] <- "6"
test$Condition1[test$Condition1== "RRNe"] <- "7"
test$Condition1[test$Condition1== "RRAe"] <- "8"


#Condition2
test$Condition2[test$Condition2== "Artery"] <- "0"
test$Condition2[test$Condition2== "Feedr"] <- "1"
test$Condition2[test$Condition2== "Norm"] <- "2"
test$Condition2[test$Condition2== "RRNn"] <- "3"
test$Condition2[test$Condition2== "RRAn"] <- "4"
test$Condition2[test$Condition2== "PosN"] <- "5"
test$Condition2[test$Condition2== "PosA"] <- "6"
test$Condition2[test$Condition2== "RRNe"] <- "7"
test$Condition2[test$Condition2== "RRAe"] <- "8"


#BldgType 
test$BldgType[test$BldgType == "1Fam"] <- "0"
test$BldgType[test$BldgType == "2fmCon"] <- "1"
test$BldgType[test$BldgType == "Duplex"] <- "2"
test$BldgType[test$BldgType == "TwnhsE"] <- "3"
test$BldgType[test$BldgType == "TwnhsI"] <- "4"


#HouseStyle
test$HouseStyle[test$HouseStyle == "1Story"] <- "0"
test$HouseStyle[test$HouseStyle == "1.5Fin"] <- "1"
test$HouseStyle[test$HouseStyle == "1.5Unf"] <- "2"
test$HouseStyle[test$HouseStyle == "2Story"] <- "3"
test$HouseStyle[test$HouseStyle == "2.5Fin"] <- "4"
test$HouseStyle[test$HouseStyle == "2.5Unf"] <- "5"
test$HouseStyle[test$HouseStyle == "SFoyer"] <- "6"
test$HouseStyle[test$HouseStyle == "SLvl"] <- "7"

#RoofStyle
test$RoofStyle[test$RoofStyle == "Flat"] <- "0"
test$RoofStyle[test$RoofStyle == "Gable"] <- "1"
test$RoofStyle[test$RoofStyle == "Gambrel"] <- "2"
test$RoofStyle[test$RoofStyle == "Hip"] <- "3"
test$RoofStyle[test$RoofStyle == "Mansard"] <- "4"
test$RoofStyle[test$RoofStyle == "Shed"] <- "5"

#RoofMatl
test$RoofMatl[test$RoofMatl == "ClyTile"] <- "0"
test$RoofMatl[test$RoofMatl == "CompShg"] <- "1"
test$RoofMatl[test$RoofMatl == "Membran"] <- "2"
test$RoofMatl[test$RoofMatl == "Metal"] <- "3"
test$RoofMatl[test$RoofMatl == "Roll"] <- "4"
test$RoofMatl[test$RoofMatl == "Tar&Grv"] <- "5"
test$RoofMatl[test$RoofMatl == "WdShake"] <- "6"
test$RoofMatl[test$RoofMatl == "WdShngl"] <- "7"

#Exterior1st
test$Exterior1st[test$Exterior1st == "AsbShng"] <- "0"
test$Exterior1st[test$Exterior1st == "AsphShn"] <- "1"
test$Exterior1st[test$Exterior1st == "BrkComm"] <- "2"
test$Exterior1st[test$Exterior1st == "BrkFace"] <- "3"
test$Exterior1st[test$Exterior1st == "CBlock"] <- "4"
test$Exterior1st[test$Exterior1st == "CemntBd"] <- "5"
test$Exterior1st[test$Exterior1st == "HdBoard"] <- "6"
test$Exterior1st[test$Exterior1st == "ImStucc"] <- "7"
test$Exterior1st[test$Exterior1st == "MetalSd"] <- "8"
test$Exterior1st[test$Exterior1st == "Other"] <- "9"
test$Exterior1st[test$Exterior1st == "Plywood"] <- "10"
test$Exterior1st[test$Exterior1st == "PreCast"] <- "11"
test$Exterior1st[test$Exterior1st == "Stone"] <- "12"
test$Exterior1st[test$Exterior1st == "Stucco"] <- "13"
test$Exterior1st[test$Exterior1st == "VinylSd"] <- "14"
test$Exterior1st[test$Exterior1st == "Wd Sdng"] <- "15"
test$Exterior1st[test$Exterior1st == "WdShing"] <- "16"

#Exterior2nd
test$Exterior2nd[test$Exterior2nd == "AsbShng"] <- "0"
test$Exterior2nd[test$Exterior2nd == "AsphShn"] <- "1"
test$Exterior2nd[test$Exterior2nd == "BrkComm"] <- "2"
test$Exterior2nd[test$Exterior2nd == "BrkFace"] <- "3"
test$Exterior2nd[test$Exterior2nd == "CBlock"] <- "4"
test$Exterior2nd[test$Exterior2nd == "CemntBd"] <- "5"
test$Exterior2nd[test$Exterior2nd == "HdBoard"] <- "6"
test$Exterior2nd[test$Exterior2nd == "ImStucc"] <- "7"
test$Exterior2nd[test$Exterior2nd == "MetalSd"] <- "8"
test$Exterior2nd[test$Exterior2nd == "Other"] <- "9"
test$Exterior2nd[test$Exterior2nd == "Plywood"] <- "10"
test$Exterior2nd[test$Exterior2nd == "PreCast"] <- "11"
test$Exterior2nd[test$Exterior2nd == "Stone"] <- "12"
test$Exterior2nd[test$Exterior2nd == "Stucco"] <- "13"
test$Exterior2nd[test$Exterior2nd == "VinylSd"] <- "14"
test$Exterior2nd[test$Exterior2nd == "Wd Sdng"] <- "15"
test$Exterior2nd[test$Exterior2nd == "Wd Shng"] <- "16"

#MasVnrType
test$MasVnrType[test$MasVnrType == "BrkCmn"] <- "0"
test$MasVnrType[test$MasVnrType == "BrkFace"] <- "1"
test$MasVnrType[test$MasVnrType == "CBlock"] <- "2"
test$MasVnrType[test$MasVnrType == "None"] <- "3"
test$MasVnrType[test$MasVnrType == "Stone"] <- "4"

#ExterQual
test$ExterQual[test$ExterQual== "Ex"] <- "0"
test$ExterQual[test$ExterQual== "Gd"] <- "1"
test$ExterQual[test$ExterQual== "TA"] <- "2"
test$ExterQual[test$ExterQual== "Fa"] <- "3"
test$ExterQual[test$ExterQual== "Po"] <- "4"
        
#ExterCond
test$ExterCond[test$ExterCond== "Ex"] <- "0"
test$ExterCond[test$ExterCond== "Gd"] <- "1"
test$ExterCond[test$ExterCond== "TA"] <- "2"
test$ExterCond[test$ExterCond== "Fa"] <- "3"
test$ExterCond[test$ExterCond== "Po"] <- "4"


#Foundation
test$Foundation[test$Foundation== "BrkTil"] <- "0"
test$Foundation[test$Foundation== "CBlock"] <- "1"
test$Foundation[test$Foundation== "PConc"] <- "2"
test$Foundation[test$Foundation== "Slab"] <- "3"
test$Foundation[test$Foundation== "Stone"] <- "4"
test$Foundation[test$Foundation== "Wood"] <- "5"

#BsmtQual
test$BsmtQual[is.na(test$BsmtQual)] = 0
test$BsmtQual[test$BsmtQual== "Ex"] <- "1"
test$BsmtQual[test$BsmtQual== "Gd"] <- "2"
test$BsmtQual[test$BsmtQual== "TA"] <- "3"
test$BsmtQual[test$BsmtQual== "Fa"] <- "4"
test$BsmtQual[test$BsmtQual== "Po"] <- "5"

#BsmtCond
test$BsmtCond[is.na(test$BsmtCond)] = 0
test$BsmtCond[test$BsmtCond== "Ex"] <- "1"
test$BsmtCond[test$BsmtCond== "Gd"] <- "2"
test$BsmtCond[test$BsmtCond== "TA"] <- "3"
test$BsmtCond[test$BsmtCond== "Fa"] <- "4"
test$BsmtCond[test$BsmtCond== "Po"] <- "5"

#BsmtExposure
test$BsmtExposure[is.na(test$BsmtExposure)] = 0
test$BsmtExposure[test$BsmtExposure== "Gd"] <- "1"
test$BsmtExposure[test$BsmtExposure== "Av"] <- "2"
test$BsmtExposure[test$BsmtExposure== "Mn"] <- "3"
test$BsmtExposure[test$BsmtExposure== "No"] <- "4"

#BsmtFinType1
test$BsmtFinType1[is.na(test$BsmtFinType1)] = 0
test$BsmtFinType1[test$BsmtFinType1== "GLQ"] <- "1"
test$BsmtFinType1[test$BsmtFinType1== "ALQ"] <- "2"
test$BsmtFinType1[test$BsmtFinType1== "BLQ"] <- "3"
test$BsmtFinType1[test$BsmtFinType1== "Rec"] <- "4"
test$BsmtFinType1[test$BsmtFinType1== "LwQ"] <- "5"
test$BsmtFinType1[test$BsmtFinType1== "Unf"] <- "6"

#BsmtFinType2
test$BsmtFinType2[is.na(test$BsmtFinType2)] = 0
test$BsmtFinType2[test$BsmtFinType2== "GLQ"] <- "1"
test$BsmtFinType2[test$BsmtFinType2== "ALQ"] <- "2"
test$BsmtFinType2[test$BsmtFinType2== "BLQ"] <- "3"
test$BsmtFinType2[test$BsmtFinType2== "Rec"] <- "4"
test$BsmtFinType2[test$BsmtFinType2== "LwQ"] <- "5"
test$BsmtFinType2[test$BsmtFinType2== "Unf"] <- "6"

#Heating
test$Heating[test$Heating== "Floor"] <- "0"
test$Heating[test$Heating== "GasA"] <- "1"
test$Heating[test$Heating== "GasW"] <- "2"
test$Heating[test$Heating== "Grav"] <- "3"
test$Heating[test$Heating== "OthW"] <- "4"
test$Heating[test$Heating== "Wall"] <- "5"

#HeatingQC
test$HeatingQC[test$HeatingQC== "Ex"] <- "0"
test$HeatingQC[test$HeatingQC== "Gd"] <- "1"
test$HeatingQC[test$HeatingQC== "TA"] <- "2"
test$HeatingQC[test$HeatingQC== "Fa"] <- "3"
test$HeatingQC[test$HeatingQC== "Po"] <- "4"

#CentralAir
test$CentralAir[test$CentralAir== "N"] <- "0"
test$CentralAir[test$CentralAir== "Y"] <- "1"

#Electrical
test$Electrical[test$Electrical== "SBrkr"] <- "0"
test$Electrical[test$Electrical== "FuseA"] <- "1"
test$Electrical[test$Electrical== "FuseF"] <- "2"
test$Electrical[test$Electrical== "FuseP"] <- "3"
test$Electrical[test$Electrical== "Mix"] <- "4"

#KitchenQual
test$KitchenQual[test$KitchenQual== "Ex"] <- "0"
test$KitchenQual[test$KitchenQual== "Gd"] <- "1"
test$KitchenQual[test$KitchenQual== "TA"] <- "2"
test$KitchenQual[test$KitchenQual== "Fa"] <- "3"
test$KitchenQual[test$KitchenQual== "Po"] <- "4"

#Functional
test$Functional[test$Functional== "Typ"] <- "0"
test$Functional[test$Functional== "Min1"] <- "1"
test$Functional[test$Functional== "Min2"] <- "2"
test$Functional[test$Functional== "Mod"] <- "3"
test$Functional[test$Functional== "Maj1"] <- "4"
test$Functional[test$Functional== "Maj2"] <- "5"
test$Functional[test$Functional== "Sev"] <- "6"
test$Functional[test$Functional== "Sal"] <- "7"


#FireplaceQu
test$FireplaceQu[is.na(test$FireplaceQu)] = 0
test$FireplaceQu[test$FireplaceQu== "Ex"] <- "1"
test$FireplaceQu[test$FireplaceQu== "Gd"] <- "2"
test$FireplaceQu[test$FireplaceQu== "TA"] <- "3"
test$FireplaceQu[test$FireplaceQu== "Fa"] <- "4"
test$FireplaceQu[test$FireplaceQu== "Po"] <- "5"

#GarageType
test$GarageType[is.na(test$GarageType)] = 0
test$GarageType[test$GarageType== "2Types"] <- "1"
test$GarageType[test$GarageType== "Attchd"] <- "2"
test$GarageType[test$GarageType== "Basment"] <- "3"
test$GarageType[test$GarageType== "BuiltIn"] <- "4"
test$GarageType[test$GarageType== "CarPort"] <- "5"
test$GarageType[test$GarageType== "Detchd"] <- "6"

#GarageFinish
test$GarageFinish[is.na(test$GarageFinish)] = 0
test$GarageFinish[test$GarageFinish== "Fin"] <- "1"
test$GarageFinish[test$GarageFinish== "RFn"] <- "2"
test$GarageFinish[test$GarageFinish== "Unf"] <- "3"

#GarageQual
test$GarageQual[is.na(test$GarageQual)] = 0
test$GarageQual[test$GarageQual== "Ex"] <- "1"
test$GarageQual[test$GarageQual== "Gd"] <- "2"
test$GarageQual[test$GarageQual== "TA"] <- "3"
test$GarageQual[test$GarageQual== "Fa"] <- "4"
test$GarageQual[test$GarageQual== "Po"] <- "5"

#GarageCond
test$GarageCond[is.na(test$GarageCond)] = 0
test$GarageCond[test$GarageCond== "Ex"] <- "1"
test$GarageCond[test$GarageCond== "Gd"] <- "2"
test$GarageCond[test$GarageCond== "TA"] <- "3"
test$GarageCond[test$GarageCond== "Fa"] <- "4"
test$GarageCond[test$GarageCond== "Po"] <- "5"

#PavedDrive
test$PavedDrive[test$PavedDrive== "Y"] <- "0"
test$PavedDrive[test$PavedDrive== "P"] <- "1"
test$PavedDrive[test$PavedDrive== "N"] <- "2"

#PoolQC
test$PoolQC[is.na(test$PoolQC)] = 0
test$PoolQC[test$PoolQC== "Ex"] <- "1"
test$PoolQC[test$PoolQC== "Gd"] <- "2"
test$PoolQC[test$PoolQC== "TA"] <- "3"
test$PoolQC[test$PoolQC== "Fa"] <- "4"
        
#Fence
test$Fence[is.na(test$Fence)] = 0
test$Fence[test$Fence== "GdPrv"] <- "1"
test$Fence[test$Fence== "MnPrv"] <- "2"
test$Fence[test$Fence== "GdWo"] <- "3"
test$Fence[test$Fence== "MnWw"] <- "4"

#MiscFeature
test$MiscFeature[is.na(test$MiscFeature)] = 0
test$MiscFeature[test$MiscFeature== "Elev"] <- "1"
test$MiscFeature[test$MiscFeature== "Gar2"] <- "2"
test$MiscFeature[test$MiscFeature== "Othr"] <- "3"
test$MiscFeature[test$MiscFeature== "Shed"] <- "4"
test$MiscFeature[test$MiscFeature== "TenC"] <- "5"

#SaleType
test$SaleType[test$SaleType== "WD"] <- "0"
test$SaleType[test$SaleType== "CWD"] <- "1"
test$SaleType[test$SaleType== "VWD"] <- "2"
test$SaleType[test$SaleType== "New"] <- "3"
test$SaleType[test$SaleType== "COD"] <- "4"
test$SaleType[test$SaleType== "Con"] <- "5"
test$SaleType[test$SaleType== "ConLw"] <- "6"
test$SaleType[test$SaleType== "ConLI"] <- "7"
test$SaleType[test$SaleType== "ConLD"] <- "8"
test$SaleType[test$SaleType== "Oth"] <- "9"

#SaleCondition
test$SaleCondition[test$SaleCondition== "Normal"] <- "0"
test$SaleCondition[test$SaleCondition== "Abnorml"] <- "1"
test$SaleCondition[test$SaleCondition== "AdjLand"] <- "2"
test$SaleCondition[test$SaleCondition== "Alloca"] <- "3"
test$SaleCondition[test$SaleCondition== "Family"] <- "4"
test$SaleCondition[test$SaleCondition== "Partial"] <- "5"

Turning the categorical dataset numeric and looking for variables with strong enough correlation to be used in regression model

#Turns entire data set numeric
test = as.data.frame(apply(test,2,as.numeric))
C_data_train2 = test

y = data.frame(cor(C_data_train2)) %>% select(SalePrice)
yc = y %>% filter(SalePrice < -.5)
yc
yp = y %>% filter(SalePrice > .5)
yp
# Variables with strong enough correlation to SalePrice
#ExterQual  -0.6826392          
#KitchenQual    -0.6595997
# Comparing SalePrice and ExterQual 
cor(C_data_train2$SalePrice,C_data_train2$ExterQual)
## [1] -0.6826392
plot(C_data_train2$ExterQual,C_data_train2$SalePrice,main = "Comparing SalePrice and ExterQual",xlab = "ExterQual",ylab = "SalePrice", mtext("EX=0,GD=1,TA=2,FA=3",1))

model1.1 = lm(SalePrice~ExterQual,C_data_train2)
model1.1
## 
## Call:
## lm(formula = SalePrice ~ ExterQual, data = C_data_train2)
## 
## Coefficients:
## (Intercept)    ExterQual  
##      332401       -94432
summary(model1.1)
## 
## Call:
## lm(formula = SalePrice ~ ExterQual, data = C_data_train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185969  -33636   -6536   26213  507031 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   332401       4510   73.69   <2e-16 ***
## ExterQual     -94432       2647  -35.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58070 on 1458 degrees of freedom
## Multiple R-squared:  0.466,  Adjusted R-squared:  0.4656 
## F-statistic:  1272 on 1 and 1458 DF,  p-value: < 2.2e-16
# Comparing SalePrice and KitchenQual 
cor(C_data_train2$SalePrice,C_data_train2$KitchenQual)
## [1] -0.6595997
plot(C_data_train2$KitchenQual,C_data_train2$SalePrice,main = "Comparing SalePrice and KitchenQual",xlab = "KitchenQual",ylab = "SalePrice", mtext("EX=0,GD=1,TA=2,FA=3",1))

model1.2 = lm(SalePrice~ExterQual+KitchenQual,C_data_train2)
model1.2
## 
## Call:
## lm(formula = SalePrice ~ ExterQual + KitchenQual, data = C_data_train2)
## 
## Coefficients:
## (Intercept)    ExterQual  KitchenQual  
##      339139       -59712       -41948
summary(model1.2)
## 
## Call:
## lm(formula = SalePrice ~ ExterQual + KitchenQual, data = C_data_train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179139  -32589   -4345   24181  465572 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   339139       4281   79.23   <2e-16 ***
## ExterQual     -59712       3575  -16.70   <2e-16 ***
## KitchenQual   -41948       3093  -13.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54740 on 1457 degrees of freedom
## Multiple R-squared:  0.5258, Adjusted R-squared:  0.5252 
## F-statistic: 807.9 on 2 and 1457 DF,  p-value: < 2.2e-16
plot(model1.2$residuals)

Prepare Dataset

C_data_test = select(data_test,Id,is.character)
test = C_data_test

#ExterQual
test$ExterQual[test$ExterQual== "Ex"] <- "0"
test$ExterQual[test$ExterQual== "Gd"] <- "1"
test$ExterQual[test$ExterQual== "TA"] <- "2"
test$ExterQual[test$ExterQual== "Fa"] <- "3"
test$ExterQual[test$ExterQual== "Po"] <- "4"

#KitchenQual
test$KitchenQual[test$KitchenQual== "Ex"] <- "0"
test$KitchenQual[test$KitchenQual== "Gd"] <- "1"
test$KitchenQual[test$KitchenQual== "TA"] <- "2"
test$KitchenQual[test$KitchenQual== "Fa"] <- "3"
test$KitchenQual[test$KitchenQual== "Po"] <- "4"

test = as.data.frame(apply(test,2,as.numeric))

Regression with Categorical Variables

ExterQual = test %>% select(ExterQual)
KitchenQual = test %>% select(KitchenQual)
#model
head(coef(model1.2)[1] + coef(model1.2)[2]*ExterQual + coef(model1.2)[3]*KitchenQual)
Predictions2 = coef(model1.2)[1] + coef(model1.2)[2]*ExterQual + coef(model1.2)[3]*KitchenQual

Prediction data table for SalePrice of test data set

Predictions2 = Predictions2 %>% rename(SalePrice = ExterQual) %>% mutate(Id = 1461:2919)
Predictions2 = Predictions2 %>% select(Id, SalePrice)

hist(Predictions2$SalePrice,main="Categorical SalePrice Predictions",xlab = "Price in Thousand of Dollars")

summary(Predictions2$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   34160  135819  135819  180979  237479  339139       1

Combination of both Categorical and Numerical Models

test = data_train

#ExterQual
test$ExterQual[test$ExterQual== "Ex"] <- "0"
test$ExterQual[test$ExterQual== "Gd"] <- "1"
test$ExterQual[test$ExterQual== "TA"] <- "2"
test$ExterQual[test$ExterQual== "Fa"] <- "3"
test$ExterQual[test$ExterQual== "Po"] <- "4"

#KitchenQual
test$KitchenQual[test$KitchenQual== "Ex"] <- "0"
test$KitchenQual[test$KitchenQual== "Gd"] <- "1"
test$KitchenQual[test$KitchenQual== "TA"] <- "2"
test$KitchenQual[test$KitchenQual== "Fa"] <- "3"
test$KitchenQual[test$KitchenQual== "Po"] <- "4"

test = as.data.frame(apply(test,2,as.numeric))
test = test %>% rename(fstFlrSF = `1stFlrSF`)

Combination of both Categorical and Numerical Models

modelF = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath+TotRmsAbvGrd+YearBuilt+YearRemodAdd+ExterQual+KitchenQual,test)
summary(modelF)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + fstFlrSF + FullBath + TotRmsAbvGrd + 
##     YearBuilt + YearRemodAdd + ExterQual + KitchenQual, data = test)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -483263  -18230   -1211   15661  280443 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.734e+05  1.435e+05  -3.996 6.76e-05 ***
## OverallQual   1.508e+04  1.273e+03  11.841  < 2e-16 ***
## GrLivArea     4.971e+01  4.128e+00  12.042  < 2e-16 ***
## GarageCars    1.011e+04  2.967e+03   3.409  0.00067 ***
## GarageArea    1.036e+01  1.006e+01   1.030  0.30334    
## TotalBsmtSF   1.735e+01  4.200e+00   4.131 3.81e-05 ***
## fstFlrSF      1.412e+01  4.807e+00   2.937  0.00336 ** 
## FullBath     -6.895e+03  2.617e+03  -2.635  0.00851 ** 
## TotRmsAbvGrd  2.797e+02  1.091e+03   0.256  0.79769    
## YearBuilt     2.109e+02  4.977e+01   4.237 2.41e-05 ***
## YearRemodAdd  8.225e+01  6.687e+01   1.230  0.21888    
## ExterQual    -1.307e+04  2.860e+03  -4.571 5.27e-06 ***
## KitchenQual  -1.316e+04  2.346e+03  -5.611 2.41e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36950 on 1447 degrees of freedom
## Multiple R-squared:  0.7854, Adjusted R-squared:  0.7836 
## F-statistic: 441.3 on 12 and 1447 DF,  p-value: < 2.2e-16
plot(modelF$residuals)

PredictionsF = coef(modelF)[1] + coef(modelF)[2]*OverallQual + coef(modelF)[3]*GrLivArea + coef(modelF)[4]*GarageCars + coef(modelF)[5]*GarageArea + coef(modelF)[6]*TotalBsmtSF + coef(modelF)[7]*fstFlrSF + coef(modelF)[8]*FullBath + coef(modelF)[9]*TotRmsAbvGrd + coef(modelF)[10]*YearBuilt + coef(modelF)[11]*YearRemodAdd + coef(modelF)[12]*ExterQual + coef(modelF)[13]*KitchenQual
PredictionsF = PredictionsF %>% rename(SalePrice = OverallQual) %>% mutate(Id = 1461:2919)
PredictionsF = PredictionsF %>% select(Id, SalePrice)
PredictionsFG = PredictionsF %>% select(Id, SalePrice) %>% mutate(SalePrice = SalePrice/1000)

hist(PredictionsFG$SalePrice,main="Final SalePrice Predictions",xlab = "Price in Thousand of Dollars")

summary(PredictionsF$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -1212  128892  166731  178720  221565  612448       1
head(PredictionsF)