Goal : Predict the housing prices in test data set using train data set.
Upload libraries and data set
library(tidyverse)
library(psych)
library(pastecs)
library(imputeMissings)
library(ggplot2)
library(reshape2)
data_test = read_csv("/Users/aarontomat/Desktop/test.csv")
data_train = read_csv("/Users/aarontomat/Desktop/train.csv")
Looking at and sorting data
# Initial description data set
head(describe(data_train))
#Summary of SalePrice
summary(data_train$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
# Look at which variabels are numeric and categorical
head(lapply(data_train,class))
## $Id
## [1] "numeric"
##
## $MSSubClass
## [1] "numeric"
##
## $MSZoning
## [1] "character"
##
## $LotFrontage
## [1] "numeric"
##
## $LotArea
## [1] "numeric"
##
## $Street
## [1] "character"
# Creates a seperate numerical data set
N_data_train = select(data_train,is.numeric)
head(N_data_train)
# Turns all NA's into 0 for numerical dataset
N_data_train[is.na(N_data_train)] = 0
N_data_train = N_data_train %>% rename(fstFlrSF = '1stFlrSF')
#Creates a seperate categorical data set
C_data_train = select(data_train,Id,is.character,SalePrice)
head(C_data_train)
# Correlation between numeric variables
x = data.frame(cor(N_data_train)) %>% select(SalePrice)
xn = x %>% filter(SalePrice < -.5)
xn
xp = x %>% filter(SalePrice > .5)
xp
# Variables that have a strong correlation related to Sales Price
#OverallQual 0.7909816
#GrLivArea 0.7086245
#GarageCars 0.6404092
#GarageArea 0.6234314
#TotalBsmtSF 0.6135806
#fstFlrSF 0.6058522
#FullBath 0.5606638
#TotRmsAbvGrd 0.5337232
#YearBuilt 0.5228973
#YearRemodAdd 0.5071010
# Data set containing only variables used for correlation formula
Cor_data_train = N_data_train %>% select(OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,fstFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd)
Comparing SalePrice to highly correlated Variables
# Comparing SalePrice and OverallQual
cor(N_data_train$SalePrice,N_data_train$OverallQual)
## [1] 0.7909816
plot(N_data_train$OverallQual,N_data_train$SalePrice,main = "Comparing SalePrice and OverallQual",xlab = "OverallQual",ylab = "SalePrice")

model1 = lm(SalePrice~OverallQual,N_data_train)
model1
##
## Call:
## lm(formula = SalePrice ~ OverallQual, data = N_data_train)
##
## Coefficients:
## (Intercept) OverallQual
## -96206 45436
summary(model1)
##
## Call:
## lm(formula = SalePrice ~ OverallQual, data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -198152 -29409 -1845 21463 396848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -96206.1 5756.4 -16.71 <2e-16 ***
## OverallQual 45435.8 920.4 49.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48620 on 1458 degrees of freedom
## Multiple R-squared: 0.6257, Adjusted R-squared: 0.6254
## F-statistic: 2437 on 1 and 1458 DF, p-value: < 2.2e-16
# Adding GrLivArea to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$GrLivArea)
## [1] 0.7086245
plot(N_data_train$GrLivArea,N_data_train$SalePrice,main = "Comparing SalePrice and GrLivArea",xlab = "GrLivArea",ylab = "SalePrice")

model2 = lm(SalePrice~OverallQual+GrLivArea,N_data_train)
summary(model2)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea, data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -379572 -22266 -386 19895 289501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -104092.67 5045.37 -20.63 <2e-16 ***
## OverallQual 32849.05 999.20 32.88 <2e-16 ***
## GrLivArea 55.86 2.63 21.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42500 on 1457 degrees of freedom
## Multiple R-squared: 0.7142, Adjusted R-squared: 0.7138
## F-statistic: 1820 on 2 and 1457 DF, p-value: < 2.2e-16
# Adding GarageCars to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$GarageCars)
## [1] 0.6404092
plot(N_data_train$GarageCars,N_data_train$SalePrice,main = "Comparing SalePrice and GarageCars",xlab = "GarageCars",ylab = "SalePrice")

model3 = lm(SalePrice~OverallQual+GrLivArea+GarageCars,N_data_train)
summary(model3)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars,
## data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340718 -21675 -2085 19500 300177
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -98832.493 4842.897 -20.41 <2e-16 ***
## OverallQual 27104.826 1072.182 25.28 <2e-16 ***
## GrLivArea 50.674 2.552 19.86 <2e-16 ***
## GarageCars 21298.960 1807.065 11.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40620 on 1456 degrees of freedom
## Multiple R-squared: 0.7391, Adjusted R-squared: 0.7385
## F-statistic: 1375 on 3 and 1456 DF, p-value: < 2.2e-16
# Adding GarageArea to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$GarageArea)
## [1] 0.6234314
plot(N_data_train$GarageArea,N_data_train$SalePrice,main = "Comparing SalePrice and GarageArea",xlab = "GarageArea",ylab = "SalePrice")

model4 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea,N_data_train)
summary(model4)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea, data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -372594 -21236 -1594 18625 301129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -98436.050 4820.467 -20.420 < 2e-16 ***
## OverallQual 26988.854 1067.393 25.285 < 2e-16 ***
## GrLivArea 49.573 2.555 19.402 < 2e-16 ***
## GarageCars 11317.522 3126.297 3.620 0.000305 ***
## GarageArea 41.478 10.627 3.903 9.93e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40420 on 1455 degrees of freedom
## Multiple R-squared: 0.7418, Adjusted R-squared: 0.7411
## F-statistic: 1045 on 4 and 1455 DF, p-value: < 2.2e-16
# Adding TotalBsmtSF to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$TotalBsmtSF)
## [1] 0.6135806
plot(N_data_train$TotalBsmtSF,N_data_train$SalePrice,main = "Comparing SalePrice and TotalBsmtSF",xlab = "TotalBsmtSF",ylab = "SalePrice")

model5 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF,N_data_train)
summary(model5)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF, data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -478977 -19915 -1503 16701 287132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -99072.050 4638.450 -21.359 < 2e-16 ***
## OverallQual 23635.007 1072.532 22.037 < 2e-16 ***
## GrLivArea 45.346 2.489 18.218 < 2e-16 ***
## GarageCars 14544.315 3022.681 4.812 1.65e-06 ***
## GarageArea 17.133 10.468 1.637 0.102
## TotalBsmtSF 31.501 2.904 10.848 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38900 on 1454 degrees of freedom
## Multiple R-squared: 0.7611, Adjusted R-squared: 0.7603
## F-statistic: 926.5 on 5 and 1454 DF, p-value: < 2.2e-16
# Adding fstFlrSF to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$fstFlrSF)
## [1] 0.6058522
plot(N_data_train$fstFlrSF,N_data_train$SalePrice,main = "Comparing SalePrice and fstFlrSF",xlab = "fstFlrSF",ylab = "SalePrice")

model6 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF,N_data_train)
summary(model6)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + fstFlrSF, data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473373 -19732 -1080 16922 288035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.027e+05 4.904e+03 -20.932 < 2e-16 ***
## OverallQual 2.400e+04 1.083e+03 22.150 < 2e-16 ***
## GrLivArea 4.312e+01 2.679e+00 16.095 < 2e-16 ***
## GarageCars 1.452e+04 3.019e+03 4.809 1.68e-06 ***
## GarageArea 1.566e+01 1.047e+01 1.495 0.1350
## TotalBsmtSF 2.439e+01 4.318e+00 5.649 1.94e-08 ***
## fstFlrSF 1.119e+01 5.032e+00 2.223 0.0264 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38840 on 1453 degrees of freedom
## Multiple R-squared: 0.7619, Adjusted R-squared: 0.7609
## F-statistic: 775 on 6 and 1453 DF, p-value: < 2.2e-16
# Adding FullBath to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$FullBath)
## [1] 0.5606638
plot(N_data_train$FullBath,N_data_train$SalePrice,main = "Comparing SalePrice and FullBath",xlab = "FullBath",ylab = "SalePrice")

model7 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath,N_data_train)
summary(model7)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + fstFlrSF + FullBath, data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -471605 -19887 -1264 16874 288458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.029e+05 4.923e+03 -20.901 < 2e-16 ***
## OverallQual 2.386e+04 1.108e+03 21.546 < 2e-16 ***
## GrLivArea 4.243e+01 2.940e+00 14.432 < 2e-16 ***
## GarageCars 1.421e+04 3.067e+03 4.632 3.94e-06 ***
## GarageArea 1.630e+01 1.054e+01 1.547 0.1220
## TotalBsmtSF 2.453e+01 4.325e+00 5.671 1.71e-08 ***
## fstFlrSF 1.112e+01 5.035e+00 2.208 0.0274 *
## FullBath 1.457e+03 2.529e+03 0.576 0.5646
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38850 on 1452 degrees of freedom
## Multiple R-squared: 0.762, Adjusted R-squared: 0.7608
## F-statistic: 664 on 7 and 1452 DF, p-value: < 2.2e-16
# Adding TotRmsAbvGrd to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$TotRmsAbvGrd)
## [1] 0.5337232
plot(N_data_train$TotRmsAbvGrd,N_data_train$SalePrice,main = "Comparing SalePrice and TotRmsAbvGrd",xlab = "TotRmsAbvGrd",ylab = "SalePrice")

model8 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath+TotRmsAbvGrd,N_data_train)
summary(model8)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + fstFlrSF + FullBath + TotRmsAbvGrd,
## data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -474264 -19688 -1330 17195 285481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.003e+05 6.007e+03 -16.704 < 2e-16 ***
## OverallQual 2.378e+04 1.114e+03 21.345 < 2e-16 ***
## GrLivArea 4.468e+01 4.233e+00 10.555 < 2e-16 ***
## GarageCars 1.436e+04 3.074e+03 4.672 3.27e-06 ***
## GarageArea 1.574e+01 1.056e+01 1.490 0.1364
## TotalBsmtSF 2.425e+01 4.342e+00 5.584 2.80e-08 ***
## fstFlrSF 1.116e+01 5.036e+00 2.216 0.0269 *
## FullBath 1.655e+03 2.543e+03 0.651 0.5152
## TotRmsAbvGrd -8.461e+02 1.141e+03 -0.741 0.4586
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38860 on 1451 degrees of freedom
## Multiple R-squared: 0.7621, Adjusted R-squared: 0.7608
## F-statistic: 580.9 on 8 and 1451 DF, p-value: < 2.2e-16
# Adding YearBuilt to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$YearBuilt)
## [1] 0.5228973
plot(N_data_train$YearBuilt,N_data_train$SalePrice,main = "Comparing SalePrice and YearBuilt",xlab = "YearBuilt",ylab = "SalePrice")

model9 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath+TotRmsAbvGrd+YearBuilt,N_data_train)
summary(model9)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + fstFlrSF + FullBath + TotRmsAbvGrd +
## YearBuilt, data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -487024 -19856 -2161 16304 285943
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.585e+05 9.145e+04 -8.294 2.46e-16 ***
## OverallQual 2.092e+04 1.164e+03 17.971 < 2e-16 ***
## GrLivArea 5.137e+01 4.263e+00 12.051 < 2e-16 ***
## GarageCars 1.063e+04 3.066e+03 3.469 0.000538 ***
## GarageArea 1.451e+01 1.038e+01 1.397 0.162572
## TotalBsmtSF 1.925e+01 4.324e+00 4.451 9.19e-06 ***
## fstFlrSF 1.395e+01 4.965e+00 2.810 0.005028 **
## FullBath -5.410e+03 2.685e+03 -2.015 0.044081 *
## TotRmsAbvGrd -8.664e+01 1.127e+03 -0.077 0.938718
## YearBuilt 3.454e+02 4.789e+01 7.212 8.83e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38190 on 1450 degrees of freedom
## Multiple R-squared: 0.7703, Adjusted R-squared: 0.7689
## F-statistic: 540.3 on 9 and 1450 DF, p-value: < 2.2e-16
# Adding YearRemodAdd to model and Compare to SalePrice
cor(N_data_train$SalePrice,N_data_train$YearRemodAdd)
## [1] 0.507101
plot(N_data_train$YearRemodAdd,N_data_train$SalePrice,main = "Comparing SalePrice and YearRemodAdd",xlab = "YearRemodAdd",ylab = "SalePrice")

model10 = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath+TotRmsAbvGrd+YearBuilt+YearRemodAdd,N_data_train)
summary(model10)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + fstFlrSF + FullBath + TotRmsAbvGrd +
## YearBuilt + YearRemodAdd, data = N_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -489958 -19316 -1948 16020 290558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.186e+06 1.291e+05 -9.187 < 2e-16 ***
## OverallQual 1.960e+04 1.190e+03 16.472 < 2e-16 ***
## GrLivArea 5.130e+01 4.233e+00 12.119 < 2e-16 ***
## GarageCars 1.042e+04 3.044e+03 3.422 0.000639 ***
## GarageArea 1.495e+01 1.031e+01 1.450 0.147384
## TotalBsmtSF 1.986e+01 4.295e+00 4.625 4.09e-06 ***
## fstFlrSF 1.417e+01 4.930e+00 2.875 0.004097 **
## FullBath -6.791e+03 2.682e+03 -2.532 0.011457 *
## TotRmsAbvGrd 3.310e+01 1.119e+03 0.030 0.976404
## YearBuilt 2.682e+02 5.035e+01 5.328 1.15e-07 ***
## YearRemodAdd 2.965e+02 6.363e+01 4.659 3.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37920 on 1449 degrees of freedom
## Multiple R-squared: 0.7737, Adjusted R-squared: 0.7721
## F-statistic: 495.4 on 10 and 1449 DF, p-value: < 2.2e-16
Prepare test data set
N_data_test = select(data_test,is.numeric)
head(N_data_test)
# Turns all NA's into 0 for numerical dataset
N_data_test[is.na(N_data_test)] = 0
N_data_test = N_data_test %>% rename(fstFlrSF = '1stFlrSF')
Cor_data_test= N_data_test %>% select(OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,fstFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd)
OverallQual = Cor_data_test %>% select(OverallQual)
GrLivArea = Cor_data_test %>% select(GrLivArea)
GarageCars = Cor_data_test %>% select(GarageCars)
GarageArea = Cor_data_test %>% select(GarageArea)
TotalBsmtSF = Cor_data_test %>% select(TotalBsmtSF)
fstFlrSF = Cor_data_test %>% select(fstFlrSF)
FullBath = Cor_data_test %>% select(FullBath)
TotRmsAbvGrd = Cor_data_test %>% select(TotRmsAbvGrd)
YearBuilt = Cor_data_test %>% select(YearBuilt)
YearRemodAdd = Cor_data_test %>% select(YearRemodAdd)
Plug in values from test data set into regression models to determine the predicted housing prices
#model1
head(coef(model1)[1] + coef(model1)[2]*OverallQual)
#model2
head(coef(model2)[1] + coef(model2)[2]*OverallQual + coef(model2)[3]*GrLivArea)
#model3
head(coef(model3)[1] + coef(model3)[2]*OverallQual + coef(model3)[3]*GrLivArea + coef(model3)[4]*GarageCars)
#model4
head(coef(model4)[1] + coef(model4)[2]*OverallQual + coef(model4)[3]*GrLivArea + coef(model4)[4]*GarageCars + coef(model4)[5]*GarageArea)
#model5
head(coef(model5)[1] + coef(model5)[2]*OverallQual + coef(model5)[3]*GrLivArea + coef(model5)[4]*GarageCars + coef(model5)[5]*GarageArea + coef(model5)[6]*TotalBsmtSF)
#model6
head(coef(model6)[1] + coef(model6)[2]*OverallQual + coef(model6)[3]*GrLivArea + coef(model6)[4]*GarageCars + coef(model6)[5]*GarageArea + coef(model6)[6]*TotalBsmtSF + coef(model6)[7]*fstFlrSF)
#model7
head(coef(model7)[1] + coef(model7)[2]*OverallQual + coef(model7)[3]*GrLivArea + coef(model7)[4]*GarageCars + coef(model7)[5]*GarageArea + coef(model7)[6]*TotalBsmtSF + coef(model7)[7]*fstFlrSF + coef(model7)[8]*FullBath)
#model8
head(coef(model8)[1] + coef(model8)[2]*OverallQual + coef(model8)[3]*GrLivArea + coef(model8)[4]*GarageCars + coef(model8)[5]*GarageArea + coef(model8)[6]*TotalBsmtSF + coef(model8)[7]*fstFlrSF + coef(model8)[8]*FullBath + coef(model8)[9]*TotRmsAbvGrd)
#model9
head(coef(model9)[1] + coef(model9)[2]*OverallQual + coef(model9)[3]*GrLivArea + coef(model9)[4]*GarageCars + coef(model9)[5]*GarageArea + coef(model9)[6]*TotalBsmtSF + coef(model9)[7]*fstFlrSF + coef(model9)[8]*FullBath + coef(model9)[9]*TotRmsAbvGrd + coef(model9)[10]*YearBuilt)
#model10
head(coef(model10)[1] + coef(model10)[2]*OverallQual + coef(model10)[3]*GrLivArea + coef(model10)[4]*GarageCars + coef(model10)[5]*GarageArea + coef(model10)[6]*TotalBsmtSF + coef(model10)[7]*fstFlrSF + coef(model10)[8]*FullBath + coef(model10)[9]*TotRmsAbvGrd + coef(model10)[10]*YearBuilt + coef(model10)[11]*YearRemodAdd)
Predictions = coef(model10)[1] + coef(model10)[2]*OverallQual + coef(model10)[3]*GrLivArea + coef(model10)[4]*GarageCars + coef(model10)[5]*GarageArea + coef(model10)[6]*TotalBsmtSF + coef(model10)[7]*fstFlrSF + coef(model10)[8]*FullBath + coef(model10)[9]*TotRmsAbvGrd + coef(model10)[10]*YearBuilt + coef(model10)[11]*YearRemodAdd
plot(model10$residuals)

Prediction data table for SalePrice using only numeric variables test data set
Predictions = Predictions %>% rename(SalePrice = OverallQual) %>% mutate(Id = 1461:2919)
Predictions = Predictions %>% select(Id, SalePrice)
PredictionsG = Predictions %>% select(Id, SalePrice) %>% mutate(SalePrice = SalePrice/1000)
hist(PredictionsG$SalePrice,main = "Numeric SalePrice Predictions",xlab = "SalePrice in Thousands of Dollars")

plot(model10$residuals)

summary(Predictions$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9996 126204 169288 178277 222010 614315
Changing all the Categorical Variables to numeric ones
test = C_data_train
# MSZoning
test$MSZoning[test$MSZoning == "A"] <- "0"
test$MSZoning[test$MSZoning == "c"] <- "1"
test$MSZoning[test$MSZoning == "FV"] <- "2"
test$MSZoning[test$MSZoning == "I"] <- "3"
test$MSZoning[test$MSZoning == "RH"] <- "4"
test$MSZoning[test$MSZoning == "RL"] <- "5"
test$MSZoning[test$MSZoning == "RP"] <- "6"
test$MSZoning[test$MSZoning == "RM"] <- "7"
# Street
test$Street[test$Street == "Grvl"] <- "0"
test$Street[test$Street == "Pave"] <- "1"
#Alley
test$Alley[test$Alley == "Grvl"] <- "1"
test$Alley[test$Alley == "Pave"] <- "2"
test$Alley[is.na(test$Alley)] = 0
#Lot Shape
test$LotShape[test$LotShape == "Reg"] <- "0"
test$LotShape[test$LotShape == "IR1"] <- "1"
test$LotShape[test$LotShape == "IR2"] <- "2"
test$LotShape[test$LotShape == "IR3"] <- "3"
#LandContour
test$LandContour[test$LandContour == "Lvl"] <- "0"
test$LandContour[test$LandContour == "Bnk"] <- "1"
test$LandContour[test$LandContour == "HLS"] <- "2"
test$LandContour[test$LandContour == "Low"] <- "3"
#Utilities
test$Utilities[test$Utilities == "AllPub"] <- "0"
test$Utilities[test$Utilities == "NoSewr"] <- "1"
test$Utilities[test$Utilities == "NoSeWa"] <- "2"
test$Utilities[test$Utilities == "ELO"] <- "3"
#LotConfig
test$LotConfig[test$LotConfig == "Inside"] <- "0"
test$LotConfig[test$LotConfig == "Corner"] <- "1"
test$LotConfig[test$LotConfig == "CulDSac"] <- "2"
test$LotConfig[test$LotConfig == "FR2"] <- "3"
test$LotConfig[test$LotConfig == "FR3"] <- "4"
#LandSlope
test$LandSlope[test$LandSlope == "Gtl"] <- "0"
test$LandSlope[test$LandSlope == "Mod"] <- "1"
test$LandSlope[test$LandSlope == "Sev"] <- "2"
#Neighborhood
test$Neighborhood[test$Neighborhood == "Blmngtn"] <- "0"
test$Neighborhood[test$Neighborhood == "Blueste"] <- "1"
test$Neighborhood[test$Neighborhood == "BrDale"] <- "2"
test$Neighborhood[test$Neighborhood == "BrkSide"] <- "3"
test$Neighborhood[test$Neighborhood == "ClearCr"] <- "4"
test$Neighborhood[test$Neighborhood == "CollgCr"] <- "5"
test$Neighborhood[test$Neighborhood == "Crawfor"] <- "6"
test$Neighborhood[test$Neighborhood == "Edwards"] <- "7"
test$Neighborhood[test$Neighborhood == "Gilbert"] <- "8"
test$Neighborhood[test$Neighborhood == "IDOTRR"] <- "9"
test$Neighborhood[test$Neighborhood == "MeadowV"] <- "10"
test$Neighborhood[test$Neighborhood == "Mitchel"] <- "11"
test$Neighborhood[test$Neighborhood == "Names"] <- "12"
test$Neighborhood[test$Neighborhood == "NoRidge"] <- "13"
test$Neighborhood[test$Neighborhood == "NPkVill"] <- "14"
test$Neighborhood[test$Neighborhood == "NridgHt"] <- "15"
test$Neighborhood[test$Neighborhood == "NAmes"] <- "16"
test$Neighborhood[test$Neighborhood == "OldTown"] <- "17"
test$Neighborhood[test$Neighborhood == "SWISU"] <- "18"
test$Neighborhood[test$Neighborhood == "Sawyer"] <- "19"
test$Neighborhood[test$Neighborhood == "SawyerW"] <- "20"
test$Neighborhood[test$Neighborhood == "Somerst"] <- "21"
test$Neighborhood[test$Neighborhood == "StoneBr"] <- "22"
test$Neighborhood[test$Neighborhood == "Timber"] <- "23"
test$Neighborhood[test$Neighborhood == "Veenker"] <- "24"
#Condition1
test$Condition1[test$Condition1== "Artery"] <- "0"
test$Condition1[test$Condition1== "Feedr"] <- "1"
test$Condition1[test$Condition1== "Norm"] <- "2"
test$Condition1[test$Condition1== "RRNn"] <- "3"
test$Condition1[test$Condition1== "RRAn"] <- "4"
test$Condition1[test$Condition1== "PosN"] <- "5"
test$Condition1[test$Condition1== "PosA"] <- "6"
test$Condition1[test$Condition1== "RRNe"] <- "7"
test$Condition1[test$Condition1== "RRAe"] <- "8"
#Condition2
test$Condition2[test$Condition2== "Artery"] <- "0"
test$Condition2[test$Condition2== "Feedr"] <- "1"
test$Condition2[test$Condition2== "Norm"] <- "2"
test$Condition2[test$Condition2== "RRNn"] <- "3"
test$Condition2[test$Condition2== "RRAn"] <- "4"
test$Condition2[test$Condition2== "PosN"] <- "5"
test$Condition2[test$Condition2== "PosA"] <- "6"
test$Condition2[test$Condition2== "RRNe"] <- "7"
test$Condition2[test$Condition2== "RRAe"] <- "8"
#BldgType
test$BldgType[test$BldgType == "1Fam"] <- "0"
test$BldgType[test$BldgType == "2fmCon"] <- "1"
test$BldgType[test$BldgType == "Duplex"] <- "2"
test$BldgType[test$BldgType == "TwnhsE"] <- "3"
test$BldgType[test$BldgType == "TwnhsI"] <- "4"
#HouseStyle
test$HouseStyle[test$HouseStyle == "1Story"] <- "0"
test$HouseStyle[test$HouseStyle == "1.5Fin"] <- "1"
test$HouseStyle[test$HouseStyle == "1.5Unf"] <- "2"
test$HouseStyle[test$HouseStyle == "2Story"] <- "3"
test$HouseStyle[test$HouseStyle == "2.5Fin"] <- "4"
test$HouseStyle[test$HouseStyle == "2.5Unf"] <- "5"
test$HouseStyle[test$HouseStyle == "SFoyer"] <- "6"
test$HouseStyle[test$HouseStyle == "SLvl"] <- "7"
#RoofStyle
test$RoofStyle[test$RoofStyle == "Flat"] <- "0"
test$RoofStyle[test$RoofStyle == "Gable"] <- "1"
test$RoofStyle[test$RoofStyle == "Gambrel"] <- "2"
test$RoofStyle[test$RoofStyle == "Hip"] <- "3"
test$RoofStyle[test$RoofStyle == "Mansard"] <- "4"
test$RoofStyle[test$RoofStyle == "Shed"] <- "5"
#RoofMatl
test$RoofMatl[test$RoofMatl == "ClyTile"] <- "0"
test$RoofMatl[test$RoofMatl == "CompShg"] <- "1"
test$RoofMatl[test$RoofMatl == "Membran"] <- "2"
test$RoofMatl[test$RoofMatl == "Metal"] <- "3"
test$RoofMatl[test$RoofMatl == "Roll"] <- "4"
test$RoofMatl[test$RoofMatl == "Tar&Grv"] <- "5"
test$RoofMatl[test$RoofMatl == "WdShake"] <- "6"
test$RoofMatl[test$RoofMatl == "WdShngl"] <- "7"
#Exterior1st
test$Exterior1st[test$Exterior1st == "AsbShng"] <- "0"
test$Exterior1st[test$Exterior1st == "AsphShn"] <- "1"
test$Exterior1st[test$Exterior1st == "BrkComm"] <- "2"
test$Exterior1st[test$Exterior1st == "BrkFace"] <- "3"
test$Exterior1st[test$Exterior1st == "CBlock"] <- "4"
test$Exterior1st[test$Exterior1st == "CemntBd"] <- "5"
test$Exterior1st[test$Exterior1st == "HdBoard"] <- "6"
test$Exterior1st[test$Exterior1st == "ImStucc"] <- "7"
test$Exterior1st[test$Exterior1st == "MetalSd"] <- "8"
test$Exterior1st[test$Exterior1st == "Other"] <- "9"
test$Exterior1st[test$Exterior1st == "Plywood"] <- "10"
test$Exterior1st[test$Exterior1st == "PreCast"] <- "11"
test$Exterior1st[test$Exterior1st == "Stone"] <- "12"
test$Exterior1st[test$Exterior1st == "Stucco"] <- "13"
test$Exterior1st[test$Exterior1st == "VinylSd"] <- "14"
test$Exterior1st[test$Exterior1st == "Wd Sdng"] <- "15"
test$Exterior1st[test$Exterior1st == "WdShing"] <- "16"
#Exterior2nd
test$Exterior2nd[test$Exterior2nd == "AsbShng"] <- "0"
test$Exterior2nd[test$Exterior2nd == "AsphShn"] <- "1"
test$Exterior2nd[test$Exterior2nd == "BrkComm"] <- "2"
test$Exterior2nd[test$Exterior2nd == "BrkFace"] <- "3"
test$Exterior2nd[test$Exterior2nd == "CBlock"] <- "4"
test$Exterior2nd[test$Exterior2nd == "CemntBd"] <- "5"
test$Exterior2nd[test$Exterior2nd == "HdBoard"] <- "6"
test$Exterior2nd[test$Exterior2nd == "ImStucc"] <- "7"
test$Exterior2nd[test$Exterior2nd == "MetalSd"] <- "8"
test$Exterior2nd[test$Exterior2nd == "Other"] <- "9"
test$Exterior2nd[test$Exterior2nd == "Plywood"] <- "10"
test$Exterior2nd[test$Exterior2nd == "PreCast"] <- "11"
test$Exterior2nd[test$Exterior2nd == "Stone"] <- "12"
test$Exterior2nd[test$Exterior2nd == "Stucco"] <- "13"
test$Exterior2nd[test$Exterior2nd == "VinylSd"] <- "14"
test$Exterior2nd[test$Exterior2nd == "Wd Sdng"] <- "15"
test$Exterior2nd[test$Exterior2nd == "Wd Shng"] <- "16"
#MasVnrType
test$MasVnrType[test$MasVnrType == "BrkCmn"] <- "0"
test$MasVnrType[test$MasVnrType == "BrkFace"] <- "1"
test$MasVnrType[test$MasVnrType == "CBlock"] <- "2"
test$MasVnrType[test$MasVnrType == "None"] <- "3"
test$MasVnrType[test$MasVnrType == "Stone"] <- "4"
#ExterQual
test$ExterQual[test$ExterQual== "Ex"] <- "0"
test$ExterQual[test$ExterQual== "Gd"] <- "1"
test$ExterQual[test$ExterQual== "TA"] <- "2"
test$ExterQual[test$ExterQual== "Fa"] <- "3"
test$ExterQual[test$ExterQual== "Po"] <- "4"
#ExterCond
test$ExterCond[test$ExterCond== "Ex"] <- "0"
test$ExterCond[test$ExterCond== "Gd"] <- "1"
test$ExterCond[test$ExterCond== "TA"] <- "2"
test$ExterCond[test$ExterCond== "Fa"] <- "3"
test$ExterCond[test$ExterCond== "Po"] <- "4"
#Foundation
test$Foundation[test$Foundation== "BrkTil"] <- "0"
test$Foundation[test$Foundation== "CBlock"] <- "1"
test$Foundation[test$Foundation== "PConc"] <- "2"
test$Foundation[test$Foundation== "Slab"] <- "3"
test$Foundation[test$Foundation== "Stone"] <- "4"
test$Foundation[test$Foundation== "Wood"] <- "5"
#BsmtQual
test$BsmtQual[is.na(test$BsmtQual)] = 0
test$BsmtQual[test$BsmtQual== "Ex"] <- "1"
test$BsmtQual[test$BsmtQual== "Gd"] <- "2"
test$BsmtQual[test$BsmtQual== "TA"] <- "3"
test$BsmtQual[test$BsmtQual== "Fa"] <- "4"
test$BsmtQual[test$BsmtQual== "Po"] <- "5"
#BsmtCond
test$BsmtCond[is.na(test$BsmtCond)] = 0
test$BsmtCond[test$BsmtCond== "Ex"] <- "1"
test$BsmtCond[test$BsmtCond== "Gd"] <- "2"
test$BsmtCond[test$BsmtCond== "TA"] <- "3"
test$BsmtCond[test$BsmtCond== "Fa"] <- "4"
test$BsmtCond[test$BsmtCond== "Po"] <- "5"
#BsmtExposure
test$BsmtExposure[is.na(test$BsmtExposure)] = 0
test$BsmtExposure[test$BsmtExposure== "Gd"] <- "1"
test$BsmtExposure[test$BsmtExposure== "Av"] <- "2"
test$BsmtExposure[test$BsmtExposure== "Mn"] <- "3"
test$BsmtExposure[test$BsmtExposure== "No"] <- "4"
#BsmtFinType1
test$BsmtFinType1[is.na(test$BsmtFinType1)] = 0
test$BsmtFinType1[test$BsmtFinType1== "GLQ"] <- "1"
test$BsmtFinType1[test$BsmtFinType1== "ALQ"] <- "2"
test$BsmtFinType1[test$BsmtFinType1== "BLQ"] <- "3"
test$BsmtFinType1[test$BsmtFinType1== "Rec"] <- "4"
test$BsmtFinType1[test$BsmtFinType1== "LwQ"] <- "5"
test$BsmtFinType1[test$BsmtFinType1== "Unf"] <- "6"
#BsmtFinType2
test$BsmtFinType2[is.na(test$BsmtFinType2)] = 0
test$BsmtFinType2[test$BsmtFinType2== "GLQ"] <- "1"
test$BsmtFinType2[test$BsmtFinType2== "ALQ"] <- "2"
test$BsmtFinType2[test$BsmtFinType2== "BLQ"] <- "3"
test$BsmtFinType2[test$BsmtFinType2== "Rec"] <- "4"
test$BsmtFinType2[test$BsmtFinType2== "LwQ"] <- "5"
test$BsmtFinType2[test$BsmtFinType2== "Unf"] <- "6"
#Heating
test$Heating[test$Heating== "Floor"] <- "0"
test$Heating[test$Heating== "GasA"] <- "1"
test$Heating[test$Heating== "GasW"] <- "2"
test$Heating[test$Heating== "Grav"] <- "3"
test$Heating[test$Heating== "OthW"] <- "4"
test$Heating[test$Heating== "Wall"] <- "5"
#HeatingQC
test$HeatingQC[test$HeatingQC== "Ex"] <- "0"
test$HeatingQC[test$HeatingQC== "Gd"] <- "1"
test$HeatingQC[test$HeatingQC== "TA"] <- "2"
test$HeatingQC[test$HeatingQC== "Fa"] <- "3"
test$HeatingQC[test$HeatingQC== "Po"] <- "4"
#CentralAir
test$CentralAir[test$CentralAir== "N"] <- "0"
test$CentralAir[test$CentralAir== "Y"] <- "1"
#Electrical
test$Electrical[test$Electrical== "SBrkr"] <- "0"
test$Electrical[test$Electrical== "FuseA"] <- "1"
test$Electrical[test$Electrical== "FuseF"] <- "2"
test$Electrical[test$Electrical== "FuseP"] <- "3"
test$Electrical[test$Electrical== "Mix"] <- "4"
#KitchenQual
test$KitchenQual[test$KitchenQual== "Ex"] <- "0"
test$KitchenQual[test$KitchenQual== "Gd"] <- "1"
test$KitchenQual[test$KitchenQual== "TA"] <- "2"
test$KitchenQual[test$KitchenQual== "Fa"] <- "3"
test$KitchenQual[test$KitchenQual== "Po"] <- "4"
#Functional
test$Functional[test$Functional== "Typ"] <- "0"
test$Functional[test$Functional== "Min1"] <- "1"
test$Functional[test$Functional== "Min2"] <- "2"
test$Functional[test$Functional== "Mod"] <- "3"
test$Functional[test$Functional== "Maj1"] <- "4"
test$Functional[test$Functional== "Maj2"] <- "5"
test$Functional[test$Functional== "Sev"] <- "6"
test$Functional[test$Functional== "Sal"] <- "7"
#FireplaceQu
test$FireplaceQu[is.na(test$FireplaceQu)] = 0
test$FireplaceQu[test$FireplaceQu== "Ex"] <- "1"
test$FireplaceQu[test$FireplaceQu== "Gd"] <- "2"
test$FireplaceQu[test$FireplaceQu== "TA"] <- "3"
test$FireplaceQu[test$FireplaceQu== "Fa"] <- "4"
test$FireplaceQu[test$FireplaceQu== "Po"] <- "5"
#GarageType
test$GarageType[is.na(test$GarageType)] = 0
test$GarageType[test$GarageType== "2Types"] <- "1"
test$GarageType[test$GarageType== "Attchd"] <- "2"
test$GarageType[test$GarageType== "Basment"] <- "3"
test$GarageType[test$GarageType== "BuiltIn"] <- "4"
test$GarageType[test$GarageType== "CarPort"] <- "5"
test$GarageType[test$GarageType== "Detchd"] <- "6"
#GarageFinish
test$GarageFinish[is.na(test$GarageFinish)] = 0
test$GarageFinish[test$GarageFinish== "Fin"] <- "1"
test$GarageFinish[test$GarageFinish== "RFn"] <- "2"
test$GarageFinish[test$GarageFinish== "Unf"] <- "3"
#GarageQual
test$GarageQual[is.na(test$GarageQual)] = 0
test$GarageQual[test$GarageQual== "Ex"] <- "1"
test$GarageQual[test$GarageQual== "Gd"] <- "2"
test$GarageQual[test$GarageQual== "TA"] <- "3"
test$GarageQual[test$GarageQual== "Fa"] <- "4"
test$GarageQual[test$GarageQual== "Po"] <- "5"
#GarageCond
test$GarageCond[is.na(test$GarageCond)] = 0
test$GarageCond[test$GarageCond== "Ex"] <- "1"
test$GarageCond[test$GarageCond== "Gd"] <- "2"
test$GarageCond[test$GarageCond== "TA"] <- "3"
test$GarageCond[test$GarageCond== "Fa"] <- "4"
test$GarageCond[test$GarageCond== "Po"] <- "5"
#PavedDrive
test$PavedDrive[test$PavedDrive== "Y"] <- "0"
test$PavedDrive[test$PavedDrive== "P"] <- "1"
test$PavedDrive[test$PavedDrive== "N"] <- "2"
#PoolQC
test$PoolQC[is.na(test$PoolQC)] = 0
test$PoolQC[test$PoolQC== "Ex"] <- "1"
test$PoolQC[test$PoolQC== "Gd"] <- "2"
test$PoolQC[test$PoolQC== "TA"] <- "3"
test$PoolQC[test$PoolQC== "Fa"] <- "4"
#Fence
test$Fence[is.na(test$Fence)] = 0
test$Fence[test$Fence== "GdPrv"] <- "1"
test$Fence[test$Fence== "MnPrv"] <- "2"
test$Fence[test$Fence== "GdWo"] <- "3"
test$Fence[test$Fence== "MnWw"] <- "4"
#MiscFeature
test$MiscFeature[is.na(test$MiscFeature)] = 0
test$MiscFeature[test$MiscFeature== "Elev"] <- "1"
test$MiscFeature[test$MiscFeature== "Gar2"] <- "2"
test$MiscFeature[test$MiscFeature== "Othr"] <- "3"
test$MiscFeature[test$MiscFeature== "Shed"] <- "4"
test$MiscFeature[test$MiscFeature== "TenC"] <- "5"
#SaleType
test$SaleType[test$SaleType== "WD"] <- "0"
test$SaleType[test$SaleType== "CWD"] <- "1"
test$SaleType[test$SaleType== "VWD"] <- "2"
test$SaleType[test$SaleType== "New"] <- "3"
test$SaleType[test$SaleType== "COD"] <- "4"
test$SaleType[test$SaleType== "Con"] <- "5"
test$SaleType[test$SaleType== "ConLw"] <- "6"
test$SaleType[test$SaleType== "ConLI"] <- "7"
test$SaleType[test$SaleType== "ConLD"] <- "8"
test$SaleType[test$SaleType== "Oth"] <- "9"
#SaleCondition
test$SaleCondition[test$SaleCondition== "Normal"] <- "0"
test$SaleCondition[test$SaleCondition== "Abnorml"] <- "1"
test$SaleCondition[test$SaleCondition== "AdjLand"] <- "2"
test$SaleCondition[test$SaleCondition== "Alloca"] <- "3"
test$SaleCondition[test$SaleCondition== "Family"] <- "4"
test$SaleCondition[test$SaleCondition== "Partial"] <- "5"
Turning the categorical dataset numeric and looking for variables with strong enough correlation to be used in regression model
#Turns entire data set numeric
test = as.data.frame(apply(test,2,as.numeric))
C_data_train2 = test
y = data.frame(cor(C_data_train2)) %>% select(SalePrice)
yc = y %>% filter(SalePrice < -.5)
yc
yp = y %>% filter(SalePrice > .5)
yp
# Variables with strong enough correlation to SalePrice
#ExterQual -0.6826392
#KitchenQual -0.6595997
# Comparing SalePrice and ExterQual
cor(C_data_train2$SalePrice,C_data_train2$ExterQual)
## [1] -0.6826392
plot(C_data_train2$ExterQual,C_data_train2$SalePrice,main = "Comparing SalePrice and ExterQual",xlab = "ExterQual",ylab = "SalePrice", mtext("EX=0,GD=1,TA=2,FA=3",1))

model1.1 = lm(SalePrice~ExterQual,C_data_train2)
model1.1
##
## Call:
## lm(formula = SalePrice ~ ExterQual, data = C_data_train2)
##
## Coefficients:
## (Intercept) ExterQual
## 332401 -94432
summary(model1.1)
##
## Call:
## lm(formula = SalePrice ~ ExterQual, data = C_data_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185969 -33636 -6536 26213 507031
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 332401 4510 73.69 <2e-16 ***
## ExterQual -94432 2647 -35.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58070 on 1458 degrees of freedom
## Multiple R-squared: 0.466, Adjusted R-squared: 0.4656
## F-statistic: 1272 on 1 and 1458 DF, p-value: < 2.2e-16
# Comparing SalePrice and KitchenQual
cor(C_data_train2$SalePrice,C_data_train2$KitchenQual)
## [1] -0.6595997
plot(C_data_train2$KitchenQual,C_data_train2$SalePrice,main = "Comparing SalePrice and KitchenQual",xlab = "KitchenQual",ylab = "SalePrice", mtext("EX=0,GD=1,TA=2,FA=3",1))

model1.2 = lm(SalePrice~ExterQual+KitchenQual,C_data_train2)
model1.2
##
## Call:
## lm(formula = SalePrice ~ ExterQual + KitchenQual, data = C_data_train2)
##
## Coefficients:
## (Intercept) ExterQual KitchenQual
## 339139 -59712 -41948
summary(model1.2)
##
## Call:
## lm(formula = SalePrice ~ ExterQual + KitchenQual, data = C_data_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179139 -32589 -4345 24181 465572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 339139 4281 79.23 <2e-16 ***
## ExterQual -59712 3575 -16.70 <2e-16 ***
## KitchenQual -41948 3093 -13.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54740 on 1457 degrees of freedom
## Multiple R-squared: 0.5258, Adjusted R-squared: 0.5252
## F-statistic: 807.9 on 2 and 1457 DF, p-value: < 2.2e-16
plot(model1.2$residuals)

Prepare Dataset
C_data_test = select(data_test,Id,is.character)
test = C_data_test
#ExterQual
test$ExterQual[test$ExterQual== "Ex"] <- "0"
test$ExterQual[test$ExterQual== "Gd"] <- "1"
test$ExterQual[test$ExterQual== "TA"] <- "2"
test$ExterQual[test$ExterQual== "Fa"] <- "3"
test$ExterQual[test$ExterQual== "Po"] <- "4"
#KitchenQual
test$KitchenQual[test$KitchenQual== "Ex"] <- "0"
test$KitchenQual[test$KitchenQual== "Gd"] <- "1"
test$KitchenQual[test$KitchenQual== "TA"] <- "2"
test$KitchenQual[test$KitchenQual== "Fa"] <- "3"
test$KitchenQual[test$KitchenQual== "Po"] <- "4"
test = as.data.frame(apply(test,2,as.numeric))
Regression with Categorical Variables
ExterQual = test %>% select(ExterQual)
KitchenQual = test %>% select(KitchenQual)
#model
head(coef(model1.2)[1] + coef(model1.2)[2]*ExterQual + coef(model1.2)[3]*KitchenQual)
Predictions2 = coef(model1.2)[1] + coef(model1.2)[2]*ExterQual + coef(model1.2)[3]*KitchenQual
Prediction data table for SalePrice of test data set
Predictions2 = Predictions2 %>% rename(SalePrice = ExterQual) %>% mutate(Id = 1461:2919)
Predictions2 = Predictions2 %>% select(Id, SalePrice)
hist(Predictions2$SalePrice,main="Categorical SalePrice Predictions",xlab = "Price in Thousand of Dollars")

summary(Predictions2$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 34160 135819 135819 180979 237479 339139 1
Combination of both Categorical and Numerical Models
test = data_train
#ExterQual
test$ExterQual[test$ExterQual== "Ex"] <- "0"
test$ExterQual[test$ExterQual== "Gd"] <- "1"
test$ExterQual[test$ExterQual== "TA"] <- "2"
test$ExterQual[test$ExterQual== "Fa"] <- "3"
test$ExterQual[test$ExterQual== "Po"] <- "4"
#KitchenQual
test$KitchenQual[test$KitchenQual== "Ex"] <- "0"
test$KitchenQual[test$KitchenQual== "Gd"] <- "1"
test$KitchenQual[test$KitchenQual== "TA"] <- "2"
test$KitchenQual[test$KitchenQual== "Fa"] <- "3"
test$KitchenQual[test$KitchenQual== "Po"] <- "4"
test = as.data.frame(apply(test,2,as.numeric))
test = test %>% rename(fstFlrSF = `1stFlrSF`)
Combination of both Categorical and Numerical Models
modelF = lm(SalePrice~OverallQual+GrLivArea+GarageCars+GarageArea+TotalBsmtSF+fstFlrSF+FullBath+TotRmsAbvGrd+YearBuilt+YearRemodAdd+ExterQual+KitchenQual,test)
summary(modelF)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + fstFlrSF + FullBath + TotRmsAbvGrd +
## YearBuilt + YearRemodAdd + ExterQual + KitchenQual, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -483263 -18230 -1211 15661 280443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.734e+05 1.435e+05 -3.996 6.76e-05 ***
## OverallQual 1.508e+04 1.273e+03 11.841 < 2e-16 ***
## GrLivArea 4.971e+01 4.128e+00 12.042 < 2e-16 ***
## GarageCars 1.011e+04 2.967e+03 3.409 0.00067 ***
## GarageArea 1.036e+01 1.006e+01 1.030 0.30334
## TotalBsmtSF 1.735e+01 4.200e+00 4.131 3.81e-05 ***
## fstFlrSF 1.412e+01 4.807e+00 2.937 0.00336 **
## FullBath -6.895e+03 2.617e+03 -2.635 0.00851 **
## TotRmsAbvGrd 2.797e+02 1.091e+03 0.256 0.79769
## YearBuilt 2.109e+02 4.977e+01 4.237 2.41e-05 ***
## YearRemodAdd 8.225e+01 6.687e+01 1.230 0.21888
## ExterQual -1.307e+04 2.860e+03 -4.571 5.27e-06 ***
## KitchenQual -1.316e+04 2.346e+03 -5.611 2.41e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36950 on 1447 degrees of freedom
## Multiple R-squared: 0.7854, Adjusted R-squared: 0.7836
## F-statistic: 441.3 on 12 and 1447 DF, p-value: < 2.2e-16
plot(modelF$residuals)

PredictionsF = coef(modelF)[1] + coef(modelF)[2]*OverallQual + coef(modelF)[3]*GrLivArea + coef(modelF)[4]*GarageCars + coef(modelF)[5]*GarageArea + coef(modelF)[6]*TotalBsmtSF + coef(modelF)[7]*fstFlrSF + coef(modelF)[8]*FullBath + coef(modelF)[9]*TotRmsAbvGrd + coef(modelF)[10]*YearBuilt + coef(modelF)[11]*YearRemodAdd + coef(modelF)[12]*ExterQual + coef(modelF)[13]*KitchenQual
PredictionsF = PredictionsF %>% rename(SalePrice = OverallQual) %>% mutate(Id = 1461:2919)
PredictionsF = PredictionsF %>% select(Id, SalePrice)
PredictionsFG = PredictionsF %>% select(Id, SalePrice) %>% mutate(SalePrice = SalePrice/1000)
hist(PredictionsFG$SalePrice,main="Final SalePrice Predictions",xlab = "Price in Thousand of Dollars")

summary(PredictionsF$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1212 128892 166731 178720 221565 612448 1
head(PredictionsF)