library('lsr')
library('ggplot2')
## Warning: package 'ggplot2' was built under R version 4.0.4
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 4.0.4
library('randomForest')
## Warning: package 'randomForest' was built under R version 4.0.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library('readxl')
library('tidyverse')
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4
## Warning: package 'readr' was built under R version 4.0.4
## Warning: package 'forcats' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::combine()       masks randomForest::combine()
## x dplyr::filter()        masks stats::filter()
## x dplyr::lag()           masks stats::lag()
## x randomForest::margin() masks ggplot2::margin()
library('tidyr')
library('tibble')
train <- read_excel("train.xlsx")
test <- read_excel("test.xlsx")
table(sapply(train, class))
## 
## character   numeric 
##        43        38
table(sapply(test, class))
## 
## character   numeric 
##        43        37

To see how much data is numeric or not table(sapply(train, class))

Create a numeric data set.fi

nums <- unlist(lapply(train, is.numeric))
numeric.train <- train[ , nums]

nums <- unlist(lapply(test, is.numeric))
numeric.test <- test[ , nums]
factors <- unlist(lapply(train, is.character))  
train[ ,factors]
## # A tibble: 1,460 x 43
##    MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope
##    <chr>    <chr>  <chr> <chr>    <chr>       <chr>     <chr>     <chr>    
##  1 RL       Pave   0     Reg      Lvl         AllPub    Inside    Gtl      
##  2 RL       Pave   0     Reg      Lvl         AllPub    FR2       Gtl      
##  3 RL       Pave   0     IR1      Lvl         AllPub    Inside    Gtl      
##  4 RL       Pave   0     IR1      Lvl         AllPub    Corner    Gtl      
##  5 RL       Pave   0     IR1      Lvl         AllPub    FR2       Gtl      
##  6 RL       Pave   0     IR1      Lvl         AllPub    Inside    Gtl      
##  7 RL       Pave   0     Reg      Lvl         AllPub    Inside    Gtl      
##  8 RL       Pave   0     IR1      Lvl         AllPub    Corner    Gtl      
##  9 RM       Pave   0     Reg      Lvl         AllPub    Inside    Gtl      
## 10 RL       Pave   0     Reg      Lvl         AllPub    Corner    Gtl      
## # ... with 1,450 more rows, and 35 more variables: Neighborhood <chr>,
## #   Condition1 <chr>, Condition2 <chr>, BldgType <chr>, HouseStyle <chr>,
## #   RoofStyle <chr>, RoofMatl <chr>, Exterior1st <chr>, Exterior2nd <chr>,
## #   MasVnrType <chr>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## #   BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>,
## #   BsmtFinType2 <chr>, Heating <chr>, HeatingQC <chr>, CentralAir <chr>,
## #   Electrical <chr>, KitchenQual <chr>, Functionl <chr>, FireplaceQu <chr>,
## #   GarageType <chr>, GarageFinish <chr>, GarageQual <chr>, GarageCond <chr>,
## #   PavedDrive <chr>, PoolQC <chr>, Fence <chr>, MiscFeature <chr>,
## #   SaleType <chr>, SaleCondition <chr>
factor.train <- train[ ,factors]

#Data Cleaning

I Replaced all NA with 0. I did this while importing the data set. While I could have replace NA with, “None,” for the categorical data points I decided to put a 0 as well and just explain that it’s not a ctually numeric rather it means none. I also transformed Sale PRice into log Sale Price using a log transformation.

plot(density(train$SalePrice))

qqnorm(train$SalePrice, pch = 1, frame = FALSE)

logSalePrice <- log(numeric.train$SalePrice)
plot(density(logSalePrice))

qqnorm(logSalePrice, pch = 1, frame = FALSE)

train$TotalArea <- train$GrLivArea + train$TotalBsmtSF
train$TotalBaths <- train$BsmtFullBath + train$BsmtHalfBath/2 +train$FullBath + train$HalfBath/2
train$logSalePrice <- log(numeric.train$SalePrice)

test$TotalArea <- test$GrLivArea + test$TotalBsmtSF
test$TotalBaths <- test$BsmtFullBath + test$BsmtHalfBath/2 +test$FullBath + test$HalfBath/2
nums <- unlist(lapply(train, is.numeric))
train[ , nums]
## # A tibble: 1,460 x 41
##       Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
##    <dbl>      <dbl>       <dbl>   <dbl>       <dbl>       <dbl>     <dbl>
##  1     1         60          65    8450           7           5      2003
##  2     2         20          80    9600           6           8      1976
##  3     3         60          68   11250           7           5      2001
##  4     4         70          60    9550           7           5      1915
##  5     5         60          84   14260           8           5      2000
##  6     6         50          85   14115           5           5      1993
##  7     7         20          75   10084           8           5      2004
##  8     8         60           0   10382           7           6      1973
##  9     9         50          51    6120           7           5      1931
## 10    10        190          50    7420           5           6      1939
## # ... with 1,450 more rows, and 34 more variables: YearRemodAdd <dbl>,
## #   MasVnrArea <dbl>, BsmtFinSF1 <dbl>, BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>,
## #   TotalBsmtSF <dbl>, 1stFlrSF <dbl>, 2ndFlrSF <dbl>, LowQualFinSF <dbl>,
## #   GrLivArea <dbl>, BsmtFullBath <dbl>, BsmtHalfBath <dbl>, FullBath <dbl>,
## #   HalfBath <dbl>, BedroomAbvGr <dbl>, Kitchen AbvGr <dbl>,
## #   TotRmsAbvGrd <dbl>, Fireplaces <dbl>, GarageYrBlt <dbl>, GarageCars <dbl>,
## #   GarageArea <dbl>, WoodDeckSF <dbl>, OpenPorchSF <dbl>, EnclosedPorch <dbl>,
## #   3SsnPorch <dbl>, ScreenPorch <dbl>, PoolArea <dbl>, MiscVal <dbl>,
## #   MoSold <dbl>, YrSold <dbl>, SalePrice <dbl>, TotalArea <dbl>,
## #   TotalBaths <dbl>, logSalePrice <dbl>
numeric.train <- train[ , nums]
correlation <- round(cor(train[ ,nums], y = NULL, use = "everything", method = c("pearson")),4)
ggcorrplot(correlation, method = "square")

#Make a Table With Sales Price

x <- data.frame(Variables = rownames(correlation), 
                Cor = correlation[, "logSalePrice"])
x <- x[order(x$Cor, decreasing = T),]
x <- x[which(x$Cor > 0.5 | x$Cor < -0.5),]
rownames(x) <- c()
print(x)
##       Variables    Cor
## 1  logSalePrice 1.0000
## 2     SalePrice 0.9484
## 3   OverallQual 0.8172
## 4     TotalArea 0.7733
## 5     GrLivArea 0.7009
## 6    GarageCars 0.6806
## 7    TotalBaths 0.6730
## 8    GarageArea 0.6509
## 9   TotalBsmtSF 0.6121
## 10     1stFlrSF 0.5970
## 11     FullBath 0.5948
## 12    YearBuilt 0.5866
## 13 YearRemodAdd 0.5656
## 14 TotRmsAbvGrd 0.5344

#Now We trim the List Down We can take away the 1stFlrSF, FullBath, YearRemodAdd, GarageArea, SalePrice, YearRemodAdd.

#Factor List

rf <- randomForest(SalePrice ~ ., data = fulldt.fac.train, importance = T)

factors.train <- train %>% select(Id, which(sapply(., is.character))) %>%
                        mutate(logSalePrice = train$logSalePrice)
# Run RF algorithm will all factor variables
rf <- randomForest(logSalePrice ~ ., data = factors.train, importance = T)
rf
## 
## Call:
##  randomForest(formula = logSalePrice ~ ., data = factors.train,      importance = T) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 14
## 
##           Mean of squared residuals: 0.037802
##                     % Var explained: 76.29
importance.table <- data.frame(Names = rownames(importance(rf)), '%IncMSE' = importance(rf)[,1])
importance.table <- importance.table[order(importance.table[,2], decreasing = T),]
rownames(importance.table) <- c()
importance.table[1:10,]
##           Names X.IncMSE
## 1   FireplaceQu 44.36213
## 2  Neighborhood 34.92391
## 3     ExterQual 32.93325
## 4      MSZoning 29.96128
## 5      BldgType 29.15379
## 6    HouseStyle 25.91260
## 7      BsmtQual 24.61693
## 8   KitchenQual 24.35134
## 9    GarageType 19.57114
## 10   MasVnrType 17.75098

Looking at the data

Looks good! Now I am going to run P-tests pn each of the factors.

#Take out 1stFlrSF because that is captured in total, garagearea isn't needed because we have garagecars
#OverallQual
ggplot(train, aes(x = OverallQual, y=logSalePrice, title = OverallQual)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'

#TotalArea  
ggplot(train, aes(x = TotalArea, y=logSalePrice, title = TotalArea)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'

#GrLivArea
ggplot(train, aes(x = OverallQual, y=logSalePrice,  title = GrLivArea)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'

#GarageCars 
ggplot(train, aes(x = GarageCars, y=logSalePrice,  title = GarageCars)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'

#TotalBaths         
ggplot(train, aes(x = TotalBaths, y=logSalePrice,  title = TotalBaths)) +geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'

#TotalBsmtSF            
ggplot(train, aes(x = TotalBsmtSF, y=logSalePrice,  title = TotalBsmtSF)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'

#YearBuilt
ggplot(train, aes(x = YearBuilt, y=logSalePrice,  title = YearBuilt)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'

We can use either backward elimination or forward selection. These techniques are often referred to as stepwise model selection strategies. They are used to remove variables that aren’t actually good fits for the model.

Looking at the values the P values how all the selcted values are statistically significant.

#FireplaceQu
#ExterQual  
#Neighborhood
#MSZoning   
#BldgType           
#KitchenQual            
#GarageType
#BsmtQual
#HouseStyle
#Foundation
#FireplaceQu
boxplot(logSalePrice~train$FireplaceQu, notch = TRUE)

#ExterQual  
boxplot(logSalePrice~train$ExterQual, notch = TRUE)

#Neighborhood
boxplot(logSalePrice~train$Neighborhood, notch = TRUE)
## Warning in bxp(list(stats = structure(c(11.4392789248019, 11.7558716435806, :
## some notches went outside hinges ('box'): maybe set notch=FALSE

#MSZoning   
boxplot(logSalePrice~train$MSZoning, notch = TRUE)
## Warning in bxp(list(stats = structure(c(10.4602421081905, 10.5966347330961, :
## some notches went outside hinges ('box'): maybe set notch=FALSE

#BldgType           
boxplot(logSalePrice~train$BldgType, notch = TRUE)

#KitchenQual            
boxplot(logSalePrice~train$KitchenQual, notch = TRUE)

#GarageType
boxplot(logSalePrice~train$GarageType, notch = TRUE)
## Warning in bxp(list(stats = structure(c(10.8589989975636, 11.3205535723228, :
## some notches went outside hinges ('box'): maybe set notch=FALSE

#BsmtQual
boxplot(logSalePrice~train$BsmtQual, notch = TRUE)

#HouseStyle
boxplot(logSalePrice~train$HouseStyle, notch = TRUE)
## Warning in bxp(list(stats = structure(c(11.2772031314492, 11.6483301019764, :
## some notches went outside hinges ('box'): maybe set notch=FALSE

#Foundation
boxplot(logSalePrice~train$Foundation, notch = TRUE)
## Warning in bxp(list(stats = structure(c(10.9767820332199, 11.5327280922664, :
## some notches went outside hinges ('box'): maybe set notch=FALSE

#Linear Models to see how they will fit
modelnumeric <- lm(logSalePrice ~ OverallQual + TotalArea + GrLivArea + GarageCars + TotalBaths + TotalBsmtSF + YearBuilt, data = train)

modelall <- lm(logSalePrice ~ OverallQual + TotalArea + GrLivArea + GarageCars + TotalBaths + TotalBsmtSF + YearBuilt + FireplaceQu + ExterQual +   Neighborhood + MSZoning + BldgType + KitchenQual + GarageType + BsmtQual + HouseStyle + Foundation, data = train)
summary(modelall)
## 
## Call:
## lm(formula = logSalePrice ~ OverallQual + TotalArea + GrLivArea + 
##     GarageCars + TotalBaths + TotalBsmtSF + YearBuilt + FireplaceQu + 
##     ExterQual + Neighborhood + MSZoning + BldgType + KitchenQual + 
##     GarageType + BsmtQual + HouseStyle + Foundation, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.44775 -0.06306  0.00861  0.07631  0.46669 
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          8.780e+00  7.029e-01  12.491  < 2e-16 ***
## OverallQual          6.483e-02  5.501e-03  11.783  < 2e-16 ***
## TotalArea           -4.286e-06  1.919e-05  -0.223 0.823292    
## GrLivArea            2.188e-04  3.325e-05   6.580 6.65e-11 ***
## GarageCars           6.266e-02  9.087e-03   6.895 8.16e-12 ***
## TotalBaths           7.036e-02  7.461e-03   9.431  < 2e-16 ***
## TotalBsmtSF                 NA         NA      NA       NA    
## YearBuilt            8.458e-04  3.588e-04   2.357 0.018555 *  
## FireplaceQuEx        7.383e-02  3.247e-02   2.274 0.023135 *  
## FireplaceQuFa        5.201e-02  2.603e-02   1.998 0.045897 *  
## FireplaceQuGd        5.403e-02  1.145e-02   4.718 2.62e-06 ***
## FireplaceQuPo        1.627e-02  3.264e-02   0.499 0.618210    
## FireplaceQuTA        3.771e-02  1.212e-02   3.112 0.001894 ** 
## ExterQualFa         -7.981e-02  5.363e-02  -1.488 0.136885    
## ExterQualGd         -9.493e-03  2.681e-02  -0.354 0.723344    
## ExterQualTA         -7.780e-03  2.987e-02  -0.260 0.794566    
## NeighborhoodBlmngtn  3.153e-02  4.298e-02   0.733 0.463394    
## NeighborhoodBlueste -2.731e-02  1.045e-01  -0.261 0.793942    
## NeighborhoodBrDale  -3.280e-02  4.716e-02  -0.695 0.486899    
## NeighborhoodBrkSide  3.416e-02  2.745e-02   1.244 0.213616    
## NeighborhoodClearCr  1.345e-01  2.934e-02   4.583 5.00e-06 ***
## NeighborhoodCollgCr  3.332e-02  2.093e-02   1.592 0.111705    
## NeighborhoodCrawfor  1.706e-01  2.474e-02   6.898 7.97e-12 ***
## NeighborhoodEdwards -7.062e-02  1.882e-02  -3.752 0.000183 ***
## NeighborhoodGilbert -3.107e-03  2.511e-02  -0.124 0.901557    
## NeighborhoodIDOTRR  -3.652e-02  3.778e-02  -0.967 0.333911    
## NeighborhoodMeadowV -9.586e-02  4.449e-02  -2.155 0.031360 *  
## NeighborhoodMitchel  5.552e-03  2.400e-02   0.231 0.817123    
## NeighborhoodNoRidge  1.314e-01  3.052e-02   4.305 1.79e-05 ***
## NeighborhoodNPkVill -1.258e-02  5.332e-02  -0.236 0.813582    
## NeighborhoodNridgHt  1.441e-01  2.766e-02   5.208 2.20e-07 ***
## NeighborhoodNWAmes  -2.015e-02  2.105e-02  -0.957 0.338500    
## NeighborhoodOldTown -4.866e-02  2.816e-02  -1.728 0.084228 .  
## NeighborhoodSawyer  -1.341e-02  1.920e-02  -0.698 0.485162    
## NeighborhoodSawyerW -3.638e-03  2.442e-02  -0.149 0.881582    
## NeighborhoodSomerst  7.947e-02  3.723e-02   2.135 0.032971 *  
## NeighborhoodStoneBr  1.820e-01  3.540e-02   5.141 3.12e-07 ***
## NeighborhoodSWISU   -1.929e-02  3.561e-02  -0.542 0.588120    
## NeighborhoodTimber   5.549e-02  2.828e-02   1.962 0.049951 *  
## NeighborhoodVeenker  1.584e-01  4.545e-02   3.486 0.000506 ***
## MSZoningFV           4.250e-01  6.717e-02   6.327 3.37e-10 ***
## MSZoningRH           4.310e-01  6.768e-02   6.369 2.58e-10 ***
## MSZoningRL           4.163e-01  5.632e-02   7.392 2.49e-13 ***
## MSZoningRM           3.839e-01  5.257e-02   7.304 4.69e-13 ***
## BldgType2fmCon       6.435e-03  2.775e-02   0.232 0.816662    
## BldgTypeDuplex      -8.032e-02  2.415e-02  -3.326 0.000903 ***
## BldgTypeTwnhs       -1.531e-01  3.023e-02  -5.066 4.61e-07 ***
## BldgTypeTwnhsE      -1.040e-01  1.999e-02  -5.205 2.23e-07 ***
## KitchenQualFa       -1.810e-01  3.379e-02  -5.355 1.00e-07 ***
## KitchenQualGd       -6.595e-02  2.001e-02  -3.296 0.001005 ** 
## KitchenQualTA       -1.094e-01  2.230e-02  -4.908 1.03e-06 ***
## GarageType2Types    -8.828e-02  6.621e-02  -1.333 0.182659    
## GarageTypeAttchd     6.396e-02  2.345e-02   2.728 0.006460 ** 
## GarageTypeBasment    4.879e-02  4.075e-02   1.197 0.231393    
## GarageTypeBuiltIn    3.629e-02  2.925e-02   1.241 0.214962    
## GarageTypeCarPort   -2.059e-02  5.325e-02  -0.387 0.699118    
## GarageTypeDetchd     3.690e-02  2.318e-02   1.592 0.111673    
## BsmtQualEx           1.945e-01  5.087e-02   3.824 0.000137 ***
## BsmtQualFa           8.093e-02  5.087e-02   1.591 0.111868    
## BsmtQualGd           1.510e-01  4.712e-02   3.206 0.001379 ** 
## BsmtQualTA           1.318e-01  4.523e-02   2.915 0.003616 ** 
## HouseStyle1.5Unf    -3.025e-02  4.162e-02  -0.727 0.467476    
## HouseStyle1Story     2.900e-02  1.733e-02   1.673 0.094471 .  
## HouseStyle2.5Fin    -2.693e-03  5.686e-02  -0.047 0.962238    
## HouseStyle2.5Unf     1.513e-02  4.630e-02   0.327 0.743872    
## HouseStyle2Story    -3.947e-02  1.626e-02  -2.428 0.015320 *  
## HouseStyleSFoyer     4.622e-02  3.100e-02   1.491 0.136116    
## HouseStyleSLvl       1.625e-02  2.378e-02   0.683 0.494531    
## FoundationCBlock     6.179e-02  1.754e-02   3.523 0.000440 ***
## FoundationPConc      5.953e-02  1.962e-02   3.034 0.002456 ** 
## FoundationSlab       2.329e-02  5.240e-02   0.444 0.656792    
## FoundationStone      1.563e-01  6.082e-02   2.571 0.010252 *  
## FoundationWood       4.207e-03  8.641e-02   0.049 0.961173    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1411 on 1388 degrees of freedom
## Multiple R-squared:  0.8813, Adjusted R-squared:  0.8753 
## F-statistic: 145.2 on 71 and 1388 DF,  p-value: < 2.2e-16
final.train <- train %>% select("Id", "OverallQual", "TotalArea", "GrLivArea", "GarageCars", "TotalBaths", "TotalBsmtSF", "YearBuilt", "FireplaceQu", "ExterQual", "Neighborhood", "KitchenQual", "GarageType", "BsmtQual", "HouseStyle")%>%
                        mutate(finalSalePrice = logSalePrice)
random.forest.prediction.model <- randomForest(finalSalePrice ~ ., data = final.train, importance = T)
random.forest.prediction.model
## 
## Call:
##  randomForest(formula = finalSalePrice ~ ., data = final.train,      importance = T) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 0.02224355
##                     % Var explained: 86.05
final.test <- test %>% select("Id", "OverallQual", "TotalArea", "GrLivArea", "GarageCars", "TotalBaths", "TotalBsmtSF", "YearBuilt", "FireplaceQu", "ExterQual", "Neighborhood", "KitchenQual", "GarageType", "BsmtQual", "HouseStyle")
pred <- exp(predict(random.forest.prediction.model , newdata = test))

#Finally create a csv file for test write.csv(x = data.frame(Id = test$Id, SalePrice = pred), row.names = F, file = “predictions.csv”)