library('lsr')
library('ggplot2')
## Warning: package 'ggplot2' was built under R version 4.0.4
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 4.0.4
library('randomForest')
## Warning: package 'randomForest' was built under R version 4.0.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library('readxl')
library('tidyverse')
## Warning: package 'tidyverse' was built under R version 4.0.4
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## v purrr 0.3.4
## Warning: package 'readr' was built under R version 4.0.4
## Warning: package 'forcats' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::combine() masks randomForest::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x randomForest::margin() masks ggplot2::margin()
library('tidyr')
library('tibble')
train <- read_excel("train.xlsx")
test <- read_excel("test.xlsx")
table(sapply(train, class))
##
## character numeric
## 43 38
table(sapply(test, class))
##
## character numeric
## 43 37
To see how much data is numeric or not table(sapply(train, class))
Create a numeric data set.fi
nums <- unlist(lapply(train, is.numeric))
numeric.train <- train[ , nums]
nums <- unlist(lapply(test, is.numeric))
numeric.test <- test[ , nums]
factors <- unlist(lapply(train, is.character))
train[ ,factors]
## # A tibble: 1,460 x 43
## MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 RL Pave 0 Reg Lvl AllPub Inside Gtl
## 2 RL Pave 0 Reg Lvl AllPub FR2 Gtl
## 3 RL Pave 0 IR1 Lvl AllPub Inside Gtl
## 4 RL Pave 0 IR1 Lvl AllPub Corner Gtl
## 5 RL Pave 0 IR1 Lvl AllPub FR2 Gtl
## 6 RL Pave 0 IR1 Lvl AllPub Inside Gtl
## 7 RL Pave 0 Reg Lvl AllPub Inside Gtl
## 8 RL Pave 0 IR1 Lvl AllPub Corner Gtl
## 9 RM Pave 0 Reg Lvl AllPub Inside Gtl
## 10 RL Pave 0 Reg Lvl AllPub Corner Gtl
## # ... with 1,450 more rows, and 35 more variables: Neighborhood <chr>,
## # Condition1 <chr>, Condition2 <chr>, BldgType <chr>, HouseStyle <chr>,
## # RoofStyle <chr>, RoofMatl <chr>, Exterior1st <chr>, Exterior2nd <chr>,
## # MasVnrType <chr>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## # BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>,
## # BsmtFinType2 <chr>, Heating <chr>, HeatingQC <chr>, CentralAir <chr>,
## # Electrical <chr>, KitchenQual <chr>, Functionl <chr>, FireplaceQu <chr>,
## # GarageType <chr>, GarageFinish <chr>, GarageQual <chr>, GarageCond <chr>,
## # PavedDrive <chr>, PoolQC <chr>, Fence <chr>, MiscFeature <chr>,
## # SaleType <chr>, SaleCondition <chr>
factor.train <- train[ ,factors]
#Data Cleaning
I Replaced all NA with 0. I did this while importing the data set. While I could have replace NA with, “None,” for the categorical data points I decided to put a 0 as well and just explain that it’s not a ctually numeric rather it means none. I also transformed Sale PRice into log Sale Price using a log transformation.
plot(density(train$SalePrice))
qqnorm(train$SalePrice, pch = 1, frame = FALSE)
logSalePrice <- log(numeric.train$SalePrice)
plot(density(logSalePrice))
qqnorm(logSalePrice, pch = 1, frame = FALSE)
train$TotalArea <- train$GrLivArea + train$TotalBsmtSF
train$TotalBaths <- train$BsmtFullBath + train$BsmtHalfBath/2 +train$FullBath + train$HalfBath/2
train$logSalePrice <- log(numeric.train$SalePrice)
test$TotalArea <- test$GrLivArea + test$TotalBsmtSF
test$TotalBaths <- test$BsmtFullBath + test$BsmtHalfBath/2 +test$FullBath + test$HalfBath/2
nums <- unlist(lapply(train, is.numeric))
train[ , nums]
## # A tibble: 1,460 x 41
## Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 60 65 8450 7 5 2003
## 2 2 20 80 9600 6 8 1976
## 3 3 60 68 11250 7 5 2001
## 4 4 70 60 9550 7 5 1915
## 5 5 60 84 14260 8 5 2000
## 6 6 50 85 14115 5 5 1993
## 7 7 20 75 10084 8 5 2004
## 8 8 60 0 10382 7 6 1973
## 9 9 50 51 6120 7 5 1931
## 10 10 190 50 7420 5 6 1939
## # ... with 1,450 more rows, and 34 more variables: YearRemodAdd <dbl>,
## # MasVnrArea <dbl>, BsmtFinSF1 <dbl>, BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>,
## # TotalBsmtSF <dbl>, 1stFlrSF <dbl>, 2ndFlrSF <dbl>, LowQualFinSF <dbl>,
## # GrLivArea <dbl>, BsmtFullBath <dbl>, BsmtHalfBath <dbl>, FullBath <dbl>,
## # HalfBath <dbl>, BedroomAbvGr <dbl>, Kitchen AbvGr <dbl>,
## # TotRmsAbvGrd <dbl>, Fireplaces <dbl>, GarageYrBlt <dbl>, GarageCars <dbl>,
## # GarageArea <dbl>, WoodDeckSF <dbl>, OpenPorchSF <dbl>, EnclosedPorch <dbl>,
## # 3SsnPorch <dbl>, ScreenPorch <dbl>, PoolArea <dbl>, MiscVal <dbl>,
## # MoSold <dbl>, YrSold <dbl>, SalePrice <dbl>, TotalArea <dbl>,
## # TotalBaths <dbl>, logSalePrice <dbl>
numeric.train <- train[ , nums]
correlation <- round(cor(train[ ,nums], y = NULL, use = "everything", method = c("pearson")),4)
ggcorrplot(correlation, method = "square")
#Make a Table With Sales Price
x <- data.frame(Variables = rownames(correlation),
Cor = correlation[, "logSalePrice"])
x <- x[order(x$Cor, decreasing = T),]
x <- x[which(x$Cor > 0.5 | x$Cor < -0.5),]
rownames(x) <- c()
print(x)
## Variables Cor
## 1 logSalePrice 1.0000
## 2 SalePrice 0.9484
## 3 OverallQual 0.8172
## 4 TotalArea 0.7733
## 5 GrLivArea 0.7009
## 6 GarageCars 0.6806
## 7 TotalBaths 0.6730
## 8 GarageArea 0.6509
## 9 TotalBsmtSF 0.6121
## 10 1stFlrSF 0.5970
## 11 FullBath 0.5948
## 12 YearBuilt 0.5866
## 13 YearRemodAdd 0.5656
## 14 TotRmsAbvGrd 0.5344
#Now We trim the List Down We can take away the 1stFlrSF, FullBath, YearRemodAdd, GarageArea, SalePrice, YearRemodAdd.
#Factor List
rf <- randomForest(SalePrice ~ ., data = fulldt.fac.train, importance = T)
factors.train <- train %>% select(Id, which(sapply(., is.character))) %>%
mutate(logSalePrice = train$logSalePrice)
# Run RF algorithm will all factor variables
rf <- randomForest(logSalePrice ~ ., data = factors.train, importance = T)
rf
##
## Call:
## randomForest(formula = logSalePrice ~ ., data = factors.train, importance = T)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 14
##
## Mean of squared residuals: 0.037802
## % Var explained: 76.29
importance.table <- data.frame(Names = rownames(importance(rf)), '%IncMSE' = importance(rf)[,1])
importance.table <- importance.table[order(importance.table[,2], decreasing = T),]
rownames(importance.table) <- c()
importance.table[1:10,]
## Names X.IncMSE
## 1 FireplaceQu 44.36213
## 2 Neighborhood 34.92391
## 3 ExterQual 32.93325
## 4 MSZoning 29.96128
## 5 BldgType 29.15379
## 6 HouseStyle 25.91260
## 7 BsmtQual 24.61693
## 8 KitchenQual 24.35134
## 9 GarageType 19.57114
## 10 MasVnrType 17.75098
Looking at the data
Looks good! Now I am going to run P-tests pn each of the factors.
#Take out 1stFlrSF because that is captured in total, garagearea isn't needed because we have garagecars
#OverallQual
ggplot(train, aes(x = OverallQual, y=logSalePrice, title = OverallQual)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'
#TotalArea
ggplot(train, aes(x = TotalArea, y=logSalePrice, title = TotalArea)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'
#GrLivArea
ggplot(train, aes(x = OverallQual, y=logSalePrice, title = GrLivArea)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'
#GarageCars
ggplot(train, aes(x = GarageCars, y=logSalePrice, title = GarageCars)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'
#TotalBaths
ggplot(train, aes(x = TotalBaths, y=logSalePrice, title = TotalBaths)) +geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'
#TotalBsmtSF
ggplot(train, aes(x = TotalBsmtSF, y=logSalePrice, title = TotalBsmtSF)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'
#YearBuilt
ggplot(train, aes(x = YearBuilt, y=logSalePrice, title = YearBuilt)) + geom_point() + stat_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula 'y ~ x'
We can use either backward elimination or forward selection. These techniques are often referred to as stepwise model selection strategies. They are used to remove variables that aren’t actually good fits for the model.
Looking at the values the P values how all the selcted values are statistically significant.
#FireplaceQu
#ExterQual
#Neighborhood
#MSZoning
#BldgType
#KitchenQual
#GarageType
#BsmtQual
#HouseStyle
#Foundation
#FireplaceQu
boxplot(logSalePrice~train$FireplaceQu, notch = TRUE)
#ExterQual
boxplot(logSalePrice~train$ExterQual, notch = TRUE)
#Neighborhood
boxplot(logSalePrice~train$Neighborhood, notch = TRUE)
## Warning in bxp(list(stats = structure(c(11.4392789248019, 11.7558716435806, :
## some notches went outside hinges ('box'): maybe set notch=FALSE
#MSZoning
boxplot(logSalePrice~train$MSZoning, notch = TRUE)
## Warning in bxp(list(stats = structure(c(10.4602421081905, 10.5966347330961, :
## some notches went outside hinges ('box'): maybe set notch=FALSE
#BldgType
boxplot(logSalePrice~train$BldgType, notch = TRUE)
#KitchenQual
boxplot(logSalePrice~train$KitchenQual, notch = TRUE)
#GarageType
boxplot(logSalePrice~train$GarageType, notch = TRUE)
## Warning in bxp(list(stats = structure(c(10.8589989975636, 11.3205535723228, :
## some notches went outside hinges ('box'): maybe set notch=FALSE
#BsmtQual
boxplot(logSalePrice~train$BsmtQual, notch = TRUE)
#HouseStyle
boxplot(logSalePrice~train$HouseStyle, notch = TRUE)
## Warning in bxp(list(stats = structure(c(11.2772031314492, 11.6483301019764, :
## some notches went outside hinges ('box'): maybe set notch=FALSE
#Foundation
boxplot(logSalePrice~train$Foundation, notch = TRUE)
## Warning in bxp(list(stats = structure(c(10.9767820332199, 11.5327280922664, :
## some notches went outside hinges ('box'): maybe set notch=FALSE
#Linear Models to see how they will fit
modelnumeric <- lm(logSalePrice ~ OverallQual + TotalArea + GrLivArea + GarageCars + TotalBaths + TotalBsmtSF + YearBuilt, data = train)
modelall <- lm(logSalePrice ~ OverallQual + TotalArea + GrLivArea + GarageCars + TotalBaths + TotalBsmtSF + YearBuilt + FireplaceQu + ExterQual + Neighborhood + MSZoning + BldgType + KitchenQual + GarageType + BsmtQual + HouseStyle + Foundation, data = train)
summary(modelall)
##
## Call:
## lm(formula = logSalePrice ~ OverallQual + TotalArea + GrLivArea +
## GarageCars + TotalBaths + TotalBsmtSF + YearBuilt + FireplaceQu +
## ExterQual + Neighborhood + MSZoning + BldgType + KitchenQual +
## GarageType + BsmtQual + HouseStyle + Foundation, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.44775 -0.06306 0.00861 0.07631 0.46669
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.780e+00 7.029e-01 12.491 < 2e-16 ***
## OverallQual 6.483e-02 5.501e-03 11.783 < 2e-16 ***
## TotalArea -4.286e-06 1.919e-05 -0.223 0.823292
## GrLivArea 2.188e-04 3.325e-05 6.580 6.65e-11 ***
## GarageCars 6.266e-02 9.087e-03 6.895 8.16e-12 ***
## TotalBaths 7.036e-02 7.461e-03 9.431 < 2e-16 ***
## TotalBsmtSF NA NA NA NA
## YearBuilt 8.458e-04 3.588e-04 2.357 0.018555 *
## FireplaceQuEx 7.383e-02 3.247e-02 2.274 0.023135 *
## FireplaceQuFa 5.201e-02 2.603e-02 1.998 0.045897 *
## FireplaceQuGd 5.403e-02 1.145e-02 4.718 2.62e-06 ***
## FireplaceQuPo 1.627e-02 3.264e-02 0.499 0.618210
## FireplaceQuTA 3.771e-02 1.212e-02 3.112 0.001894 **
## ExterQualFa -7.981e-02 5.363e-02 -1.488 0.136885
## ExterQualGd -9.493e-03 2.681e-02 -0.354 0.723344
## ExterQualTA -7.780e-03 2.987e-02 -0.260 0.794566
## NeighborhoodBlmngtn 3.153e-02 4.298e-02 0.733 0.463394
## NeighborhoodBlueste -2.731e-02 1.045e-01 -0.261 0.793942
## NeighborhoodBrDale -3.280e-02 4.716e-02 -0.695 0.486899
## NeighborhoodBrkSide 3.416e-02 2.745e-02 1.244 0.213616
## NeighborhoodClearCr 1.345e-01 2.934e-02 4.583 5.00e-06 ***
## NeighborhoodCollgCr 3.332e-02 2.093e-02 1.592 0.111705
## NeighborhoodCrawfor 1.706e-01 2.474e-02 6.898 7.97e-12 ***
## NeighborhoodEdwards -7.062e-02 1.882e-02 -3.752 0.000183 ***
## NeighborhoodGilbert -3.107e-03 2.511e-02 -0.124 0.901557
## NeighborhoodIDOTRR -3.652e-02 3.778e-02 -0.967 0.333911
## NeighborhoodMeadowV -9.586e-02 4.449e-02 -2.155 0.031360 *
## NeighborhoodMitchel 5.552e-03 2.400e-02 0.231 0.817123
## NeighborhoodNoRidge 1.314e-01 3.052e-02 4.305 1.79e-05 ***
## NeighborhoodNPkVill -1.258e-02 5.332e-02 -0.236 0.813582
## NeighborhoodNridgHt 1.441e-01 2.766e-02 5.208 2.20e-07 ***
## NeighborhoodNWAmes -2.015e-02 2.105e-02 -0.957 0.338500
## NeighborhoodOldTown -4.866e-02 2.816e-02 -1.728 0.084228 .
## NeighborhoodSawyer -1.341e-02 1.920e-02 -0.698 0.485162
## NeighborhoodSawyerW -3.638e-03 2.442e-02 -0.149 0.881582
## NeighborhoodSomerst 7.947e-02 3.723e-02 2.135 0.032971 *
## NeighborhoodStoneBr 1.820e-01 3.540e-02 5.141 3.12e-07 ***
## NeighborhoodSWISU -1.929e-02 3.561e-02 -0.542 0.588120
## NeighborhoodTimber 5.549e-02 2.828e-02 1.962 0.049951 *
## NeighborhoodVeenker 1.584e-01 4.545e-02 3.486 0.000506 ***
## MSZoningFV 4.250e-01 6.717e-02 6.327 3.37e-10 ***
## MSZoningRH 4.310e-01 6.768e-02 6.369 2.58e-10 ***
## MSZoningRL 4.163e-01 5.632e-02 7.392 2.49e-13 ***
## MSZoningRM 3.839e-01 5.257e-02 7.304 4.69e-13 ***
## BldgType2fmCon 6.435e-03 2.775e-02 0.232 0.816662
## BldgTypeDuplex -8.032e-02 2.415e-02 -3.326 0.000903 ***
## BldgTypeTwnhs -1.531e-01 3.023e-02 -5.066 4.61e-07 ***
## BldgTypeTwnhsE -1.040e-01 1.999e-02 -5.205 2.23e-07 ***
## KitchenQualFa -1.810e-01 3.379e-02 -5.355 1.00e-07 ***
## KitchenQualGd -6.595e-02 2.001e-02 -3.296 0.001005 **
## KitchenQualTA -1.094e-01 2.230e-02 -4.908 1.03e-06 ***
## GarageType2Types -8.828e-02 6.621e-02 -1.333 0.182659
## GarageTypeAttchd 6.396e-02 2.345e-02 2.728 0.006460 **
## GarageTypeBasment 4.879e-02 4.075e-02 1.197 0.231393
## GarageTypeBuiltIn 3.629e-02 2.925e-02 1.241 0.214962
## GarageTypeCarPort -2.059e-02 5.325e-02 -0.387 0.699118
## GarageTypeDetchd 3.690e-02 2.318e-02 1.592 0.111673
## BsmtQualEx 1.945e-01 5.087e-02 3.824 0.000137 ***
## BsmtQualFa 8.093e-02 5.087e-02 1.591 0.111868
## BsmtQualGd 1.510e-01 4.712e-02 3.206 0.001379 **
## BsmtQualTA 1.318e-01 4.523e-02 2.915 0.003616 **
## HouseStyle1.5Unf -3.025e-02 4.162e-02 -0.727 0.467476
## HouseStyle1Story 2.900e-02 1.733e-02 1.673 0.094471 .
## HouseStyle2.5Fin -2.693e-03 5.686e-02 -0.047 0.962238
## HouseStyle2.5Unf 1.513e-02 4.630e-02 0.327 0.743872
## HouseStyle2Story -3.947e-02 1.626e-02 -2.428 0.015320 *
## HouseStyleSFoyer 4.622e-02 3.100e-02 1.491 0.136116
## HouseStyleSLvl 1.625e-02 2.378e-02 0.683 0.494531
## FoundationCBlock 6.179e-02 1.754e-02 3.523 0.000440 ***
## FoundationPConc 5.953e-02 1.962e-02 3.034 0.002456 **
## FoundationSlab 2.329e-02 5.240e-02 0.444 0.656792
## FoundationStone 1.563e-01 6.082e-02 2.571 0.010252 *
## FoundationWood 4.207e-03 8.641e-02 0.049 0.961173
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1411 on 1388 degrees of freedom
## Multiple R-squared: 0.8813, Adjusted R-squared: 0.8753
## F-statistic: 145.2 on 71 and 1388 DF, p-value: < 2.2e-16
final.train <- train %>% select("Id", "OverallQual", "TotalArea", "GrLivArea", "GarageCars", "TotalBaths", "TotalBsmtSF", "YearBuilt", "FireplaceQu", "ExterQual", "Neighborhood", "KitchenQual", "GarageType", "BsmtQual", "HouseStyle")%>%
mutate(finalSalePrice = logSalePrice)
random.forest.prediction.model <- randomForest(finalSalePrice ~ ., data = final.train, importance = T)
random.forest.prediction.model
##
## Call:
## randomForest(formula = finalSalePrice ~ ., data = final.train, importance = T)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 0.02224355
## % Var explained: 86.05
final.test <- test %>% select("Id", "OverallQual", "TotalArea", "GrLivArea", "GarageCars", "TotalBaths", "TotalBsmtSF", "YearBuilt", "FireplaceQu", "ExterQual", "Neighborhood", "KitchenQual", "GarageType", "BsmtQual", "HouseStyle")
pred <- exp(predict(random.forest.prediction.model , newdata = test))
#Finally create a csv file for test write.csv(x = data.frame(Id = test$Id, SalePrice = pred), row.names = F, file = “predictions.csv”)