You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition.
Make ready with relevant libraries:
library( tidyverse )
library( tidymodels )
library( ggplot2 )
library( correlationfunnel )
library(reshape2)
library( hrbrthemes )
library( matlib )
library( matrixcalc )
library( MASS )
library( outliers )
library( ggfortify )Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
Begin by importing the data:
train_url <- 'https://raw.githubusercontent.com/SmilodonCub/DATA605/master/house-prices-advanced-regression-techniques/train.csv'
test_url <- 'https://raw.githubusercontent.com/SmilodonCub/DATA605/master/house-prices-advanced-regression-techniques/test.csv'
train_df <- read.csv( train_url )
test_df <- read.csv( test_url )Basic size of dataset:
sizedf <- dim( train_df )
mes <- paste( 'This dataset holds', sizedf[1], 'records\nEach record has data on', sizedf[2], 'features' )
cat( mes, sep = '\n' )## This dataset holds 1460 records
## Each record has data on 81 features
This is not a particularly large dataset as there are only 1460 records. However, it is quite high dimentional with each record holding 81 features.
Explore the training dataset with descriptive statistics and appropriate visualizations:
## Rows: 1,460
## Columns: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20…
## $ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91,…
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 61…
## $ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave",…
## $ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1",…
## $ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl",…
## $ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllP…
## $ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "…
## $ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl",…
## $ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", …
## $ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm"…
## $ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam",…
## $ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5F…
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5…
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5…
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 1…
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 1…
## $ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "…
## $ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", …
## $ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", …
## $ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", …
## $ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None"…
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306,…
## $ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "…
## $ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "…
## $ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", …
## $ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "…
## $ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "…
## $ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "…
## $ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ",…
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 9…
## $ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ",…
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 1…
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 1…
## $ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA",…
## $ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "…
## $ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "…
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077,…
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0…
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 1…
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1…
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1…
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1…
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3…
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1…
## $ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "…
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, …
## $ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ",…
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0…
## $ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA",…
## $ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attc…
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 1…
## $ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn",…
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2…
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, …
## $ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "…
## $ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "…
## $ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", …
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 16…
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0,…
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0,…
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA…
## $ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA,…
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700…
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 1…
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 2…
## $ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Nor…
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000,…
A quick scan of the data with the glimpse() function shows that the data is a mix of categorical (e.g. ‘Street’ and ‘Heating ) and quantitative variables (e.g. ’Lot Area’ and ‘TotRmsAbvGrd’).
We would like to use the features to predict SalePrice. It is worth taking a moment to explore SalePrice:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
SP_histo <- ggplot( train_df, aes( x=SalePrice ) ) + geom_histogram( fill="#69b3a2", color="#e9ecef", alpha=0.9 ) + ggtitle( 'Sale Price' )
SP_histo## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Judging by the envelope of the histogram and the fact that the distribution mean is greater than the median, we can see that the target distribution is skewed. To account for this, we will log-transform SalePrice to improve the linearity.
l10t_SalePrice <- data.frame( 'SalePrice' = log( train_df$SalePrice +1 ) );
SP_histo <- ggplot( l10t_SalePrice, aes( x=SalePrice ) ) + geom_histogram( fill="#69b3a2", color="#e9ecef", alpha=0.9 ) + ggtitle( 'log10 transform Sale Price' )
SP_histo## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] 1460 38
Find the numeric features with the highest correlation to the target data:
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 38 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## # A tibble: 15 x 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 SalePrice <NA> 1
## 2 OverallQual <NA> 0.791
## 3 GrLivArea <NA> 0.709
## 4 GarageCars <NA> 0.640
## 5 GarageArea <NA> 0.623
## 6 TotalBsmtSF <NA> 0.614
## 7 X1stFlrSF <NA> 0.606
## 8 FullBath <NA> 0.561
## 9 TotRmsAbvGrd <NA> 0.534
## 10 YearBuilt <NA> 0.523
## 11 YearRemodAdd <NA> 0.507
## 12 Fireplaces <NA> 0.467
## 13 BsmtFinSF1 <NA> 0.386
## 14 WoodDeckSF <NA> 0.324
## 15 X2ndFlrSF <NA> 0.319
The features listed after SalePrice (which of course fully correlates to itself) have the highest correlations to the target. We can also visualize correlation with a correlation heatmap plot:
top12names = c( 'OverallQual','GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', 'X1stFlrSF','FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'Fireplaces', 'BsmtFinSF1')
top12 <- train_df %>%
dplyr::select( top12names ) %>%
drop_na()## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(top12names)` instead of `top12names` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
top12_cor <- round( cor( top12 ), 2 )
top12_melted <- melt( top12_cor )
ggplot( data = top12_melted, aes(x=Var1, y=Var2, fill=value), col = cm.colors() ) +
geom_tile() +
scale_fill_distiller(palette = "RdBu") +
theme_ipsum() +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
ggtitle( 'Correlation Heat Map' )The heat map is very informative, because it reveals that several of the features with the highest correlation to SalePrice are also highly correlated between themselves. For instance, GarageArea is highly correlated with GarageCars; this makes sense because garages with a higher holding capacity will necessarily be larger. Additionally, TotalBsmentSF (total basement square feet) is highly correlated with X1stFlrSF (1st floor square feet); again, this makes intuitive sense. In the interest of producing the most efficient model, these high correlations between features will be considered in future steps.
Correlation is a strong indication that a feature will contribute to how well the model we develop account’s for the target variable’s variance. A point that can be made by visualizing how features vary as a function of the target variable.
#scatterplot of feature with highest correlation to SalePrice
ggplot(train_df, aes(y=SalePrice, x=OverallQual)) +
geom_point(size=3, color="#69b3a2") +
theme_ipsum() +
scale_y_log10() +
ggtitle( 'SalePrice (log10) ~ Overall Quality' )#find the feature with the minimal absolute value correlation to SalePrice
numcorrs$correlation <- abs( numcorrs$correlation )
head( numcorrs[ order( numcorrs$correlation ), ],1 )## # A tibble: 1 x 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 BsmtFinSF2 <NA> 0.0114
The feature OverallQual has the highest correlation to SalePrice and when plotted as a function of the target variable exhibits a clear positive trend. This is not the case with BsmtFinSF2 which does not show any clear trend in the data. BsmtFinSF2 was the least correlated numeric feature in the dataset. In fact, it actually has a slightly lower correlation than ID which is a completely arbitrary feature lable.
Next we will take the top 3 correlated features plotted previously in the heatmap and test the hypotheses that the correlations between pairwise set of variables is 0.
##
## Pearson's product-moment correlation
##
## data: train_df$OverallQual and train_df$GrLivArea
## t = 28.121, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5708061 0.6143422
## sample estimates:
## cor
## 0.5930074
##
## Pearson's product-moment correlation
##
## data: train_df$OverallQual and train_df$GarageCars
## t = 28.688, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5787769 0.6216992
## sample estimates:
## cor
## 0.6006707
##
## Pearson's product-moment correlation
##
## data: train_df$GarageCars and train_df$GrLivArea
## t = 20.18, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4405950 0.4930768
## sample estimates:
## cor
## 0.4672474
The p-values for all three of the correlation test results are very small and well below conventional criteria. Therefor, we can conclude that the features being compared are significantly correlated. This is significant because correlations between features can lead to familywise errors.
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
#precision matix = invert the correlation matrix
top12_inv <- inv( top12_cor )
#multiply correlation by precision
corpre <- top12_cor %*% top12_inv
print( round( corpre,7 ) )##
## OverallQual 1 0 0 0 0 0 0 0 0 0 0 0
## GrLivArea 0 1 0 0 0 0 0 0 0 0 0 0
## GarageCars 0 0 1 0 0 0 0 0 0 0 0 0
## GarageArea 0 0 0 1 0 0 0 0 0 0 0 0
## TotalBsmtSF 0 0 0 0 1 0 0 0 0 0 0 0
## X1stFlrSF 0 0 0 0 0 1 0 0 0 0 0 0
## FullBath 0 0 0 0 0 0 1 0 0 0 0 0
## TotRmsAbvGrd 0 0 0 0 0 0 0 1 0 0 0 0
## YearBuilt 0 0 0 0 0 0 0 0 1 0 0 0
## YearRemodAdd 0 0 0 0 0 0 0 0 0 1 0 0
## Fireplaces 0 0 0 0 0 0 0 0 0 0 1 0
## BsmtFinSF1 0 0 0 0 0 0 0 0 0 0 0 1
## OverallQual GrLivArea GarageCars GarageArea TotalBsmtSF X1stFlrSF
## [1,] 1 0 0 0 0 0
## [2,] 0 1 0 0 0 0
## [3,] 0 0 1 0 0 0
## [4,] 0 0 0 1 0 0
## [5,] 0 0 0 0 1 0
## [6,] 0 0 0 0 0 1
## [7,] 0 0 0 0 0 0
## [8,] 0 0 0 0 0 0
## [9,] 0 0 0 0 0 0
## [10,] 0 0 0 0 0 0
## [11,] 0 0 0 0 0 0
## [12,] 0 0 0 0 0 0
## FullBath TotRmsAbvGrd YearBuilt YearRemodAdd Fireplaces BsmtFinSF1
## [1,] 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0
## [7,] 1 0 0 0 0 0
## [8,] 0 1 0 0 0 0
## [9,] 0 0 1 0 0 0
## [10,] 0 0 0 1 0 0
## [11,] 0 0 0 0 1 0
## [12,] 0 0 0 0 0 1
## $L
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.00 0.00000000 0.0000000000 0.00000000 0.00000000 0.00000000
## [2,] 0.59 1.00000000 0.0000000000 0.00000000 0.00000000 0.00000000
## [3,] 0.60 0.17794140 1.0000000000 0.00000000 0.00000000 0.00000000
## [4,] 0.56 0.21414327 0.8382207252 1.00000000 0.00000000 0.00000000
## [5,] 0.54 0.20156466 0.1333936002 0.40756705 1.00000000 0.00000000
## [6,] 0.48 0.43994478 0.1630176342 0.33953701 0.72366165 1.00000000
## [7,] 0.55 0.46863016 0.1382702596 -0.15905754 -0.05622412 0.02001210
## [8,] 0.43 0.88403129 -0.0008841886 -0.10731103 -0.07665374 -0.02454074
## [9,] 0.57 -0.20908115 0.3588443630 0.01667293 0.12382690 -0.10373743
## [10,] 0.55 -0.05292223 0.1552234000 -0.05059172 -0.01309912 -0.05450283
## [11,] 0.40 0.34361098 0.0325193184 -0.08516847 0.13222230 0.22977669
## [12,] 0.24 0.10492407 0.1030562711 0.44028181 0.51784741 0.08753130
## [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 0.00000000 0.00000000 0.00000000 0.00000000 0.0000000 0
## [2,] 0.00000000 0.00000000 0.00000000 0.00000000 0.0000000 0
## [3,] 0.00000000 0.00000000 0.00000000 0.00000000 0.0000000 0
## [4,] 0.00000000 0.00000000 0.00000000 0.00000000 0.0000000 0
## [5,] 0.00000000 0.00000000 0.00000000 0.00000000 0.0000000 0
## [6,] 0.00000000 0.00000000 0.00000000 0.00000000 0.0000000 0
## [7,] 1.00000000 0.00000000 0.00000000 0.00000000 0.0000000 0
## [8,] 0.06942218 1.00000000 0.00000000 0.00000000 0.0000000 0
## [9,] 0.36501781 -0.10866591 1.00000000 0.00000000 0.0000000 0
## [10,] 0.25883586 -0.09346707 0.37742636 1.00000000 0.0000000 0
## [11,] -0.16320896 -0.09447462 -0.02758978 -0.12712891 1.0000000 0
## [12,] -0.14845106 -0.27608667 0.17695672 -0.02950854 0.1091514 1
##
## $U
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 5.900000e-01 6.000000e-01 0.5600000 5.400000e-01 0.48000000
## [2,] 0 6.519000e-01 1.160000e-01 0.1396000 1.314000e-01 0.28680000
## [3,] 0 0.000000e+00 6.193588e-01 0.5191594 8.261850e-02 0.10096641
## [4,] 0 0.000000e+00 0.000000e+00 0.2213354 9.020904e-02 0.07515158
## [5,] 0 0.000000e+00 0.000000e+00 0.0000000 6.341274e-01 0.45889368
## [6,] 0 0.000000e+00 0.000000e+00 0.0000000 0.000000e+00 0.26936404
## [7,] 0 0.000000e+00 1.387779e-17 0.0000000 0.000000e+00 0.00000000
## [8,] 0 0.000000e+00 -9.634263e-19 0.0000000 -6.938894e-18 0.00000000
## [9,] 0 0.000000e+00 -5.170331e-18 0.0000000 -7.540212e-19 0.00000000
## [10,] 0 0.000000e+00 -1.730698e-18 0.0000000 -3.639706e-19 0.00000000
## [11,] 0 2.775558e-17 1.811290e-18 0.0000000 -7.226238e-19 0.00000000
## [12,] 0 -3.029559e-18 2.460333e-18 0.0000000 -1.714172e-18 0.00000000
## [,7] [,8] [,9] [,10] [,11]
## [1,] 5.500000e-01 0.430000000 0.570000000 5.500000e-01 0.40000000
## [2,] 3.055000e-01 0.576300000 -0.136300000 -3.450000e-02 0.22400000
## [3,] 8.563890e-02 -0.000547630 0.222253413 9.613898e-02 0.02014113
## [4,] -3.520507e-02 -0.023751734 0.003690311 -1.119774e-02 -0.01885080
## [5,] -3.565326e-02 -0.048608238 0.078522031 -8.306513e-03 0.08384579
## [6,] 5.390539e-03 -0.006610391 -0.027943132 -1.468110e-02 0.06189358
## [7,] 5.347801e-01 0.037125601 0.195204258 1.384203e-01 -0.08728090
## [8,] 0.000000e+00 0.296617891 -0.032232253 -2.772400e-02 -0.02802286
## [9,] 0.000000e+00 0.000000000 0.479408860 1.809415e-01 -0.01322678
## [10,] 2.775558e-17 0.000000000 0.000000000 5.725642e-01 -0.07278946
## [11,] 3.528536e-18 0.000000000 0.000000000 -1.387779e-17 0.70895164
## [12,] 4.338821e-19 0.000000000 0.000000000 1.514779e-18 0.00000000
## [,12]
## [1,] 0.24000000
## [2,] 0.06840000
## [3,] 0.06382881
## [4,] 0.09744997
## [5,] 0.32838123
## [6,] 0.02357778
## [7,] -0.07938867
## [8,] -0.08189225
## [9,] 0.08483462
## [10,] -0.01689553
## [11,] 0.07738303
## [12,] 0.65527285
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5 th percentile and 95 th percentile of the data. Discuss.
As demonstrated in the first visual exploration of the dataset above, the target variable, SalePrice is right skewed:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
The mean of SalePrice is greater than the median, indicating skew and the minimum value is well above zero. \(\therefore\) this variable is a good candidate for this exercise
#Find the optimal value of lambda
optlam <- fitdistr( train_df$SalePrice, "exponential" )
optrate<- optlam$estimate
#take 1000 samples from this exponential distribution
n <- 1000
optlam_sample <- rexp( n, rate = optrate )
#prep data for visualization
sp_df <- data.frame( 'SalePrice' = train_df$SalePrice ) %>% mutate( 'origin' = 'data' )
sim_df <- data.frame( 'SalePrice' = optlam_sample ) %>% mutate( 'origin' = 'simulated' )
histo_df <- rbind( sp_df, sim_df )
SP_histo <- ggplot( histo_df, aes( x=SalePrice, fill=origin ) ) +
geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity' ) +
scale_fill_manual(values=c("#69b3a2", "#404080")) +
theme_ipsum() +
labs(fill="") +
ggtitle( 'Sale Price' )
SP_histo## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#find the 5th and 95th percentiles of the CDF from the exponential
cdfs <- ggplot( histo_df, aes( x=SalePrice, color=origin ) ) +
stat_ecdf( geom = "step", pad = FALSE ) +
ggtitle( 'CDF' )
cdfscdfs_histo <- layer_data( cdfs )
cdfs_data <- cdfs_histo %>% filter( group == 1 ) %>% dplyr::select( c( 'x', 'y' ) )
cdfs_sim <- cdfs_histo %>% filter( group == 2 ) %>% dplyr::select( c( 'x', 'y' ) )
simp5 <- cdfs_sim$x[ which.min(abs(cdfs_sim$y-0.05)) ]
simp95 <- cdfs_sim$x[ which.min(abs(cdfs_sim$y-0.95)) ]
datp025 <- cdfs_data$x[ which.min(abs(cdfs_data$y-0.025)) ]
datp975 <- cdfs_data$x[ which.min(abs(cdfs_data$y-0.975)) ]
res <- paste( '90% Confidence interval for simulated data\n5th percentile:', simp5,
'\n95th percentile:', simp95, '\n\n95% Confidence interval for emirical data:',
'\n2.5th percentile:', datp025, '\n97.5th percentile:', datp975)
cat( res, sep = '\n' )## 90% Confidence interval for simulated data
## 5th percentile: 10769.5728417215
## 95th percentile: 567807.937132202
##
## 95% Confidence interval for emirical data:
## 2.5th percentile: 79900
## 97.5th percentile: 383970
In general, the exponential fit to SalePrice does not do a satisfactory job of describing the envelope of the data as shown in the dual plot histogram. Additionally, the simulated data’s 90% confidence interval range is much broader than the empirical 95% interval.
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
For the first pass at modeling, we will arbitrarity take the 1st seven numeric features with the highest correlation to the target variable:
## # A tibble: 12 x 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 SalePrice <NA> 1
## 2 OverallQual <NA> 0.791
## 3 GrLivArea <NA> 0.709
## 4 GarageCars <NA> 0.640
## 5 GarageArea <NA> 0.623
## 6 TotalBsmtSF <NA> 0.614
## 7 X1stFlrSF <NA> 0.606
## 8 FullBath <NA> 0.561
## 9 TotRmsAbvGrd <NA> 0.534
## 10 YearBuilt <NA> 0.523
## 11 YearRemodAdd <NA> 0.507
## 12 Fireplaces <NA> 0.467
The 7 will be added to the list with the following criteria: Add feature with highest correlation to SalePrice, if the feature shows high cross-correlation with another feature already added to the list then pass and move to evaluate the next. This yields the following list:
Now to further clean the numeric data for these 7 features.
Visual inspection of some features shows that there are outliers in the data. For example, look at the SalePrice scatter plot for Ground Living Area:
#scatterplot of feature with SalePrice ~ GrLivArea
ggplot(train_df, aes(y=SalePrice, x=GrLivArea)) +
geom_point(size=3, color="#69b3a2") +
theme_ipsum() +
scale_y_log10() +
ggtitle( 'SalePrice (log10) ~ Ground Living Area' )There are some outliers particularly towards the extreme right of the data distribution. We will achieve a better fit for the training data if we remove outliers from the set:
#remove outliers [Q1- (1.5)IQR] or above [Q3+(1.5)IQR] from the numeric data
print( dim( train_df ) )## [1] 1460 81
colselect <- c( c( 'Id', 'SalePrice' ), pick7 )
train_df_out <- train_df %>% dplyr::select( all_of( colselect ) ) %>%
filter( !OverallQual %in% c( outlier( OverallQual ) ),
!GrLivArea %in% c( outlier( GrLivArea ) ),
!GarageCars %in% c( outlier( GarageCars ) ),
!TotalBsmtSF %in% c( outlier( TotalBsmtSF ) ),
!FullBath %in% c( outlier( FullBath ) ),
!YearBuilt %in% c( outlier( YearBuilt ) ),
!YearRemodAdd %in% c( outlier( YearRemodAdd ) ))
print( dim( train_df_out ) )## [1] 1268 9
Next, we will evaluate and deal with null values:
## Id SalePrice OverallQual GrLivArea GarageCars TotalBsmtSF
## 0 0 0 0 0 0
## FullBath YearBuilt YearRemodAdd
## 0 0 0
We got lucky! (how rare), there are no null values for these data fields
Now to train a multiple linear regression model just the selected numeric features:
cleantrain <- train_df_out %>% dplyr::select( -Id )
cleantrain_lm <- lm( SalePrice ~ ., data = cleantrain )
summary( cleantrain_lm )##
## Call:
## lm(formula = SalePrice ~ ., data = cleantrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -378823 -19029 -1719 16671 252834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.305e+06 1.626e+05 -8.028 2.25e-15 ***
## OverallQual 1.884e+04 1.310e+03 14.382 < 2e-16 ***
## GrLivArea 6.539e+01 3.111e+00 21.015 < 2e-16 ***
## GarageCars 1.194e+04 2.014e+03 5.928 3.94e-09 ***
## TotalBsmtSF 4.296e+01 2.920e+00 14.716 < 2e-16 ***
## FullBath -1.301e+04 2.845e+03 -4.575 5.23e-06 ***
## YearBuilt 2.514e+02 5.298e+01 4.745 2.33e-06 ***
## YearRemodAdd 3.679e+02 7.175e+01 5.128 3.38e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35970 on 1260 degrees of freedom
## Multiple R-squared: 0.7974, Adjusted R-squared: 0.7963
## F-statistic: 708.5 on 7 and 1260 DF, p-value: < 2.2e-16
Visualize model performance:
modpredict <- cleantrain_lm %>% predict( cleantrain )
modresdat <- data.frame( 'Empirical' = cleantrain$SalePrice, 'Predicted' = modpredict )
ggplot( modresdat, aes( y = Empirical, x = Predicted ) ) +
geom_point() +
geom_line( aes( y = modpredict ) ) +
ggtitle( 'Multiple Lin. Reg. fit for key Numeric Features' )Evaluate the model with some diagnostics
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
A quick glance at the model diagnostics shows that for most of the range of the data the multiple linear regression model does an reasonable job at capturing the variance of the data. However, there are some departures from linearity for more extreme values towards the upper limit (e.g. see deviations in Normal Q-Q plot). The R-squard value for the model is 0.7963. This suggest that with just using the 7 most highly correlated numeric features ( excluding those with high cross-correlations) our model can describe approximately 80% of the variance in the data. Additionally, the model statistics suggest this is a statistically significant relationship.
Now to generate predictions for the test data and format them for submission
colselect <- c( c( 'Id' ), pick7 )
test_df_out <- test_df %>%
dplyr::select( all_of( colselect ) ) %>%
replace_na( list( TotalBsmtSF = 0, GarageCars = 0) )
testpredict <- cleantrain_lm %>% predict( test_df_out )
submission <- data.frame( 'Id' = test_df$Id, 'SalePrice' = testpredict )
savestr <- '/home/bonzilla/Desktop/MSDS2020/DATA605/submission.csv'
write.csv( submission, savestr, row.names=FALSE )Well, as you can see by the ranking below, this did not perform very well on kaggle. There is only room for improvement!
Let’s improve on our model by incorporating more features including categorical one with one-hot encoding
train_df %>% summarise_all( funs( sum( is.na(.) ) ) ) %>%
pivot_longer(cols=tidyr::everything(), names_to= 'feature', values_to= 'na_count' ) %>%
filter( na_count > 0 )## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## # A tibble: 19 x 2
## feature na_count
## <chr> <int>
## 1 LotFrontage 259
## 2 Alley 1369
## 3 MasVnrType 8
## 4 MasVnrArea 8
## 5 BsmtQual 37
## 6 BsmtCond 37
## 7 BsmtExposure 38
## 8 BsmtFinType1 37
## 9 BsmtFinType2 38
## 10 Electrical 1
## 11 FireplaceQu 690
## 12 GarageType 81
## 13 GarageYrBlt 81
## 14 GarageFinish 81
## 15 GarageQual 81
## 16 GarageCond 81
## 17 PoolQC 1453
## 18 Fence 1179
## 19 MiscFeature 1406
From reading the documentation, many of the NAs indicate the record does not possess the structure (e.g. property does not have a basement) and for our purposes numeric feattures with ‘NA’ values can be changed to 0 and categorical features can have a new value to indicate the absence of the feature.
train_df_out <- train_df %>%
replace_na( list( LotFrontage = 0, Alley = 'No Access', MasVnrType = 'No Vnr',
MasVnrArea = 0, BsmtQual = 'No Bsmt', BsmtCond = 'No Bsmt',
BsmtExposure = 'No Bsmt', BsmtFinType1 = 'No Bsmt',
BsmtFinType2 = 'No Bsmt', Electrical = 'No sys', TotalBsmtSF = 0,
FireplaceQu = 'No FP', GarageQual = 'No Garage', GarageType = 'No Garage',
GarageYrBlt = 'No Garage', GarageFinish = 'No Garage', GarageCars = 0,
GarageCond = 'No Garage', GarageArea = 0, PoolQC = 'No Pool',
Fence = 'No Fence', MiscFeature = 'None' ) )
#treat the test data the same
test_df_out <- test_df %>%
replace_na( list( LotFrontage = 0, Alley = 'No Access', MasVnrType = 'No Vnr',
MasVnrArea = 0, BsmtQual = 'No Bsmt', BsmtCond = 'No Bsmt',
BsmtExposure = 'No Bsmt', BsmtFinType1 = 'No Bsmt',
BsmtFinType2 = 'No Bsmt', Electrical = 'No sys', TotalBsmtSF = 0,
FireplaceQu = 'No FP', GarageQual = 'No Garage', GarageType = 'No Garage',
GarageYrBlt = 'No Garage', GarageFinish = 'No Garage', GarageCars = 0,
GarageCond = 'No Garage', GarageArea = 0, PoolQC = 'No Pool',
Fence = 'No Fence', MiscFeature = 'None' ) )We will now use a very simple tidymodels recipe to apply to the data.
#simple model recipe for housing dataset
simple_ames <-
recipe(SalePrice ~ Neighborhood + GrLivArea + YearBuilt + BldgType,
data = train_df_out) %>%
step_log(GrLivArea, base = 10) %>%
step_dummy(all_nominal())
# now to prep the model
simple_ames_prepped <- prep(simple_ames )
train_prep <- bake(simple_ames_prepped, new_data = NULL)
#the next step is the 'bake' the trained recipe
test_prep <- bake(simple_ames_prepped, new_data = test_df_out)
#fit linear model
lm_fit <- lm( SalePrice ~ ., data = train_prep )
summary( lm_fit )##
## Call:
## lm(formula = SalePrice ~ ., data = train_prep)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185675 -19872 -2738 14497 357691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.197e+06 1.512e+05 -14.534 < 2e-16 ***
## GrLivArea 2.570e+05 8.707e+03 29.518 < 2e-16 ***
## YearBuilt 8.069e+02 7.475e+01 10.794 < 2e-16 ***
## Neighborhood_Blueste -1.936e+04 3.000e+04 -0.645 0.518785
## Neighborhood_BrDale -1.708e+04 1.526e+04 -1.119 0.263323
## Neighborhood_BrkSide -2.101e+04 1.295e+04 -1.622 0.104929
## Neighborhood_ClearCr -9.018e+03 1.325e+04 -0.681 0.496266
## Neighborhood_CollgCr -2.396e+04 1.106e+04 -2.167 0.030411 *
## Neighborhood_Crawfor 1.466e+04 1.265e+04 1.159 0.246649
## Neighborhood_Edwards -4.126e+04 1.179e+04 -3.500 0.000479 ***
## Neighborhood_Gilbert -4.642e+04 1.158e+04 -4.009 6.41e-05 ***
## Neighborhood_IDOTRR -3.836e+04 1.364e+04 -2.812 0.004987 **
## Neighborhood_MeadowV -1.601e+04 1.427e+04 -1.123 0.261821
## Neighborhood_Mitchel -2.992e+04 1.213e+04 -2.466 0.013779 *
## Neighborhood_NAmes -3.113e+04 1.141e+04 -2.728 0.006446 **
## Neighborhood_NoRidge 5.157e+04 1.240e+04 4.158 3.40e-05 ***
## Neighborhood_NPkVill -2.553e+03 1.672e+04 -0.153 0.878678
## Neighborhood_NridgHt 6.758e+04 1.122e+04 6.023 2.18e-09 ***
## Neighborhood_NWAmes -3.552e+04 1.176e+04 -3.021 0.002562 **
## Neighborhood_OldTown -3.018e+04 1.263e+04 -2.390 0.016966 *
## Neighborhood_Sawyer -3.556e+04 1.201e+04 -2.961 0.003117 **
## Neighborhood_SawyerW -2.994e+04 1.169e+04 -2.561 0.010551 *
## Neighborhood_Somerst -1.859e+03 1.109e+04 -0.168 0.866986
## Neighborhood_StoneBr 7.683e+04 1.270e+04 6.050 1.84e-09 ***
## Neighborhood_SWISU -4.076e+04 1.435e+04 -2.840 0.004572 **
## Neighborhood_Timber 1.140e+03 1.247e+04 0.091 0.927166
## Neighborhood_Veenker 2.989e+04 1.577e+04 1.896 0.058210 .
## BldgType_X2fmCon -1.377e+04 7.489e+03 -1.838 0.066211 .
## BldgType_Duplex -4.148e+04 5.799e+03 -7.153 1.36e-12 ***
## BldgType_Twnhs -6.287e+04 7.918e+03 -7.941 4.03e-15 ***
## BldgType_TwnhsE -3.820e+04 5.019e+03 -7.611 4.90e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39720 on 1429 degrees of freedom
## Multiple R-squared: 0.7552, Adjusted R-squared: 0.75
## F-statistic: 146.9 on 30 and 1429 DF, p-value: < 2.2e-16
submission2 <- data.frame( 'Id' = test_df$Id, 'SalePrice' = fullycooked )
savestr <- '/home/bonzilla/Desktop/MSDS2020/DATA605/submission2.csv'
write.csv( submission2, savestr, row.names=FALSE )Working through this simple demo greatly improved the ranking (although it did not improve the \(R^2\) value of the model fit?):
This is a step in the right direction. Let us see if we can improve on the latest attempt by creating a more complex model recipe. Starting with adding several more features. Will add several of the numeric features with high correlations to SalePrice (from the first attempt) and a few more categorical features that make intuitive sense.
extended_ames <-
recipe(SalePrice ~ Neighborhood + GrLivArea + YearBuilt + BldgType + OverallQual + GarageArea + TotRmsAbvGrd + BedroomAbvGr,
data = train_df_out) %>%
step_log(GrLivArea, base = 10) %>%
step_dummy(all_nominal())
# now to prep the model
extended_ames_prepped <- prep(extended_ames )
train_prep <- bake(extended_ames_prepped, new_data = NULL)
#the next step is the 'bake' the trained recipe
test_prep <- bake(extended_ames_prepped, new_data = test_df_out)
#fit linear model
lm_fit <- lm( SalePrice ~ ., data = train_prep )
summary( lm_fit )##
## Call:
## lm(formula = SalePrice ~ ., data = train_prep)
##
## Residuals:
## Min 1Q Median 3Q Max
## -255633 -17777 -1476 15004 336412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.256e+06 1.453e+05 -8.649 < 2e-16 ***
## GrLivArea 1.575e+05 1.433e+04 10.991 < 2e-16 ***
## YearBuilt 4.142e+02 7.044e+01 5.880 5.10e-09 ***
## OverallQual 1.799e+04 1.196e+03 15.040 < 2e-16 ***
## GarageArea 3.576e+01 5.989e+00 5.971 2.97e-09 ***
## TotRmsAbvGrd 5.380e+03 1.230e+03 4.375 1.30e-05 ***
## BedroomAbvGr -1.191e+04 1.752e+03 -6.801 1.53e-11 ***
## Neighborhood_Blueste 1.402e+03 2.680e+04 0.052 0.95828
## Neighborhood_BrDale 2.477e+02 1.366e+04 0.018 0.98554
## Neighborhood_BrkSide -5.987e+02 1.170e+04 -0.051 0.95919
## Neighborhood_ClearCr 1.915e+04 1.202e+04 1.594 0.11122
## Neighborhood_CollgCr -6.603e+03 1.000e+04 -0.660 0.50932
## Neighborhood_Crawfor 2.614e+04 1.140e+04 2.294 0.02194 *
## Neighborhood_Edwards -9.828e+03 1.071e+04 -0.918 0.35878
## Neighborhood_Gilbert -1.904e+04 1.048e+04 -1.816 0.06951 .
## Neighborhood_IDOTRR -1.857e+04 1.226e+04 -1.514 0.13016
## Neighborhood_MeadowV 2.448e+04 1.292e+04 1.894 0.05839 .
## Neighborhood_Mitchel -5.073e+03 1.100e+04 -0.461 0.64460
## Neighborhood_NAmes -5.092e+03 1.037e+04 -0.491 0.62355
## Neighborhood_NoRidge 6.088e+04 1.122e+04 5.426 6.75e-08 ***
## Neighborhood_NPkVill 1.291e+04 1.499e+04 0.861 0.38924
## Neighborhood_NridgHt 5.467e+04 1.008e+04 5.423 6.89e-08 ***
## Neighborhood_NWAmes -1.291e+04 1.065e+04 -1.213 0.22532
## Neighborhood_OldTown -1.779e+04 1.138e+04 -1.563 0.11827
## Neighborhood_Sawyer -3.242e+03 1.092e+04 -0.297 0.76656
## Neighborhood_SawyerW -7.594e+03 1.057e+04 -0.718 0.47260
## Neighborhood_Somerst 4.453e+03 1.003e+04 0.444 0.65701
## Neighborhood_StoneBr 6.556e+04 1.140e+04 5.752 1.08e-08 ***
## Neighborhood_SWISU -9.307e+03 1.300e+04 -0.716 0.47430
## Neighborhood_Timber 9.615e+03 1.122e+04 0.857 0.39163
## Neighborhood_Veenker 3.685e+04 1.415e+04 2.603 0.00933 **
## BldgType_X2fmCon -1.688e+03 6.723e+03 -0.251 0.80185
## BldgType_Duplex -2.273e+04 5.384e+03 -4.221 2.59e-05 ***
## BldgType_Twnhs -5.471e+04 7.234e+03 -7.564 6.99e-14 ***
## BldgType_TwnhsE -3.780e+04 4.723e+03 -8.004 2.49e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35370 on 1425 degrees of freedom
## Multiple R-squared: 0.8064, Adjusted R-squared: 0.8018
## F-statistic: 174.5 on 34 and 1425 DF, p-value: < 2.2e-16
submission3 <- data.frame( 'Id' = test_df$Id, 'SalePrice' = fullycooked2 )
savestr <- '/home/bonzilla/Desktop/MSDS2020/DATA605/submission3.csv'
write.csv( submission3, savestr, row.names=FALSE )Adding these features led to a modest improvement to both the \(R^2\) score of the model fit and the kaggle evaluation of the model predictions:
However, to make real progress towards improving the model’s prediction power would require detailed attention to groom individual features as well as implementing more methods to the tidymodel recipe. Perhaps if I find time on another weekend……