library(dplyr)
library(tidyr)
library(ggplot2)
library(naniar)
library(randomForest)
library(purrr)You are to compete in the House Prices: Advanced Regression Techniques competition https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? 5 points
# define url for data
urlTrain <- "https://raw.githubusercontent.com/esteban-data-enthusiast/data605/main/shared-data/house-prices-advanced-regression-techniques/train.csv"
# load the training data
rawTrain <- read.csv(urlTrain, stringsAsFactors = TRUE)summary(rawTrain)#> Id MSSubClass MSZoning LotFrontage
#> Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
#> 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
#> Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
#> Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
#> 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
#> Max. :1460.0 Max. :190.0 Max. :313.00
#> NA's :259
#> LotArea Street Alley LotShape LandContour Utilities
#> Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63 AllPub:1459
#> 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50 NoSeWa: 1
#> Median : 9478 NA's:1369 IR3: 10 Low: 36
#> Mean : 10517 Reg:925 Lvl:1311
#> 3rd Qu.: 11602
#> Max. :215245
#>
#> LotConfig LandSlope Neighborhood Condition1 Condition2
#> Corner : 263 Gtl:1382 NAmes :225 Norm :1260 Norm :1445
#> CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81 Feedr : 6
#> FR2 : 47 Sev: 13 OldTown:113 Artery : 48 Artery : 2
#> FR3 : 4 Edwards:100 RRAn : 26 PosN : 2
#> Inside :1052 Somerst: 86 PosN : 19 RRNn : 2
#> Gilbert: 79 RRAe : 11 PosA : 1
#> (Other):707 (Other): 15 (Other): 2
#> BldgType HouseStyle OverallQual OverallCond YearBuilt
#> 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872
#> 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
#> Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973
#> Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971
#> TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
#> 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
#> (Other): 19
#> YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
#> Min. :1950 Flat : 13 CompShg:1434 VinylSd:515 VinylSd:504
#> 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222 MetalSd:214
#> Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220 HdBoard:207
#> Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206 Wd Sdng:197
#> 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108 Plywood:142
#> Max. :2010 Shed : 2 Membran: 1 CemntBd: 61 CmentBd: 60
#> (Other): 2 (Other):128 (Other):136
#> MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual
#> BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3 BrkTil:146 Ex :121
#> BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28 CBlock:634 Fa : 35
#> None :864 Median : 0.0 Gd:488 Gd: 146 PConc :647 Gd :618
#> Stone :128 Mean : 103.7 TA:906 Po: 1 Slab : 24 TA :649
#> NA's : 8 3rd Qu.: 166.0 TA:1282 Stone : 6 NA's: 37
#> Max. :1600.0 Wood : 3
#> NA's :8
#> BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
#> Fa : 45 Av :221 ALQ :220 Min. : 0.0 ALQ : 19
#> Gd : 65 Gd :134 BLQ :148 1st Qu.: 0.0 BLQ : 33
#> Po : 2 Mn :114 GLQ :418 Median : 383.5 GLQ : 14
#> TA :1311 No :953 LwQ : 74 Mean : 443.6 LwQ : 46
#> NA's: 37 NA's: 38 Rec :133 3rd Qu.: 712.2 Rec : 54
#> Unf :430 Max. :5644.0 Unf :1256
#> NA's: 37 NA's: 38
#> BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
#> Min. : 0.00 Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741
#> 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49
#> Median : 0.00 Median : 477.5 Median : 991.5 GasW : 18 Gd:241
#> Mean : 46.55 Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
#> 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
#> Max. :1474.00 Max. :2336.0 Max. :6110.0 Wall : 4
#>
#> CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
#> N: 95 FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
#> Y:1365 FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
#> FuseP: 3 Median :1087 Median : 0 Median : 0.000
#> Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
#> SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
#> NA's : 1 Max. :4692 Max. :2065 Max. :572.000
#>
#> GrLivArea BsmtFullBath BsmtHalfBath FullBath
#> Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
#> 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
#> Median :1464 Median :0.0000 Median :0.00000 Median :2.000
#> Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
#> 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
#> Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
#>
#> HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
#> Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000
#> 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000
#> Median :0.0000 Median :3.000 Median :1.000 Gd:586 Median : 6.000
#> Mean :0.3829 Mean :2.866 Mean :1.047 TA:735 Mean : 6.518
#> 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
#> Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
#>
#> Functional Fireplaces FireplaceQu GarageType GarageYrBlt
#> Maj1: 14 Min. :0.000 Ex : 24 2Types : 6 Min. :1900
#> Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870 1st Qu.:1961
#> Min1: 31 Median :1.000 Gd :380 Basment: 19 Median :1980
#> Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88 Mean :1979
#> Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9 3rd Qu.:2002
#> Sev : 1 Max. :3.000 NA's:690 Detchd :387 Max. :2010
#> Typ :1360 NA's : 81 NA's :81
#> GarageFinish GarageCars GarageArea GarageQual GarageCond
#> Fin :352 Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2
#> RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48 Fa : 35
#> Unf :605 Median :2.000 Median : 480.0 Gd : 14 Gd : 9
#> NA's: 81 Mean :1.767 Mean : 473.0 Po : 3 Po : 7
#> 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311 TA :1326
#> Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
#>
#> PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
#> N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
#> P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
#> Y:1340 Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
#> Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
#> 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
#> Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
#>
#> ScreenPorch PoolArea PoolQC Fence MiscFeature
#> Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
#> 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
#> Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
#> Mean : 15.06 Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1
#> 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 NA's:1406
#> Max. :480.00 Max. :738.000
#>
#> MiscVal MoSold YrSold SaleType
#> Min. : 0.00 Min. : 1.000 Min. :2006 WD :1267
#> 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 New : 122
#> Median : 0.00 Median : 6.000 Median :2008 COD : 43
#> Mean : 43.49 Mean : 6.322 Mean :2008 ConLD : 9
#> 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009 ConLI : 5
#> Max. :15500.00 Max. :12.000 Max. :2010 ConLw : 5
#> (Other): 9
#> SaleCondition SalePrice
#> Abnorml: 101 Min. : 34900
#> AdjLand: 4 1st Qu.:129975
#> Alloca : 12 Median :163000
#> Family : 20 Mean :180921
#> Normal :1198 3rd Qu.:214000
#> Partial: 125 Max. :755000
#>
Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.
par(mfrow = c(1,3))
plot(rawTrain$OverallQual, rawTrain$SalePrice,
xlab = 'Overall Quality', ylab = 'Sale Price')
plot(rawTrain$GarageCars, rawTrain$SalePrice,
xlab = 'Garage Cars', ylab = 'Sale Price')
plot(rawTrain$FullBath, rawTrain$SalePrice,
xlab = 'Full Baths', ylab = 'Sale Price')Derive a correlation matrix for any three quantitative variables in the dataset.
# Subset numerical variables that are from the "train" set
num.train <- rawTrain %>%
select(which(sapply(.,is.integer)), which(sapply(., is.numeric)))
# select 3 specific numeric variables
num.train.three <- select(num.train, OverallQual, GarageCars, FullBath)
# generate a correlation matrix for the 3 numeric variables
corr_matrix <- round(cor(num.train.three),
digits = 2)
corr_matrix#> OverallQual GarageCars FullBath
#> OverallQual 1.00 0.60 0.55
#> GarageCars 0.60 1.00 0.47
#> FullBath 0.55 0.47 1.00
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
The null and alternative hypothesis for the correlation test are as follows:
\(H_0: p = 0\) (meaning that there is no linear relationship between the two variables)
\(H_1: p \neq 0\) (meaning that there is a linear relationship between the two variables)
# test the correlation between 1st and 2nd variables
cor_test1 <- cor.test(formula = ~ OverallQual + GarageCars,
data = num.train.three,
method = "pearson",
conf.level = 0.80)
# test the correlation between 1st and 3rd variables
cor_test2 <- cor.test(formula = ~ OverallQual + FullBath,
data = num.train.three,
method = "pearson",
conf.level = 0.80)
# test the correlation between 2nd and 3rd variables
cor_test3 <- cor.test(formula = ~ GarageCars + FullBath,
data = num.train.three,
method = "pearson",
conf.level = 0.80)Analyze the results
cor_test1#>
#> Pearson's product-moment correlation
#>
#> data: OverallQual and GarageCars
#> t = 28.688, df = 1458, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 80 percent confidence interval:
#> 0.5787769 0.6216992
#> sample estimates:
#> cor
#> 0.6006707
cor_test2#>
#> Pearson's product-moment correlation
#>
#> data: OverallQual and FullBath
#> t = 25.185, df = 1458, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 80 percent confidence interval:
#> 0.5267723 0.5735625
#> sample estimates:
#> cor
#> 0.5505997
cor_test3#>
#> Pearson's product-moment correlation
#>
#> data: GarageCars and FullBath
#> t = 20.314, df = 1458, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 80 percent confidence interval:
#> 0.4430949 0.4954243
#> sample estimates:
#> cor
#> 0.469672
In all three tests we see that the p-value < 2.2e-16, which is less than the significance level alpha = 0.05. We can conclude that the three pairs of variables are significantly correlated among each other. We can reject the null hypothesis.
Calculate the FamilyWise error (FWE)
\(FWE \leq ( 1 - \alpha ) ^ c\)
Where:
\(\alpha =\) alpha level for an individual test (e.g. .05)
\(c =\) Number of comparisons
FWE <- 1 - (1 -.05)^3
FWE#> [1] 0.142625
This means that the probability of a type I error is just over 14%, which is low considering only 3 tests were performed. So, we do not need to worry about Familywise Errors.
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix. 5 points
precision_matrix <- solve(corr_matrix)corr_matrix#> OverallQual GarageCars FullBath
#> OverallQual 1.00 0.60 0.55
#> GarageCars 0.60 1.00 0.47
#> FullBath 0.55 0.47 1.00
precision_matrix#> OverallQual GarageCars FullBath
#> OverallQual 1.8254452 -0.8001406 -0.6279288
#> GarageCars -0.8001406 1.6342549 -0.3280225
#> FullBath -0.6279288 -0.3280225 1.4995314
Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.
cor_x_precis <- corr_matrix %*% precision_matrix
cor_x_precis#> OverallQual GarageCars FullBath
#> OverallQual 1.000000e+00 5.551115e-17 0.000000e+00
#> GarageCars -5.551115e-17 1.000000e+00 1.110223e-16
#> FullBath -1.110223e-16 0.000000e+00 1.000000e+00
precis_x_cor <- precision_matrix %*% corr_matrix
precis_x_cor#> OverallQual GarageCars FullBath
#> OverallQual 1.000000e+00 1.665335e-16 0.000000e+00
#> GarageCars -1.665335e-16 1.000000e+00 -1.110223e-16
#> FullBath 0.000000e+00 1.110223e-16 1.000000e+00
library(matrixcalc)
lu_cor_x_precis <- matrixcalc::lu.decomposition(cor_x_precis)
L1 <- lu_cor_x_precis$L
U1 <- lu_cor_x_precis$U
print(L1)#> [,1] [,2] [,3]
#> [1,] 1.000000e+00 0.000000e+00 0
#> [2,] -5.551115e-17 1.000000e+00 0
#> [3,] -1.110223e-16 6.162976e-33 1
print(U1)#> [,1] [,2] [,3]
#> [1,] 1 5.551115e-17 0.000000e+00
#> [2,] 0 1.000000e+00 1.110223e-16
#> [3,] 0 0.000000e+00 1.000000e+00
print( L1 %*% U1)#> [,1] [,2] [,3]
#> [1,] 1.000000e+00 5.551115e-17 0.000000e+00
#> [2,] -5.551115e-17 1.000000e+00 1.110223e-16
#> [3,] -1.110223e-16 0.000000e+00 1.000000e+00
print( cor_x_precis ) #> OverallQual GarageCars FullBath
#> OverallQual 1.000000e+00 5.551115e-17 0.000000e+00
#> GarageCars -5.551115e-17 1.000000e+00 1.110223e-16
#> FullBath -1.110223e-16 0.000000e+00 1.000000e+00
library(matrixcalc)
lu_precis_x_cor <- matrixcalc::lu.decomposition(precis_x_cor)
L2 <- lu_precis_x_cor$L
U2 <- lu_precis_x_cor$U
print(L2)#> [,1] [,2] [,3]
#> [1,] 1.000000e+00 0.000000e+00 0
#> [2,] -1.665335e-16 1.000000e+00 0
#> [3,] 0.000000e+00 1.110223e-16 1
print(U2)#> [,1] [,2] [,3]
#> [1,] 1 1.665335e-16 0.000000e+00
#> [2,] 0 1.000000e+00 -1.110223e-16
#> [3,] 0 0.000000e+00 1.000000e+00
print( L2 %*% U2)#> [,1] [,2] [,3]
#> [1,] 1.000000e+00 1.665335e-16 0.000000e+00
#> [2,] -1.665335e-16 1.000000e+00 -1.110223e-16
#> [3,] 0.000000e+00 1.110223e-16 1.000000e+00
print( precis_x_cor ) #> OverallQual GarageCars FullBath
#> OverallQual 1.000000e+00 1.665335e-16 0.000000e+00
#> GarageCars -1.665335e-16 1.000000e+00 -1.110223e-16
#> FullBath 0.000000e+00 1.110223e-16 1.000000e+00
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html). Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss. 10 points
# histograms of integer variables
rawTrain %>%
keep(is.numeric) %>%
select(BsmtFinSF1, BsmtUnfSF, GrLivArea,
LotFrontage, GrLivArea, MasVnrArea,
OpenPorchSF, X1stFlrSF) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()From the histograms above we can see that the variable “BsmtUnfSF” (Unfinished square feet of basement area) appears to be significantly right skewed.
summary(rawTrain$BsmtUnfSF)#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.0 223.0 477.5 567.2 808.0 2336.0
X <- rawTrain$BsmtUnfSF
# fit fit an exponential probability density function
fit_dist <- MASS::fitdistr(X, "exponential")We know that probability density function (pdf) of an exponential distribution is
\[ f(x; \lambda)= \begin{cases} \lambda e^{-\lambda x} & \quad \text{when $x \geq 0$,}\\ 0 & \quad \text{when $x < 0$} \end{cases} \]
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score. 10 points
urlTest <-
"https://raw.githubusercontent.com/esteban-data-enthusiast/data605/main/shared-data/house-prices-advanced-regression-techniques/test.csv"
# load the test data
rawTest <- read.csv(urlTest, stringsAsFactors = TRUE)# In order to preserve the original data, store the raw Train and Test datasets
# in new variables
train <- rawTrain
test <- rawTest
# Before merging the two data sets, let's remove the extra columns from both
# sides, so that they have the same structure
# Remove the ID columns because we do not need them until we are ready
# to submit our Prediction dataset
trainId <- train$Id
testId <- test$Id
train$Id <- NULL
test$Id <- NULL
# Remove the SalePrice columns because we do not need them until we are ready
# to submit our Prediction dataset
trainSalePrice <- train$SalePrice
train$SalePrice <- NULL
# Now that both datasets have the same number of variables
# let's merge both datasets
allData <- rbind(train, test)
allData <- cbind(allData, SourceSet = c(rep("Train", times = dim(train)[1]),
rep("Test", times = dim(test)[1])
))allData %>%
naniar::miss_var_summary() %>%
arrange(desc(pct_miss)) %>%
filter(pct_miss > 20)#> # A tibble: 5 x 3
#> variable n_miss pct_miss
#> <chr> <int> <dbl>
#> 1 PoolQC 2909 99.7
#> 2 MiscFeature 2814 96.4
#> 3 Alley 2721 93.2
#> 4 Fence 2348 80.4
#> 5 FireplaceQu 1420 48.6
# From full dataset remove variables with high percentage of missing values
allData$PoolQC <- NULL
allData$MiscFeature <- NULL
allData$Alley <- NULL
allData$Fence <- NULL
allData$FireplaceQu <- NULL
# # From train dataset remove variables with high percentage of missing values
# train$PoolQC <- NULL
# train$MiscFeature <- NULL
# train$Alley <- NULL
# train$Fence <- NULL
# train$FireplaceQu <- NULL
#
# # From test dataset remove variables with high percentage of missing values
# test$PoolQC <- NULL
# test$MiscFeature <- NULL
# test$Alley <- NULL
# test$Fence <- NULL
# test$FireplaceQu <- NULLallData %>%
naniar::miss_var_summary() %>%
arrange(desc(pct_miss)) %>%
filter(pct_miss < 21)#> # A tibble: 75 x 3
#> variable n_miss pct_miss
#> <chr> <int> <dbl>
#> 1 LotFrontage 486 16.6
#> 2 GarageYrBlt 159 5.45
#> 3 GarageFinish 159 5.45
#> 4 GarageQual 159 5.45
#> 5 GarageCond 159 5.45
#> 6 GarageType 157 5.38
#> 7 BsmtCond 82 2.81
#> 8 BsmtExposure 82 2.81
#> 9 BsmtQual 81 2.77
#> 10 BsmtFinType2 80 2.74
#> # ... with 65 more rows
# Perform data imputation for NAs
# Replacing missing values by “0” for specific numeric variables where an
# integer value is expected
y <- c("LotFrontage", "MasVnrArea", "BsmtFinSF2", "BsmtUnfSF",
"TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath")
allData[,y] <- apply(allData[,y], 2,
function(x) {
replace(x, is.na(x), 0)
})
# For specific factor variables, replace NAs with "None"
y <- c("BsmtQual", "BsmtExposure", "BsmtFinType1",
"BsmtFinType2", "GarageType",
"GarageFinish", "GarageQual", "GarageCond", "BsmtCond")
allData[,y] <- apply(allData[,y], 2,
function(x) {
replace(x, is.na(x), "None")
})
# For specific non-numeric variables, replace NAs with the most common value
y <- c("MSZoning", "Utilities", "Exterior1st", "Exterior2nd",
"MasVnrType", "Electrical", "KitchenQual", "Functional", "SaleType")
allData[,y] <- apply(allData[,y], 2,
function(x) {
replace(x, is.na(x), names(which.max(table(x))))
})
# For specific numeric variables, replace NAs with the median value
y <- c("GarageCars", "GarageArea", "BsmtFinSF1")
allData[,y] <- apply(allData[,y], 2,
function(x) {
replace(x, is.na(x), median(x, na.rm = T))
}
)
# For GarageYrBlt, we will assume that the Garage was built in the
# same year as the house
allData$GarageYrBlt[is.na(allData$GarageYrBlt)] <-
allData$YearBuilt[is.na(allData$GarageYrBlt)]Ensure that there are no more variables with NAs
allData %>%
naniar::miss_var_summary() %>%
arrange(desc(pct_miss)) %>%
filter(pct_miss > 0)#> # A tibble: 0 x 3
#> # ... with 3 variables: variable <chr>, n_miss <int>, pct_miss <dbl>
# Colect name of variables that are character
class.list <- sapply(allData, class)
class.list.character <- names(class.list[which(class.list=="character")])
class.list.character#> [1] "MSZoning" "Utilities" "Exterior1st" "Exterior2nd" "MasVnrType"
#> [6] "BsmtQual" "BsmtCond" "BsmtExposure" "BsmtFinType1" "BsmtFinType2"
#> [11] "Electrical" "KitchenQual" "Functional" "GarageType" "GarageFinish"
#> [16] "GarageQual" "GarageCond" "SaleType" "SourceSet"
# Convert character variables to factors
allData[class.list.character] <- lapply(allData[class.list.character], factor)Since variable MSSubClass contains 16 unique values and it does not look like a variable on which we can apply math, convert it from numeric to factor
allData$MSSubClass <- factor(allData$MSSubClass)Ensure that all variables are properly classified
table(sapply(allData, class))#>
#> factor integer numeric
#> 40 25 10
Generate final “Train” and “Test” data sets.
# from the full data set, filter for the "Train" records and the selected variables
final_train <- allData %>%
filter(SourceSet == "Train") %>%
select("OverallQual", "GarageArea",
"YearBuilt", "Neighborhood", "MSSubClass",
"ExterQual", "KitchenQual", "BsmtQual",
"HouseStyle") %>%
mutate(Id = rawTrain$Id,
SalePrice = log(rawTrain$SalePrice))
#final_train
# from the full data set, filter for the "Test records and the selected variables
final_test <- allData %>%
filter(SourceSet == "Test") %>%
select("OverallQual", "GarageArea",
"YearBuilt", "Neighborhood", "MSSubClass",
"ExterQual", "KitchenQual", "BsmtQual",
"HouseStyle") %>%
mutate(Id = rawTest$Id)
#final_testUse the select features to fit a Random Forest model to our training dataset
fit_model <- randomForest(SalePrice ~ ., data = final_train, importance = TRUE )Use the model to predict the Sale Price on the Training data set
pred_test <- exp(predict(fit_model, newdata = final_test))Export the prediction results to a CSV file to be used for submission to Kaggle
write.csv(x = data.frame(Id = rawTest$Id, SalePrice = pred_test),
row.names = FALSE,
file = "submission.csv")