##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
The purpose of this project is to use different techniques in either R or Python programming language to analyse the relationship between the Sale Price of the houses in Ames, Iowa and different explanatory variables. Then, using the results of the analysis, predict the prices of the houses.
Let’s get started; first, we’ll load the data into R:
# Reading the data into R and assign it to Train_DataFrame
Train_DataFrame <- read.csv("https://raw.githubusercontent.com/SalouaDaouki/Data605/main/train.csv")
# Reading the test data into R
Test_DataFrame <- read.csv("https://raw.githubusercontent.com/SalouaDaouki/Data605/main/test.csv")
Let’s look at the data and its structure and see if we need to perform any tidying:
# Cheching the first few rows of the data set
head(Train_DataFrame)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
# Summary statistics of the dataset
summary(Train_DataFrame)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
Based on the statistical summary of the SalePrice variables, it appears that it is right skewed; as the mean (180,921) is greater than the median (163,000). We can visualize that better by the following:
plot(Train_DataFrame$SalePrice)
ggplot(data = Train_DataFrame, aes(x = SalePrice)) +
geom_histogram(color = "white", fill = "blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The majority of properties seem to have lower sale prices, as indicated by the clustering of points along the lower half of the graph. The histogram shows that the SalePrice is right skewed.
# Structure of the dataset
str(Train_DataFrame)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
The data have mixed of numerical and categorical variables, we may need to create subsets to separate both variables for the purpose of further analysis.
First, let’s identify any missing values:
# Check which variables have missing values
missing_values <- colSums(is.na(Train_DataFrame))
missing_variables <- names(missing_values[missing_values > 0])
# Create a data frame to store variable names and missing value counts
missing_info <- data.frame(Variable = missing_variables, Missing_Count = missing_values[missing_variables])
# Order the data frame by Missing_Count in descending order
missing_info <- missing_info[order(-missing_info$Missing_Count), ]
# Display the table
print(missing_info)
## Variable Missing_Count
## PoolQC PoolQC 1453
## MiscFeature MiscFeature 1406
## Alley Alley 1369
## Fence Fence 1179
## FireplaceQu FireplaceQu 690
## LotFrontage LotFrontage 259
## GarageType GarageType 81
## GarageYrBlt GarageYrBlt 81
## GarageFinish GarageFinish 81
## GarageQual GarageQual 81
## GarageCond GarageCond 81
## BsmtExposure BsmtExposure 38
## BsmtFinType2 BsmtFinType2 38
## BsmtQual BsmtQual 37
## BsmtCond BsmtCond 37
## BsmtFinType1 BsmtFinType1 37
## MasVnrType MasVnrType 8
## MasVnrArea MasVnrArea 8
## Electrical Electrical 1
Let’s visualize the missing values to see which variables has the most:
# Create a bar plot
barplot(missing_info$Missing_Count, names.arg = missing_info$Variable,
xlab = "Variable", ylab = "Missing Count",
col = "skyblue", main = "Count of Missing Data by Variable",
las = 2, cex.names = 0.8)
After calculating the percents of the missing values of each variables, we can clearly be confident to remove the ones that have higher number of the missing values, since it is not going to help us with anything for further analysis.
# Calculate the total number of observations
numb_obs <- nrow(Train_DataFrame)
# Calculate the percentage of missing values for each variable
missing_info <- mutate(missing_info, missing_percentage = (Missing_Count / numb_obs) * 100)
# Select variables with missing values exceeding 80%
vars_to_remove <- missing_info %>%
filter(missing_percentage > 80) %>%
pull(Variable)
# Remove the selected variables from the data
Train_df_subset <- Train_DataFrame %>%
select(-one_of(vars_to_remove))
Now let’s do the same thing for the test data, calculate the missing values and remove them from the data set.
# Check which variables have missing values
missing_valuesTest <- colSums(is.na(Test_DataFrame))
missing_variablesTest <- names(missing_valuesTest[missing_valuesTest > 0])
# Create a data frame to store variable names and missing value counts
missing_infoTest <- data.frame(Variable = missing_variablesTest, Missing_Count = missing_valuesTest[missing_variablesTest])
# Order the data frame by Missing_Count in descending order
missing_infoTest <- missing_infoTest[order(-missing_infoTest$Missing_Count), ]
# Display the table
print(missing_infoTest)
## Variable Missing_Count
## PoolQC PoolQC 1456
## MiscFeature MiscFeature 1408
## Alley Alley 1352
## Fence Fence 1169
## FireplaceQu FireplaceQu 730
## LotFrontage LotFrontage 227
## GarageYrBlt GarageYrBlt 78
## GarageFinish GarageFinish 78
## GarageQual GarageQual 78
## GarageCond GarageCond 78
## GarageType GarageType 76
## BsmtCond BsmtCond 45
## BsmtQual BsmtQual 44
## BsmtExposure BsmtExposure 44
## BsmtFinType1 BsmtFinType1 42
## BsmtFinType2 BsmtFinType2 42
## MasVnrType MasVnrType 16
## MasVnrArea MasVnrArea 15
## MSZoning MSZoning 4
## Utilities Utilities 2
## BsmtFullBath BsmtFullBath 2
## BsmtHalfBath BsmtHalfBath 2
## Functional Functional 2
## Exterior1st Exterior1st 1
## Exterior2nd Exterior2nd 1
## BsmtFinSF1 BsmtFinSF1 1
## BsmtFinSF2 BsmtFinSF2 1
## BsmtUnfSF BsmtUnfSF 1
## TotalBsmtSF TotalBsmtSF 1
## KitchenQual KitchenQual 1
## GarageCars GarageCars 1
## GarageArea GarageArea 1
## SaleType SaleType 1
Let’s visualize the missing values to see which variables has the most:
# Create a bar plot
barplot(missing_infoTest$Missing_Count, names.arg = missing_infoTest$Variable,
xlab = "Variable", ylab = "Missing Count",
col = "skyblue", main = "Count of Missing Data by Variable",
las = 2, cex.names = 0.8)
# Calculate the total number of observations
numb_obsTest <- nrow(Test_DataFrame)
# Calculate the percentage of missing values for each variable
missing_infoTest <- mutate(missing_infoTest, missing_percentage = (Missing_Count / numb_obsTest) * 100)
# Select variables with missing values exceeding 80%
vars_to_removeTest <- missing_infoTest %>%
filter(missing_percentage > 80) %>%
pull(Variable)
# Remove the selected variables from the data
Test_df_subset <- Test_DataFrame %>%
select(-one_of(vars_to_removeTest))
# Create a MICE imputation model using pmm
set.seed(124)
mice_mod <- mice(Train_df_subset, method = 'pmm', m = 5)
##
## iter imp variable
## 1 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 1 2 LotFrontage* MasVnrArea* GarageYrBlt*
## 1 3 LotFrontage* MasVnrArea* GarageYrBlt*
## 1 4 LotFrontage* MasVnrArea* GarageYrBlt*
## 1 5 LotFrontage* MasVnrArea* GarageYrBlt*
## 2 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 2 2 LotFrontage* MasVnrArea* GarageYrBlt*
## 2 3 LotFrontage* MasVnrArea* GarageYrBlt*
## 2 4 LotFrontage* MasVnrArea* GarageYrBlt*
## 2 5 LotFrontage* MasVnrArea* GarageYrBlt*
## 3 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 3 2 LotFrontage* MasVnrArea* GarageYrBlt*
## 3 3 LotFrontage* MasVnrArea* GarageYrBlt*
## 3 4 LotFrontage* MasVnrArea* GarageYrBlt*
## 3 5 LotFrontage* MasVnrArea* GarageYrBlt*
## 4 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 4 2 LotFrontage* MasVnrArea* GarageYrBlt*
## 4 3 LotFrontage* MasVnrArea* GarageYrBlt*
## 4 4 LotFrontage* MasVnrArea* GarageYrBlt*
## 4 5 LotFrontage* MasVnrArea* GarageYrBlt*
## 5 1 LotFrontage* MasVnrArea* GarageYrBlt*
## 5 2 LotFrontage* MasVnrArea* GarageYrBlt*
## 5 3 LotFrontage* MasVnrArea* GarageYrBlt*
## 5 4 LotFrontage* MasVnrArea* GarageYrBlt*
## 5 5 LotFrontage* MasVnrArea* GarageYrBlt*
## Warning: Number of logged events: 189
# Complete the imputation process
imputed_data <- complete(mice_mod)
# Create a MICE imputation model using the Predictive mean matching
mice_modTest <- mice(Test_df_subset, method = 'pmm', m = 5)
##
## iter imp variable
## 1 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 1 2 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 1 3 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 1 4 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 1 5 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 2 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 2 2 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 2 3 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 2 4 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 2 5 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 3 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 3 2 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 3 3 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 3 4 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 3 5 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 4 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 4 2 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 4 3 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 4 4 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 4 5 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 5 1 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 5 2 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 5 3 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 5 4 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## 5 5 LotFrontage* MasVnrArea* BsmtFinSF1* BsmtFinSF2* BsmtUnfSF* TotalBsmtSF* BsmtFullBath* BsmtHalfBath* GarageYrBlt* GarageCars* GarageArea*
## Warning: Number of logged events: 589
# Complete the imputation process
imputed_dataTest <- complete(mice_modTest)
After imputation of both data sets (train and test), let’s assess the quality of the imputation:
# Diagnostic plots
plot(mice_mod)
# Summary statistics
summary(mice_mod)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## Id MSSubClass MSZoning LotFrontage LotArea
## "" "" "" "pmm" ""
## Street LotShape LandContour Utilities LotConfig
## "" "" "" "" ""
## LandSlope Neighborhood Condition1 Condition2 BldgType
## "" "" "" "" ""
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## "" "" "" "" ""
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## "" "" "" "" ""
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## "pmm" "" "" "" ""
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## "" "" "" "" ""
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## "" "" "" "" ""
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## "" "" "" "" ""
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## "" "" "" "" ""
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## "" "" "" "" ""
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## "" "" "" "pmm" ""
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## "" "" "" "" ""
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## "" "" "" "" ""
## PoolArea MiscVal MoSold YrSold SaleType
## "" "" "" "" ""
## SaleCondition SalePrice
## "" ""
## PredictorMatrix:
## Id MSSubClass MSZoning LotFrontage LotArea Street LotShape
## Id 0 1 0 1 1 0 0
## MSSubClass 1 0 0 1 1 0 0
## MSZoning 1 1 0 1 1 0 0
## LotFrontage 1 1 0 0 1 0 0
## LotArea 1 1 0 1 0 0 0
## Street 1 1 0 1 1 0 0
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## Id 0 0 0 0 0 0
## MSSubClass 0 0 0 0 0 0
## MSZoning 0 0 0 0 0 0
## LotFrontage 0 0 0 0 0 0
## LotArea 0 0 0 0 0 0
## Street 0 0 0 0 0 0
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## Id 0 0 0 1 1 1
## MSSubClass 0 0 0 1 1 1
## MSZoning 0 0 0 1 1 1
## LotFrontage 0 0 0 1 1 1
## LotArea 0 0 0 1 1 1
## Street 0 0 0 1 1 1
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## Id 1 0 0 0 0 0
## MSSubClass 1 0 0 0 0 0
## MSZoning 1 0 0 0 0 0
## LotFrontage 1 0 0 0 0 0
## LotArea 1 0 0 0 0 0
## Street 1 0 0 0 0 0
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## Id 1 0 0 0 0 0
## MSSubClass 1 0 0 0 0 0
## MSZoning 1 0 0 0 0 0
## LotFrontage 1 0 0 0 0 0
## LotArea 1 0 0 0 0 0
## Street 1 0 0 0 0 0
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## Id 0 0 1 0 1
## MSSubClass 0 0 1 0 1
## MSZoning 0 0 1 0 1
## LotFrontage 0 0 1 0 1
## LotArea 0 0 1 0 1
## Street 0 0 1 0 1
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Id 1 1 0 0 0 0
## MSSubClass 1 1 0 0 0 0
## MSZoning 1 1 0 0 0 0
## LotFrontage 1 1 0 0 0 0
## LotArea 1 1 0 0 0 0
## Street 1 1 0 0 0 0
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Id 1 1 1 1 1
## MSSubClass 1 1 1 1 1
## MSZoning 1 1 1 1 1
## LotFrontage 1 1 1 1 1
## LotArea 1 1 1 1 1
## Street 1 1 1 1 1
## BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr
## Id 1 1 1 1 1
## MSSubClass 1 1 1 1 1
## MSZoning 1 1 1 1 1
## LotFrontage 1 1 1 1 1
## LotArea 1 1 1 1 1
## Street 1 1 1 1 1
## KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## Id 0 1 0 1 0
## MSSubClass 0 1 0 1 0
## MSZoning 0 1 0 1 0
## LotFrontage 0 1 0 1 0
## LotArea 0 1 0 1 0
## Street 0 1 0 1 0
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## Id 0 1 0 1 1
## MSSubClass 0 1 0 1 1
## MSZoning 0 1 0 1 1
## LotFrontage 0 1 0 1 1
## LotArea 0 1 0 1 1
## Street 0 1 0 1 1
## GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Id 0 0 0 1 1
## MSSubClass 0 0 0 1 1
## MSZoning 0 0 0 1 1
## LotFrontage 0 0 0 1 1
## LotArea 0 0 0 1 1
## Street 0 0 0 1 1
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold
## Id 1 1 1 1 1 1 1
## MSSubClass 1 1 1 1 1 1 1
## MSZoning 1 1 1 1 1 1 1
## LotFrontage 1 1 1 1 1 1 1
## LotArea 1 1 1 1 1 1 1
## Street 1 1 1 1 1 1 1
## SaleType SaleCondition SalePrice
## Id 0 0 1
## MSSubClass 0 0 1
## MSZoning 0 0 1
## LotFrontage 0 0 1
## LotArea 0 0 1
## Street 0 0 1
## Number of logged events: 189
## it im dep meth out
## 1 0 0 constant MSZoning
## 2 0 0 constant Street
## 3 0 0 constant LotShape
## 4 0 0 constant LandContour
## 5 0 0 constant Utilities
## 6 0 0 constant LotConfig
# Diagnostic plots
plot(mice_modTest)
# Summary statistics
summary(mice_modTest)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## Id MSSubClass MSZoning LotFrontage LotArea
## "" "" "" "pmm" ""
## Street LotShape LandContour Utilities LotConfig
## "" "" "" "" ""
## LandSlope Neighborhood Condition1 Condition2 BldgType
## "" "" "" "" ""
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## "" "" "" "" ""
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## "" "" "" "" ""
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## "pmm" "" "" "" ""
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## "" "" "" "pmm" ""
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## "pmm" "pmm" "pmm" "" ""
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## "" "" "" "" ""
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## "" "pmm" "pmm" "" ""
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## "" "" "" "" ""
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## "" "" "" "pmm" ""
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## "pmm" "pmm" "" "" ""
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## "" "" "" "" ""
## PoolArea MiscVal MoSold YrSold SaleType
## "" "" "" "" ""
## SaleCondition
## ""
## PredictorMatrix:
## Id MSSubClass MSZoning LotFrontage LotArea Street LotShape
## Id 0 1 0 1 1 0 0
## MSSubClass 1 0 0 1 1 0 0
## MSZoning 0 0 0 0 0 0 0
## LotFrontage 1 1 0 0 1 0 0
## LotArea 1 1 0 1 0 0 0
## Street 1 1 0 1 1 0 0
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## Id 0 0 0 0 0 0
## MSSubClass 0 0 0 0 0 0
## MSZoning 0 0 0 0 0 0
## LotFrontage 0 0 0 0 0 0
## LotArea 0 0 0 0 0 0
## Street 0 0 0 0 0 0
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## Id 0 0 0 1 1 1
## MSSubClass 0 0 0 1 1 1
## MSZoning 0 0 0 0 0 0
## LotFrontage 0 0 0 1 1 1
## LotArea 0 0 0 1 1 1
## Street 0 0 0 1 1 1
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## Id 1 0 0 0 0 0
## MSSubClass 1 0 0 0 0 0
## MSZoning 0 0 0 0 0 0
## LotFrontage 1 0 0 0 0 0
## LotArea 1 0 0 0 0 0
## Street 1 0 0 0 0 0
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## Id 1 0 0 0 0 0
## MSSubClass 1 0 0 0 0 0
## MSZoning 0 0 0 0 0 0
## LotFrontage 1 0 0 0 0 0
## LotArea 1 0 0 0 0 0
## Street 1 0 0 0 0 0
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## Id 0 0 1 0 1
## MSSubClass 0 0 1 0 1
## MSZoning 0 0 0 0 0
## LotFrontage 0 0 1 0 1
## LotArea 0 0 1 0 1
## Street 0 0 1 0 1
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Id 1 1 0 0 0 0
## MSSubClass 1 1 0 0 0 0
## MSZoning 0 0 0 0 0 0
## LotFrontage 1 1 0 0 0 0
## LotArea 1 1 0 0 0 0
## Street 1 1 0 0 0 0
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Id 1 1 1 1 1
## MSSubClass 1 1 1 1 1
## MSZoning 0 0 0 0 0
## LotFrontage 1 1 1 1 1
## LotArea 1 1 1 1 1
## Street 1 1 1 1 1
## BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr
## Id 1 1 1 1 1
## MSSubClass 1 1 1 1 1
## MSZoning 0 0 0 0 0
## LotFrontage 1 1 1 1 1
## LotArea 1 1 1 1 1
## Street 1 1 1 1 1
## KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## Id 0 1 0 1 0
## MSSubClass 0 1 0 1 0
## MSZoning 0 0 0 0 0
## LotFrontage 0 1 0 1 0
## LotArea 0 1 0 1 0
## Street 0 1 0 1 0
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## Id 0 1 0 1 1
## MSSubClass 0 1 0 1 1
## MSZoning 0 0 0 0 0
## LotFrontage 0 1 0 1 1
## LotArea 0 1 0 1 1
## Street 0 1 0 1 1
## GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Id 0 0 0 1 1
## MSSubClass 0 0 0 1 1
## MSZoning 0 0 0 0 0
## LotFrontage 0 0 0 1 1
## LotArea 0 0 0 1 1
## Street 0 0 0 1 1
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold
## Id 1 1 1 1 1 1 1
## MSSubClass 1 1 1 1 1 1 1
## MSZoning 0 0 0 0 0 0 0
## LotFrontage 1 1 1 1 1 1 1
## LotArea 1 1 1 1 1 1 1
## Street 1 1 1 1 1 1 1
## SaleType SaleCondition
## Id 0 0
## MSSubClass 0 0
## MSZoning 0 0
## LotFrontage 0 0
## LotArea 0 0
## Street 0 0
## Number of logged events: 589
## it im dep meth out
## 1 0 0 constant MSZoning
## 2 0 0 constant Street
## 3 0 0 constant LotShape
## 4 0 0 constant LandContour
## 5 0 0 constant Utilities
## 6 0 0 constant LotConfig
Now let’s take a look at the SalePrice on both datasets, Train_DataFrame; before tidying, and Train_df_subset; after tidying:
# histogram for SalePrice in the original data
hist(Train_DataFrame$SalePrice, main = "Train Data Frame before tidying", xlab = "Sale Price")
# Histogram for SalePrice in the subset
hist(Train_df_subset$SalePrice, main = "Train Subset after tidying", xlab = "Sale Price")
Before visualizing the numerical variables in the data, let’s check their skewness, then pick only the varaibles with positive skewness.
# Identify the numerical and categorical data variables
numerical_vars <- sapply(Train_df_subset, is.numeric)
Categorical_vars <- sapply(Train_df_subset, is.character)
# Subset the data to include only quantitative variables
numerical_data <- Train_df_subset[, numerical_vars]
Categorical_data <- Train_df_subset[, Categorical_vars]
# Calculate skewness for quantitative variables
skewness_values <- sapply(numerical_data, skewness)
head(skewness_values)
## Id MSSubClass LotFrontage LotArea OverallQual OverallCond
## 0.0000000 1.4047656 NA 12.1826150 0.2164984 0.6916440
summary(numerical_data)
## Id MSSubClass LotFrontage LotArea
## Min. : 1.0 Min. : 20.0 Min. : 21.00 Min. : 1300
## 1st Qu.: 365.8 1st Qu.: 20.0 1st Qu.: 59.00 1st Qu.: 7554
## Median : 730.5 Median : 50.0 Median : 69.00 Median : 9478
## Mean : 730.5 Mean : 56.9 Mean : 70.05 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00 3rd Qu.: 11602
## Max. :1460.0 Max. :190.0 Max. :313.00 Max. :215245
## NA's :259
## OverallQual OverallCond YearBuilt YearRemodAdd
## Min. : 1.000 Min. :1.000 Min. :1872 Min. :1950
## 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967
## Median : 6.000 Median :5.000 Median :1973 Median :1994
## Mean : 6.099 Mean :5.575 Mean :1971 Mean :1985
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004
## Max. :10.000 Max. :9.000 Max. :2010 Max. :2010
##
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 0.0 Median : 383.5 Median : 0.00 Median : 477.5
## Mean : 103.7 Mean : 443.6 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 166.0 3rd Qu.: 712.2 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :1600.0 Max. :5644.0 Max. :1474.00 Max. :2336.0
## NA's :8
## TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF
## Min. : 0.0 Min. : 334 Min. : 0 Min. : 0.000
## 1st Qu.: 795.8 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## Median : 991.5 Median :1087 Median : 0 Median : 0.000
## Mean :1057.4 Mean :1163 Mean : 347 Mean : 5.845
## 3rd Qu.:1298.2 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## Max. :6110.0 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Fireplaces GarageYrBlt GarageCars GarageArea
## Min. :0.000 Min. :1900 Min. :0.000 Min. : 0.0
## 1st Qu.:0.000 1st Qu.:1961 1st Qu.:1.000 1st Qu.: 334.5
## Median :1.000 Median :1980 Median :2.000 Median : 480.0
## Mean :0.613 Mean :1979 Mean :1.767 Mean : 473.0
## 3rd Qu.:1.000 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :3.000 Max. :2010 Max. :4.000 Max. :1418.0
## NA's :81
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea MiscVal MoSold
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 5.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 6.000
## Mean : 15.06 Mean : 2.759 Mean : 43.49 Mean : 6.322
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :480.00 Max. :738.000 Max. :15500.00 Max. :12.000
##
## YrSold SalePrice
## Min. :2006 Min. : 34900
## 1st Qu.:2007 1st Qu.:129975
## Median :2008 Median :163000
## Mean :2008 Mean :180921
## 3rd Qu.:2009 3rd Qu.:214000
## Max. :2010 Max. :755000
##
# Histogram for numerical data with right skewness
numerical_data %>%
keep(skewness_values>2) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Perform ANOVA for each categorical variable against SalePrice
anova_results <- lapply(Categorical_data, function(x) {
aov_result <- aov(Train_df_subset$SalePrice ~ x)
p_value <- summary(aov_result)[[1]][["Pr(>F)"]][1]
return(p_value)
})
# Combine variable names and p-values into a data frame
anova_results_df <- data.frame(variable = names(anova_results),
p_value = unlist(anova_results))
# Filter variables with significant p-values (e.g., p < 0.05)
significant_vars <- anova_results_df$variable[anova_results_df$p_value < 0.05]
# Subset the categorical data to include only significant variables
significant_categorical_data <- Categorical_data[, significant_vars]
# Histogram for most significant categorical variables data with right skewness
significant_categorical_data %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_bar()
# Perform log transformation on LotArea and SalePrice
Train_df_subset$log_LotArea <- log(Train_df_subset$LotArea)
Train_df_subset$log_SalePrice <- log(Train_df_subset$SalePrice)
# Plot histograms of original and log-transformed variables
ggplot(Train_df_subset, aes(x = LotArea)) +
geom_histogram(binwidth = 100) +
labs(title = "Histogram of LotArea (Original)")
ggplot(Train_df_subset, aes(x = log_LotArea)) +
geom_histogram(binwidth = 0.1) +
labs(title = "Histogram of Log(LotArea)")
ggplot(Train_df_subset, aes(x = SalePrice)) +
geom_histogram(binwidth = 10000) +
labs(title = "Histogram of SalePrice (Original)")
ggplot(Train_df_subset, aes(x = log_SalePrice)) +
geom_histogram(binwidth = 0.1) +
labs(title = "Histogram of Log(SalePrice)")
As we can see that the original variables have right-skewed distribution, whereas the log-transformed variables have normal distribution
Now, in order to have a better model and a better predictions, let’s calculate the correlation between all numerical variables and Sale Price to identify potential relationships.
# Exclude SalePrice from numerical_data
numerical_data_without_saleprice <- numerical_data[, !colnames(numerical_data) %in% "SalePrice"]
# Calculate correlations between all numerical variables (excluding SalePrice) and SalePrice
correlation_with_saleprice <- cor(numerical_data_without_saleprice, Train_df_subset$SalePrice, use = "complete.obs")
# Find the variable with the strongest correlation with SalePrice
strongest_correlation <- names(which.max(abs(correlation_with_saleprice)))
# Print the correlation values
print(correlation_with_saleprice)
## [,1]
## Id -0.047121850
## MSSubClass -0.088031702
## LotFrontage 0.344269772
## LotArea 0.299962206
## OverallQual 0.797880680
## OverallCond -0.124391232
## YearBuilt 0.525393598
## YearRemodAdd 0.521253270
## MasVnrArea 0.488658155
## BsmtFinSF1 0.390300523
## BsmtFinSF2 -0.028021366
## BsmtUnfSF 0.213128680
## TotalBsmtSF 0.615612237
## X1stFlrSF 0.607969106
## X2ndFlrSF 0.306879002
## LowQualFinSF -0.001481983
## GrLivArea 0.705153567
## BsmtFullBath 0.236737407
## BsmtHalfBath -0.036512665
## FullBath 0.566627442
## HalfBath 0.268560303
## BedroomAbvGr 0.166813894
## KitchenAbvGr -0.140497445
## TotRmsAbvGrd 0.547067360
## Fireplaces 0.461872689
## GarageYrBlt 0.504753018
## GarageCars 0.647033611
## GarageArea 0.619329622
## WoodDeckSF 0.336855121
## OpenPorchSF 0.343353812
## EnclosedPorch -0.154843204
## X3SsnPorch 0.030776594
## ScreenPorch 0.110426815
## PoolArea 0.092488120
## MiscVal -0.036041237
## MoSold 0.051568064
## YrSold -0.011868823
# Print the variable with the strongest correlation
print(strongest_correlation)
## NULL
Based on the correlation calculated above, we can see that Sale Price has strong (absolute value is above 0.3) with the following variables: LotArea”, “OverallQual”, “YearBuilt”, “YearRemodAdd”, “TotalBsmtSF”, “X1stFlrSF”, “GrLivArea”, “GarageCars”, “GarageArea”
# Subset the numerical data to include variables with the strongest correlation
strongest_correlation_variables <- c("LotArea", "OverallQual", "YearBuilt", "YearRemodAdd", "TotalBsmtSF", "X1stFlrSF", "GrLivArea", "GarageCars", "GarageArea")
strongest_correlation_data <- numerical_data_without_saleprice[, strongest_correlation_variables]
# Compute the correlation matrix
correlation_matrix_strongest <- cor(strongest_correlation_data)
# Print the correlation matrix
print(correlation_matrix_strongest)
## LotArea OverallQual YearBuilt YearRemodAdd TotalBsmtSF
## LotArea 1.00000000 0.1058057 0.01422765 0.01378843 0.2608331
## OverallQual 0.10580574 1.0000000 0.57232277 0.55068392 0.5378085
## YearBuilt 0.01422765 0.5723228 1.00000000 0.59285498 0.3914520
## YearRemodAdd 0.01378843 0.5506839 0.59285498 1.00000000 0.2910656
## TotalBsmtSF 0.26083313 0.5378085 0.39145200 0.29106558 1.0000000
## X1stFlrSF 0.29947458 0.4762238 0.28198586 0.24037927 0.8195300
## GrLivArea 0.26311617 0.5930074 0.19900971 0.28738852 0.4548682
## GarageCars 0.15487074 0.6006707 0.53785009 0.42062215 0.4345848
## GarageArea 0.18040276 0.5620218 0.47895382 0.37159981 0.4866655
## X1stFlrSF GrLivArea GarageCars GarageArea
## LotArea 0.2994746 0.2631162 0.1548707 0.1804028
## OverallQual 0.4762238 0.5930074 0.6006707 0.5620218
## YearBuilt 0.2819859 0.1990097 0.5378501 0.4789538
## YearRemodAdd 0.2403793 0.2873885 0.4206222 0.3715998
## TotalBsmtSF 0.8195300 0.4548682 0.4345848 0.4866655
## X1stFlrSF 1.0000000 0.5660240 0.4393168 0.4897817
## GrLivArea 0.5660240 1.0000000 0.4672474 0.4689975
## GarageCars 0.4393168 0.4672474 1.0000000 0.8824754
## GarageArea 0.4897817 0.4689975 0.8824754 1.0000000
# Fit linear model with variables having abs(correlation) >= 0.3
strong_correlation_variables <- names(which(abs(correlation_matrix_strongest) >= 0.3 & row(correlation_matrix_strongest) != col(correlation_matrix_strongest)))
# Add SalePrice to strongest_correlation_data
strongest_correlation_data <- cbind(strongest_correlation_data, SalePrice = Train_df_subset$SalePrice)
# Fit linear model with variables having the strongest correlation
lm_model <- lm(SalePrice ~ ., data = strongest_correlation_data)
# Print the summary of the linear model
summary(lm_model)
##
## Call:
## lm(formula = SalePrice ~ ., data = strongest_correlation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -489636 -19500 -1834 14990 295150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.125e+06 1.204e+05 -9.344 < 2e-16 ***
## LotArea 6.386e-01 1.051e-01 6.076 1.57e-09 ***
## OverallQual 2.017e+04 1.180e+03 17.092 < 2e-16 ***
## YearBuilt 2.411e+02 4.741e+01 5.086 4.14e-07 ***
## YearRemodAdd 2.897e+02 6.260e+01 4.628 4.01e-06 ***
## TotalBsmtSF 1.911e+01 4.226e+00 4.522 6.62e-06 ***
## X1stFlrSF 1.132e+01 4.885e+00 2.317 0.02063 *
## GrLivArea 4.515e+01 2.707e+00 16.677 < 2e-16 ***
## GarageCars 9.335e+03 2.986e+03 3.126 0.00181 **
## GarageArea 1.629e+01 1.012e+01 1.609 0.10774
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37520 on 1450 degrees of freedom
## Multiple R-squared: 0.7783, Adjusted R-squared: 0.7769
## F-statistic: 565.7 on 9 and 1450 DF, p-value: < 2.2e-16
We can see that the p-value of GarageArea is 0.10774, which is greater than 0.05 To improve the linear model above, let’s “backward_eliminate” the least significant variables:
# Fit initial model without X1stFlrSF
lm_model_initial <- lm(SalePrice ~ . - GarageArea, data = strongest_correlation_data)
# Assess model fit
summary(lm_model_initial)
##
## Call:
## lm(formula = SalePrice ~ . - GarageArea, data = strongest_correlation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -480779 -19678 -1749 15393 294396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.121e+06 1.205e+05 -9.306 < 2e-16 ***
## LotArea 6.424e-01 1.051e-01 6.111 1.27e-09 ***
## OverallQual 2.015e+04 1.180e+03 17.067 < 2e-16 ***
## YearBuilt 2.406e+02 4.744e+01 5.073 4.43e-07 ***
## YearRemodAdd 2.879e+02 6.262e+01 4.597 4.66e-06 ***
## TotalBsmtSF 1.976e+01 4.210e+00 4.694 2.93e-06 ***
## X1stFlrSF 1.179e+01 4.879e+00 2.417 0.0158 *
## GrLivArea 4.531e+01 2.707e+00 16.736 < 2e-16 ***
## GarageCars 1.317e+04 1.800e+03 7.316 4.22e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37540 on 1451 degrees of freedom
## Multiple R-squared: 0.7779, Adjusted R-squared: 0.7767
## F-statistic: 635.4 on 8 and 1451 DF, p-value: < 2.2e-16
Removing GarageArea from the model has a very small impact on the model’s fit. Let’s remove the X1stFlrSF and see the impact:
# Fit initial model without X1stFlrSF
lm_model_initial1 <- lm(SalePrice ~ . - X1stFlrSF, data = strongest_correlation_data)
# Assess model fit
summary(lm_model_initial1)
##
## Call:
## lm(formula = SalePrice ~ . - X1stFlrSF, data = strongest_correlation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -495369 -19133 -2244 15155 294325
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.109e+06 1.204e+05 -9.214 < 2e-16 ***
## LotArea 6.568e-01 1.050e-01 6.258 5.12e-10 ***
## OverallQual 1.989e+04 1.176e+03 16.920 < 2e-16 ***
## YearBuilt 2.351e+02 4.741e+01 4.958 7.98e-07 ***
## YearRemodAdd 2.894e+02 6.269e+01 4.616 4.25e-06 ***
## TotalBsmtSF 2.627e+01 2.890e+00 9.090 < 2e-16 ***
## GrLivArea 4.721e+01 2.561e+00 18.431 < 2e-16 ***
## GarageCars 9.453e+03 2.991e+03 3.161 0.0016 **
## GarageArea 1.770e+01 1.012e+01 1.749 0.0805 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37580 on 1451 degrees of freedom
## Multiple R-squared: 0.7775, Adjusted R-squared: 0.7763
## F-statistic: 633.8 on 8 and 1451 DF, p-value: < 2.2e-16
Backward-eliminating the X1stFlrSF, has a little larger impact on the model’s fit, we can see that by comparing the adjusted R-squared on the three models; it was 0.7769, then it decreased to 0.7797 (very slight difference) then it decreased to 0.7763. So let’s perform residual analysis on the initial model (before removing any of the variables)
plot(fitted(lm_model), residuals(lm_model),
xlab = "Fitted values", ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red")
The points are not randomly scattered around 0, so the linear model might not be suitable for the data. let’s confirm that with normal Q-Q plot:
qqnorm(residuals(lm_model))
qqline(residuals(lm_model))
Less variance in the data, this suggest that it doesn’t follow normal distribution perfectly. For further investigation of the linear model, let’s plot cook’s distance to identify the values which have more influence than others on the estimated coefficients.
# Predicted values from the linear regression model
predicted <- predict(lm_model)
# Plot Response vs. Fitted values
plot(predicted, strongest_correlation_data$SalePrice,
xlab = "Fitted (Predicted) Values",
ylab = "Actual Sale Price",
main = "Response vs. Fitted (Predicted) Values")
# Add a reference line
abline(0, 1, col = "red")
The y-intercept of the abline is close to zero, but not zero, so we may need more data processing then we will fit the model again. One of the options we can do is log-transformations of the predictors.
# Log-transform strongly correlated variables in the entire dataframe
strongest_correlation_data <- strongest_correlation_data %>%
mutate(log_LotArea = log(LotArea),
log_OverallQual = log(OverallQual),
log_YearBuilt = log(YearBuilt),
log_YearRemodAdd = log(YearRemodAdd),
log_TotalBsmtSF = log(TotalBsmtSF),
log_X1stFlrSF = log(X1stFlrSF),
log_GrLivArea = log(GrLivArea))
Now let’s fit the linear model on the log-transformed predictors:
# Check for zero or negative values in the original variables
sum(strongest_correlation_data$TotalBsmtSF <= 0)
## [1] 37
sum(strongest_correlation_data$GrLivArea <= 0)
## [1] 0
After checking for zero or negative values, the variables TotalBsmtSF has 37 values that are \(\leq0\), let’s add a small constant value to it to avoid any issu with log transformation:
# Add a small constant value to avoid zero or negative values
epsilon <- 1e-6 # Small constant
strongest_correlation_data$log_TotalBsmtSF <- log(strongest_correlation_data$TotalBsmtSF + epsilon)
Let’s check again for zero and/or negative values:
# Check for zero or negative values in the original variables
sum(strongest_correlation_data$Log_TotalBsmtSF <= 0)
## [1] 0
# Fit linear model on log-transformed variables
lm_model_log <- lm(SalePrice ~ log_LotArea + log_OverallQual + log_YearBuilt + log_YearRemodAdd +
log_TotalBsmtSF + log_X1stFlrSF + log_GrLivArea, data = strongest_correlation_data)
# Print summary of the linear model
summary(lm_model_log)
##
## Call:
## lm(formula = SalePrice ~ log_LotArea + log_OverallQual + log_YearBuilt +
## log_YearRemodAdd + log_TotalBsmtSF + log_X1stFlrSF + log_GrLivArea,
## data = strongest_correlation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -296387 -22172 -3695 15019 378800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.374e+07 9.862e+05 -13.930 < 2e-16 ***
## log_LotArea 2.216e+04 2.467e+03 8.984 < 2e-16 ***
## log_OverallQual 1.012e+05 7.127e+03 14.193 < 2e-16 ***
## log_YearBuilt 8.804e+05 9.606e+04 9.165 < 2e-16 ***
## log_YearRemodAdd 7.997e+05 1.372e+05 5.829 6.86e-09 ***
## log_TotalBsmtSF 6.252e+02 3.495e+02 1.789 0.0738 .
## log_X1stFlrSF 3.894e+04 4.487e+03 8.679 < 2e-16 ***
## log_GrLivArea 7.002e+04 4.679e+03 14.964 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41730 on 1452 degrees of freedom
## Multiple R-squared: 0.7254, Adjusted R-squared: 0.7241
## F-statistic: 547.9 on 7 and 1452 DF, p-value: < 2.2e-16
Let’s remove the log_TotalBsmtSF since it has the highest p-value (\(0.0738 > 0.05\))
# Fit linear model on log-transformed variables
lm_model_log1 <- lm(SalePrice ~ log_LotArea + log_OverallQual + log_YearBuilt + log_YearRemodAdd +
log_X1stFlrSF + log_GrLivArea, data = strongest_correlation_data)
# Print summary of the linear model
summary(lm_model_log1)
##
## Call:
## lm(formula = SalePrice ~ log_LotArea + log_OverallQual + log_YearBuilt +
## log_YearRemodAdd + log_X1stFlrSF + log_GrLivArea, data = strongest_correlation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -295942 -22115 -3957 15135 378681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13716909 986854 -13.900 < 2e-16 ***
## log_LotArea 22234 2469 9.006 < 2e-16 ***
## log_OverallQual 104217 6924 15.052 < 2e-16 ***
## log_YearBuilt 868123 95889 9.053 < 2e-16 ***
## log_YearRemodAdd 809329 137187 5.899 4.53e-09 ***
## log_X1stFlrSF 39152 4489 8.722 < 2e-16 ***
## log_GrLivArea 69350 4668 14.858 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41760 on 1453 degrees of freedom
## Multiple R-squared: 0.7248, Adjusted R-squared: 0.7236
## F-statistic: 637.7 on 6 and 1453 DF, p-value: < 2.2e-16
Let’s do residual analysis and see if there is any improvments:
plot(fitted(lm_model_log1), residuals(lm_model_log1),
xlab = "Fitted values", ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red")
Even after the log transformation, the residual plot shows that the linear model is not better model for the data; it shows a scatter plot that forms an upward curve, which indicates that a polynomial regression model might be more appropriate.
# Fit a polynomial regression model
poly_model <- lm(SalePrice ~ poly(log_LotArea, degree = 2) +
poly(log_OverallQual, degree = 2) +
poly(log_YearBuilt, degree = 2) +
poly(log_YearRemodAdd, degree = 2) +
poly(log_X1stFlrSF, degree = 2) +
poly(log_GrLivArea, degree = 2),
data = strongest_correlation_data)
# Print summary of the polynomial regression model
summary(poly_model)
##
## Call:
## lm(formula = SalePrice ~ poly(log_LotArea, degree = 2) + poly(log_OverallQual,
## degree = 2) + poly(log_YearBuilt, degree = 2) + poly(log_YearRemodAdd,
## degree = 2) + poly(log_X1stFlrSF, degree = 2) + poly(log_GrLivArea,
## degree = 2), data = strongest_correlation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -430487 -17419 41 15975 278478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 180921.2 955.5 189.338 < 2e-16 ***
## poly(log_LotArea, degree = 2)1 376822.9 43564.3 8.650 < 2e-16 ***
## poly(log_LotArea, degree = 2)2 76127.1 39018.1 1.951 0.051241 .
## poly(log_OverallQual, degree = 2)1 1161292.8 59583.4 19.490 < 2e-16 ***
## poly(log_OverallQual, degree = 2)2 501306.7 44565.2 11.249 < 2e-16 ***
## poly(log_YearBuilt, degree = 2)1 436176.6 55388.6 7.875 6.65e-15 ***
## poly(log_YearBuilt, degree = 2)2 -89905.9 50126.0 -1.794 0.073086 .
## poly(log_YearRemodAdd, degree = 2)1 288994.8 52800.0 5.473 5.20e-08 ***
## poly(log_YearRemodAdd, degree = 2)2 159474.0 40914.2 3.898 0.000102 ***
## poly(log_X1stFlrSF, degree = 2)1 442762.2 48407.2 9.147 < 2e-16 ***
## poly(log_X1stFlrSF, degree = 2)2 -74080.8 43827.9 -1.690 0.091193 .
## poly(log_GrLivArea, degree = 2)1 806661.4 53667.2 15.031 < 2e-16 ***
## poly(log_GrLivArea, degree = 2)2 454717.3 43148.5 10.538 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36510 on 1447 degrees of freedom
## Multiple R-squared: 0.7905, Adjusted R-squared: 0.7888
## F-statistic: 455 on 12 and 1447 DF, p-value: < 2.2e-16
Let’s remove the predictors that have higher p-value (>0.05); log_X1stFlrSF and log_YearBuilt and see if there is any improvement in the model:
# Fit a polynomial regression model
poly_model1 <- lm(SalePrice ~ poly(log_LotArea, degree = 2) +
poly(log_OverallQual, degree = 2) +
poly(log_YearRemodAdd, degree = 2) +
poly(log_GrLivArea, degree = 2),
data = strongest_correlation_data)
# Print summary of the polynomial regression model
summary(poly_model1)
##
## Call:
## lm(formula = SalePrice ~ poly(log_LotArea, degree = 2) + poly(log_OverallQual,
## degree = 2) + poly(log_YearRemodAdd, degree = 2) + poly(log_GrLivArea,
## degree = 2), data = strongest_correlation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -402637 -19644 -318 18278 291902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 180921 1020 177.414 < 2e-16 ***
## poly(log_LotArea, degree = 2)1 541973 42668 12.702 < 2e-16 ***
## poly(log_LotArea, degree = 2)2 50852 39410 1.290 0.197
## poly(log_OverallQual, degree = 2)1 1433359 55972 25.609 < 2e-16 ***
## poly(log_OverallQual, degree = 2)2 606174 44284 13.688 < 2e-16 ***
## poly(log_YearRemodAdd, degree = 2)1 433681 47064 9.215 < 2e-16 ***
## poly(log_YearRemodAdd, degree = 2)2 52160 40332 1.293 0.196
## poly(log_GrLivArea, degree = 2)1 831695 52975 15.700 < 2e-16 ***
## poly(log_GrLivArea, degree = 2)2 352859 42857 8.233 4.01e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38970 on 1451 degrees of freedom
## Multiple R-squared: 0.7607, Adjusted R-squared: 0.7594
## F-statistic: 576.7 on 8 and 1451 DF, p-value: < 2.2e-16
# Fit a polynomial regression model
poly_model2 <- lm(SalePrice ~poly(log_OverallQual, degree = 2) +
poly(log_YearBuilt, degree = 2) +
poly(log_X1stFlrSF, degree = 2) +
poly(log_GrLivArea, degree = 2),
data = strongest_correlation_data)
# Print summary of the polynomial regression model
summary(poly_model2)
##
## Call:
## lm(formula = SalePrice ~ poly(log_OverallQual, degree = 2) +
## poly(log_YearBuilt, degree = 2) + poly(log_X1stFlrSF, degree = 2) +
## poly(log_GrLivArea, degree = 2), data = strongest_correlation_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -408809 -18322 -457 16576 270578
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 180921.2 993.7 182.062 <2e-16 ***
## poly(log_OverallQual, degree = 2)1 1186683.5 61507.6 19.293 <2e-16 ***
## poly(log_OverallQual, degree = 2)2 524430.8 45549.9 11.513 <2e-16 ***
## poly(log_YearBuilt, degree = 2)1 537237.2 49726.7 10.804 <2e-16 ***
## poly(log_YearBuilt, degree = 2)2 15367.3 44359.3 0.346 0.7291
## poly(log_X1stFlrSF, degree = 2)1 584356.1 47366.3 12.337 <2e-16 ***
## poly(log_X1stFlrSF, degree = 2)2 -91936.2 42986.0 -2.139 0.0326 *
## poly(log_GrLivArea, degree = 2)1 893221.3 53662.9 16.645 <2e-16 ***
## poly(log_GrLivArea, degree = 2)2 492443.7 44408.4 11.089 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37970 on 1451 degrees of freedom
## Multiple R-squared: 0.7728, Adjusted R-squared: 0.7716
## F-statistic: 616.9 on 8 and 1451 DF, p-value: < 2.2e-16
Based on the adjusted R-squared the original model using all predictors is better; around 79% of the data is explained by those predictors. let’s confirm that by plotting the residual
# Extract residuals
residuals <- residuals(poly_model)
# Plot residuals vs. fitted values
plot(fitted(poly_model), residuals,
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs. Fitted",
pch = 20, col = "blue")
abline(h = 0, col = "red")
qqnorm(residuals, main = "QQ Plot of Residuals")
qqline(residuals)
so let’s use it to predict the sale prices of the houses.
# Create a data frame with ID numbers starting at 1461
Predictions_df <- data.frame(Id = 1461:(1460 + nrow(strongest_correlation_data)))
# Predict sale prices using the fitted polynomial regression model
Predictions_df$Predicted_SalePrice <- predict(poly_model, newdata = strongest_correlation_data)
# Print the first few rows of the new data frame
head(Predictions_df)
## Id Predicted_SalePrice
## 1 1461 215078.8
## 2 1462 162700.1
## 3 1463 226393.9
## 4 1464 169629.1
## 5 1465 286207.5
## 6 1466 147025.6
Now let’s write the Predictions_df as .csv file and save it on the directory:
# Write Predictions data to a CSV file
write.csv(Predictions_df, "predicted_sale_prices.csv", row.names = FALSE)
In conclusion, let’s take a look at the most improtant varaibles
# variable importance
importance <- varImp(poly_model, scale = FALSE)
importance
## Overall
## poly(log_LotArea, degree = 2)1 8.649806
## poly(log_LotArea, degree = 2)2 1.951072
## poly(log_OverallQual, degree = 2)1 19.490211
## poly(log_OverallQual, degree = 2)2 11.248829
## poly(log_YearBuilt, degree = 2)1 7.874847
## poly(log_YearBuilt, degree = 2)2 1.793597
## poly(log_YearRemodAdd, degree = 2)1 5.473382
## poly(log_YearRemodAdd, degree = 2)2 3.897771
## poly(log_X1stFlrSF, degree = 2)1 9.146617
## poly(log_X1stFlrSF, degree = 2)2 1.690264
## poly(log_GrLivArea, degree = 2)1 15.030809
## poly(log_GrLivArea, degree = 2)2 10.538427
# Arrange the dataframe by the Importance column in descending order
importance_ordered <- importance %>%
arrange(desc(importance))
# Print the ordered variable importance dataframe
print(importance_ordered)
## Overall
## poly(log_OverallQual, degree = 2)1 19.490211
## poly(log_GrLivArea, degree = 2)1 15.030809
## poly(log_OverallQual, degree = 2)2 11.248829
## poly(log_GrLivArea, degree = 2)2 10.538427
## poly(log_X1stFlrSF, degree = 2)1 9.146617
## poly(log_LotArea, degree = 2)1 8.649806
## poly(log_YearBuilt, degree = 2)1 7.874847
## poly(log_YearRemodAdd, degree = 2)1 5.473382
## poly(log_YearRemodAdd, degree = 2)2 3.897771
## poly(log_LotArea, degree = 2)2 1.951072
## poly(log_YearBuilt, degree = 2)2 1.793597
## poly(log_X1stFlrSF, degree = 2)2 1.690264
We can see that OverallQual of the houses is more important in affecting the sale prices, then followed by GrLivArea.The least important is X1stFlrSF.