##########################################################
# Load the data
# set current directory
setwd("/Users/whinton/src/rstudio/tim8521")
train_data <- read.csv("train.csv", header = TRUE, sep= ",", stringsAsFactors = TRUE)
test_data <- read.csv("test.csv", header = TRUE, sep= ",",stringsAsFactors = TRUE)
##########################################################
###########################################################
# Exploration of the Data and Correlation Analysis
###########################################################
# Preview data
str(train_data)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
## $ Alley : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
## $ LotShape : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 1 1 1 1 4 1 4 4 ...
## $ LandContour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Utilities : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
## $ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
## $ LandSlope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
## $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
## $ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
## $ Condition2 : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
## $ Exterior2nd : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
## $ MasVnrType : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
## $ ExterCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
## $ BsmtQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
## $ BsmtCond : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
## $ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 6 3 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 6 6 6 2 6 6 ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ HeatingQC : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ CentralAir : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
## $ GarageType : Factor w/ 6 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 2 3 2 3 2 2 3 2 ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
## $ GarageCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ PavedDrive : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
## $ Fence : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
## $ MiscFeature : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA NA 3 NA 3 NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
summary(train_data)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour Utilities
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63 AllPub:1459
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50 NoSeWa: 1
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## LotConfig LandSlope Neighborhood Condition1 Condition2
## Corner : 263 Gtl:1382 NAmes :225 Norm :1260 Norm :1445
## CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81 Feedr : 6
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48 Artery : 2
## FR3 : 4 Edwards:100 RRAn : 26 PosN : 2
## Inside :1052 Somerst: 86 PosN : 19 RRNn : 2
## Gilbert: 79 RRAe : 11 PosA : 1
## (Other):707 (Other): 15 (Other): 2
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872
## 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973
## Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971
## TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 19
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## Min. :1950 Flat : 13 CompShg:1434 VinylSd:515 VinylSd:504
## 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222 MetalSd:214
## Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220 HdBoard:207
## Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206 Wd Sdng:197
## 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108 Plywood:142
## Max. :2010 Shed : 2 Membran: 1 CemntBd: 61 CmentBd: 60
## (Other): 2 (Other):128 (Other):136
## MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual
## BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3 BrkTil:146 Ex :121
## BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28 CBlock:634 Fa : 35
## None :864 Median : 0.0 Gd:488 Gd: 146 PConc :647 Gd :618
## Stone :128 Mean : 103.7 TA:906 Po: 1 Slab : 24 TA :649
## NA's : 8 3rd Qu.: 166.0 TA:1282 Stone : 6 NA's: 37
## Max. :1600.0 Wood : 3
## NA's :8
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Fa : 45 Av :221 ALQ :220 Min. : 0.0 ALQ : 19
## Gd : 65 Gd :134 BLQ :148 1st Qu.: 0.0 BLQ : 33
## Po : 2 Mn :114 GLQ :418 Median : 383.5 GLQ : 14
## TA :1311 No :953 LwQ : 74 Mean : 443.6 LwQ : 46
## NA's: 37 NA's: 38 Rec :133 3rd Qu.: 712.2 Rec : 54
## Unf :430 Max. :5644.0 Unf :1256
## NA's: 37 NA's: 38
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49
## Median : 0.00 Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 46.55 Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :1474.00 Max. :2336.0 Max. :6110.0 Wall : 4
##
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## N: 95 FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## Y:1365 FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Gd:586 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## Maj1: 14 Min. :0.000 Ex : 24 2Types : 6 Min. :1900
## Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870 1st Qu.:1961
## Min1: 31 Median :1.000 Gd :380 Basment: 19 Median :1980
## Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88 Mean :1979
## Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9 3rd Qu.:2002
## Sev : 1 Max. :3.000 NA's:690 Detchd :387 Max. :2010
## Typ :1360 NA's : 81 NA's :81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## Fin :352 Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2
## RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48 Fa : 35
## Unf :605 Median :2.000 Median : 480.0 Gd : 14 Gd : 9
## NA's: 81 Mean :1.767 Mean : 473.0 Po : 3 Po : 7
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311 TA :1326
## Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
##
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Y:1340 Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
## Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
## Mean : 15.06 Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 NA's:1406
## Max. :480.00 Max. :738.000
##
## MiscVal MoSold YrSold SaleType
## Min. : 0.00 Min. : 1.000 Min. :2006 WD :1267
## 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 New : 122
## Median : 0.00 Median : 6.000 Median :2008 COD : 43
## Mean : 43.49 Mean : 6.322 Mean :2008 ConLD : 9
## 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009 ConLI : 5
## Max. :15500.00 Max. :12.000 Max. :2010 ConLw : 5
## (Other): 9
## SaleCondition SalePrice
## Abnorml: 101 Min. : 34900
## AdjLand: 4 1st Qu.:129975
## Alloca : 12 Median :163000
## Family : 20 Mean :180921
## Normal :1198 3rd Qu.:214000
## Partial: 125 Max. :755000
##
##########################################################
# Preprocess Data
##########################################################
# Check for Erroneous Data Before Handling Missing Values
# List of numeric columns where negative could be invalid
invalid_negative_columns <- c("LotFrontage", "LotArea", "YearBuilt", "YearRemodAdd",
"MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
"TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageYrBlt", "GarageArea", "WoodDeckSF",
"OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch",
"PoolArea", "MiscVal", "SalePrice")
# Detect erroneous data in train and test datasets
erroneous_train <- detect_erroneous_data(train_data, invalid_negative_columns)
erroneous_test <- detect_erroneous_data(test_data, invalid_negative_columns)
# Print detected erroneous data
if (length(erroneous_train) > 0) {
cat("\n### Erroneous Data Found in Train Dataset ###\n")
print(erroneous_train)
} else {
cat("\nNo erroneous values found in Train Dataset.\n")
}
##
## No erroneous values found in Train Dataset.
if (length(erroneous_test) > 0) {
cat("\n### Erroneous Data Found in Test Dataset ###\n")
print(erroneous_test)
} else {
cat("\nNo erroneous values found in Test Dataset.\n")
}
##
## No erroneous values found in Test Dataset.
# Identify columns with missing values
missing_values <- colSums(is.na(train_data))
missing_cols <- names(missing_values[missing_values > 0])
missing_cols_train <- missing_cols
print(missing_cols)
## [1] "LotFrontage" "Alley" "MasVnrType" "MasVnrArea" "BsmtQual"
## [6] "BsmtCond" "BsmtExposure" "BsmtFinType1" "BsmtFinType2" "Electrical"
## [11] "FireplaceQu" "GarageType" "GarageYrBlt" "GarageFinish" "GarageQual"
## [16] "GarageCond" "PoolQC" "Fence" "MiscFeature"
# Imputation of missing values
for (col in missing_cols) {
if (is.numeric(train_data[[col]])) {
train_data[[col]][is.na(train_data[[col]])] <- mean(train_data[[col]], na.rm = TRUE)
} else {
train_data[[col]][is.na(train_data[[col]])] <- as.character(names(sort(table(train_data[[col]]), decreasing = TRUE)[1]))
}
}
# Apply the same imputation for test_data
missing_vals_test <- colSums(is.na(test_data))
missing_cols_test <- names(missing_vals_test[missing_vals_test > 0])
for (col in missing_cols_test) {
if (col %in% missing_cols_train) {
if (is.numeric(test_data[[col]])) {
test_data[[col]][is.na(test_data[[col]])] <- mean(train_data[[col]], na.rm = TRUE) # Use train_data mean
} else {
test_data[[col]][is.na(test_data[[col]])] <- as.character(names(sort(table(train_data[[col]]), decreasing = TRUE)[1])) # Use train_data mode
}
}
}
# Show Missingness Map After Imputation
missmap(train_data)
# Encode categorical variables
train_data <- train_data %>% mutate_if(is.factor, as.numeric)
##########################################################
# Build Initial Multiple Regression Model (Top 5 Features)
##########################################################
# Select only numeric features
numeric_features <- select_if(train_data, is.numeric)
# Compute correlation matrix
cor_matrix <- cor(numeric_features, use = "complete.obs")
# Identify the top 5 features most correlated with SalePrice (including SalePrice itself)
cor_target <- sort(cor_matrix["SalePrice",], decreasing = TRUE)
top_features <- names(head(cor_target, 5)) # Select top predictors + SalePrice
# Subset correlation matrix to include only top 40 correlated features
cor_top_5 <- cor_matrix[top_features, top_features]
# Generate the correlation heatmap for the top 5 correlated features
pheatmap(cor_top_5,
display_numbers = TRUE,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "complete",
color = colorRampPalette(c("blue", "white", "red"))(200),
main = "Top 5 Features Correlated with SalePrice")
# Select only the top 5 highly correlated features, including SalePrice
train_data_top5 <- train_data[, c(top_features)]
# Construct formula using only these features
formula_top5 <- as.formula(paste("SalePrice ~", paste(setdiff(top_features, "SalePrice"), collapse = " + ")))
# Build multiple regression model
lm_model_top5 <- lm(formula_top5, data = train_data_top5)
# Print model summary
summary(lm_model_top5)
##
## Call:
## lm(formula = formula_top5, data = train_data_top5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -372594 -21236 -1594 18625 301129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -98436.050 4820.467 -20.420 < 2e-16 ***
## OverallQual 26988.854 1067.393 25.285 < 2e-16 ***
## GrLivArea 49.573 2.555 19.402 < 2e-16 ***
## GarageCars 11317.522 3126.297 3.620 0.000305 ***
## GarageArea 41.478 10.627 3.903 9.93e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40420 on 1455 degrees of freedom
## Multiple R-squared: 0.7418, Adjusted R-squared: 0.7411
## F-statistic: 1045 on 4 and 1455 DF, p-value: < 2.2e-16
# Validate Model Using Cross-Validation
set.seed(123)
cv_control <- trainControl(method = "cv", number = 10)
cv_model_top5 <- train(formula_top5, data = train_data_top5, method = "lm", trControl = cv_control, metric = "RMSE")
# Print cross-validation results
print(cv_model_top5$results)
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 40239.58 0.7447829 27143.76 6808.893 0.06748995 2476.471
# Check Residuals After Optimization
par(mfrow = c(2,2)) # Arrange plots in a 2x2 grid
plot(lm_model_top5) # Residual plots
# Features with a high number of outliers may be a candidate for removal
##########################################################
# Print outliers for each of the top correlated features
outlier_list <- list()
for (feature in top_features[-1]) { # Exclude SalePrice
outlier_data <- detect_outliers(train_data, feature)
outlier_list[[feature]] <- outlier_data
if (nrow(outlier_data) > 0) {
print(paste("Outliers detected in:", feature))
print(outlier_data)
}
}
## [1] "Outliers detected in: OverallQual"
## Id OverallQual
## 1 376 1
## 2 534 1
## [1] "Outliers detected in: GrLivArea"
## Id GrLivArea
## 1 59 2945
## 2 119 3222
## 3 186 3608
## 4 198 3112
## 5 232 2794
## 6 305 3493
## 7 325 2978
## 8 497 3228
## 9 524 4676
## 10 584 2775
## 11 609 3194
## 12 636 3395
## 13 692 4316
## 14 770 3279
## 15 799 3140
## 16 804 2822
## 17 962 2872
## 18 1025 2898
## 19 1032 3082
## 20 1047 2868
## 21 1143 2828
## 22 1170 3627
## 23 1174 3086
## 24 1176 2872
## 25 1183 4476
## 26 1269 3447
## 27 1299 5642
## 28 1313 2810
## 29 1329 2792
## 30 1354 3238
## 31 1387 2784
## [1] "Outliers detected in: GarageCars"
## Id GarageCars
## 1 421 4
## 2 748 4
## 3 1191 4
## 4 1341 4
## 5 1351 4
## [1] "Outliers detected in: GarageArea"
## Id GarageArea
## 1 179 1166
## 2 225 968
## 3 271 1053
## 4 318 1025
## 5 409 947
## 6 582 1390
## 7 665 1134
## 8 719 983
## 9 804 1020
## 10 826 1220
## 11 1062 1248
## 12 1088 1043
## 13 1143 1052
## 14 1185 995
## 15 1191 1356
## 16 1229 1052
## 17 1242 954
## 18 1269 1014
## 19 1299 1418
## 20 1351 968
## 21 1418 1069
##########################################################
# Visualize Outliers Using Boxplots
##########################################################
# Generate boxplots for each of the top features
boxplot_list <- list()
for (feature in top_features[-1]) {
p <- ggplot(train_data, aes_string(x = "1", y = feature)) +
geom_boxplot(fill = "lightblue", outlier.color = "red", outlier.shape = 16) +
labs(title = paste(" ", feature), x = "", y = feature) +
theme_minimal()
boxplot_list[[feature]] <- p
}
# Display boxplots in a grid format
grid.arrange(grobs = boxplot_list, ncol = 4)
##########################################################
# Correlation Analysis with Selecte Features
##########################################################
# Convert categorical variables to numeric
train_data$Neighborhood <- as.numeric(factor(train_data$Neighborhood))
train_data$ExterQual <- as.numeric(factor(train_data$ExterQual))
train_data$BsmtQual <- as.numeric(factor(train_data$BsmtQual))
# Select numeric features for correlation analysis
numeric_features <- select_if(train_data, is.numeric)
# Compute correlation matrix
cor_matrix <- cor(numeric_features, use = "complete.obs")
# Identify top 10 features most correlated with SalePrice
cor_target <- sort(cor_matrix["SalePrice",], decreasing = TRUE)
top_features <- names(head(cor_target, 10))
# Manually add additional recommended features
additional_features <- c("Neighborhood", "LotFrontage", "ExterQual", "BsmtQual")
# Ensure the additional features exist in train_data
additional_features <- additional_features[additional_features %in% names(train_data)]
# Remove high-outlier features before finalizing features list (Exclude SalePrice)
features_to_remove <- c("GrLivArea", "GarageArea")
final_features <- setdiff(unique(c(top_features, additional_features, "SalePrice")), features_to_remove)
# Subset correlation matrix for selected features
cor_selected <- cor_matrix[final_features, final_features]
# Generate correlation heatmap
pheatmap(cor_selected,
display_numbers = TRUE,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "complete",
color = colorRampPalette(c("blue", "white", "red"))(200),
main = "Correlation Matrix for Selected Features")
##########################################################
# Optimize and Rebuild Model (Removing High VIF Features)
##########################################################
# Step 1: Subset Data with Selected Features
train_data_top <- train_data[, final_features] # Ensure SalePrice is included
# Step 2: Identify High VIF Features
lm_model_top <- lm(SalePrice ~ ., data = train_data_top)
vif_values <- vif(lm_model_top)
# Identify the top features with the highest VIF
top_vif_features <- names(vif_values[vif_values > 5])
# Step 3: Remove High VIF Features (Excluding SalePrice)
features_to_remove_vif <- setdiff(top_vif_features, "SalePrice")
train_data_filtered <- train_data_top %>% dplyr::select(-all_of(features_to_remove_vif))
# Step 4: Rebuild Model with Optimized Features
optimized_features <- setdiff(names(train_data_filtered), "SalePrice")
formula_optimized <- as.formula(paste("SalePrice ~", paste(optimized_features, collapse = " + ")))
lm_model_optimized <- lm(formula_optimized, data = train_data_filtered)
# Step 5: Print Model Summary After Optimization
summary(lm_model_optimized)
##
## Call:
## lm(formula = formula_optimized, data = train_data_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -405663 -19851 -1088 15772 339206
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.789e+05 9.645e+04 -1.855 0.06379 .
## OverallQual 2.003e+04 1.208e+03 16.585 < 2e-16 ***
## GarageCars 1.352e+04 1.857e+03 7.282 5.39e-13 ***
## TotalBsmtSF 1.429e+01 4.363e+00 3.274 0.00108 **
## X1stFlrSF 2.605e+01 4.925e+00 5.288 1.43e-07 ***
## FullBath 1.365e+03 2.606e+03 0.524 0.60061
## TotRmsAbvGrd 8.805e+03 8.464e+02 10.402 < 2e-16 ***
## YearBuilt 9.432e+01 4.839e+01 1.949 0.05150 .
## Neighborhood 1.440e+02 1.744e+02 0.826 0.40913
## LotFrontage 1.090e+02 5.141e+01 2.121 0.03409 *
## ExterQual -1.386e+04 2.015e+03 -6.878 9.00e-12 ***
## BsmtQual -1.144e+04 1.607e+03 -7.116 1.75e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38360 on 1448 degrees of freedom
## Multiple R-squared: 0.7686, Adjusted R-squared: 0.7669
## F-statistic: 437.3 on 11 and 1448 DF, p-value: < 2.2e-16
# Step 6: Validate with Cross-Validation
set.seed(123)
cv_control <- trainControl(method = "cv", number = 10)
cv_model_optimized <- train(formula_optimized, data = train_data_filtered, method = "lm", trControl = cv_control, metric = "RMSE")
# Print updated cross-validation results
print(cv_model_optimized$results)
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 38334.16 0.7688749 25291.06 8515.063 0.08354885 2874.598
# Step 7: Check Residuals After Optimization
par(mfrow = c(2,2))
plot(lm_model_optimized)
This study performed by Will Hinton