Introduction
df <- read.csv("https://raw.githubusercontent.com/engine2031/Data-Sets/main/house-price_train.csv")
Libraries
library(tidyverse)
library(GGally)
library(matrixcalc)
library(MASS)
library(scales)
#Note: The select function in MASS conflicts with dplyr
Descriptive and Inferential Statistics
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
glimpse(df)
## Rows: 1,460
## Columns: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1~
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,~
## $ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R~
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, ~
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612~
## $ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", ~
## $ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", ~
## $ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", ~
## $ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu~
## $ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I~
## $ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", ~
## $ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "~
## $ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",~
## $ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", ~
## $ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", ~
## $ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi~
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,~
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,~
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19~
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19~
## $ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G~
## $ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "~
## $ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "~
## $ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "~
## $ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",~
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, ~
## $ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T~
## $ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T~
## $ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "~
## $ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T~
## $ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T~
## $ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N~
## $ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", ~
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99~
## $ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", ~
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17~
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10~
## $ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", ~
## $ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E~
## $ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "~
## $ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S~
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, ~
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,~
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10~
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,~
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,~
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,~
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,~
## $ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T~
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6~
## $ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", ~
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,~
## $ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", ~
## $ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch~
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19~
## $ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", ~
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,~
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7~
## $ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G~
## $ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T~
## $ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "~
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160~
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,~
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, ~
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, ~
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,~
## $ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, ~
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,~
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10~
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20~
## $ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W~
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm~
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, ~
df <- as.data.frame(unclass(df), stringsAsFactors = TRUE)
df$MoSold <- as.factor(df$MoSold)
df$YrSold <- as.factor(df$YrSold)
summary(df)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour Utilities
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63 AllPub:1459
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50 NoSeWa: 1
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## LotConfig LandSlope Neighborhood Condition1 Condition2
## Corner : 263 Gtl:1382 NAmes :225 Norm :1260 Norm :1445
## CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81 Feedr : 6
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48 Artery : 2
## FR3 : 4 Edwards:100 RRAn : 26 PosN : 2
## Inside :1052 Somerst: 86 PosN : 19 RRNn : 2
## Gilbert: 79 RRAe : 11 PosA : 1
## (Other):707 (Other): 15 (Other): 2
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872
## 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973
## Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971
## TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 19
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## Min. :1950 Flat : 13 CompShg:1434 VinylSd:515 VinylSd:504
## 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222 MetalSd:214
## Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220 HdBoard:207
## Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206 Wd Sdng:197
## 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108 Plywood:142
## Max. :2010 Shed : 2 Membran: 1 CemntBd: 61 CmentBd: 60
## (Other): 2 (Other):128 (Other):136
## MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual
## BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3 BrkTil:146 Ex :121
## BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28 CBlock:634 Fa : 35
## None :864 Median : 0.0 Gd:488 Gd: 146 PConc :647 Gd :618
## Stone :128 Mean : 103.7 TA:906 Po: 1 Slab : 24 TA :649
## NA's : 8 3rd Qu.: 166.0 TA:1282 Stone : 6 NA's: 37
## Max. :1600.0 Wood : 3
## NA's :8
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Fa : 45 Av :221 ALQ :220 Min. : 0.0 ALQ : 19
## Gd : 65 Gd :134 BLQ :148 1st Qu.: 0.0 BLQ : 33
## Po : 2 Mn :114 GLQ :418 Median : 383.5 GLQ : 14
## TA :1311 No :953 LwQ : 74 Mean : 443.6 LwQ : 46
## NA's: 37 NA's: 38 Rec :133 3rd Qu.: 712.2 Rec : 54
## Unf :430 Max. :5644.0 Unf :1256
## NA's: 37 NA's: 38
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49
## Median : 0.00 Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 46.55 Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :1474.00 Max. :2336.0 Max. :6110.0 Wall : 4
##
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## N: 95 FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## Y:1365 FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Gd:586 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## Maj1: 14 Min. :0.000 Ex : 24 2Types : 6 Min. :1900
## Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870 1st Qu.:1961
## Min1: 31 Median :1.000 Gd :380 Basment: 19 Median :1980
## Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88 Mean :1979
## Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9 3rd Qu.:2002
## Sev : 1 Max. :3.000 NA's:690 Detchd :387 Max. :2010
## Typ :1360 NA's : 81 NA's :81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## Fin :352 Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2
## RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48 Fa : 35
## Unf :605 Median :2.000 Median : 480.0 Gd : 14 Gd : 9
## NA's: 81 Mean :1.767 Mean : 473.0 Po : 3 Po : 7
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311 TA :1326
## Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
##
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Y:1340 Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
## Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
## Mean : 15.06 Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 NA's:1406
## Max. :480.00 Max. :738.000
##
## MiscVal MoSold YrSold SaleType SaleCondition
## Min. : 0.00 6 :253 2006:314 WD :1267 Abnorml: 101
## 1st Qu.: 0.00 7 :234 2007:329 New : 122 AdjLand: 4
## Median : 0.00 5 :204 2008:304 COD : 43 Alloca : 12
## Mean : 43.49 4 :141 2009:338 ConLD : 9 Family : 20
## 3rd Qu.: 0.00 8 :122 2010:175 ConLI : 5 Normal :1198
## Max. :15500.00 3 :106 ConLw : 5 Partial: 125
## (Other):400 (Other): 9
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
ggplot(data = df, aes(x=YrSold))+
geom_bar(stat="count", fill="steelblue")+
ggtitle("Houses Sold Per Year")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = df, aes(x=YrSold))+
geom_bar(stat="count", fill="steelblue")+
ggtitle("Houses Sold Per Month")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = df, aes(x=YearBuilt))+
geom_histogram(binwidth=5, colour="black", fill="steelblue")+
ggtitle("Original Construction Year")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = df, aes(x=GrLivArea))+
geom_histogram(binwidth=100, colour="black", fill="steelblue")+
ggtitle("Above Grade Living Area (sq. ft.)")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = df, aes(x=SalePrice))+
geom_histogram(binwidth=25000, colour="black", fill="steelblue")+
ggtitle("House Sale Price")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous(labels=comma)
df %>%
dplyr::select(GrLivArea, LotArea, YearBuilt, TotalBsmtSF, GarageArea, SalePrice)%>%
ggpairs()
Computing the Correlation Matrix & Test
house_select <- df %>%
dplyr::select(GrLivArea, LotArea, YearBuilt, TotalBsmtSF, GarageArea, SalePrice)
house_corr <- cor(house_select)
round(house_corr,2)
## GrLivArea LotArea YearBuilt TotalBsmtSF GarageArea SalePrice
## GrLivArea 1.00 0.26 0.20 0.45 0.47 0.71
## LotArea 0.26 1.00 0.01 0.26 0.18 0.26
## YearBuilt 0.20 0.01 1.00 0.39 0.48 0.52
## TotalBsmtSF 0.45 0.26 0.39 1.00 0.49 0.61
## GarageArea 0.47 0.18 0.48 0.49 1.00 0.62
## SalePrice 0.71 0.26 0.52 0.61 0.62 1.00
df %>%
dplyr::select(GrLivArea, LotArea, YearBuilt, TotalBsmtSF, GarageArea, SalePrice)%>%
ggcorr(label=TRUE, midpoint = NULL, limits = c(0, 1))
## Color gradient midpoint set at median correlation to 0.45
cor.test(house_select$GrLivArea,house_select$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_select$GrLivArea and house_select$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
cor.test(house_select$LotArea,house_select$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_select$LotArea and house_select$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
cor.test(house_select$TotalBsmtSF,house_select$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: house_select$TotalBsmtSF and house_select$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5922142 0.6340846
## sample estimates:
## cor
## 0.6135806
Resources & References
http://www.sthda.com/english/wiki/correlation-test-between-two-variables-in-r
Linear Algebra and Correlation
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
Precision Matrix
precision_mat <- matrix.inverse(house_corr)
precision_mat
## GrLivArea LotArea YearBuilt TotalBsmtSF GarageArea
## GrLivArea 2.23400269 -0.09476284 0.5666501 -0.08353876 -0.22455087
## LotArea -0.09476284 1.12994186 0.1975152 -0.19161366 -0.04197554
## YearBuilt 0.56665007 0.19751516 1.6471193 -0.16695955 -0.41405940
## TotalBsmtSF -0.08353876 -0.19161366 -0.1669595 1.69561538 -0.23836356
## GarageArea -0.22455087 -0.04197554 -0.4140594 -0.23836356 1.79106740
## SalePrice -1.66311643 -0.19051725 -0.9543487 -0.69473698 -0.58364472
## SalePrice
## GrLivArea -1.6631164
## LotArea -0.1905173
## YearBuilt -0.9543487
## TotalBsmtSF -0.6947370
## GarageArea -0.5836447
## SalePrice 3.5179577
\(A*A^{-1}*A\)
house_corr %*% precision_mat %*% house_corr
## GrLivArea LotArea YearBuilt TotalBsmtSF GarageArea SalePrice
## GrLivArea 1.0000000 0.26311617 0.19900971 0.4548682 0.4689975 0.7086245
## LotArea 0.2631162 1.00000000 0.01422765 0.2608331 0.1804028 0.2638434
## YearBuilt 0.1990097 0.01422765 1.00000000 0.3914520 0.4789538 0.5228973
## TotalBsmtSF 0.4548682 0.26083313 0.39145200 1.0000000 0.4866655 0.6135806
## GarageArea 0.4689975 0.18040276 0.47895382 0.4866655 1.0000000 0.6234314
## SalePrice 0.7086245 0.26384335 0.52289733 0.6135806 0.6234314 1.0000000
LU Decomposition
lu_house_corr <- lu.decomposition(house_corr)
lu_house_corr$L
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0000000 0.00000000 0.0000000 0.0000000 0.0000000 0
## [2,] 0.2631162 1.00000000 0.0000000 0.0000000 0.0000000 0
## [3,] 0.1990097 -0.04097148 1.0000000 0.0000000 0.0000000 0
## [4,] 0.4548682 0.15164861 0.3198806 1.0000000 0.0000000 0
## [5,] 0.4689975 0.06124171 0.4046110 0.2087212 1.0000000 0
## [6,] 0.7086245 0.08314923 0.4015769 0.2321108 0.1659044 1
lu_house_corr$U
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 0.2631162 0.19900971 0.4548682 0.46899748 0.70862448
## [2,] 0 0.9307699 -0.03813502 0.1411500 0.05700194 0.07739280
## [3,] 0 0.0000000 0.95883269 0.3067119 0.38795422 0.38504508
## [4,] 0 0.0000000 0.00000000 0.6735785 0.14059015 0.15634487
## [5,] 0 0.0000000 0.00000000 0.0000000 0.59023579 0.09792272
## [6,] 0 0.0000000 0.00000000 0.0000000 0.00000000 0.28425584
Calculus-Based Probability & Statistics
Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data.
The house sale price is data is used for this exercise. Based on the summary statistics noted in prior chunks, it is noted that the minimum value is above one.
fit_exp <- fitdistr(df$SalePrice, densfun="exponential")
fit_exp
## rate
## 5.527268e-06
## (1.446552e-07)
# An n of 1460 is used to match the original data set.
SalePrice_Sim <- rexp(1460, rate = 5.527268e-06 )
df2 <- data.frame(SalePrice_Sim)
ggplot(data = df, aes(x=SalePrice))+
geom_histogram(binwidth=25000, colour="black", fill="steelblue")+
ggtitle("Actual House Sale Price")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous(labels=comma)
ggplot(data = df2, aes(x=SalePrice_Sim))+
geom_histogram(binwidth=25000, colour="black", fill="steelblue")+
ggtitle("Simulation House Sale Price")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous(labels=comma)
The 5th and 95th percentile of the exponential distribution
qexp(.05, rate = fit_exp$estimate)
## [1] 9280.044
qexp(.95, rate = fit_exp$estimate)
## [1] 541991.5
The normal distribution 95% Confidence Interval. Based on the calculations below we see that the lower and upper limits of the confidence intervals are $50,294 and $311,547.
fit_normal <- fitdistr(df$SalePrice, densfun="normal")
fit_normal
## mean sd
## 180921.196 79415.292
## ( 2078.393) ( 1469.646)
CI95_Lower <- qnorm(.05, fit_normal$estimate[1], fit_normal$estimate[2])
CI95_Upper <- qnorm(.95, fit_normal$estimate[1], fit_normal$estimate[2])
CI95_Lower
## [1] 50294.66
CI95_Upper
## [1] 311547.7
Resources & References
Modeling
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com username and score.
1st Model Iteration: In this first iteration, we see that Garage Area, Month Sold and Overall Condition Rating did not play a significant factor in the sale price. These variables will be removed from the model.
houseprices_lm1 <- lm(SalePrice ~ LotFrontage+ LotArea +OverallQual+OverallCond+ YearBuilt+YearRemodAdd+Foundation+TotalBsmtSF+GrLivArea+FullBath+HalfBath+BedroomAbvGr+KitchenQual+GarageType+GarageCars+GarageArea+MoSold+YrSold, data=df)
anova(houseprices_lm1)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## LotFrontage 1 9.3275e+11 9.3275e+11 668.7165 < 2.2e-16 ***
## LotArea 1 2.2368e+11 2.2368e+11 160.3654 < 2.2e-16 ***
## OverallQual 1 4.1163e+12 4.1163e+12 2951.1102 < 2.2e-16 ***
## OverallCond 1 4.8377e+08 4.8377e+08 0.3468 0.5560372
## YearBuilt 1 6.8292e+10 6.8292e+10 48.9602 4.564e-12 ***
## YearRemodAdd 1 2.1950e+10 2.1950e+10 15.7363 7.759e-05 ***
## Foundation 5 3.3075e+10 6.6150e+09 4.7425 0.0002721 ***
## TotalBsmtSF 1 1.6386e+11 1.6386e+11 117.4737 < 2.2e-16 ***
## GrLivArea 1 4.1225e+11 4.1225e+11 295.5532 < 2.2e-16 ***
## FullBath 1 3.3993e+09 3.3993e+09 2.4371 0.1187878
## HalfBath 1 1.6863e+07 1.6863e+07 0.0121 0.9124683
## BedroomAbvGr 1 2.4867e+10 2.4867e+10 17.8278 2.621e-05 ***
## KitchenQual 3 1.6954e+11 5.6514e+10 40.5167 < 2.2e-16 ***
## GarageType 5 3.3100e+09 6.6200e+08 0.4746 0.7953842
## GarageCars 1 8.5930e+10 8.5930e+10 61.6055 1.002e-14 ***
## GarageArea 1 2.2821e+08 2.2821e+08 0.1636 0.6859327
## MoSold 11 1.6685e+10 1.5168e+09 1.0875 0.3677355
## YrSold 4 6.7595e+09 1.6899e+09 1.2115 0.3041389
## Residuals 1085 1.5134e+12 1.3948e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2nd Model Iteration: In this model we see that the majority of the variables played a significant factor. I do see that the foundation type of the house had a significance code of ** only and this variable will be removed.
houseprices_lm2 <- lm(SalePrice ~ LotFrontage+ LotArea +OverallQual+ YearBuilt+YearRemodAdd+Foundation+TotalBsmtSF+GrLivArea+BedroomAbvGr+KitchenQual+GarageCars, data=df)
anova(houseprices_lm2)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## LotFrontage 1 1.0327e+12 1.0327e+12 735.3910 < 2.2e-16 ***
## LotArea 1 2.6598e+11 2.6598e+11 189.3958 < 2.2e-16 ***
## OverallQual 1 4.3846e+12 4.3846e+12 3122.1243 < 2.2e-16 ***
## YearBuilt 1 7.5909e+10 7.5909e+10 54.0526 3.634e-13 ***
## YearRemodAdd 1 3.8964e+10 3.8964e+10 27.7452 1.643e-07 ***
## Foundation 5 2.8651e+10 5.7302e+09 4.0804 0.00112 **
## TotalBsmtSF 1 1.7724e+11 1.7724e+11 126.2088 < 2.2e-16 ***
## GrLivArea 1 4.0061e+11 4.0061e+11 285.2656 < 2.2e-16 ***
## BedroomAbvGr 1 3.6477e+10 3.6477e+10 25.9740 4.025e-07 ***
## KitchenQual 3 1.7159e+11 5.7198e+10 40.7290 < 2.2e-16 ***
## GarageCars 1 7.0501e+10 7.0501e+10 50.2021 2.377e-12 ***
## Residuals 1183 1.6613e+12 1.4043e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Final Model
houseprices_lm3 <- lm(SalePrice ~ LotFrontage+ LotArea +OverallQual+ YearBuilt+YearRemodAdd+TotalBsmtSF+GrLivArea+BedroomAbvGr+KitchenQual+GarageCars, data=df)
anova(houseprices_lm3)
## Analysis of Variance Table
##
## Response: SalePrice
## Df Sum Sq Mean Sq F value Pr(>F)
## LotFrontage 1 1.0327e+12 1.0327e+12 736.155 < 2.2e-16 ***
## LotArea 1 2.6598e+11 2.6598e+11 189.593 < 2.2e-16 ***
## OverallQual 1 4.3846e+12 4.3846e+12 3125.367 < 2.2e-16 ***
## YearBuilt 1 7.5909e+10 7.5909e+10 54.109 3.527e-13 ***
## YearRemodAdd 1 3.8964e+10 3.8964e+10 27.774 1.618e-07 ***
## TotalBsmtSF 1 1.6404e+11 1.6404e+11 116.929 < 2.2e-16 ***
## GrLivArea 1 4.3299e+11 4.3299e+11 308.640 < 2.2e-16 ***
## BedroomAbvGr 1 3.7235e+10 3.7235e+10 26.542 3.017e-07 ***
## KitchenQual 3 1.7524e+11 5.8413e+10 41.637 < 2.2e-16 ***
## GarageCars 1 7.0285e+10 7.0285e+10 50.100 2.493e-12 ***
## Residuals 1188 1.6666e+12 1.4029e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(houseprices_lm3)
Testing the Model
#Importing Test Data
df_test <- read.csv("https://raw.githubusercontent.com/engine2031/Data-Sets/main/house-price_test.csv")
#Selecting variables to match linear model
df_test2 <- df_test %>% dplyr::select(LotFrontage,LotArea,OverallQual,YearBuilt, YearRemodAdd, TotalBsmtSF, GrLivArea, BedroomAbvGr, KitchenQual, GarageCars)
#Prediction Results
house_price_predict <- predict(houseprices_lm3,df_test2)
#Formatting Results
house_price_predict <- as.data.frame(house_price_predict)
head(house_price_predict)
## house_price_predict
## 1 108629.4
## 2 157646.6
## 3 174636.7
## 4 193167.7
## 5 213208.0
## 6 184054.3
house_price_predict <- house_price_predict %>%
rename( SalePrice = house_price_predict)
house_price_predict <- house_price_predict %>%
add_column(Id = df_test$Id, .before = "SalePrice")
head(house_price_predict)
## Id SalePrice
## 1 1461 108629.4
## 2 1462 157646.6
## 3 1463 174636.7
## 4 1464 193167.7
## 5 1465 213208.0
## 6 1466 184054.3
#The prediction set included na values. These are replaced with 0.
write.csv(house_price_predict, file='Ames-House-Prediction_ER.csv', na="0")
Results
Results Image