You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.
Pick one of the quanititative independent variables from the training data set (train.csv) , and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.
Loading packages and data
library(readr)
library(dplyr)
library(ggplot2)
#Loading dataset
data <- read_csv('C:/Users/aleja/Downloads/train.csv')
#Initial data exploration
head(data)## # A tibble: 6 × 81
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## # ℹ 73 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## # LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## # BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## # YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## # Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## # ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## # BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …
## spc_tbl_ [1,460 × 81] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : num [1:1460] 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr [1:1460] "RL" "RL" "RL" "RL" ...
## $ LotFrontage : num [1:1460] 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : num [1:1460] 8450 9600 11250 9550 14260 ...
## $ Street : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr [1:1460] NA NA NA NA ...
## $ LotShape : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : num [1:1460] 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num [1:1460] 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd : num [1:1460] 2003 1976 2002 1970 2000 ...
## $ RoofStyle : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : num [1:1460] 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ Foundation : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr [1:1460] "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : num [1:1460] 706 978 486 216 655 ...
## $ BsmtFinType2 : chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num [1:1460] 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : num [1:1460] 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : num [1:1460] 856 1262 920 756 1145 ...
## $ Heating : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ Electrical : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : num [1:1460] 856 1262 920 961 1145 ...
## $ 2ndFlrSF : num [1:1460] 854 0 866 756 1053 ...
## $ LowQualFinSF : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num [1:1460] 1710 1262 1786 1717 2198 ...
## $ BsmtFullBath : num [1:1460] 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : num [1:1460] 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : num [1:1460] 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : num [1:1460] 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num [1:1460] 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr [1:1460] NA "TA" "TA" "Gd" ...
## $ GarageType : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : num [1:1460] 2003 1976 2001 1998 2000 ...
## $ GarageFinish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num [1:1460] 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : num [1:1460] 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: num [1:1460] 0 0 0 272 0 0 0 228 205 0 ...
## $ 3SsnPorch : num [1:1460] 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr [1:1460] NA NA NA NA ...
## $ Fence : chr [1:1460] NA NA NA NA ...
## $ MiscFeature : chr [1:1460] NA NA NA NA ...
## $ MiscVal : num [1:1460] 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : num [1:1460] 2008 2007 2008 2006 2008 ...
## $ SaleType : chr [1:1460] "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num [1:1460] 208500 181500 223500 140000 250000 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotFrontage = col_double(),
## .. LotArea = col_double(),
## .. Street = col_character(),
## .. Alley = col_character(),
## .. LotShape = col_character(),
## .. LandContour = col_character(),
## .. Utilities = col_character(),
## .. LotConfig = col_character(),
## .. LandSlope = col_character(),
## .. Neighborhood = col_character(),
## .. Condition1 = col_character(),
## .. Condition2 = col_character(),
## .. BldgType = col_character(),
## .. HouseStyle = col_character(),
## .. OverallQual = col_double(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. RoofStyle = col_character(),
## .. RoofMatl = col_character(),
## .. Exterior1st = col_character(),
## .. Exterior2nd = col_character(),
## .. MasVnrType = col_character(),
## .. MasVnrArea = col_double(),
## .. ExterQual = col_character(),
## .. ExterCond = col_character(),
## .. Foundation = col_character(),
## .. BsmtQual = col_character(),
## .. BsmtCond = col_character(),
## .. BsmtExposure = col_character(),
## .. BsmtFinType1 = col_character(),
## .. BsmtFinSF1 = col_double(),
## .. BsmtFinType2 = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. BsmtUnfSF = col_double(),
## .. TotalBsmtSF = col_double(),
## .. Heating = col_character(),
## .. HeatingQC = col_character(),
## .. CentralAir = col_character(),
## .. Electrical = col_character(),
## .. `1stFlrSF` = col_double(),
## .. `2ndFlrSF` = col_double(),
## .. LowQualFinSF = col_double(),
## .. GrLivArea = col_double(),
## .. BsmtFullBath = col_double(),
## .. BsmtHalfBath = col_double(),
## .. FullBath = col_double(),
## .. HalfBath = col_double(),
## .. BedroomAbvGr = col_double(),
## .. KitchenAbvGr = col_double(),
## .. KitchenQual = col_character(),
## .. TotRmsAbvGrd = col_double(),
## .. Functional = col_character(),
## .. Fireplaces = col_double(),
## .. FireplaceQu = col_character(),
## .. GarageType = col_character(),
## .. GarageYrBlt = col_double(),
## .. GarageFinish = col_character(),
## .. GarageCars = col_double(),
## .. GarageArea = col_double(),
## .. GarageQual = col_character(),
## .. GarageCond = col_character(),
## .. PavedDrive = col_character(),
## .. WoodDeckSF = col_double(),
## .. OpenPorchSF = col_double(),
## .. EnclosedPorch = col_double(),
## .. `3SsnPorch` = col_double(),
## .. ScreenPorch = col_double(),
## .. PoolArea = col_double(),
## .. PoolQC = col_character(),
## .. Fence = col_character(),
## .. MiscFeature = col_character(),
## .. MiscVal = col_double(),
## .. MoSold = col_double(),
## .. YrSold = col_double(),
## .. SaleType = col_character(),
## .. SaleCondition = col_character(),
## .. SalePrice = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
Looking for NA in Train data
#Check for missing values in the dataset
missing_values <- colSums(is.na(data))
#Display variables with missing values
missing_values[missing_values > 0]## LotFrontage Alley MasVnrType MasVnrArea BsmtQual BsmtCond
## 259 1369 8 8 37 37
## BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType
## 38 37 38 1 690 81
## GarageYrBlt GarageFinish GarageQual GarageCond PoolQC Fence
## 81 81 81 81 1453 1179
## MiscFeature
## 1406
#Replacing missing values with Mean/Median/Mode
#Numerical variables
data$LotFrontage[is.na(data$LotFrontage)] <- mean(data$LotFrontage, na.rm = TRUE)
data$MasVnrArea[is.na(data$MasVnrArea)] <- mean(data$MasVnrArea, na.rm = TRUE)
data$GarageYrBlt[is.na(data$GarageYrBlt)] <- mean(data$GarageYrBlt, na.rm = TRUE)
#Categorical variables
data$Alley[is.na(data$Alley)] <- as.character(sort(table(data$Alley), decreasing = TRUE)[1])
data$MasVnrType[is.na(data$MasVnrType)] <- as.character(sort(table(data$MasVnrType), decreasing = TRUE)[1])
data$BsmtQual[is.na(data$BsmtQual)] <- as.character(sort(table(data$BsmtQual), decreasing = TRUE)[1])
data$BsmtCond[is.na(data$BsmtCond)] <- as.character(sort(table(data$BsmtCond), decreasing = TRUE)[1])
data$BsmtExposure[is.na(data$BsmtExposure)] <- as.character(sort(table(data$BsmtExposure), decreasing = TRUE)[1])
data$BsmtFinType1[is.na(data$BsmtFinType1)] <- as.character(sort(table(data$BsmtFinType1), decreasing = TRUE)[1])
data$BsmtFinType2[is.na(data$BsmtFinType2)] <- as.character(sort(table(data$BsmtFinType2), decreasing = TRUE)[1])
data$Electrical[is.na(data$Electrical)] <- as.character(sort(table(data$Electrical), decreasing = TRUE)[1])
data$FireplaceQu[is.na(data$FireplaceQu)] <- as.character(sort(table(data$FireplaceQu), decreasing = TRUE)[1])
data$GarageType[is.na(data$GarageType)] <- as.character(sort(table(data$GarageType), decreasing = TRUE)[1])
data$GarageFinish[is.na(data$GarageFinish)] <- as.character(sort(table(data$GarageFinish), decreasing = TRUE)[1])
data$GarageQual[is.na(data$GarageQual)] <- as.character(sort(table(data$GarageQual), decreasing = TRUE)[1])
data$GarageCond[is.na(data$GarageCond)] <- as.character(sort(table(data$GarageCond), decreasing = TRUE)[1])
data$PoolQC[is.na(data$PoolQC)] <- as.character(sort(table(data$PoolQC), decreasing = TRUE)[1])
data$Fence[is.na(data$Fence)] <- as.character(sort(table(data$Fence), decreasing = TRUE)[1])
data$MiscFeature[is.na(data$MiscFeature)] <- as.character(sort(table(data$MiscFeature), decreasing = TRUE)[1])## [1] FALSE
#Extracting quantitative variables
quantitative_vars <- select(data, where(is.numeric))
#Plotting histograms for quantitative variables
plots <- lapply(names(quantitative_vars), function(var) {
ggplot(data, aes(x = !!sym(var))) +
geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
labs(title = paste("Histogram of", var), x = var, y = "Frequency")
})
#Printing histograms
print(plots)## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
Probability. Calculate as a minimum the below probabilities a through
c. Assume the small letter “x” is estimated as the 3d quartile of the X
variable, and the small letter “y” is estimated as the 2d quartile of
the Y variable. Interpret the meaning of all probabilities. In addition,
make a table of counts as shown below. a. P(X>x | Y>y) b.
P(X>x, Y>y) c. P(X<x | Y>y)
x/y <=2d quartile >2d quartile Total <=3d quartile
>3d quartile
Total
#Calculating quartiles for LotArea and SalePrice
x <- quantile(data$LotArea, 0.75, na.rm = TRUE)
y <- quantile(data$SalePrice, 0.5, na.rm = TRUE)
# Creating a new dataframe to hold binary variables
data_binary <- data %>%
mutate(
X_greater_than_x = ifelse(LotArea > x, 1, 0),
Y_greater_than_y = ifelse(SalePrice > y, 1, 0)
)
# Calculating probabilities
P_X_greater_x_given_Y_greater_y <- sum(data_binary$X_greater_than_x & data_binary$Y_greater_than_y) / sum(data_binary$Y_greater_than_y)
P_X_greater_x_and_Y_greater_y <- sum(data_binary$X_greater_than_x & data_binary$Y_greater_than_y) / nrow(data_binary)
P_X_less_x_given_Y_greater_y <- sum(!data_binary$X_greater_than_x & data_binary$Y_greater_than_y) / sum(data_binary$Y_greater_than_y)
#Creating the contingency table
contingency_table <- table(data_binary$X_greater_than_x, data_binary$Y_greater_than_y)
#Printing probabilities
print(paste("P(X > x | Y > y):", P_X_greater_x_given_Y_greater_y))## [1] "P(X > x | Y > y): 0.379120879120879"
## [1] "P(X > x, Y > y): 0.189041095890411"
## [1] "P(X < x | Y > y): 0.620879120879121"
##
## 0 1
## 0 643 452
## 1 89 276
#Performing Chi-Square Test for Independence
chi_square_test <- chisq.test(contingency_table)
#Printing the results of the Chi-Square Test
print(chi_square_test)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: contingency_table
## X-squared = 127.74, df = 1, p-value < 2.2e-16
Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Provide a 95% CI for the difference in the mean of the variables. Derive a correlation matrix for two of the quantitative variables you selected. Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.
Summary Statistics for LotArea and SalePrice:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
Histograms for LotArea and SalePrice:
# Histogram for LotArea
hist(data$LotArea, main = "Histogram of LotArea", xlab = "LotArea", col = "blue")# Histogram for SalePrice
hist(data$SalePrice, main = "Histogram of SalePrice", xlab = "SalePrice", col = "green")Scatterplot of LotArea and SalePrice:
plot(data$LotArea, data$SalePrice, main = "Scatterplot of LotArea vs SalePrice", xlab = "LotArea", ylab = "SalePrice", pch = 19)95% Confidence Interval for the Difference in Means:
##
## Welch Two Sample t-test
##
## data: data$LotArea and data$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -174514.7 -166294.1
## sample estimates:
## mean of x mean of y
## 10516.83 180921.20
Correlation Matrix:
correlation_matrix <- cor(data[, c("LotArea", "SalePrice")], use = "complete.obs")
correlation_matrix## LotArea SalePrice
## LotArea 1.0000000 0.2638434
## SalePrice 0.2638434 1.0000000
Hypothesis Test for Zero Correlation:
##
## Pearson's product-moment correlation
##
## data: data$LotArea and data$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2154574 0.3109369
## sample estimates:
## cor
## 0.2638434
This analysis indicates that LotArea has a significant but modest impact on SalePrice.
Invert the Correlation Matrix:
#With correlation_matrix already calculated
precision_matrix <- solve(correlation_matrix)
precision_matrix## LotArea SalePrice
## LotArea 1.0748219 -0.2835846
## SalePrice -0.2835846 1.0748219
Multiply Correlation Matrix by Precision Matrix:
## LotArea SalePrice
## LotArea 1 0
## SalePrice 0 1
Principal Component Analysis (PCA):
## Importance of components:
## PC1 PC2
## Standard deviation 1.1242 0.8580
## Proportion of Variance 0.6319 0.3681
## Cumulative Proportion 0.6319 1.0000
PC1, with its larger standard deviation and higher proportion of variance explained, is the dominant principal component and contains the most important information about the original variables.
PC2 captures additional variance not explained by PC1 but to a lesser extent.
Together, PC1 and PC2 capture all the variability in the data, indicating that they are the most important components for representing the relationships among the original variables.
These results suggest that the variables are structured in a way that can be effectively represented by a two-dimensional space spanned by PC1 and PC2.
Fit Exponential Distribution to LotArea:
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#Shift LotArea to be positive
shifted_lotarea <- data$LotArea - min(data$LotArea) + 1
fit <- fitdistr(shifted_lotarea, "exponential")
lambda <- fit$estimate
lambda## rate
## 0.0001084854
Generate 1000 Samples from Exponential Distribution:
samples <- rexp(1000, lambda)
hist(samples, main = "Histogram of Exponential Samples", xlab = "Value", col = "red")Compare Histogram of Samples with Original Data:
par(mfrow = c(1, 2))
hist(shifted_lotarea, main = "Histogram of Shifted LotArea", xlab = "Shifted LotArea", col = "blue")
hist(samples, main = "Histogram of Exponential Samples", xlab = "Value", col = "red")Calculate 5th and 95th Percentiles using the Exponential PDF:
## [1] 472.8128 27614.1451
95% Confidence Interval from Empirical Data:
mean_lotarea <- mean(shifted_lotarea)
sd_lotarea <- sd(shifted_lotarea)
n <- length(shifted_lotarea)
error <- qt(0.975, df = n-1) * sd_lotarea / sqrt(n)
ci_lower <- mean_lotarea - error
ci_upper <- mean_lotarea + error
ci_lower## [1] 8705.418
## [1] 9730.238
Empirical 5th and 95th Percentiles:
## 5% 95%
## 2012.70 16102.15
The wide disparity between the 5th and 95th percentiles suggests a broad range of variability within the dataset. This indicates that the dataset encompasses values spread over a wide range.
The wide disparity between the 5th and 95th percentiles suggests a broad range of variability within the dataset. This indicates that the dataset encompasses values spread over a wide range.
Build and Submit a Regression Model:
data <- as.data.frame(data)
#Model Training
model <- lm(SalePrice ~ ., data = data)
# Model Evaluation
predictions <- predict(model, newdata = data)
mse <- mean((data$SalePrice - predictions)^2)
rmse <- sqrt(mse)
rsquared <- summary(model)$r.squared
# Displaying evaluation metrics
cat("Mean Squared Error (MSE):", mse, "\n")## Mean Squared Error (MSE): 420675612
## Root Mean Squared Error (RMSE): 20510.38
## R-squared: 0.933298
These results suggest that the linear regression model is performing well, with a high degree of accuracy in predicting house prices based on the selected independent variables. However, it’s always a good idea to further validate the model and potentially refine it if needed.
## Warning: not plotting observations with leverage one:
## 121, 186, 272, 326, 333, 347, 376, 399, 584, 596, 667, 811, 945, 949, 1004, 1012, 1188, 1231, 1271, 1276, 1299, 1322, 1371, 1380, 1387
# Check for missing values in train data
missing_values <- colSums(is.na(data))
# Display variables with missing values
vars_with_missing <- names(missing_values[missing_values > 0])
if (length(vars_with_missing) > 0) {
cat("Variables with missing values:\n")
print(vars_with_missing)
} else {
cat("No missing values found in the train data.\n")
}## No missing values found in the train data.
Trying to improve model and reduce Multicollinearity
#Getting the column names of train_data
column_names <- colnames(train_data)
#Printing the column names
print(column_names)## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "1stFlrSF"
## [45] "2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
#Selecting only numeric columns
numeric_columns <- train_data[, sapply(train_data, is.numeric)]
#Calculating correlations with SalePrice
correlations <- cor(numeric_columns, use="pairwise.complete.obs")
saleprice_correlations <- correlations[,"SalePrice"]
#Sorting correlations by absolute value in descending order, excluding SalePrice itself
sorted_correlations <- sort(abs(saleprice_correlations), decreasing = TRUE)
sorted_correlations <- sorted_correlations[names(sorted_correlations) != "SalePrice"]
#Selecting top N features (top 10)
top_n <- 10
top_features <- names(sorted_correlations)[1:top_n]
#Creating a new DataFrame with the selected top features and SalePrice
selected_columns <- c(top_features, "SalePrice")
filtered_data <- train_data[, selected_columns]
#Calculating correlation matrix for the filtered data
correlation_matrix_filtered <- cor(filtered_data, use="pairwise.complete.obs")
#Printing the filtered correlation matrix
print(correlation_matrix_filtered)## OverallQual GrLivArea GarageCars GarageArea TotalBsmtSF 1stFlrSF
## OverallQual 1.0000000 0.5930074 0.6006707 0.5620218 0.5378085 0.4762238
## GrLivArea 0.5930074 1.0000000 0.4672474 0.4689975 0.4548682 0.5660240
## GarageCars 0.6006707 0.4672474 1.0000000 0.8824754 0.4345848 0.4393168
## GarageArea 0.5620218 0.4689975 0.8824754 1.0000000 0.4866655 0.4897817
## TotalBsmtSF 0.5378085 0.4548682 0.4345848 0.4866655 1.0000000 0.8195300
## 1stFlrSF 0.4762238 0.5660240 0.4393168 0.4897817 0.8195300 1.0000000
## FullBath 0.5505997 0.6300116 0.4696720 0.4056562 0.3237224 0.3806375
## TotRmsAbvGrd 0.4274523 0.8254894 0.3622886 0.3378221 0.2855726 0.4095160
## YearBuilt 0.5723228 0.1990097 0.5378501 0.4789538 0.3914520 0.2819859
## YearRemodAdd 0.5506839 0.2873885 0.4206222 0.3715998 0.2910656 0.2403793
## SalePrice 0.7909816 0.7086245 0.6404092 0.6234314 0.6135806 0.6058522
## FullBath TotRmsAbvGrd YearBuilt YearRemodAdd SalePrice
## OverallQual 0.5505997 0.42745234 0.57232277 0.5506839 0.7909816
## GrLivArea 0.6300116 0.82548937 0.19900971 0.2873885 0.7086245
## GarageCars 0.4696720 0.36228857 0.53785009 0.4206222 0.6404092
## GarageArea 0.4056562 0.33782212 0.47895382 0.3715998 0.6234314
## TotalBsmtSF 0.3237224 0.28557256 0.39145200 0.2910656 0.6135806
## 1stFlrSF 0.3806375 0.40951598 0.28198586 0.2403793 0.6058522
## FullBath 1.0000000 0.55478425 0.46827079 0.4390465 0.5606638
## TotRmsAbvGrd 0.5547843 1.00000000 0.09558913 0.1917398 0.5337232
## YearBuilt 0.4682708 0.09558913 1.00000000 0.5928550 0.5228973
## YearRemodAdd 0.4390465 0.19173982 0.59285498 1.0000000 0.5071010
## SalePrice 0.5606638 0.53372316 0.52289733 0.5071010 1.0000000
#Specify the formula for linear regression
formula <- SalePrice ~ . -Id -Alley -LotShape -LotFrontage -LandContour -HouseStyle -FireplaceQu -OpenPorchSF -EnclosedPorch -GarageFinish -GarageYrBlt -BsmtHalfBath -ExterCond -CentralAir -PavedDrive -Electrical -Heating -MiscFeature -MiscVal -YrSold
#Fitting the linear regression model
lm_model <- lm(formula, data = train_data)
#Summarize the model
summary(lm_model)##
## Call:
## lm(formula = formula, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -180511 -9338 0 9567 180511
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.434e+06 1.729e+05 -8.296 2.74e-16 ***
## MSSubClass -1.143e+02 4.802e+01 -2.381 0.017411 *
## MSZoningFV 3.480e+04 1.157e+04 3.007 0.002690 **
## MSZoningRH 2.677e+04 1.146e+04 2.337 0.019610 *
## MSZoningRL 2.764e+04 9.784e+03 2.825 0.004809 **
## MSZoningRM 2.446e+04 9.143e+03 2.675 0.007566 **
## LotArea 6.781e-01 9.722e-02 6.975 4.95e-12 ***
## StreetPave 3.088e+04 1.161e+04 2.661 0.007897 **
## UtilitiesNoSeWa -4.346e+04 2.514e+04 -1.728 0.084174 .
## LotConfigCulDSac 7.280e+03 3.045e+03 2.391 0.016949 *
## LotConfigFR2 -7.166e+03 3.906e+03 -1.835 0.066800 .
## LotConfigFR3 -1.477e+04 1.238e+04 -1.193 0.233154
## LotConfigInside -1.479e+03 1.698e+03 -0.871 0.383959
## LandSlopeMod 2.748e+03 3.476e+03 0.790 0.429452
## LandSlopeSev -4.161e+04 1.065e+04 -3.906 9.87e-05 ***
## NeighborhoodBlueste 3.935e+03 1.899e+04 0.207 0.835844
## NeighborhoodBrDale -4.398e+03 1.065e+04 -0.413 0.679663
## NeighborhoodBrkSide -5.246e+03 8.999e+03 -0.583 0.560042
## NeighborhoodClearCr -1.591e+04 8.898e+03 -1.788 0.073940 .
## NeighborhoodCollgCr -1.114e+04 6.974e+03 -1.597 0.110566
## NeighborhoodCrawfor 1.186e+04 8.185e+03 1.449 0.147479
## NeighborhoodEdwards -2.170e+04 7.752e+03 -2.799 0.005205 **
## NeighborhoodGilbert -1.326e+04 7.417e+03 -1.788 0.074069 .
## NeighborhoodIDOTRR -1.072e+04 1.028e+04 -1.043 0.297195
## NeighborhoodMeadowV -8.893e+03 1.081e+04 -0.822 0.410957
## NeighborhoodMitchel -2.298e+04 7.886e+03 -2.914 0.003631 **
## NeighborhoodNAmes -1.657e+04 7.533e+03 -2.200 0.028006 *
## NeighborhoodNoRidge 2.548e+04 8.175e+03 3.117 0.001868 **
## NeighborhoodNPkVill 1.019e+04 1.364e+04 0.747 0.454994
## NeighborhoodNridgHt 1.576e+04 7.166e+03 2.200 0.028000 *
## NeighborhoodNWAmes -1.805e+04 7.745e+03 -2.331 0.019907 *
## NeighborhoodOldTown -1.524e+04 9.202e+03 -1.656 0.097881 .
## NeighborhoodSawyer -1.163e+04 7.845e+03 -1.483 0.138438
## NeighborhoodSawyerW -5.560e+03 7.543e+03 -0.737 0.461228
## NeighborhoodSomerst -3.161e+03 8.711e+03 -0.363 0.716772
## NeighborhoodStoneBr 3.636e+04 8.011e+03 4.539 6.19e-06 ***
## NeighborhoodSWISU -1.068e+04 9.301e+03 -1.149 0.250936
## NeighborhoodTimber -1.015e+04 7.793e+03 -1.303 0.192824
## NeighborhoodVeenker -1.688e+03 1.020e+04 -0.166 0.868547
## Condition1Feedr 6.921e+03 4.790e+03 1.445 0.148717
## Condition1Norm 1.529e+04 3.953e+03 3.868 0.000115 ***
## Condition1PosA 8.513e+03 9.688e+03 0.879 0.379700
## Condition1PosN 1.190e+04 7.107e+03 1.675 0.094245 .
## Condition1RRAe -1.553e+04 8.868e+03 -1.752 0.080087 .
## Condition1RRAn 1.222e+04 6.600e+03 1.851 0.064411 .
## Condition1RRNe -4.418e+03 1.725e+04 -0.256 0.797872
## Condition1RRNn 7.493e+03 1.252e+04 0.599 0.549597
## Condition2Feedr -7.568e+03 2.236e+04 -0.338 0.735128
## Condition2Norm -1.018e+04 1.916e+04 -0.531 0.595332
## Condition2PosA 4.702e+04 3.078e+04 1.528 0.126869
## Condition2PosN -2.385e+05 2.668e+04 -8.938 < 2e-16 ***
## Condition2RRAe -1.144e+05 4.275e+04 -2.676 0.007549 **
## Condition2RRAn -1.990e+04 3.053e+04 -0.652 0.514528
## Condition2RRNn -3.741e+03 2.581e+04 -0.145 0.884784
## BldgType2fmCon 4.860e+03 8.648e+03 0.562 0.574211
## BldgTypeDuplex -4.012e+03 6.586e+03 -0.609 0.542522
## BldgTypeTwnhs -1.418e+04 7.200e+03 -1.970 0.049101 *
## BldgTypeTwnhsE -1.015e+04 5.738e+03 -1.769 0.077154 .
## OverallQual 6.541e+03 9.692e+02 6.749 2.27e-11 ***
## OverallCond 5.489e+03 8.100e+02 6.777 1.88e-11 ***
## YearBuilt 3.190e+02 6.945e+01 4.592 4.82e-06 ***
## YearRemodAdd 8.116e+01 5.251e+01 1.545 0.122485
## RoofStyleGable 7.084e+03 1.812e+04 0.391 0.695854
## RoofStyleGambrel 1.108e+04 1.967e+04 0.563 0.573457
## RoofStyleHip 7.109e+03 1.819e+04 0.391 0.695986
## RoofStyleMansard 1.574e+04 2.095e+04 0.751 0.452621
## RoofStyleShed 8.476e+04 3.377e+04 2.510 0.012207 *
## RoofMatlCompShg 5.916e+05 4.421e+04 13.379 < 2e-16 ***
## RoofMatlMembran 6.677e+05 5.535e+04 12.064 < 2e-16 ***
## RoofMatlMetal 6.436e+05 5.493e+04 11.718 < 2e-16 ***
## RoofMatlRoll 5.813e+05 5.087e+04 11.425 < 2e-16 ***
## RoofMatlTar&Grv 5.899e+05 4.822e+04 12.233 < 2e-16 ***
## RoofMatlWdShake 5.854e+05 4.683e+04 12.502 < 2e-16 ***
## RoofMatlWdShngl 6.440e+05 4.497e+04 14.323 < 2e-16 ***
## Exterior1stAsphShn -2.335e+04 3.193e+04 -0.731 0.464674
## Exterior1stBrkComm 1.087e+03 2.650e+04 0.041 0.967278
## Exterior1stBrkFace 1.192e+04 1.197e+04 0.996 0.319483
## Exterior1stCBlock -1.083e+04 2.634e+04 -0.411 0.681075
## Exterior1stCemntBd -4.288e+03 1.814e+04 -0.236 0.813198
## Exterior1stHdBoard -9.244e+03 1.218e+04 -0.759 0.448068
## Exterior1stImStucc -2.707e+04 2.726e+04 -0.993 0.321021
## Exterior1stMetalSd -5.160e+03 1.386e+04 -0.372 0.709765
## Exterior1stPlywood -1.064e+04 1.199e+04 -0.888 0.374777
## Exterior1stStone -3.427e+03 2.334e+04 -0.147 0.883301
## Exterior1stStucco -3.665e+03 1.313e+04 -0.279 0.780149
## Exterior1stVinylSd -1.006e+04 1.264e+04 -0.796 0.426431
## Exterior1stWd Sdng -1.123e+04 1.167e+04 -0.962 0.336044
## Exterior1stWdShing -6.660e+03 1.249e+04 -0.533 0.594035
## Exterior2ndAsphShn 1.237e+04 2.143e+04 0.577 0.563858
## Exterior2ndBrk Cmn 1.815e+03 1.947e+04 0.093 0.925728
## Exterior2ndBrkFace 6.713e+02 1.278e+04 0.053 0.958110
## Exterior2ndCBlock NA NA NA NA
## Exterior2ndCmentBd 6.357e+03 1.808e+04 0.352 0.725176
## Exterior2ndHdBoard 6.085e+03 1.196e+04 0.509 0.611029
## Exterior2ndImStucc 1.798e+04 1.389e+04 1.294 0.195754
## Exterior2ndMetalSd 5.748e+03 1.370e+04 0.420 0.674820
## Exterior2ndOther -1.868e+04 2.672e+04 -0.699 0.484627
## Exterior2ndPlywood 4.124e+03 1.156e+04 0.357 0.721203
## Exterior2ndStone -1.006e+04 1.645e+04 -0.611 0.541134
## Exterior2ndStucco 5.492e+03 1.291e+04 0.425 0.670685
## Exterior2ndVinylSd 1.009e+04 1.240e+04 0.814 0.415906
## Exterior2ndWd Sdng 1.070e+04 1.147e+04 0.933 0.351047
## Exterior2ndWd Shng 2.767e+03 1.192e+04 0.232 0.816401
## MasVnrTypeBrkCmn -3.952e+03 1.064e+04 -0.371 0.710414
## MasVnrTypeBrkFace 2.467e+03 8.595e+03 0.287 0.774170
## MasVnrTypeNone 6.156e+03 8.511e+03 0.723 0.469655
## MasVnrTypeStone 6.749e+03 8.687e+03 0.777 0.437349
## MasVnrArea 2.264e+01 5.665e+00 3.997 6.78e-05 ***
## ExterQualFa -9.178e+03 1.027e+04 -0.894 0.371719
## ExterQualGd -2.151e+04 4.686e+03 -4.591 4.85e-06 ***
## ExterQualTA -2.059e+04 5.194e+03 -3.964 7.80e-05 ***
## FoundationCBlock 2.708e+03 3.042e+03 0.890 0.373446
## FoundationPConc 4.021e+03 3.301e+03 1.218 0.223417
## FoundationSlab -2.650e+03 9.140e+03 -0.290 0.771940
## FoundationStone 1.221e+04 1.053e+04 1.159 0.246677
## FoundationWood -2.140e+04 1.439e+04 -1.487 0.137212
## BsmtQualEx -2.730e+04 3.550e+04 -0.769 0.442009
## BsmtQualFa -3.959e+04 3.529e+04 -1.122 0.262214
## BsmtQualGd -4.619e+04 3.525e+04 -1.310 0.190393
## BsmtQualTA -4.277e+04 3.516e+04 -1.216 0.224065
## BsmtCondFa -3.212e+03 4.066e+03 -0.790 0.429679
## BsmtCondGd -3.122e+03 3.094e+03 -1.009 0.313166
## BsmtCondPo 3.752e+04 2.152e+04 1.744 0.081475 .
## BsmtCondTA NA NA NA NA
## BsmtExposureAv 1.270e+04 2.280e+04 0.557 0.577511
## BsmtExposureGd 2.723e+04 2.290e+04 1.189 0.234555
## BsmtExposureMn 9.946e+03 2.287e+04 0.435 0.663640
## BsmtExposureNo 7.633e+03 2.276e+04 0.335 0.737380
## BsmtFinType1ALQ -2.614e+03 2.831e+03 -0.924 0.355879
## BsmtFinType1BLQ -4.376e+02 3.040e+03 -0.144 0.885554
## BsmtFinType1GLQ 3.027e+03 2.654e+03 1.141 0.254228
## BsmtFinType1LwQ -6.062e+03 3.661e+03 -1.656 0.098003 .
## BsmtFinType1Rec -2.639e+03 3.062e+03 -0.862 0.388867
## BsmtFinType1Unf NA NA NA NA
## BsmtFinSF1 3.610e+01 5.011e+00 7.203 1.01e-12 ***
## BsmtFinType2ALQ 2.823e+04 2.467e+04 1.144 0.252700
## BsmtFinType2BLQ 1.587e+04 2.443e+04 0.650 0.515962
## BsmtFinType2GLQ 2.622e+04 2.516e+04 1.042 0.297669
## BsmtFinType2LwQ 1.361e+04 2.440e+04 0.558 0.577139
## BsmtFinType2Rec 1.784e+04 2.434e+04 0.733 0.463698
## BsmtFinType2Unf 1.888e+04 2.431e+04 0.776 0.437632
## BsmtFinSF2 2.807e+01 8.835e+00 3.177 0.001526 **
## BsmtUnfSF 1.889e+01 4.599e+00 4.108 4.26e-05 ***
## TotalBsmtSF NA NA NA NA
## HeatingQCFa -2.276e+03 4.060e+03 -0.561 0.575213
## HeatingQCGd -3.788e+03 2.016e+03 -1.879 0.060458 .
## HeatingQCPo 1.161e+04 2.481e+04 0.468 0.639876
## HeatingQCTA -3.724e+03 2.004e+03 -1.859 0.063318 .
## `1stFlrSF` 4.702e+01 5.268e+00 8.925 < 2e-16 ***
## `2ndFlrSF` 5.391e+01 4.095e+00 13.165 < 2e-16 ***
## LowQualFinSF -1.449e+01 1.521e+01 -0.952 0.341081
## GrLivArea NA NA NA NA
## BsmtFullBath 1.744e+03 1.822e+03 0.957 0.338786
## FullBath 3.950e+03 2.110e+03 1.872 0.061380 .
## HalfBath 9.678e+02 2.001e+03 0.484 0.628692
## BedroomAbvGr -3.777e+03 1.317e+03 -2.867 0.004208 **
## KitchenAbvGr -1.337e+04 5.396e+03 -2.477 0.013379 *
## KitchenQualFa -2.015e+04 5.851e+03 -3.444 0.000593 ***
## KitchenQualGd -2.416e+04 3.385e+03 -7.138 1.60e-12 ***
## KitchenQualTA -2.292e+04 3.821e+03 -5.998 2.60e-09 ***
## TotRmsAbvGrd 1.745e+03 9.292e+02 1.878 0.060584 .
## FunctionalMaj2 -4.331e+03 1.335e+04 -0.324 0.745652
## FunctionalMin1 4.115e+03 8.284e+03 0.497 0.619479
## FunctionalMin2 6.221e+03 8.217e+03 0.757 0.449159
## FunctionalMod -1.561e+03 9.735e+03 -0.160 0.872600
## FunctionalSev -3.488e+04 2.843e+04 -1.227 0.220191
## FunctionalTyp 1.608e+04 7.152e+03 2.248 0.024777 *
## Fireplaces 2.540e+03 1.292e+03 1.966 0.049507 *
## GarageType870 2.710e+04 1.171e+04 2.314 0.020808 *
## GarageTypeAttchd 1.756e+04 1.076e+04 1.632 0.102993
## GarageTypeBasment 2.245e+04 1.245e+04 1.802 0.071719 .
## GarageTypeBuiltIn 2.021e+04 1.116e+04 1.811 0.070453 .
## GarageTypeCarPort 2.167e+04 1.424e+04 1.522 0.128274
## GarageTypeDetchd 2.049e+04 1.074e+04 1.909 0.056473 .
## GarageCars 4.982e+03 2.208e+03 2.256 0.024230 *
## GarageArea 1.364e+01 7.419e+00 1.838 0.066225 .
## GarageQualEx 1.275e+05 2.797e+04 4.559 5.65e-06 ***
## GarageQualFa -4.133e+03 4.612e+03 -0.896 0.370324
## GarageQualGd -2.102e+02 7.402e+03 -0.028 0.977351
## GarageQualPo -2.290e+04 2.034e+04 -1.126 0.260481
## GarageQualTA NA NA NA NA
## GarageCondEx -1.217e+05 3.260e+04 -3.732 0.000199 ***
## GarageCondFa -2.436e+03 5.123e+03 -0.476 0.634474
## GarageCondGd -4.600e+03 8.948e+03 -0.514 0.607310
## GarageCondPo 6.644e+02 1.285e+04 0.052 0.958788
## GarageCondTA NA NA NA NA
## WoodDeckSF 1.321e+01 5.712e+00 2.312 0.020947 *
## `3SsnPorch` 2.781e+01 2.193e+01 1.268 0.204906
## ScreenPorch 3.527e+01 1.211e+01 2.912 0.003654 **
## PoolArea 5.435e+02 1.675e+02 3.244 0.001209 **
## PoolQCEx -1.717e+05 9.111e+04 -1.885 0.059691 .
## PoolQCFa -3.109e+05 9.968e+04 -3.119 0.001857 **
## PoolQCGd -2.801e+05 1.089e+05 -2.573 0.010205 *
## FenceGdPrv -8.584e+03 3.552e+03 -2.417 0.015800 *
## FenceGdWo -1.932e+02 3.461e+03 -0.056 0.955499
## FenceMnPrv 1.084e+03 2.196e+03 0.494 0.621468
## FenceMnWw -3.907e+03 7.307e+03 -0.535 0.592991
## MoSold -4.070e+02 2.367e+02 -1.719 0.085779 .
## SaleTypeCon 2.735e+04 1.734e+04 1.578 0.114906
## SaleTypeConLD 1.554e+04 9.355e+03 1.661 0.097023 .
## SaleTypeConLI 3.918e+03 1.128e+04 0.347 0.728438
## SaleTypeConLw 5.383e+02 1.164e+04 0.046 0.963119
## SaleTypeCWD 1.531e+04 1.261e+04 1.214 0.224846
## SaleTypeNew 2.606e+04 1.518e+04 1.717 0.086264 .
## SaleTypeOth 8.746e+03 1.418e+04 0.617 0.537471
## SaleTypeWD -5.808e+02 4.097e+03 -0.142 0.887299
## SaleConditionAdjLand 1.105e+04 1.344e+04 0.822 0.411194
## SaleConditionAlloca 4.864e+02 8.451e+03 0.058 0.954117
## SaleConditionFamily 2.121e+03 5.975e+03 0.355 0.722721
## SaleConditionNormal 6.931e+03 2.826e+03 2.453 0.014316 *
## SaleConditionPartial -4.397e+03 1.464e+04 -0.300 0.763979
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22510 on 1256 degrees of freedom
## Multiple R-squared: 0.9309, Adjusted R-squared: 0.9197
## F-statistic: 83.36 on 203 and 1256 DF, p-value: < 2.2e-16
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 6 × 80
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1461 20 RH 80 11622 Pave <NA> Reg
## 2 1462 20 RL 81 14267 Pave <NA> IR1
## 3 1463 60 RL 74 13830 Pave <NA> IR1
## 4 1464 60 RL 78 9978 Pave <NA> IR1
## 5 1465 120 RL 43 5005 Pave <NA> IR1
## 6 1466 60 RL 75 10000 Pave <NA> IR1
## # ℹ 72 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## # LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## # BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## # YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## # Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## # ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## # BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …
## spc_tbl_ [1,459 × 80] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:1459] 1461 1462 1463 1464 1465 ...
## $ MSSubClass : num [1:1459] 20 20 60 60 120 60 20 60 20 20 ...
## $ MSZoning : chr [1:1459] "RH" "RL" "RL" "RL" ...
## $ LotFrontage : num [1:1459] 80 81 74 78 43 75 NA 63 85 70 ...
## $ LotArea : num [1:1459] 11622 14267 13830 9978 5005 ...
## $ Street : chr [1:1459] "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr [1:1459] NA NA NA NA ...
## $ LotShape : chr [1:1459] "Reg" "IR1" "IR1" "IR1" ...
## $ LandContour : chr [1:1459] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr [1:1459] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr [1:1459] "Inside" "Corner" "Inside" "Inside" ...
## $ LandSlope : chr [1:1459] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr [1:1459] "NAmes" "NAmes" "Gilbert" "Gilbert" ...
## $ Condition1 : chr [1:1459] "Feedr" "Norm" "Norm" "Norm" ...
## $ Condition2 : chr [1:1459] "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr [1:1459] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr [1:1459] "1Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : num [1:1459] 5 6 5 6 8 6 6 6 7 4 ...
## $ OverallCond : num [1:1459] 6 6 5 6 5 5 7 5 5 5 ...
## $ YearBuilt : num [1:1459] 1961 1958 1997 1998 1992 ...
## $ YearRemodAdd : num [1:1459] 1961 1958 1998 1998 1992 ...
## $ RoofStyle : chr [1:1459] "Gable" "Hip" "Gable" "Gable" ...
## $ RoofMatl : chr [1:1459] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr [1:1459] "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
## $ Exterior2nd : chr [1:1459] "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
## $ MasVnrType : chr [1:1459] "None" "BrkFace" "None" "BrkFace" ...
## $ MasVnrArea : num [1:1459] 0 108 0 20 0 0 0 0 0 0 ...
## $ ExterQual : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ ExterCond : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ Foundation : chr [1:1459] "CBlock" "CBlock" "PConc" "PConc" ...
## $ BsmtQual : chr [1:1459] "TA" "TA" "Gd" "TA" ...
## $ BsmtCond : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ BsmtExposure : chr [1:1459] "No" "No" "No" "No" ...
## $ BsmtFinType1 : chr [1:1459] "Rec" "ALQ" "GLQ" "GLQ" ...
## $ BsmtFinSF1 : num [1:1459] 468 923 791 602 263 0 935 0 637 804 ...
## $ BsmtFinType2 : chr [1:1459] "LwQ" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num [1:1459] 144 0 0 0 0 0 0 0 0 78 ...
## $ BsmtUnfSF : num [1:1459] 270 406 137 324 1017 ...
## $ TotalBsmtSF : num [1:1459] 882 1329 928 926 1280 ...
## $ Heating : chr [1:1459] "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr [1:1459] "TA" "TA" "Gd" "Ex" ...
## $ CentralAir : chr [1:1459] "Y" "Y" "Y" "Y" ...
## $ Electrical : chr [1:1459] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : num [1:1459] 896 1329 928 926 1280 ...
## $ 2ndFlrSF : num [1:1459] 0 0 701 678 0 892 0 676 0 0 ...
## $ LowQualFinSF : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num [1:1459] 896 1329 1629 1604 1280 ...
## $ BsmtFullBath : num [1:1459] 0 0 0 0 0 0 1 0 1 1 ...
## $ BsmtHalfBath : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ FullBath : num [1:1459] 1 1 2 2 2 2 2 2 1 1 ...
## $ HalfBath : num [1:1459] 0 1 1 1 0 1 0 1 1 0 ...
## $ BedroomAbvGr : num [1:1459] 2 3 3 3 2 3 3 3 2 2 ...
## $ KitchenAbvGr : num [1:1459] 1 1 1 1 1 1 1 1 1 1 ...
## $ KitchenQual : chr [1:1459] "TA" "Gd" "TA" "Gd" ...
## $ TotRmsAbvGrd : num [1:1459] 5 6 6 7 5 7 6 7 5 4 ...
## $ Functional : chr [1:1459] "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num [1:1459] 0 0 1 1 0 1 0 1 1 0 ...
## $ FireplaceQu : chr [1:1459] NA NA "TA" "Gd" ...
## $ GarageType : chr [1:1459] "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ GarageYrBlt : num [1:1459] 1961 1958 1997 1998 1992 ...
## $ GarageFinish : chr [1:1459] "Unf" "Unf" "Fin" "Fin" ...
## $ GarageCars : num [1:1459] 1 1 2 2 2 2 2 2 2 2 ...
## $ GarageArea : num [1:1459] 730 312 482 470 506 440 420 393 506 525 ...
## $ GarageQual : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr [1:1459] "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num [1:1459] 140 393 212 360 0 157 483 0 192 240 ...
## $ OpenPorchSF : num [1:1459] 0 36 34 36 82 84 21 75 0 0 ...
## $ EnclosedPorch: num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ 3SsnPorch : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ ScreenPorch : num [1:1459] 120 0 0 0 144 0 0 0 0 0 ...
## $ PoolArea : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr [1:1459] NA NA NA NA ...
## $ Fence : chr [1:1459] "MnPrv" NA "MnPrv" NA ...
## $ MiscFeature : chr [1:1459] NA "Gar2" NA NA ...
## $ MiscVal : num [1:1459] 0 12500 0 0 0 0 500 0 0 0 ...
## $ MoSold : num [1:1459] 6 6 3 6 1 4 3 5 2 4 ...
## $ YrSold : num [1:1459] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ SaleType : chr [1:1459] "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr [1:1459] "Normal" "Normal" "Normal" "Normal" ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotFrontage = col_double(),
## .. LotArea = col_double(),
## .. Street = col_character(),
## .. Alley = col_character(),
## .. LotShape = col_character(),
## .. LandContour = col_character(),
## .. Utilities = col_character(),
## .. LotConfig = col_character(),
## .. LandSlope = col_character(),
## .. Neighborhood = col_character(),
## .. Condition1 = col_character(),
## .. Condition2 = col_character(),
## .. BldgType = col_character(),
## .. HouseStyle = col_character(),
## .. OverallQual = col_double(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. RoofStyle = col_character(),
## .. RoofMatl = col_character(),
## .. Exterior1st = col_character(),
## .. Exterior2nd = col_character(),
## .. MasVnrType = col_character(),
## .. MasVnrArea = col_double(),
## .. ExterQual = col_character(),
## .. ExterCond = col_character(),
## .. Foundation = col_character(),
## .. BsmtQual = col_character(),
## .. BsmtCond = col_character(),
## .. BsmtExposure = col_character(),
## .. BsmtFinType1 = col_character(),
## .. BsmtFinSF1 = col_double(),
## .. BsmtFinType2 = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. BsmtUnfSF = col_double(),
## .. TotalBsmtSF = col_double(),
## .. Heating = col_character(),
## .. HeatingQC = col_character(),
## .. CentralAir = col_character(),
## .. Electrical = col_character(),
## .. `1stFlrSF` = col_double(),
## .. `2ndFlrSF` = col_double(),
## .. LowQualFinSF = col_double(),
## .. GrLivArea = col_double(),
## .. BsmtFullBath = col_double(),
## .. BsmtHalfBath = col_double(),
## .. FullBath = col_double(),
## .. HalfBath = col_double(),
## .. BedroomAbvGr = col_double(),
## .. KitchenAbvGr = col_double(),
## .. KitchenQual = col_character(),
## .. TotRmsAbvGrd = col_double(),
## .. Functional = col_character(),
## .. Fireplaces = col_double(),
## .. FireplaceQu = col_character(),
## .. GarageType = col_character(),
## .. GarageYrBlt = col_double(),
## .. GarageFinish = col_character(),
## .. GarageCars = col_double(),
## .. GarageArea = col_double(),
## .. GarageQual = col_character(),
## .. GarageCond = col_character(),
## .. PavedDrive = col_character(),
## .. WoodDeckSF = col_double(),
## .. OpenPorchSF = col_double(),
## .. EnclosedPorch = col_double(),
## .. `3SsnPorch` = col_double(),
## .. ScreenPorch = col_double(),
## .. PoolArea = col_double(),
## .. PoolQC = col_character(),
## .. Fence = col_character(),
## .. MiscFeature = col_character(),
## .. MiscVal = col_double(),
## .. MoSold = col_double(),
## .. YrSold = col_double(),
## .. SaleType = col_character(),
## .. SaleCondition = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
## Id MSSubClass MSZoning LotFrontage
## Min. :1461 Min. : 20.00 Length:1459 Min. : 21.00
## 1st Qu.:1826 1st Qu.: 20.00 Class :character 1st Qu.: 58.00
## Median :2190 Median : 50.00 Mode :character Median : 67.00
## Mean :2190 Mean : 57.38 Mean : 68.58
## 3rd Qu.:2554 3rd Qu.: 70.00 3rd Qu.: 80.00
## Max. :2919 Max. :190.00 Max. :200.00
## NA's :227
## LotArea Street Alley LotShape
## Min. : 1470 Length:1459 Length:1459 Length:1459
## 1st Qu.: 7391 Class :character Class :character Class :character
## Median : 9399 Mode :character Mode :character Mode :character
## Mean : 9819
## 3rd Qu.:11518
## Max. :56600
##
## LandContour Utilities LotConfig LandSlope
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1459 Min. : 1.000 Min. :1.000 Min. :1879
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1953
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.079 Mean :5.554 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2001
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1459 Length:1459 Length:1459
## 1st Qu.:1963 Class :character Class :character Class :character
## Median :1992 Mode :character Mode :character Mode :character
## Mean :1984
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1459 Length:1459 Min. : 0.0 Length:1459
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 100.7
## 3rd Qu.: 164.0
## Max. :1290.0
## NA's :15
## ExterCond Foundation BsmtQual BsmtCond
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1459 Length:1459 Min. : 0.0 Length:1459
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 350.5 Mode :character
## Mean : 439.2
## 3rd Qu.: 753.5
## Max. :4010.0
## NA's :1
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0 Length:1459
## 1st Qu.: 0.00 1st Qu.: 219.2 1st Qu.: 784 Class :character
## Median : 0.00 Median : 460.0 Median : 988 Mode :character
## Mean : 52.62 Mean : 554.3 Mean :1046
## 3rd Qu.: 0.00 3rd Qu.: 797.8 3rd Qu.:1305
## Max. :1526.00 Max. :2140.0 Max. :5095
## NA's :1 NA's :1 NA's :1
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1459 Length:1459 Length:1459 Min. : 407.0
## Class :character Class :character Class :character 1st Qu.: 873.5
## Mode :character Mode :character Mode :character Median :1079.0
## Mean :1156.5
## 3rd Qu.:1382.5
## Max. :5095.0
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 407 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1118 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1432 Median :0.0000
## Mean : 326 Mean : 3.543 Mean :1486 Mean :0.4345
## 3rd Qu.: 676 3rd Qu.: 0.000 3rd Qu.:1721 3rd Qu.:1.0000
## Max. :1862 Max. :1064.000 Max. :5095 Max. :3.0000
## NA's :2
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.0000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.0652 Mean :1.571 Mean :0.3777 Mean :2.854
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.0000 Max. :4.000 Max. :2.0000 Max. :6.000
## NA's :2
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1459 Min. : 3.000 Length:1459
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.042 Mean : 6.385
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.000 Max. :15.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.0000 Length:1459 Length:1459 Min. :1895
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:1959
## Median :0.0000 Mode :character Mode :character Median :1979
## Mean :0.5812 Mean :1978
## 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :4.0000 Max. :2207
## NA's :78
## GarageFinish GarageCars GarageArea GarageQual
## Length:1459 Min. :0.000 Min. : 0.0 Length:1459
## Class :character 1st Qu.:1.000 1st Qu.: 318.0 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.766 Mean : 472.8
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :5.000 Max. :1488.0
## NA's :1 NA's :1
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1459 Length:1459 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 28.00
## Mean : 93.17 Mean : 48.31
## 3rd Qu.: 168.00 3rd Qu.: 72.00
## Max. :1424.00 Max. :742.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 0.000
## Mean : 24.24 Mean : 1.794 Mean : 17.06 Mean : 1.744
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :1012.00 Max. :360.000 Max. :576.00 Max. :800.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1459 Length:1459 Length:1459 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 58.17
## 3rd Qu.: 0.00
## Max. :17000.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1459 Length:1459
## 1st Qu.: 4.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.104 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
#Checking for missing values in test_data
missing_test <- colSums(is.na(test_data))
#Printing columns with missing values
print(missing_test[missing_test > 0])## MSZoning LotFrontage Alley Utilities Exterior1st Exterior2nd
## 4 227 1352 2 1 1
## MasVnrType MasVnrArea BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 16 15 44 45 44 42
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath
## 1 42 1 1 1 2
## BsmtHalfBath KitchenQual Functional FireplaceQu GarageType GarageYrBlt
## 2 1 2 730 76 78
## GarageFinish GarageCars GarageArea GarageQual GarageCond PoolQC
## 78 1 1 78 78 1456
## Fence MiscFeature SaleType
## 1169 1408 1
#Impute missing values for numerical variables with mean or median
test_data$LotFrontage[is.na(test_data$LotFrontage)] <- median(test_data$LotFrontage, na.rm = TRUE)
test_data$MasVnrArea[is.na(test_data$MasVnrArea)] <- median(test_data$MasVnrArea, na.rm = TRUE)
test_data$BsmtFinSF1[is.na(test_data$BsmtFinSF1)] <- median(test_data$BsmtFinSF1, na.rm = TRUE)
test_data$BsmtFinSF2[is.na(test_data$BsmtFinSF2)] <- median(test_data$BsmtFinSF2, na.rm = TRUE)
test_data$BsmtUnfSF[is.na(test_data$BsmtUnfSF)] <- median(test_data$BsmtUnfSF, na.rm = TRUE)
test_data$TotalBsmtSF[is.na(test_data$TotalBsmtSF)] <- median(test_data$TotalBsmtSF, na.rm = TRUE)
test_data$GarageYrBlt[is.na(test_data$GarageYrBlt)] <- median(test_data$GarageYrBlt, na.rm = TRUE)
test_data$GarageCars[is.na(test_data$GarageCars)] <- median(test_data$GarageCars, na.rm = TRUE)
test_data$GarageArea[is.na(test_data$GarageArea)] <- median(test_data$GarageArea, na.rm = TRUE)
#Imputing missing values for categorical variables with mode
mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
test_data$MSZoning[is.na(test_data$MSZoning)] <- mode(test_data$MSZoning)
test_data$Alley[is.na(test_data$Alley)] <- mode(test_data$Alley)
test_data$Utilities[is.na(test_data$Utilities)] <- mode(test_data$Utilities)
test_data$Exterior1st[is.na(test_data$Exterior1st)] <- mode(test_data$Exterior1st)
test_data$Exterior2nd[is.na(test_data$Exterior2nd)] <- mode(test_data$Exterior2nd)
test_data$MasVnrType[is.na(test_data$MasVnrType)] <- mode(test_data$MasVnrType)
test_data$BsmtQual[is.na(test_data$BsmtQual)] <- mode(test_data$BsmtQual)
test_data$BsmtCond[is.na(test_data$BsmtCond)] <- mode(test_data$BsmtCond)
test_data$BsmtExposure[is.na(test_data$BsmtExposure)] <- mode(test_data$BsmtExposure)
test_data$BsmtFinType1[is.na(test_data$BsmtFinType1)] <- mode(test_data$BsmtFinType1)
test_data$BsmtFinType2[is.na(test_data$BsmtFinType2)] <- mode(test_data$BsmtFinType2)
test_data$KitchenQual[is.na(test_data$KitchenQual)] <- mode(test_data$KitchenQual)
test_data$Functional[is.na(test_data$Functional)] <- mode(test_data$Functional)
test_data$FireplaceQu[is.na(test_data$FireplaceQu)] <- mode(test_data$FireplaceQu)
test_data$GarageType[is.na(test_data$GarageType)] <- mode(test_data$GarageType)
test_data$GarageFinish[is.na(test_data$GarageFinish)] <- mode(test_data$GarageFinish)
test_data$GarageQual[is.na(test_data$GarageQual)] <- mode(test_data$GarageQual)
test_data$GarageCond[is.na(test_data$GarageCond)] <- mode(test_data$GarageCond)
test_data$PoolQC[is.na(test_data$PoolQC)] <- mode(test_data$PoolQC)
test_data$Fence[is.na(test_data$Fence)] <- mode(test_data$Fence)
test_data$MiscFeature[is.na(test_data$MiscFeature)] <- mode(test_data$MiscFeature)
test_data$SaleType[is.na(test_data$SaleType)] <- mode(test_data$SaleType)#Checking for missing values in test_data
missing_test <- colnames(test_data)[colSums(is.na(test_data)) > 0]
missing_test## [1] "Alley" "BsmtFullBath" "BsmtHalfBath" "FireplaceQu" "PoolQC"
## [6] "Fence" "MiscFeature"
#Identifying numeric columns
numeric_cols <- sapply(test_data, is.numeric)
#Subset the data to numeric columns
test_data_numeric <- test_data[, numeric_cols]
#Imputation on missing values in numeric columns with mean
test_data[, numeric_cols] <- lapply(test_data_numeric, function(x) {
x[is.na(x)] <- mean(x, na.rm = TRUE)
x
})#Function to calculate mode
Mode <- function(x) {
unique_x <- unique(x)
unique_x[which.max(tabulate(match(x, unique_x)))]
}
#Identifying categorical columns
categorical_cols <- sapply(test_data, function(x) is.factor(x) | is.character(x))
#Subset the data to categorical columns
test_data_categorical <- test_data[, categorical_cols]
#Imputation on missing values in categorical columns with mode
for (col in names(test_data_categorical)) {
test_data[[col]][is.na(test_data[[col]])] <- Mode(test_data[[col]])
}#Filling missing values for numeric columns with mean
test_data$BsmtFullBath[is.na(test_data$BsmtFullBath)] <- mean(test_data$BsmtFullBath, na.rm = TRUE)
test_data$BsmtHalfBath[is.na(test_data$BsmtHalfBath)] <- mean(test_data$BsmtHalfBath, na.rm = TRUE)
#Filling missing values for categorical columns with mode
for (col in c("Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature")) {
mode_val <- names(sort(table(test_data[[col]], useNA = "ifany"), decreasing = TRUE)[1])
test_data[[col]][is.na(test_data[[col]])] <- mode_val
}## [1] "Alley" "FireplaceQu" "PoolQC" "Fence" "MiscFeature"
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
#Identifying numeric and categorical columns
numeric_cols <- sapply(test_data, is.numeric)
categorical_cols <- !numeric_cols
#Replacing missing values with mean for numeric columns and mode for categorical columns
test_data_numeric <- test_data[, numeric_cols]
test_data_categorical <- test_data[, categorical_cols]
#Replacing missing values with mean for numeric columns
test_data_numeric_filled <- na.aggregate(test_data_numeric, FUN = mean)
#Replacing missing values with mode for categorical columns
for (col in colnames(test_data_categorical)) {
mode_val <- names(sort(table(test_data_categorical[[col]], useNA = "ifany"), decreasing = TRUE)[1])
test_data_categorical[[col]][is.na(test_data_categorical[[col]])] <- mode_val
}
#Combining numeric and categorical columns
test_data_filled <- cbind(test_data_numeric_filled, test_data_categorical)
#Replacing original columns in test_data with filled columns
test_data <- test_data_filled#Identifying numeric and categorical columns
numeric_cols <- sapply(test_data, is.numeric)
categorical_cols <- !numeric_cols
#Replacing missing values with mean for numeric columns and mode for categorical columns
for (col in colnames(test_data)) {
if (is.numeric(test_data[[col]])) {
test_data[[col]][test_data[[col]] == ""] <- NA # Convert empty strings to NA
test_data[[col]][is.na(test_data[[col]])] <- mean(test_data[[col]], na.rm = TRUE) # Replace NA with mean
} else {
test_data[[col]][test_data[[col]] == ""] <- NA # Convert empty strings to NA
mode_val <- names(sort(table(test_data[[col]], useNA = "ifany"), decreasing = TRUE)[1]) # Find mode
test_data[[col]][is.na(test_data[[col]])] <- mode_val # Replace NA with mode
}
}#Checking for missing values in test_data
NAtest <- colSums(is.na(test_data))
#Displaying columns with missing values
print(names(NAtest[NAtest > 0]))## [1] "Alley" "FireplaceQu" "PoolQC" "Fence" "MiscFeature"
#Function to replace missing values with mean, median, or mode based on condition
replace_missing <- function(x) {
ifelse(is.na(x) | x == "", ifelse(is.numeric(x), mean(x, na.rm = TRUE), Mode(x)), x)
}
#Applying the function to relevant columns and store the result in new_test
new_test <- test_data
new_test$Alley <- replace_missing(new_test$Alley)
new_test$BsmtFullBath <- replace_missing(new_test$BsmtFullBath)
new_test$BsmtHalfBath <- replace_missing(new_test$BsmtHalfBath)
new_test$FireplaceQu <- replace_missing(new_test$FireplaceQu)
new_test$PoolQC <- replace_missing(new_test$PoolQC)
new_test$Fence <- replace_missing(new_test$Fence)
new_test$MiscFeature <- replace_missing(new_test$MiscFeature)#Function to replace missing categorical values with the most common category
replace_missing_categorical <- function(x) {
ifelse(is.na(x), names(sort(table(x), decreasing = TRUE)[1]), x)
}
#Applying the function to relevant columns and store the result in new_test
new_test <- test_data
new_test$Alley <- replace_missing_categorical(new_test$Alley)
new_test$FireplaceQu <- replace_missing_categorical(new_test$FireplaceQu)
new_test$PoolQC <- replace_missing_categorical(new_test$PoolQC)
new_test$Fence <- replace_missing_categorical(new_test$Fence)
new_test$MiscFeature <- replace_missing_categorical(new_test$MiscFeature)## [1] FALSE
When splitting the data
## Loading required package: lattice
#Subset of selected features
selected_columns <- c("OverallQual", "GrLivArea", "GarageCars", "GarageArea",
"TotalBsmtSF", "1stFlrSF", "FullBath", "TotRmsAbvGrd",
"YearBuilt", "YearRemodAdd", "SalePrice")
#Selecting the data
selected_data <- train_data[, selected_columns]
#Removing rows with missing values
selected_data <- na.omit(selected_data)
#Splitting the data into training and testing sets
set.seed(123) # For reproducibility
train_index <- createDataPartition(selected_data$SalePrice, p = 0.8, list = FALSE)
train_set <- selected_data[train_index, ]
test_set <- selected_data[-train_index, ]
#Fitting the linear regression model
lm_model <- lm(SalePrice ~ ., data = train_set)
#Summarize the model
summary(lm_model)##
## Call:
## lm(formula = SalePrice ~ ., data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -458178 -18884 -2126 15473 230147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.267e+06 1.342e+05 -9.447 < 2e-16 ***
## OverallQual 1.992e+04 1.223e+03 16.282 < 2e-16 ***
## GrLivArea 4.571e+01 4.507e+00 10.141 < 2e-16 ***
## GarageCars 1.051e+04 3.095e+03 3.396 0.000706 ***
## GarageArea 2.119e+01 1.049e+01 2.019 0.043735 *
## TotalBsmtSF 1.561e+01 4.409e+00 3.541 0.000415 ***
## `1stFlrSF` 1.492e+01 5.124e+00 2.911 0.003667 **
## FullBath -6.545e+03 2.845e+03 -2.301 0.021585 *
## TotRmsAbvGrd 5.105e+02 1.180e+03 0.433 0.665410
## YearBuilt 2.781e+02 5.317e+01 5.230 2.01e-07 ***
## YearRemodAdd 3.290e+02 6.622e+01 4.968 7.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35300 on 1158 degrees of freedom
## Multiple R-squared: 0.7872, Adjusted R-squared: 0.7853
## F-statistic: 428.3 on 10 and 1158 DF, p-value: < 2.2e-16
#Prediction on the test set
predictions <- predict(lm_model, newdata = test_set)
#Calculating performance metrics
actual <- test_set$SalePrice
rmse <- sqrt(mean((predictions - actual)^2))
r2 <- 1 - sum((predictions - actual)^2) / sum((actual - mean(actual))^2)
#Printing performance metrics
cat("RMSE:", rmse, "\n")## RMSE: 47333.26
## R-squared: 0.7304998
The results indicate that the model has an Adjusted R-squared of 0.823 on the training data, which suggests a good fit. However, the R-squared on the test data is much lower at 0.4326, and the RMSE is relatively high at 55747.34
When using test data
#Libraries
library(caret)
#Subset of selected features
selected_columns <- c("OverallQual", "GrLivArea", "GarageCars", "GarageArea",
"TotalBsmtSF", "1stFlrSF", "FullBath", "TotRmsAbvGrd",
"YearBuilt", "YearRemodAdd", "SalePrice")
#Selecting the data
selected_data <- train_data[, selected_columns]
#Removing rows with missing values
selected_data <- na.omit(selected_data)
#Fitting the linear regression model
lm_model <- lm(SalePrice ~ ., data = selected_data)
#Summarize the model
summary(lm_model)##
## Call:
## lm(formula = SalePrice ~ ., data = selected_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -489958 -19316 -1948 16020 290558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.186e+06 1.291e+05 -9.187 < 2e-16 ***
## OverallQual 1.960e+04 1.190e+03 16.472 < 2e-16 ***
## GrLivArea 5.130e+01 4.233e+00 12.119 < 2e-16 ***
## GarageCars 1.042e+04 3.044e+03 3.422 0.000639 ***
## GarageArea 1.495e+01 1.031e+01 1.450 0.147384
## TotalBsmtSF 1.986e+01 4.295e+00 4.625 4.09e-06 ***
## `1stFlrSF` 1.417e+01 4.930e+00 2.875 0.004097 **
## FullBath -6.791e+03 2.682e+03 -2.532 0.011457 *
## TotRmsAbvGrd 3.310e+01 1.119e+03 0.030 0.976404
## YearBuilt 2.682e+02 5.035e+01 5.328 1.15e-07 ***
## YearRemodAdd 2.965e+02 6.363e+01 4.659 3.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37920 on 1449 degrees of freedom
## Multiple R-squared: 0.7737, Adjusted R-squared: 0.7721
## F-statistic: 495.4 on 10 and 1449 DF, p-value: < 2.2e-16
#Making predictions on the new test set (new_test)
#Ensure new_test has the same columns except SalePrice
test_columns <- selected_columns[selected_columns != "SalePrice"]
new_test_selected <- new_test[, test_columns]
predictions <- predict(lm_model, newdata = new_test_selected)
#Creating data frame for submission to Kaggle
submission <- data.frame(Id = new_test$Id, SalePrice = predictions)
#Writing data frame to CSV
write.csv(submission, "predictions.csv", row.names = FALSE)#Printing a message indicating the CSV file has been created
cat("CSV file 'predictions.csv' has been created for submission to Kaggle.\n")## CSV file 'predictions.csv' has been created for submission to Kaggle.