You are to register for Kaggle.com and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques .
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
library(moments)
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
train<- read.csv("/Users/Sangeetha/Downloads/house-prices-advanced-regression-techniques/train.csv",stringsAsFactors = FALSE)
test<- read.csv("/Users/Sangeetha/Downloads/house-prices-advanced-regression-techniques/test.csv",stringsAsFactors = FALSE)
#Checking the dimensions of the 2 datasets
dim(train)
## [1] 1460 81
dim(test)
## [1] 1459 80
#Examining the structure of the data
str(train)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
str(test)
## 'data.frame': 1459 obs. of 80 variables:
## $ Id : int 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 ...
## $ MSSubClass : int 20 20 60 60 120 60 20 60 20 20 ...
## $ MSZoning : chr "RH" "RL" "RL" "RL" ...
## $ LotFrontage : int 80 81 74 78 43 75 NA 63 85 70 ...
## $ LotArea : int 11622 14267 13830 9978 5005 10000 7980 8402 10176 8400 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "IR1" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "Corner" "Inside" "Inside" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "NAmes" "NAmes" "Gilbert" "Gilbert" ...
## $ Condition1 : chr "Feedr" "Norm" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "1Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 5 6 5 6 8 6 6 6 7 4 ...
## $ OverallCond : int 6 6 5 6 5 5 7 5 5 5 ...
## $ YearBuilt : int 1961 1958 1997 1998 1992 1993 1992 1998 1990 1970 ...
## $ YearRemodAdd : int 1961 1958 1998 1998 1992 1994 2007 1998 1990 1970 ...
## $ RoofStyle : chr "Gable" "Hip" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
## $ Exterior2nd : chr "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
## $ MasVnrType : chr "None" "BrkFace" "None" "BrkFace" ...
## $ MasVnrArea : int 0 108 0 20 0 0 0 0 0 0 ...
## $ ExterQual : chr "TA" "TA" "TA" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "CBlock" "CBlock" "PConc" "PConc" ...
## $ BsmtQual : chr "TA" "TA" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "TA" ...
## $ BsmtExposure : chr "No" "No" "No" "No" ...
## $ BsmtFinType1 : chr "Rec" "ALQ" "GLQ" "GLQ" ...
## $ BsmtFinSF1 : int 468 923 791 602 263 0 935 0 637 804 ...
## $ BsmtFinType2 : chr "LwQ" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 144 0 0 0 0 0 0 0 0 78 ...
## $ BsmtUnfSF : int 270 406 137 324 1017 763 233 789 663 0 ...
## $ TotalBsmtSF : int 882 1329 928 926 1280 763 1168 789 1300 882 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "TA" "TA" "Gd" "Ex" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 896 1329 928 926 1280 763 1187 789 1341 882 ...
## $ X2ndFlrSF : int 0 0 701 678 0 892 0 676 0 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 896 1329 1629 1604 1280 1655 1187 1465 1341 882 ...
## $ BsmtFullBath : int 0 0 0 0 0 0 1 0 1 1 ...
## $ BsmtHalfBath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 1 1 2 2 2 2 2 2 1 1 ...
## $ HalfBath : int 0 1 1 1 0 1 0 1 1 0 ...
## $ BedroomAbvGr : int 2 3 3 3 2 3 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 1 1 ...
## $ KitchenQual : chr "TA" "Gd" "TA" "Gd" ...
## $ TotRmsAbvGrd : int 5 6 6 7 5 7 6 7 5 4 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 0 1 1 0 1 0 1 1 0 ...
## $ FireplaceQu : chr NA NA "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ GarageYrBlt : int 1961 1958 1997 1998 1992 1993 1992 1998 1990 1970 ...
## $ GarageFinish : chr "Unf" "Unf" "Fin" "Fin" ...
## $ GarageCars : int 1 1 2 2 2 2 2 2 2 2 ...
## $ GarageArea : int 730 312 482 470 506 440 420 393 506 525 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 140 393 212 360 0 157 483 0 192 240 ...
## $ OpenPorchSF : int 0 36 34 36 82 84 21 75 0 0 ...
## $ EnclosedPorch: int 0 0 0 0 0 0 0 0 0 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ScreenPorch : int 120 0 0 0 144 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr "MnPrv" NA "MnPrv" NA ...
## $ MiscFeature : chr NA "Gar2" NA NA ...
## $ MiscVal : int 0 12500 0 0 0 0 500 0 0 0 ...
## $ MoSold : int 6 6 3 6 1 4 3 5 2 4 ...
## $ YrSold : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Normal" ...
Just by observing the data above, we see a lot of NA values which may effect the analysis.
SalePrice <- train$SalePrice
train$SalePrice <- NULL
str(train)
## 'data.frame': 1460 obs. of 80 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
The reason behind combining the dataset and then splitting it again could be to ensure consistent factor levels across different subsets of the data. If the factor levels differ between the original datasets, combining them allows for the creation of a unified set of factor levels.
salePrice<- train$SalePrice
train$SalePrice = NULL
full_dataSet <- rbind(train,test) #combines the 2 datasets by row
dim(full_dataSet)
## [1] 2919 80
head(full_dataSet)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition
## 1 0 2 2008 WD Normal
## 2 0 5 2007 WD Normal
## 3 0 9 2008 WD Normal
## 4 0 2 2006 WD Abnorml
## 5 0 12 2008 WD Normal
## 6 700 10 2009 WD Normal
# Convert character columns to a factor, filling NA values with "no data"
for (col in colnames(full_dataSet)){
if (typeof(full_dataSet[,col]) == "character"){
new_col <- full_dataSet[,col]
new_col[is.na(new_col)] <- "no data"
full_dataSet[col] <- as.factor(new_col)
}
}
#Counts up to the row length of the train dataset
train = full_dataSet[1:nrow(train),]
train$SalePrice = SalePrice
test = full_dataSet[(nrow(train)+1):nrow(full_dataSet),]
Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set.
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 no data: 0 Median : 69.00
## Mean : 730.5 Mean : 56.9 RH : 16 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RL :1151 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 RM : 218 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl : 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 no data:1369 IR2: 41 HLS: 50
## Median : 9478 Pave : 41 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub :1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## no data: 0 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## NoSeWa : 1 FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual OverallCond
## Norm :1445 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000
## Feedr : 6 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000
## Artery : 2 Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000
## PosN : 2 Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575
## RRNn : 2 TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000
## PosA : 1 1.5Unf : 14 Max. :10.000 Max. :9.000
## (Other): 2 (Other): 19
## YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1872 Min. :1950 Flat : 13 CompShg:1434 VinylSd:515
## 1st Qu.:1954 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222
## Median :1973 Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220
## Mean :1971 Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206
## 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108
## Max. :2010 Max. :2010 Shed : 2 Membran: 1 CemntBd: 61
## (Other): 2 (Other):128
## Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## VinylSd:504 BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3 BrkTil:146
## MetalSd:214 BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28 CBlock:634
## HdBoard:207 no data: 8 Median : 0.0 Gd:488 Gd: 146 PConc :647
## Wd Sdng:197 None :864 Mean : 103.7 TA:906 Po: 1 Slab : 24
## Plywood:142 Stone :128 3rd Qu.: 166.0 TA:1282 Stone : 6
## CmentBd: 60 Max. :1600.0 Wood : 3
## (Other):136 NA's :8
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## Ex :121 Fa : 45 Av :221 ALQ :220 Min. : 0.0
## Fa : 35 Gd : 65 Gd :134 BLQ :148 1st Qu.: 0.0
## Gd :618 no data: 37 Mn :114 GLQ :418 Median : 383.5
## no data: 37 Po : 2 No :953 LwQ : 74 Mean : 443.6
## TA :649 TA :1311 no data: 38 no data: 37 3rd Qu.: 712.2
## Rec :133 Max. :5644.0
## Unf :430
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## ALQ : 19 Min. : 0.00 Min. : 0.0 Min. : 0.0
## BLQ : 33 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8
## GLQ : 14 Median : 0.00 Median : 477.5 Median : 991.5
## LwQ : 46 Mean : 46.55 Mean : 567.2 Mean :1057.4
## no data: 38 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Rec : 54 Max. :1474.00 Max. :2336.0 Max. :6110.0
## Unf :1256
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## Floor: 1 Ex:741 N: 95 FuseA : 94 Min. : 334 Min. : 0
## GasA :1428 Fa: 49 Y:1365 FuseF : 27 1st Qu.: 882 1st Qu.: 0
## GasW : 18 Gd:241 FuseP : 3 Median :1087 Median : 0
## Grav : 7 Po: 1 Mix : 1 Mean :1163 Mean : 347
## OthW : 2 TA:428 no data: 1 3rd Qu.:1391 3rd Qu.: 728
## Wall : 4 SBrkr :1334 Max. :4692 Max. :2065
##
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## Min. : 0.000 Min. : 334 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 0.000 Median :1464 Median :0.0000 Median :0.00000
## Mean : 5.845 Mean :1515 Mean :0.4253 Mean :0.05753
## 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :572.000 Max. :5642 Max. :3.0000 Max. :2.00000
##
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.000 Ex :100
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa : 39
## Median :2.000 Median :0.0000 Median :3.000 Median :1.000 Gd :586
## Mean :1.565 Mean :0.3829 Mean :2.866 Mean :1.047 no data: 0
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 TA :735
## Max. :3.000 Max. :2.0000 Max. :8.000 Max. :3.000
##
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType
## Min. : 2.000 Typ :1360 Min. :0.000 Ex : 24 2Types : 6
## 1st Qu.: 5.000 Min2 : 34 1st Qu.:0.000 Fa : 33 Attchd :870
## Median : 6.000 Min1 : 31 Median :1.000 Gd :380 Basment: 19
## Mean : 6.518 Mod : 15 Mean :0.613 no data:690 BuiltIn: 88
## 3rd Qu.: 7.000 Maj1 : 14 3rd Qu.:1.000 Po : 20 CarPort: 9
## Max. :14.000 Maj2 : 5 Max. :3.000 TA :313 Detchd :387
## (Other): 1 no data: 81
## GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## Min. :1900 Fin :352 Min. :0.000 Min. : 0.0 Ex : 3
## 1st Qu.:1961 no data: 81 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48
## Median :1980 RFn :422 Median :2.000 Median : 480.0 Gd : 14
## Mean :1979 Unf :605 Mean :1.767 Mean : 473.0 no data: 81
## 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0 Po : 3
## Max. :2010 Max. :4.000 Max. :1418.0 TA :1311
## NA's :81
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## Ex : 2 N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Fa : 35 P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Gd : 9 Y:1340 Median : 0.00 Median : 25.00 Median : 0.00
## no data: 81 Mean : 94.24 Mean : 46.66 Mean : 21.95
## Po : 7 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
## TA :1326 Max. :857.00 Max. :547.00 Max. :552.00
##
## X3SsnPorch ScreenPorch PoolArea PoolQC
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Ex : 2
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2
## Median : 0.00 Median : 0.00 Median : 0.000 Gd : 3
## Mean : 3.41 Mean : 15.06 Mean : 2.759 no data:1453
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :508.00 Max. :480.00 Max. :738.000
##
## Fence MiscFeature MiscVal MoSold
## GdPrv : 59 Gar2 : 2 Min. : 0.00 Min. : 1.000
## GdWo : 54 no data:1406 1st Qu.: 0.00 1st Qu.: 5.000
## MnPrv : 157 Othr : 2 Median : 0.00 Median : 6.000
## MnWw : 11 Shed : 49 Mean : 43.49 Mean : 6.322
## no data:1179 TenC : 1 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :15500.00 Max. :12.000
##
## YrSold SaleType SaleCondition SalePrice
## Min. :2006 WD :1267 Abnorml: 101 Min. : 34900
## 1st Qu.:2007 New : 122 AdjLand: 4 1st Qu.:129975
## Median :2008 COD : 43 Alloca : 12 Median :163000
## Mean :2008 ConLD : 9 Family : 20 Mean :180921
## 3rd Qu.:2009 ConLI : 5 Normal :1198 3rd Qu.:214000
## Max. :2010 ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
# Filling remaining NA values with -1
train[is.na(train)] <- -1
test[is.na(test)] <- -1
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : -1.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 42.00
## Median : 730.5 Median : 50.0 no data: 0 Median : 63.00
## Mean : 730.5 Mean : 56.9 RH : 16 Mean : 57.45
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RL :1151 3rd Qu.: 79.00
## Max. :1460.0 Max. :190.0 RM : 218 Max. :313.00
##
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl : 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 no data:1369 IR2: 41 HLS: 50
## Median : 9478 Pave : 41 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub :1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## no data: 0 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## NoSeWa : 1 FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual OverallCond
## Norm :1445 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000
## Feedr : 6 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000
## Artery : 2 Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000
## PosN : 2 Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575
## RRNn : 2 TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000
## PosA : 1 1.5Unf : 14 Max. :10.000 Max. :9.000
## (Other): 2 (Other): 19
## YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1872 Min. :1950 Flat : 13 CompShg:1434 VinylSd:515
## 1st Qu.:1954 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222
## Median :1973 Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220
## Mean :1971 Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206
## 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108
## Max. :2010 Max. :2010 Shed : 2 Membran: 1 CemntBd: 61
## (Other): 2 (Other):128
## Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## VinylSd:504 BrkCmn : 15 Min. : -1.0 Ex: 52 Ex: 3 BrkTil:146
## MetalSd:214 BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28 CBlock:634
## HdBoard:207 no data: 8 Median : 0.0 Gd:488 Gd: 146 PConc :647
## Wd Sdng:197 None :864 Mean : 103.1 TA:906 Po: 1 Slab : 24
## Plywood:142 Stone :128 3rd Qu.: 164.2 TA:1282 Stone : 6
## CmentBd: 60 Max. :1600.0 Wood : 3
## (Other):136
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## Ex :121 Fa : 45 Av :221 ALQ :220 Min. : 0.0
## Fa : 35 Gd : 65 Gd :134 BLQ :148 1st Qu.: 0.0
## Gd :618 no data: 37 Mn :114 GLQ :418 Median : 383.5
## no data: 37 Po : 2 No :953 LwQ : 74 Mean : 443.6
## TA :649 TA :1311 no data: 38 no data: 37 3rd Qu.: 712.2
## Rec :133 Max. :5644.0
## Unf :430
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## ALQ : 19 Min. : 0.00 Min. : 0.0 Min. : 0.0
## BLQ : 33 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8
## GLQ : 14 Median : 0.00 Median : 477.5 Median : 991.5
## LwQ : 46 Mean : 46.55 Mean : 567.2 Mean :1057.4
## no data: 38 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Rec : 54 Max. :1474.00 Max. :2336.0 Max. :6110.0
## Unf :1256
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## Floor: 1 Ex:741 N: 95 FuseA : 94 Min. : 334 Min. : 0
## GasA :1428 Fa: 49 Y:1365 FuseF : 27 1st Qu.: 882 1st Qu.: 0
## GasW : 18 Gd:241 FuseP : 3 Median :1087 Median : 0
## Grav : 7 Po: 1 Mix : 1 Mean :1163 Mean : 347
## OthW : 2 TA:428 no data: 1 3rd Qu.:1391 3rd Qu.: 728
## Wall : 4 SBrkr :1334 Max. :4692 Max. :2065
##
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## Min. : 0.000 Min. : 334 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 0.000 Median :1464 Median :0.0000 Median :0.00000
## Mean : 5.845 Mean :1515 Mean :0.4253 Mean :0.05753
## 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :572.000 Max. :5642 Max. :3.0000 Max. :2.00000
##
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.000 Ex :100
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa : 39
## Median :2.000 Median :0.0000 Median :3.000 Median :1.000 Gd :586
## Mean :1.565 Mean :0.3829 Mean :2.866 Mean :1.047 no data: 0
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 TA :735
## Max. :3.000 Max. :2.0000 Max. :8.000 Max. :3.000
##
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType
## Min. : 2.000 Typ :1360 Min. :0.000 Ex : 24 2Types : 6
## 1st Qu.: 5.000 Min2 : 34 1st Qu.:0.000 Fa : 33 Attchd :870
## Median : 6.000 Min1 : 31 Median :1.000 Gd :380 Basment: 19
## Mean : 6.518 Mod : 15 Mean :0.613 no data:690 BuiltIn: 88
## 3rd Qu.: 7.000 Maj1 : 14 3rd Qu.:1.000 Po : 20 CarPort: 9
## Max. :14.000 Maj2 : 5 Max. :3.000 TA :313 Detchd :387
## (Other): 1 no data: 81
## GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## Min. : -1 Fin :352 Min. :0.000 Min. : 0.0 Ex : 3
## 1st Qu.:1958 no data: 81 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48
## Median :1977 RFn :422 Median :2.000 Median : 480.0 Gd : 14
## Mean :1869 Unf :605 Mean :1.767 Mean : 473.0 no data: 81
## 3rd Qu.:2001 3rd Qu.:2.000 3rd Qu.: 576.0 Po : 3
## Max. :2010 Max. :4.000 Max. :1418.0 TA :1311
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## Ex : 2 N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Fa : 35 P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Gd : 9 Y:1340 Median : 0.00 Median : 25.00 Median : 0.00
## no data: 81 Mean : 94.24 Mean : 46.66 Mean : 21.95
## Po : 7 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
## TA :1326 Max. :857.00 Max. :547.00 Max. :552.00
##
## X3SsnPorch ScreenPorch PoolArea PoolQC
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Ex : 2
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2
## Median : 0.00 Median : 0.00 Median : 0.000 Gd : 3
## Mean : 3.41 Mean : 15.06 Mean : 2.759 no data:1453
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :508.00 Max. :480.00 Max. :738.000
##
## Fence MiscFeature MiscVal MoSold
## GdPrv : 59 Gar2 : 2 Min. : 0.00 Min. : 1.000
## GdWo : 54 no data:1406 1st Qu.: 0.00 1st Qu.: 5.000
## MnPrv : 157 Othr : 2 Median : 0.00 Median : 6.000
## MnWw : 11 Shed : 49 Mean : 43.49 Mean : 6.322
## no data:1179 TenC : 1 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :15500.00 Max. :12.000
##
## YrSold SaleType SaleCondition SalePrice
## Min. :2006 WD :1267 Abnorml: 101 Min. : 34900
## 1st Qu.:2007 New : 122 AdjLand: 4 1st Qu.:129975
## Median :2008 COD : 43 Alloca : 12 Median :163000
## Mean :2008 ConLD : 9 Family : 20 Mean :180921
## 3rd Qu.:2009 ConLI : 5 Normal :1198 3rd Qu.:214000
## Max. :2010 ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
plot(density(train$YearBuilt))
Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.
ggplot(train, aes(x=GrLivArea+TotRmsAbvGrd, y=SalePrice, color=CentralAir)) + geom_point()
for (col in colnames(train)){
if(is.numeric(train[,col])){
if( abs(cor(train[,col],train$SalePrice)) > 0.5){
print(col)
print( cor(train[,col],train$SalePrice) )
}
}
}
## [1] "OverallQual"
## [1] 0.7909816
## [1] "YearBuilt"
## [1] 0.5228973
## [1] "YearRemodAdd"
## [1] 0.507101
## [1] "TotalBsmtSF"
## [1] 0.6135806
## [1] "X1stFlrSF"
## [1] 0.6058522
## [1] "GrLivArea"
## [1] 0.7086245
## [1] "FullBath"
## [1] 0.5606638
## [1] "TotRmsAbvGrd"
## [1] 0.5337232
## [1] "GarageCars"
## [1] 0.6404092
## [1] "GarageArea"
## [1] 0.6234314
## [1] "SalePrice"
## [1] 1
for (col in colnames(train)){
if(is.numeric(train[,col])){
if( abs(cor(train[,col],train$SalePrice)) < 0.1){
print(col)
print( cor(train[,col],train$SalePrice) )
}
}
}
## [1] "Id"
## [1] -0.02191672
## [1] "MSSubClass"
## [1] -0.08428414
## [1] "OverallCond"
## [1] -0.07785589
## [1] "BsmtFinSF2"
## [1] -0.01137812
## [1] "LowQualFinSF"
## [1] -0.02560613
## [1] "BsmtHalfBath"
## [1] -0.01684415
## [1] "X3SsnPorch"
## [1] 0.04458367
## [1] "PoolArea"
## [1] 0.09240355
## [1] "MiscVal"
## [1] -0.02118958
## [1] "MoSold"
## [1] 0.04643225
## [1] "YrSold"
## [1] -0.02892259
Derive a correlation matrix for any three quantitative variables in the dataset. Source: http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software
corDF <- cor(train[,c('SalePrice','GrLivArea','TotRmsAbvGrd' )])
print(corDF)
## SalePrice GrLivArea TotRmsAbvGrd
## SalePrice 1.0000000 0.7086245 0.5337232
## GrLivArea 0.7086245 1.0000000 0.8254894
## TotRmsAbvGrd 0.5337232 0.8254894 1.0000000
corrplot(corDF, method="number")
Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? Source: https://bookdown.org/ndphillips/YaRrr/correlation-cor-test.html
(cor1 <-cor.test(formula = ~SalePrice + GrLivArea,data=train, conf.level = .80))
##
## Pearson's product-moment correlation
##
## data: SalePrice and GrLivArea
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
(cor2 <-cor.test(formula = ~SalePrice + TotRmsAbvGrd,data=train, conf.level = .80))
##
## Pearson's product-moment correlation
##
## data: SalePrice and TotRmsAbvGrd
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5092841 0.5573021
## sample estimates:
## cor
## 0.5337232
My analysis A high correlation coefficient suggests a strong linear association between the variables. It indicates that the variables tend to move together in a consistent pattern. A low p-value provides evidence against the null hypothesis, indicating that the observed correlation is unlikely to occur by chance. It suggests that the relationship observed in the data is statistically significant. In other words, there is a low probability that the observed correlation is due to random variation alone.Therefore, I think above ground living area square feet and total rooms above grade (does not include bathrooms) are good indicators of house prices.
The familywise error refers to the chance of making at least one false positive error. In other words, it measures the probability of wrongly concluding that there is a significant relationship or difference between variables when, in fact, there isn't. We have rejected the null hypothesis, it is possible to have a Type 1 error.
Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)
inverse_matrix <- solve(corDF)
print(inverse_matrix)
## SalePrice GrLivArea TotRmsAbvGrd
## SalePrice 2.0424418 -1.718505 0.3285093
## GrLivArea -1.7185052 4.585000 -2.8676627
## TotRmsAbvGrd 0.3285093 -2.867663 3.1918921
Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
(multiply1 <-corDF %*% inverse_matrix)
## SalePrice GrLivArea TotRmsAbvGrd
## SalePrice 1.000000e+00 0 0
## GrLivArea -5.551115e-17 1 0
## TotRmsAbvGrd -1.665335e-16 0 1
(multiply2 <-inverse_matrix %*% corDF)
## SalePrice GrLivArea TotRmsAbvGrd
## SalePrice 1.000000e+00 -3.330669e-16 -4.440892e-16
## GrLivArea 2.220446e-16 1.000000e+00 4.440892e-16
## TotRmsAbvGrd -2.220446e-16 -4.440892e-16 1.000000e+00
(lu1 <- lu(multiply1))
## 'MatrixFactorization' of Formal class 'denseLU' [package "Matrix"] with 4 slots
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:3] "SalePrice" "GrLivArea" "TotRmsAbvGrd"
## .. ..$ : chr [1:3] "SalePrice" "GrLivArea" "TotRmsAbvGrd"
## ..@ x : num [1:9] 1.00 -5.55e-17 -1.67e-16 0.00 1.00 ...
## ..@ perm : int [1:3] 1 2 3
## ..@ Dim : int [1:2] 3 3
(lu2 <- lu(multiply2))
## 'MatrixFactorization' of Formal class 'denseLU' [package "Matrix"] with 4 slots
## ..@ Dimnames:List of 2
## .. ..$ : chr [1:3] "SalePrice" "GrLivArea" "TotRmsAbvGrd"
## .. ..$ : chr [1:3] "SalePrice" "GrLivArea" "TotRmsAbvGrd"
## ..@ x : num [1:9] 1.00 2.22e-16 -2.22e-16 -3.33e-16 1.00 ...
## ..@ perm : int [1:3] 1 2 3
## ..@ Dim : int [1:2] 3 3
Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).
hist(train$GrLivArea, breaks=20)
hist(train$TotRmsAbvGrd, breaks=20)
hist(train$LotArea,breaks=20)
hist(train$BsmtFinSF1,breaks=20)
(skewness_value <- skewness(train$BsmtFinSF1))
## [1] 1.683771
If the skewness value is significantly positive (greater than zero), it suggests that the variable is skewed to the right. For BsmtFinSF1, skewness is 1.68, therefore it's right skewed.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fitdistr(train$BsmtFinSF1, "exponential")
## rate
## 0.002254081
## (0.000058992)
I choose the larger value from above because I don't know if the spread of the data is relatively small(then I would choose the smaller one).
Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).
exponent <- rexp(1000, 0.002254081)
head(exponent)
## [1] 337.03436 52.95163 1009.60383 101.03644 116.33481 714.47480
Plot a histogram and compare it with a histogram of your original variable.
hist(exponent)
# Create the first histogram
hist(train$BsmtFinSF1, col = "blue", main = "Histogram Comparison", xlab = "Value", ylim = c(0, 350), breaks = 30)
# Add the second histogram on the same plot
par(new = TRUE)
hist(exponent, col = "red", add = TRUE, breaks = 30)
legend("topright", legend = c("Original", "Exponential"), fill = c("blue", "red"))
Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality.
qexp(.05, rate= 0.002254081)
## [1] 22.75575
qexp(.95, rate= 0.002254081)
## [1] 1329.026
Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
quantile(train$BsmtFinSF1, probs=c(0.05,0.95))
## 5% 95%
## 0 1274
(lm_model <- lm(SalePrice ~ train$GrLivArea + train$TotRmsAbvGrd, data = train))
##
## Call:
## lm(formula = SalePrice ~ train$GrLivArea + train$TotRmsAbvGrd,
## data = train)
##
## Coefficients:
## (Intercept) train$GrLivArea train$TotRmsAbvGrd
## 39387.8 127.2 -7861.3
summary(lm_model)
##
## Call:
## lm(formula = SalePrice ~ train$GrLivArea + train$TotRmsAbvGrd,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -502732 -29360 -1264 20007 358035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39387.79 6118.10 6.438 1.64e-10 ***
## train$GrLivArea 127.20 4.91 25.906 < 2e-16 ***
## train$TotRmsAbvGrd -7861.26 1587.41 -4.952 8.19e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55630 on 1457 degrees of freedom
## Multiple R-squared: 0.5104, Adjusted R-squared: 0.5097
## F-statistic: 759.4 on 2 and 1457 DF, p-value: < 2.2e-16
# Select the predictor variables for the regression model
predictors <- c("GrLivArea", "TotRmsAbvGrd")
# Create a new data frame with the predictor variables and the response variable
regression_data <- train[-1, c(predictors, "SalePrice")]
# Remove rows with missing values
regression_data <- na.omit(regression_data)
# Fit the multiple regression model
model <- lm(SalePrice ~ ., data = regression_data)
# Print the model summary
summary(model)
##
## Call:
## lm(formula = SalePrice ~ ., data = regression_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -502752 -29358 -1326 20024 358078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39419.302 6121.248 6.440 1.62e-10 ***
## GrLivArea 127.226 4.912 25.899 < 2e-16 ***
## TotRmsAbvGrd -7872.908 1588.547 -4.956 8.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55640 on 1456 degrees of freedom
## Multiple R-squared: 0.5104, Adjusted R-squared: 0.5097
## F-statistic: 758.8 on 2 and 1456 DF, p-value: < 2.2e-16
# Read the sample_submission file
sample_submission <- read.csv("/Users/Sangeetha/Downloads/house-prices-advanced-regression-techniques/sample_submission.csv")
# Create a new data frame with only "Id" column
prediction_df <- data.frame(ID = sample_submission$Id)
# Predict the SalePrice using your regression model (replace `model` with your actual model)
prediction_df$SalePrice <- predict(model, newdata = regression_data)
# Write the predictions to a CSV file
write.csv(prediction_df, file = "predictions.csv", row.names = FALSE)
# Verify the number of rows in the predictions file
num_rows <- nrow(prediction_df)
print(num_rows)
## [1] 1459
Sangeetha's Kaggle Score.