You are to register for Kaggle.com and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques .

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
library(moments)
library(Matrix)
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
train<- read.csv("/Users/Sangeetha/Downloads/house-prices-advanced-regression-techniques/train.csv",stringsAsFactors = FALSE)
test<- read.csv("/Users/Sangeetha/Downloads/house-prices-advanced-regression-techniques/test.csv",stringsAsFactors = FALSE)
#Checking the dimensions of the 2 datasets
dim(train)
## [1] 1460   81
dim(test)
## [1] 1459   80
#Examining the structure of the data
str(train)
## 'data.frame':    1460 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
str(test)
## 'data.frame':    1459 obs. of  80 variables:
##  $ Id           : int  1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 ...
##  $ MSSubClass   : int  20 20 60 60 120 60 20 60 20 20 ...
##  $ MSZoning     : chr  "RH" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  80 81 74 78 43 75 NA 63 85 70 ...
##  $ LotArea      : int  11622 14267 13830 9978 5005 10000 7980 8402 10176 8400 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "IR1" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "Corner" "Inside" "Inside" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "NAmes" "NAmes" "Gilbert" "Gilbert" ...
##  $ Condition1   : chr  "Feedr" "Norm" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "1Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  5 6 5 6 8 6 6 6 7 4 ...
##  $ OverallCond  : int  6 6 5 6 5 5 7 5 5 5 ...
##  $ YearBuilt    : int  1961 1958 1997 1998 1992 1993 1992 1998 1990 1970 ...
##  $ YearRemodAdd : int  1961 1958 1998 1998 1992 1994 2007 1998 1990 1970 ...
##  $ RoofStyle    : chr  "Gable" "Hip" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
##  $ Exterior2nd  : chr  "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
##  $ MasVnrType   : chr  "None" "BrkFace" "None" "BrkFace" ...
##  $ MasVnrArea   : int  0 108 0 20 0 0 0 0 0 0 ...
##  $ ExterQual    : chr  "TA" "TA" "TA" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "CBlock" "CBlock" "PConc" "PConc" ...
##  $ BsmtQual     : chr  "TA" "TA" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "TA" ...
##  $ BsmtExposure : chr  "No" "No" "No" "No" ...
##  $ BsmtFinType1 : chr  "Rec" "ALQ" "GLQ" "GLQ" ...
##  $ BsmtFinSF1   : int  468 923 791 602 263 0 935 0 637 804 ...
##  $ BsmtFinType2 : chr  "LwQ" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  144 0 0 0 0 0 0 0 0 78 ...
##  $ BsmtUnfSF    : int  270 406 137 324 1017 763 233 789 663 0 ...
##  $ TotalBsmtSF  : int  882 1329 928 926 1280 763 1168 789 1300 882 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "TA" "TA" "Gd" "Ex" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  896 1329 928 926 1280 763 1187 789 1341 882 ...
##  $ X2ndFlrSF    : int  0 0 701 678 0 892 0 676 0 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  896 1329 1629 1604 1280 1655 1187 1465 1341 882 ...
##  $ BsmtFullBath : int  0 0 0 0 0 0 1 0 1 1 ...
##  $ BsmtHalfBath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  1 1 2 2 2 2 2 2 1 1 ...
##  $ HalfBath     : int  0 1 1 1 0 1 0 1 1 0 ...
##  $ BedroomAbvGr : int  2 3 3 3 2 3 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ KitchenQual  : chr  "TA" "Gd" "TA" "Gd" ...
##  $ TotRmsAbvGrd : int  5 6 6 7 5 7 6 7 5 4 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 0 1 1 0 1 0 1 1 0 ...
##  $ FireplaceQu  : chr  NA NA "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ GarageYrBlt  : int  1961 1958 1997 1998 1992 1993 1992 1998 1990 1970 ...
##  $ GarageFinish : chr  "Unf" "Unf" "Fin" "Fin" ...
##  $ GarageCars   : int  1 1 2 2 2 2 2 2 2 2 ...
##  $ GarageArea   : int  730 312 482 470 506 440 420 393 506 525 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  140 393 212 360 0 157 483 0 192 240 ...
##  $ OpenPorchSF  : int  0 36 34 36 82 84 21 75 0 0 ...
##  $ EnclosedPorch: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ScreenPorch  : int  120 0 0 0 144 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  "MnPrv" NA "MnPrv" NA ...
##  $ MiscFeature  : chr  NA "Gar2" NA NA ...
##  $ MiscVal      : int  0 12500 0 0 0 0 500 0 0 0 ...
##  $ MoSold       : int  6 6 3 6 1 4 3 5 2 4 ...
##  $ YrSold       : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Normal" ...

Just by observing the data above, we see a lot of NA values which may effect the analysis.

SalePrice <- train$SalePrice 
train$SalePrice <- NULL
str(train)
## 'data.frame':    1460 obs. of  80 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...

The reason behind combining the dataset and then splitting it again could be to ensure consistent factor levels across different subsets of the data. If the factor levels differ between the original datasets, combining them allows for the creation of a unified set of factor levels.

salePrice<- train$SalePrice 
train$SalePrice = NULL
full_dataSet <- rbind(train,test) #combines the 2 datasets by row
dim(full_dataSet)
## [1] 2919   80
head(full_dataSet)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition
## 1       0      2   2008       WD        Normal
## 2       0      5   2007       WD        Normal
## 3       0      9   2008       WD        Normal
## 4       0      2   2006       WD       Abnorml
## 5       0     12   2008       WD        Normal
## 6     700     10   2009       WD        Normal
# Convert character columns to a factor, filling NA values with "no data"
for (col in colnames(full_dataSet)){
  if (typeof(full_dataSet[,col]) == "character"){
    new_col <- full_dataSet[,col]
    new_col[is.na(new_col)] <- "no data"
    full_dataSet[col] <- as.factor(new_col)
  }
}
#Counts up to the row length of the train dataset
train = full_dataSet[1:nrow(train),]
train$SalePrice = SalePrice  
test = full_dataSet[(nrow(train)+1):nrow(full_dataSet),]

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set.

summary(train)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   no data:   0   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9   RH     :  16   Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RL     :1151   3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0   RM     : 218   Max.   :313.00  
##                                                  NA's   :259     
##     LotArea        Street         Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl   :  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   no data:1369   IR2: 41   HLS:  50   
##  Median :  9478               Pave   :  41   IR3: 10   Low:  36   
##  Mean   : 10517                              Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                   
##  Max.   :215245                                                   
##                                                                   
##    Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub :1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  no data:   0   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##  NoSeWa :   1   FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                 FR3    :   4              Edwards:100   RRAn   :  26  
##                 Inside :1052              Somerst: 86   PosN   :  19  
##                                           Gilbert: 79   RRAe   :  11  
##                                           (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual      OverallCond   
##  Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000   Min.   :1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000   1st Qu.:5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000   Median :5.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099   Mean   :5.575  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.000   Max.   :9.000  
##  (Other):   2                 (Other): 19                                   
##    YearBuilt     YearRemodAdd    RoofStyle       RoofMatl     Exterior1st 
##  Min.   :1872   Min.   :1950   Flat   :  13   CompShg:1434   VinylSd:515  
##  1st Qu.:1954   1st Qu.:1967   Gable  :1141   Tar&Grv:  11   HdBoard:222  
##  Median :1973   Median :1994   Gambrel:  11   WdShngl:   6   MetalSd:220  
##  Mean   :1971   Mean   :1985   Hip    : 286   WdShake:   5   Wd Sdng:206  
##  3rd Qu.:2000   3rd Qu.:2004   Mansard:   7   ClyTile:   1   Plywood:108  
##  Max.   :2010   Max.   :2010   Shed   :   2   Membran:   1   CemntBd: 61  
##                                               (Other):   2   (Other):128  
##   Exterior2nd    MasVnrType    MasVnrArea     ExterQual ExterCond  Foundation 
##  VinylSd:504   BrkCmn : 15   Min.   :   0.0   Ex: 52    Ex:   3   BrkTil:146  
##  MetalSd:214   BrkFace:445   1st Qu.:   0.0   Fa: 14    Fa:  28   CBlock:634  
##  HdBoard:207   no data:  8   Median :   0.0   Gd:488    Gd: 146   PConc :647  
##  Wd Sdng:197   None   :864   Mean   : 103.7   TA:906    Po:   1   Slab  : 24  
##  Plywood:142   Stone  :128   3rd Qu.: 166.0             TA:1282   Stone :  6  
##  CmentBd: 60                 Max.   :1600.0                       Wood  :  3  
##  (Other):136                 NA's   :8                                        
##     BsmtQual      BsmtCond     BsmtExposure  BsmtFinType1   BsmtFinSF1    
##  Ex     :121   Fa     :  45   Av     :221   ALQ    :220   Min.   :   0.0  
##  Fa     : 35   Gd     :  65   Gd     :134   BLQ    :148   1st Qu.:   0.0  
##  Gd     :618   no data:  37   Mn     :114   GLQ    :418   Median : 383.5  
##  no data: 37   Po     :   2   No     :953   LwQ    : 74   Mean   : 443.6  
##  TA     :649   TA     :1311   no data: 38   no data: 37   3rd Qu.: 712.2  
##                                             Rec    :133   Max.   :5644.0  
##                                             Unf    :430                   
##   BsmtFinType2    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF    
##  ALQ    :  19   Min.   :   0.00   Min.   :   0.0   Min.   :   0.0  
##  BLQ    :  33   1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8  
##  GLQ    :  14   Median :   0.00   Median : 477.5   Median : 991.5  
##  LwQ    :  46   Mean   :  46.55   Mean   : 567.2   Mean   :1057.4  
##  no data:  38   3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2  
##  Rec    :  54   Max.   :1474.00   Max.   :2336.0   Max.   :6110.0  
##  Unf    :1256                                                      
##   Heating     HeatingQC CentralAir   Electrical     X1stFlrSF      X2ndFlrSF   
##  Floor:   1   Ex:741    N:  95     FuseA  :  94   Min.   : 334   Min.   :   0  
##  GasA :1428   Fa: 49    Y:1365     FuseF  :  27   1st Qu.: 882   1st Qu.:   0  
##  GasW :  18   Gd:241               FuseP  :   3   Median :1087   Median :   0  
##  Grav :   7   Po:  1               Mix    :   1   Mean   :1163   Mean   : 347  
##  OthW :   2   TA:428               no data:   1   3rd Qu.:1391   3rd Qu.: 728  
##  Wall :   4                        SBrkr  :1334   Max.   :4692   Max.   :2065  
##                                                                                
##   LowQualFinSF       GrLivArea     BsmtFullBath     BsmtHalfBath    
##  Min.   :  0.000   Min.   : 334   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  0.000   Median :1464   Median :0.0000   Median :0.00000  
##  Mean   :  5.845   Mean   :1515   Mean   :0.4253   Mean   :0.05753  
##  3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :572.000   Max.   :5642   Max.   :3.0000   Max.   :2.00000  
##                                                                     
##     FullBath        HalfBath       BedroomAbvGr    KitchenAbvGr    KitchenQual 
##  Min.   :0.000   Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex     :100  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa     : 39  
##  Median :2.000   Median :0.0000   Median :3.000   Median :1.000   Gd     :586  
##  Mean   :1.565   Mean   :0.3829   Mean   :2.866   Mean   :1.047   no data:  0  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   TA     :735  
##  Max.   :3.000   Max.   :2.0000   Max.   :8.000   Max.   :3.000                
##                                                                                
##   TotRmsAbvGrd      Functional     Fireplaces     FireplaceQu    GarageType 
##  Min.   : 2.000   Typ    :1360   Min.   :0.000   Ex     : 24   2Types :  6  
##  1st Qu.: 5.000   Min2   :  34   1st Qu.:0.000   Fa     : 33   Attchd :870  
##  Median : 6.000   Min1   :  31   Median :1.000   Gd     :380   Basment: 19  
##  Mean   : 6.518   Mod    :  15   Mean   :0.613   no data:690   BuiltIn: 88  
##  3rd Qu.: 7.000   Maj1   :  14   3rd Qu.:1.000   Po     : 20   CarPort:  9  
##  Max.   :14.000   Maj2   :   5   Max.   :3.000   TA     :313   Detchd :387  
##                   (Other):   1                                 no data: 81  
##   GarageYrBlt    GarageFinish   GarageCars      GarageArea       GarageQual  
##  Min.   :1900   Fin    :352   Min.   :0.000   Min.   :   0.0   Ex     :   3  
##  1st Qu.:1961   no data: 81   1st Qu.:1.000   1st Qu.: 334.5   Fa     :  48  
##  Median :1980   RFn    :422   Median :2.000   Median : 480.0   Gd     :  14  
##  Mean   :1979   Unf    :605   Mean   :1.767   Mean   : 473.0   no data:  81  
##  3rd Qu.:2002                 3rd Qu.:2.000   3rd Qu.: 576.0   Po     :   3  
##  Max.   :2010                 Max.   :4.000   Max.   :1418.0   TA     :1311  
##  NA's   :81                                                                  
##    GarageCond   PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Ex     :   2   N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Fa     :  35   P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Gd     :   9   Y:1340     Median :  0.00   Median : 25.00   Median :  0.00  
##  no data:  81              Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##  Po     :   7              3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##  TA     :1326              Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                              
##    X3SsnPorch      ScreenPorch        PoolArea           PoolQC    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Ex     :   2  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   Fa     :   2  
##  Median :  0.00   Median :  0.00   Median :  0.000   Gd     :   3  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759   no data:1453  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000                 
##  Max.   :508.00   Max.   :480.00   Max.   :738.000                 
##                                                                    
##      Fence       MiscFeature      MiscVal             MoSold      
##  GdPrv  :  59   Gar2   :   2   Min.   :    0.00   Min.   : 1.000  
##  GdWo   :  54   no data:1406   1st Qu.:    0.00   1st Qu.: 5.000  
##  MnPrv  : 157   Othr   :   2   Median :    0.00   Median : 6.000  
##  MnWw   :  11   Shed   :  49   Mean   :   43.49   Mean   : 6.322  
##  no data:1179   TenC   :   1   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                                Max.   :15500.00   Max.   :12.000  
##                                                                   
##      YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   :2006   WD     :1267   Abnorml: 101   Min.   : 34900  
##  1st Qu.:2007   New    : 122   AdjLand:   4   1st Qu.:129975  
##  Median :2008   COD    :  43   Alloca :  12   Median :163000  
##  Mean   :2008   ConLD  :   9   Family :  20   Mean   :180921  
##  3rd Qu.:2009   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  Max.   :2010   ConLw  :   5   Partial: 125   Max.   :755000  
##                 (Other):   9
# Filling remaining NA values with -1
train[is.na(train)] <- -1
test[is.na(test)] <- -1
summary(train)
##        Id           MSSubClass       MSZoning     LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   C (all):  10   Min.   : -1.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   FV     :  65   1st Qu.: 42.00  
##  Median : 730.5   Median : 50.0   no data:   0   Median : 63.00  
##  Mean   : 730.5   Mean   : 56.9   RH     :  16   Mean   : 57.45  
##  3rd Qu.:1095.2   3rd Qu.: 70.0   RL     :1151   3rd Qu.: 79.00  
##  Max.   :1460.0   Max.   :190.0   RM     : 218   Max.   :313.00  
##                                                                  
##     LotArea        Street         Alley      LotShape  LandContour
##  Min.   :  1300   Grvl:   6   Grvl   :  50   IR1:484   Bnk:  63   
##  1st Qu.:  7554   Pave:1454   no data:1369   IR2: 41   HLS:  50   
##  Median :  9478               Pave   :  41   IR3: 10   Low:  36   
##  Mean   : 10517                              Reg:925   Lvl:1311   
##  3rd Qu.: 11602                                                   
##  Max.   :215245                                                   
##                                                                   
##    Utilities      LotConfig    LandSlope   Neighborhood   Condition1  
##  AllPub :1459   Corner : 263   Gtl:1382   NAmes  :225   Norm   :1260  
##  no data:   0   CulDSac:  94   Mod:  65   CollgCr:150   Feedr  :  81  
##  NoSeWa :   1   FR2    :  47   Sev:  13   OldTown:113   Artery :  48  
##                 FR3    :   4              Edwards:100   RRAn   :  26  
##                 Inside :1052              Somerst: 86   PosN   :  19  
##                                           Gilbert: 79   RRAe   :  11  
##                                           (Other):707   (Other):  15  
##    Condition2     BldgType      HouseStyle   OverallQual      OverallCond   
##  Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000   Min.   :1.000  
##  Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000   1st Qu.:5.000  
##  Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000   Median :5.000  
##  PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099   Mean   :5.575  
##  RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000  
##  PosA   :   1                 1.5Unf : 14   Max.   :10.000   Max.   :9.000  
##  (Other):   2                 (Other): 19                                   
##    YearBuilt     YearRemodAdd    RoofStyle       RoofMatl     Exterior1st 
##  Min.   :1872   Min.   :1950   Flat   :  13   CompShg:1434   VinylSd:515  
##  1st Qu.:1954   1st Qu.:1967   Gable  :1141   Tar&Grv:  11   HdBoard:222  
##  Median :1973   Median :1994   Gambrel:  11   WdShngl:   6   MetalSd:220  
##  Mean   :1971   Mean   :1985   Hip    : 286   WdShake:   5   Wd Sdng:206  
##  3rd Qu.:2000   3rd Qu.:2004   Mansard:   7   ClyTile:   1   Plywood:108  
##  Max.   :2010   Max.   :2010   Shed   :   2   Membran:   1   CemntBd: 61  
##                                               (Other):   2   (Other):128  
##   Exterior2nd    MasVnrType    MasVnrArea     ExterQual ExterCond  Foundation 
##  VinylSd:504   BrkCmn : 15   Min.   :  -1.0   Ex: 52    Ex:   3   BrkTil:146  
##  MetalSd:214   BrkFace:445   1st Qu.:   0.0   Fa: 14    Fa:  28   CBlock:634  
##  HdBoard:207   no data:  8   Median :   0.0   Gd:488    Gd: 146   PConc :647  
##  Wd Sdng:197   None   :864   Mean   : 103.1   TA:906    Po:   1   Slab  : 24  
##  Plywood:142   Stone  :128   3rd Qu.: 164.2             TA:1282   Stone :  6  
##  CmentBd: 60                 Max.   :1600.0                       Wood  :  3  
##  (Other):136                                                                  
##     BsmtQual      BsmtCond     BsmtExposure  BsmtFinType1   BsmtFinSF1    
##  Ex     :121   Fa     :  45   Av     :221   ALQ    :220   Min.   :   0.0  
##  Fa     : 35   Gd     :  65   Gd     :134   BLQ    :148   1st Qu.:   0.0  
##  Gd     :618   no data:  37   Mn     :114   GLQ    :418   Median : 383.5  
##  no data: 37   Po     :   2   No     :953   LwQ    : 74   Mean   : 443.6  
##  TA     :649   TA     :1311   no data: 38   no data: 37   3rd Qu.: 712.2  
##                                             Rec    :133   Max.   :5644.0  
##                                             Unf    :430                   
##   BsmtFinType2    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF    
##  ALQ    :  19   Min.   :   0.00   Min.   :   0.0   Min.   :   0.0  
##  BLQ    :  33   1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8  
##  GLQ    :  14   Median :   0.00   Median : 477.5   Median : 991.5  
##  LwQ    :  46   Mean   :  46.55   Mean   : 567.2   Mean   :1057.4  
##  no data:  38   3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2  
##  Rec    :  54   Max.   :1474.00   Max.   :2336.0   Max.   :6110.0  
##  Unf    :1256                                                      
##   Heating     HeatingQC CentralAir   Electrical     X1stFlrSF      X2ndFlrSF   
##  Floor:   1   Ex:741    N:  95     FuseA  :  94   Min.   : 334   Min.   :   0  
##  GasA :1428   Fa: 49    Y:1365     FuseF  :  27   1st Qu.: 882   1st Qu.:   0  
##  GasW :  18   Gd:241               FuseP  :   3   Median :1087   Median :   0  
##  Grav :   7   Po:  1               Mix    :   1   Mean   :1163   Mean   : 347  
##  OthW :   2   TA:428               no data:   1   3rd Qu.:1391   3rd Qu.: 728  
##  Wall :   4                        SBrkr  :1334   Max.   :4692   Max.   :2065  
##                                                                                
##   LowQualFinSF       GrLivArea     BsmtFullBath     BsmtHalfBath    
##  Min.   :  0.000   Min.   : 334   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  0.000   Median :1464   Median :0.0000   Median :0.00000  
##  Mean   :  5.845   Mean   :1515   Mean   :0.4253   Mean   :0.05753  
##  3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :572.000   Max.   :5642   Max.   :3.0000   Max.   :2.00000  
##                                                                     
##     FullBath        HalfBath       BedroomAbvGr    KitchenAbvGr    KitchenQual 
##  Min.   :0.000   Min.   :0.0000   Min.   :0.000   Min.   :0.000   Ex     :100  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Fa     : 39  
##  Median :2.000   Median :0.0000   Median :3.000   Median :1.000   Gd     :586  
##  Mean   :1.565   Mean   :0.3829   Mean   :2.866   Mean   :1.047   no data:  0  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000   TA     :735  
##  Max.   :3.000   Max.   :2.0000   Max.   :8.000   Max.   :3.000                
##                                                                                
##   TotRmsAbvGrd      Functional     Fireplaces     FireplaceQu    GarageType 
##  Min.   : 2.000   Typ    :1360   Min.   :0.000   Ex     : 24   2Types :  6  
##  1st Qu.: 5.000   Min2   :  34   1st Qu.:0.000   Fa     : 33   Attchd :870  
##  Median : 6.000   Min1   :  31   Median :1.000   Gd     :380   Basment: 19  
##  Mean   : 6.518   Mod    :  15   Mean   :0.613   no data:690   BuiltIn: 88  
##  3rd Qu.: 7.000   Maj1   :  14   3rd Qu.:1.000   Po     : 20   CarPort:  9  
##  Max.   :14.000   Maj2   :   5   Max.   :3.000   TA     :313   Detchd :387  
##                   (Other):   1                                 no data: 81  
##   GarageYrBlt    GarageFinish   GarageCars      GarageArea       GarageQual  
##  Min.   :  -1   Fin    :352   Min.   :0.000   Min.   :   0.0   Ex     :   3  
##  1st Qu.:1958   no data: 81   1st Qu.:1.000   1st Qu.: 334.5   Fa     :  48  
##  Median :1977   RFn    :422   Median :2.000   Median : 480.0   Gd     :  14  
##  Mean   :1869   Unf    :605   Mean   :1.767   Mean   : 473.0   no data:  81  
##  3rd Qu.:2001                 3rd Qu.:2.000   3rd Qu.: 576.0   Po     :   3  
##  Max.   :2010                 Max.   :4.000   Max.   :1418.0   TA     :1311  
##                                                                              
##    GarageCond   PavedDrive   WoodDeckSF      OpenPorchSF     EnclosedPorch   
##  Ex     :   2   N:  90     Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  Fa     :  35   P:  30     1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Gd     :   9   Y:1340     Median :  0.00   Median : 25.00   Median :  0.00  
##  no data:  81              Mean   : 94.24   Mean   : 46.66   Mean   : 21.95  
##  Po     :   7              3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00  
##  TA     :1326              Max.   :857.00   Max.   :547.00   Max.   :552.00  
##                                                                              
##    X3SsnPorch      ScreenPorch        PoolArea           PoolQC    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Ex     :   2  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000   Fa     :   2  
##  Median :  0.00   Median :  0.00   Median :  0.000   Gd     :   3  
##  Mean   :  3.41   Mean   : 15.06   Mean   :  2.759   no data:1453  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000                 
##  Max.   :508.00   Max.   :480.00   Max.   :738.000                 
##                                                                    
##      Fence       MiscFeature      MiscVal             MoSold      
##  GdPrv  :  59   Gar2   :   2   Min.   :    0.00   Min.   : 1.000  
##  GdWo   :  54   no data:1406   1st Qu.:    0.00   1st Qu.: 5.000  
##  MnPrv  : 157   Othr   :   2   Median :    0.00   Median : 6.000  
##  MnWw   :  11   Shed   :  49   Mean   :   43.49   Mean   : 6.322  
##  no data:1179   TenC   :   1   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                                Max.   :15500.00   Max.   :12.000  
##                                                                   
##      YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   :2006   WD     :1267   Abnorml: 101   Min.   : 34900  
##  1st Qu.:2007   New    : 122   AdjLand:   4   1st Qu.:129975  
##  Median :2008   COD    :  43   Alloca :  12   Median :163000  
##  Mean   :2008   ConLD  :   9   Family :  20   Mean   :180921  
##  3rd Qu.:2009   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  Max.   :2010   ConLw  :   5   Partial: 125   Max.   :755000  
##                 (Other):   9
plot(density(train$YearBuilt))

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

ggplot(train, aes(x=GrLivArea+TotRmsAbvGrd, y=SalePrice, color=CentralAir)) + geom_point() 

for (col in colnames(train)){
    if(is.numeric(train[,col])){
        if( abs(cor(train[,col],train$SalePrice)) > 0.5){
            print(col)
            print( cor(train[,col],train$SalePrice) )
        }
    }
}
## [1] "OverallQual"
## [1] 0.7909816
## [1] "YearBuilt"
## [1] 0.5228973
## [1] "YearRemodAdd"
## [1] 0.507101
## [1] "TotalBsmtSF"
## [1] 0.6135806
## [1] "X1stFlrSF"
## [1] 0.6058522
## [1] "GrLivArea"
## [1] 0.7086245
## [1] "FullBath"
## [1] 0.5606638
## [1] "TotRmsAbvGrd"
## [1] 0.5337232
## [1] "GarageCars"
## [1] 0.6404092
## [1] "GarageArea"
## [1] 0.6234314
## [1] "SalePrice"
## [1] 1
for (col in colnames(train)){
    if(is.numeric(train[,col])){
        if( abs(cor(train[,col],train$SalePrice)) < 0.1){
            print(col)
            print( cor(train[,col],train$SalePrice) )
        }
    }
}
## [1] "Id"
## [1] -0.02191672
## [1] "MSSubClass"
## [1] -0.08428414
## [1] "OverallCond"
## [1] -0.07785589
## [1] "BsmtFinSF2"
## [1] -0.01137812
## [1] "LowQualFinSF"
## [1] -0.02560613
## [1] "BsmtHalfBath"
## [1] -0.01684415
## [1] "X3SsnPorch"
## [1] 0.04458367
## [1] "PoolArea"
## [1] 0.09240355
## [1] "MiscVal"
## [1] -0.02118958
## [1] "MoSold"
## [1] 0.04643225
## [1] "YrSold"
## [1] -0.02892259

Derive a correlation matrix for any three quantitative variables in the dataset. Source: http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software

corDF <- cor(train[,c('SalePrice','GrLivArea','TotRmsAbvGrd' )])
print(corDF)
##              SalePrice GrLivArea TotRmsAbvGrd
## SalePrice    1.0000000 0.7086245    0.5337232
## GrLivArea    0.7086245 1.0000000    0.8254894
## TotRmsAbvGrd 0.5337232 0.8254894    1.0000000
corrplot(corDF, method="number")

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not? Source: https://bookdown.org/ndphillips/YaRrr/correlation-cor-test.html

(cor1 <-cor.test(formula = ~SalePrice + GrLivArea,data=train,   conf.level = .80))
## 
##  Pearson's product-moment correlation
## 
## data:  SalePrice and GrLivArea
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245
(cor2 <-cor.test(formula = ~SalePrice + TotRmsAbvGrd,data=train,   conf.level = .80))
## 
##  Pearson's product-moment correlation
## 
## data:  SalePrice and TotRmsAbvGrd
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5092841 0.5573021
## sample estimates:
##       cor 
## 0.5337232

My analysis A high correlation coefficient suggests a strong linear association between the variables. It indicates that the variables tend to move together in a consistent pattern. A low p-value provides evidence against the null hypothesis, indicating that the observed correlation is unlikely to occur by chance. It suggests that the relationship observed in the data is statistically significant. In other words, there is a low probability that the observed correlation is due to random variation alone.Therefore, I think above ground living area square feet and total rooms above grade (does not include bathrooms) are good indicators of house prices.

The familywise error refers to the chance of making at least one false positive error. In other words, it measures the probability of wrongly concluding that there is a significant relationship or difference between variables when, in fact, there isn't. We have rejected the null hypothesis, it is possible to have a Type 1 error.

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.)

inverse_matrix <- solve(corDF)
print(inverse_matrix)
##               SalePrice GrLivArea TotRmsAbvGrd
## SalePrice     2.0424418 -1.718505    0.3285093
## GrLivArea    -1.7185052  4.585000   -2.8676627
## TotRmsAbvGrd  0.3285093 -2.867663    3.1918921

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

(multiply1 <-corDF %*% inverse_matrix)
##                  SalePrice GrLivArea TotRmsAbvGrd
## SalePrice     1.000000e+00         0            0
## GrLivArea    -5.551115e-17         1            0
## TotRmsAbvGrd -1.665335e-16         0            1
(multiply2 <-inverse_matrix %*% corDF)
##                  SalePrice     GrLivArea  TotRmsAbvGrd
## SalePrice     1.000000e+00 -3.330669e-16 -4.440892e-16
## GrLivArea     2.220446e-16  1.000000e+00  4.440892e-16
## TotRmsAbvGrd -2.220446e-16 -4.440892e-16  1.000000e+00
(lu1 <- lu(multiply1))
## 'MatrixFactorization' of Formal class 'denseLU' [package "Matrix"] with 4 slots
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:3] "SalePrice" "GrLivArea" "TotRmsAbvGrd"
##   .. ..$ : chr [1:3] "SalePrice" "GrLivArea" "TotRmsAbvGrd"
##   ..@ x       : num [1:9] 1.00 -5.55e-17 -1.67e-16 0.00 1.00 ...
##   ..@ perm    : int [1:3] 1 2 3
##   ..@ Dim     : int [1:2] 3 3
(lu2 <- lu(multiply2))
## 'MatrixFactorization' of Formal class 'denseLU' [package "Matrix"] with 4 slots
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:3] "SalePrice" "GrLivArea" "TotRmsAbvGrd"
##   .. ..$ : chr [1:3] "SalePrice" "GrLivArea" "TotRmsAbvGrd"
##   ..@ x       : num [1:9] 1.00 2.22e-16 -2.22e-16 -3.33e-16 1.00 ...
##   ..@ perm    : int [1:3] 1 2 3
##   ..@ Dim     : int [1:2] 3 3

Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

hist(train$GrLivArea, breaks=20)

hist(train$TotRmsAbvGrd, breaks=20)

hist(train$LotArea,breaks=20)

hist(train$BsmtFinSF1,breaks=20)

(skewness_value <- skewness(train$BsmtFinSF1))
## [1] 1.683771

If the skewness value is significantly positive (greater than zero), it suggests that the variable is skewed to the right. For BsmtFinSF1, skewness is 1.68, therefore it's right skewed.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fitdistr(train$BsmtFinSF1, "exponential")
##       rate    
##   0.002254081 
##  (0.000058992)

I choose the larger value from above because I don't know if the spread of the data is relatively small(then I would choose the smaller one).

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).

exponent <- rexp(1000, 0.002254081)
head(exponent)
## [1]  337.03436   52.95163 1009.60383  101.03644  116.33481  714.47480

Plot a histogram and compare it with a histogram of your original variable.

hist(exponent)

# Create the first histogram
hist(train$BsmtFinSF1, col = "blue", main = "Histogram Comparison", xlab = "Value", ylim = c(0, 350), breaks = 30)

# Add the second histogram on the same plot
par(new = TRUE)
hist(exponent, col = "red", add = TRUE, breaks = 30)

legend("topright", legend = c("Original", "Exponential"), fill = c("blue", "red"))

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality.

qexp(.05, rate= 0.002254081)
## [1] 22.75575
qexp(.95, rate= 0.002254081)
## [1] 1329.026

Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

quantile(train$BsmtFinSF1, probs=c(0.05,0.95))
##   5%  95% 
##    0 1274
(lm_model <- lm(SalePrice ~ train$GrLivArea + train$TotRmsAbvGrd, data = train))  
## 
## Call:
## lm(formula = SalePrice ~ train$GrLivArea + train$TotRmsAbvGrd, 
##     data = train)
## 
## Coefficients:
##        (Intercept)     train$GrLivArea  train$TotRmsAbvGrd  
##            39387.8               127.2             -7861.3
summary(lm_model)
## 
## Call:
## lm(formula = SalePrice ~ train$GrLivArea + train$TotRmsAbvGrd, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -502732  -29360   -1264   20007  358035 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        39387.79    6118.10   6.438 1.64e-10 ***
## train$GrLivArea      127.20       4.91  25.906  < 2e-16 ***
## train$TotRmsAbvGrd -7861.26    1587.41  -4.952 8.19e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55630 on 1457 degrees of freedom
## Multiple R-squared:  0.5104, Adjusted R-squared:  0.5097 
## F-statistic: 759.4 on 2 and 1457 DF,  p-value: < 2.2e-16
# Select the predictor variables for the regression model
predictors <- c("GrLivArea", "TotRmsAbvGrd")

# Create a new data frame with the predictor variables and the response variable
regression_data <- train[-1, c(predictors, "SalePrice")]


# Remove rows with missing values
regression_data <- na.omit(regression_data)

# Fit the multiple regression model
model <- lm(SalePrice ~ ., data = regression_data)

# Print the model summary
summary(model)
## 
## Call:
## lm(formula = SalePrice ~ ., data = regression_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -502752  -29358   -1326   20024  358078 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39419.302   6121.248   6.440 1.62e-10 ***
## GrLivArea      127.226      4.912  25.899  < 2e-16 ***
## TotRmsAbvGrd -7872.908   1588.547  -4.956 8.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55640 on 1456 degrees of freedom
## Multiple R-squared:  0.5104, Adjusted R-squared:  0.5097 
## F-statistic: 758.8 on 2 and 1456 DF,  p-value: < 2.2e-16
# Read the sample_submission file
sample_submission <- read.csv("/Users/Sangeetha/Downloads/house-prices-advanced-regression-techniques/sample_submission.csv")

# Create a new data frame with only "Id" column
prediction_df <- data.frame(ID = sample_submission$Id)


# Predict the SalePrice using your regression model (replace `model` with your actual model)
prediction_df$SalePrice <- predict(model, newdata = regression_data)

# Write the predictions to a CSV file
write.csv(prediction_df, file = "predictions.csv", row.names = FALSE)

# Verify the number of rows in the predictions file
num_rows <- nrow(prediction_df)
print(num_rows) 
## [1] 1459
Sangeetha's Kaggle Score.

Sangeetha's Kaggle Score.