# House Price Analysis..
# Loading Required Libraries..
libraries <- c('ggplot2','dplyr','e1071','corrplot','gridExtra',
'caret','data.table','testthat','randomForest')
installlib <- libraries[!libraries %in% installed.packages()]
for (libs in installlib) install.packages(libs, dependencies = TRUE)
sapply(libraries, require, character = TRUE)
## Loading required package: ggplot2
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: e1071
## Loading required package: corrplot
## corrplot 0.84 loaded
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Loading required package: caret
## Loading required package: lattice
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Loading required package: testthat
##
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
##
## matches
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## ggplot2 dplyr e1071 corrplot gridExtra caret
## TRUE TRUE TRUE TRUE TRUE TRUE
## data.table testthat randomForest
## TRUE TRUE TRUE
# Reading the Data into R..
train <- fread("C:/Users/Rewati/Documents/R/train.csv", stringsAsFactors = FALSE)
test <- fread("C:/Users/Rewati/Documents/R/test.csv", stringsAsFactors = FALSE)
View(train)
View(test)
head(train)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1: 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2: 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3: 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4: 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5: 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6: 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1: AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2: AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3: AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4: AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5: AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6: AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1: 2Story 7 5 2003 2003 Gable CompShg
## 2: 1Story 6 8 1976 1976 Gable CompShg
## 3: 2Story 7 5 2001 2002 Gable CompShg
## 4: 2Story 7 5 1915 1970 Gable CompShg
## 5: 2Story 8 5 2000 2000 Gable CompShg
## 6: 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1: VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2: MetalSd MetalSd None 0 TA TA CBlock
## 3: VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4: Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5: VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6: VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1: Gd TA No GLQ 706 Unf
## 2: Gd TA Gd ALQ 978 Unf
## 3: Gd TA Mn GLQ 486 Unf
## 4: TA Gd No ALQ 216 Unf
## 5: Gd TA Av GLQ 655 Unf
## 6: Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1: 0 150 856 GasA Ex Y SBrkr
## 2: 0 284 1262 GasA Ex Y SBrkr
## 3: 0 434 920 GasA Ex Y SBrkr
## 4: 0 540 756 GasA Gd Y SBrkr
## 5: 0 490 1145 GasA Ex Y SBrkr
## 6: 0 64 796 GasA Ex Y SBrkr
## 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1: 856 854 0 1710 1 0 2
## 2: 1262 0 0 1262 0 1 2
## 3: 920 866 0 1786 1 0 2
## 4: 961 756 0 1717 1 0 1
## 5: 1145 1053 0 2198 1 0 2
## 6: 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1: 1 3 1 Gd 8 Typ
## 2: 0 3 1 TA 6 Typ
## 3: 1 3 1 Gd 6 Typ
## 4: 0 3 1 Gd 7 Typ
## 5: 1 4 1 Gd 9 Typ
## 6: 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1: 0 <NA> Attchd 2003 RFn 2
## 2: 1 TA Attchd 1976 RFn 2
## 3: 1 TA Attchd 2001 RFn 2
## 4: 1 Gd Detchd 1998 Unf 3
## 5: 1 TA Attchd 2000 RFn 3
## 6: 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1: 548 TA TA Y 0 61
## 2: 460 TA TA Y 298 0
## 3: 608 TA TA Y 0 42
## 4: 642 TA TA Y 0 35
## 5: 836 TA TA Y 192 84
## 6: 480 TA TA Y 40 30
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1: 0 0 0 0 <NA> <NA> <NA>
## 2: 0 0 0 0 <NA> <NA> <NA>
## 3: 0 0 0 0 <NA> <NA> <NA>
## 4: 272 0 0 0 <NA> <NA> <NA>
## 5: 0 0 0 0 <NA> <NA> <NA>
## 6: 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1: 0 2 2008 WD Normal 208500
## 2: 0 5 2007 WD Normal 181500
## 3: 0 9 2008 WD Normal 223500
## 4: 0 2 2006 WD Abnorml 140000
## 5: 0 12 2008 WD Normal 250000
## 6: 700 10 2009 WD Normal 143000
head(test)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1: 1461 20 RH 80 11622 Pave <NA> Reg
## 2: 1462 20 RL 81 14267 Pave <NA> IR1
## 3: 1463 60 RL 74 13830 Pave <NA> IR1
## 4: 1464 60 RL 78 9978 Pave <NA> IR1
## 5: 1465 120 RL 43 5005 Pave <NA> IR1
## 6: 1466 60 RL 75 10000 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2
## 1: Lvl AllPub Inside Gtl NAmes Feedr Norm
## 2: Lvl AllPub Corner Gtl NAmes Norm Norm
## 3: Lvl AllPub Inside Gtl Gilbert Norm Norm
## 4: Lvl AllPub Inside Gtl Gilbert Norm Norm
## 5: HLS AllPub Inside Gtl StoneBr Norm Norm
## 6: Lvl AllPub Corner Gtl Gilbert Norm Norm
## BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle
## 1: 1Fam 1Story 5 6 1961 1961 Gable
## 2: 1Fam 1Story 6 6 1958 1958 Hip
## 3: 1Fam 2Story 5 5 1997 1998 Gable
## 4: 1Fam 2Story 6 6 1998 1998 Gable
## 5: TwnhsE 1Story 8 5 1992 1992 Gable
## 6: 1Fam 2Story 6 5 1993 1994 Gable
## RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## 1: CompShg VinylSd VinylSd None 0 TA TA
## 2: CompShg Wd Sdng Wd Sdng BrkFace 108 TA TA
## 3: CompShg VinylSd VinylSd None 0 TA TA
## 4: CompShg VinylSd VinylSd BrkFace 20 TA TA
## 5: CompShg HdBoard HdBoard None 0 Gd TA
## 6: CompShg HdBoard HdBoard None 0 TA TA
## Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 1: CBlock TA TA No Rec 468
## 2: CBlock TA TA No ALQ 923
## 3: PConc Gd TA No GLQ 791
## 4: PConc TA TA No GLQ 602
## 5: PConc Gd TA No ALQ 263
## 6: PConc Gd TA No Unf 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## 1: LwQ 144 270 882 GasA TA Y
## 2: Unf 0 406 1329 GasA TA Y
## 3: Unf 0 137 928 GasA Gd Y
## 4: Unf 0 324 926 GasA Ex Y
## 5: Unf 0 1017 1280 GasA Ex Y
## 6: Unf 0 763 763 GasA Gd Y
## Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## 1: SBrkr 896 0 0 896 0
## 2: SBrkr 1329 0 0 1329 0
## 3: SBrkr 928 701 0 1629 0
## 4: SBrkr 926 678 0 1604 0
## 5: SBrkr 1280 0 0 1280 0
## 6: SBrkr 763 892 0 1655 0
## BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## 1: 0 1 0 2 1 TA
## 2: 0 1 1 3 1 Gd
## 3: 0 2 1 3 1 TA
## 4: 0 2 1 3 1 Gd
## 5: 0 2 0 2 1 Gd
## 6: 0 2 1 3 1 TA
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 1: 5 Typ 0 <NA> Attchd 1961
## 2: 6 Typ 0 <NA> Attchd 1958
## 3: 6 Typ 1 TA Attchd 1997
## 4: 7 Typ 1 Gd Attchd 1998
## 5: 5 Typ 0 <NA> Attchd 1992
## 6: 7 Typ 1 TA Attchd 1993
## GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive
## 1: Unf 1 730 TA TA Y
## 2: Unf 1 312 TA TA Y
## 3: Fin 2 482 TA TA Y
## 4: Fin 2 470 TA TA Y
## 5: RFn 2 506 TA TA Y
## 6: Fin 2 440 TA TA Y
## WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC
## 1: 140 0 0 0 120 0 <NA>
## 2: 393 36 0 0 0 0 <NA>
## 3: 212 34 0 0 0 0 <NA>
## 4: 360 36 0 0 0 0 <NA>
## 5: 0 82 0 0 144 0 <NA>
## 6: 157 84 0 0 0 0 <NA>
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 1: MnPrv <NA> 0 6 2010 WD Normal
## 2: <NA> Gar2 12500 6 2010 WD Normal
## 3: MnPrv <NA> 0 3 2010 WD Normal
## 4: <NA> <NA> 0 6 2010 WD Normal
## 5: <NA> <NA> 0 1 2010 WD Normal
## 6: <NA> <NA> 0 4 2010 WD Normal
# Divide character and numeric columns..
char_var <- names(train)[which(sapply(train, is.character))]
char_car <- c(char_var, 'BedroomAbvGr','HalfBath','KitchenAbvGr',
'BsmtFullBath','BsmtHalfBath','MSSubClass')
numeric_var <- names(train)[which(sapply(train, is.numeric))]
#Data Size and Structure
dim(train)
## [1] 1460 81
str(train)
## Classes 'data.table' and 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ 2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ 3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## - attr(*, ".internal.selfref")=<externalptr>
#str(train[,c(1:10, 81)]) #Shows Response Variable
#Summarize the Missing Values in the Data
colSums(sapply(train, is.na))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 259 0
## Street Alley LotShape LandContour Utilities
## 0 1369 0 0 0
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 8 8 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 37 37 38 37 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 38 0 0 0 0
## HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 0 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 0 0 690 81 81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 81 0 0 81 81
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 1453 1179 1406
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
## SalePrice
## 0
colSums(sapply(train[,.SD, .SDcols = char_var], is.na))
## MSZoning Street Alley LotShape LandContour
## 0 0 1369 0 0
## Utilities LotConfig LandSlope Neighborhood Condition1
## 0 0 0 0 0
## Condition2 BldgType HouseStyle RoofStyle RoofMatl
## 0 0 0 0 0
## Exterior1st Exterior2nd MasVnrType ExterQual ExterCond
## 0 0 8 0 0
## Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 0 37 37 38 37
## BsmtFinType2 Heating HeatingQC CentralAir Electrical
## 38 0 0 0 1
## KitchenQual Functional FireplaceQu GarageType GarageFinish
## 0 0 690 81 81
## GarageQual GarageCond PavedDrive PoolQC Fence
## 81 81 0 1453 1179
## MiscFeature SaleType SaleCondition
## 1406 0 0
colSums(sapply(train[,.SD, .SDcols = numeric_var], is.na))
## Id MSSubClass LotFrontage LotArea OverallQual
## 0 0 259 0 0
## OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1
## 0 0 0 8 0
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF 1stFlrSF 2ndFlrSF
## 0 0 0 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
## 0 0 0 0 0
## GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF
## 81 0 0 0 0
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal
## 0 0 0 0 0
## MoSold YrSold SalePrice
## 0 0 0
#Visualizing for Missing Data
plot_Missing <- function(data_in, title = NULL){
temp_df <- as.data.frame(ifelse(is.na(data_in), 0, 1))
temp_df <- temp_df[,order(colSums(temp_df))]
data_temp <- expand.grid(list(x = 1:nrow(temp_df), y = colnames(temp_df)))
data_temp$m <- as.vector(as.matrix(temp_df))
data_temp <- data.frame(x = unlist(data_temp$x), y = unlist(data_temp$y), m = unlist(data_temp$m))
ggplot(data_temp) + geom_tile(aes(x=x, y=y, fill=factor(m))) + scale_fill_manual(values=c("green", "red"), name="Missing\n(0=Yes, 1=No)") + theme_light() + ylab("") + xlab("") + ggtitle(title)
}
plot_Missing(train[,colSums(is.na(train)) > 0, with = FALSE])

sum(train[,'YearRemodAdd', with = FALSE] != train[,'YearBuilt', with = FALSE])
## [1] 696
cat('Percentage of houses remodeled',sum(train[,'YearRemodAdd', with = FALSE] != train[,'YearBuilt', with = FALSE])/ dim(train)[1])
## Percentage of houses remodeled 0.4767123
train %>% select(YearBuilt, YearRemodAdd) %>% mutate(Remodeled = as.integer(YearBuilt != YearRemodAdd)) %>% ggplot(aes(x= factor(x = Remodeled,labels = c( 'No','Yes')))) + geom_bar() + xlab('Remodeled') + theme_light()

#Summarize the numeric values and the structure of the data.
summary(train[,.SD, .SDcols = numeric_var])
## Id MSSubClass LotFrontage LotArea
## Min. : 1.0 Min. : 20.0 Min. : 21.00 Min. : 1300
## 1st Qu.: 365.8 1st Qu.: 20.0 1st Qu.: 59.00 1st Qu.: 7554
## Median : 730.5 Median : 50.0 Median : 69.00 Median : 9478
## Mean : 730.5 Mean : 56.9 Mean : 70.05 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00 3rd Qu.: 11602
## Max. :1460.0 Max. :190.0 Max. :313.00 Max. :215245
## NA's :259
## OverallQual OverallCond YearBuilt YearRemodAdd
## Min. : 1.000 Min. :1.000 Min. :1872 Min. :1950
## 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967
## Median : 6.000 Median :5.000 Median :1973 Median :1994
## Mean : 6.099 Mean :5.575 Mean :1971 Mean :1985
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004
## Max. :10.000 Max. :9.000 Max. :2010 Max. :2010
##
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 0.0 Median : 383.5 Median : 0.00 Median : 477.5
## Mean : 103.7 Mean : 443.6 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 166.0 3rd Qu.: 712.2 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :1600.0 Max. :5644.0 Max. :1474.00 Max. :2336.0
## NA's :8
## TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF
## Min. : 0.0 Min. : 334 Min. : 0 Min. : 0.000
## 1st Qu.: 795.8 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## Median : 991.5 Median :1087 Median : 0 Median : 0.000
## Mean :1057.4 Mean :1163 Mean : 347 Mean : 5.845
## 3rd Qu.:1298.2 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## Max. :6110.0 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 5.000
## Median :0.0000 Median :3.000 Median :1.000 Median : 6.000
## Mean :0.3829 Mean :2.866 Mean :1.047 Mean : 6.518
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.0000 Max. :8.000 Max. :3.000 Max. :14.000
##
## Fireplaces GarageYrBlt GarageCars GarageArea
## Min. :0.000 Min. :1900 Min. :0.000 Min. : 0.0
## 1st Qu.:0.000 1st Qu.:1961 1st Qu.:1.000 1st Qu.: 334.5
## Median :1.000 Median :1980 Median :2.000 Median : 480.0
## Mean :0.613 Mean :1979 Mean :1.767 Mean : 473.0
## 3rd Qu.:1.000 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :3.000 Max. :2010 Max. :4.000 Max. :1418.0
## NA's :81
## WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea MiscVal MoSold
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 5.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 6.000
## Mean : 15.06 Mean : 2.759 Mean : 43.49 Mean : 6.322
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :480.00 Max. :738.000 Max. :15500.00 Max. :12.000
##
## YrSold SalePrice
## Min. :2006 Min. : 34900
## 1st Qu.:2007 1st Qu.:129975
## Median :2008 Median :163000
## Mean :2008 Mean :180921
## 3rd Qu.:2009 3rd Qu.:214000
## Max. :2010 Max. :755000
##
cat('Train has', dim(train)[1], 'rows and', dim(train)[2], 'columns.')
## Train has 1460 rows and 81 columns.
cat('Test has', dim(test)[1], 'rows and', dim(test)[2], 'columns.')
## Test has 1459 rows and 80 columns.
# The Percentage of data missing in train.
sum(is.na(train)) / (nrow(train) * ncol(train))
## [1] 0.05889565
# The Percentage of data missing in test.
sum(is.na(test)) / (nrow(test) * ncol(test))
## [1] 0.05997258
#Check for duplicated rows
cat("The number of duplicated rows are", nrow(train) - nrow(unique(train)))
## The number of duplicated rows are 0
#Convert Character to factors
train[,(char_var) := lapply(.SD, as.factor), .SDcols = char_var]
train_cat <- train[,.SD, .SDcols = char_var]
train_cont <- train[,.SD,.SDcols = numeric_var]
plotHist <- function(data_in, i) {
data <- data.frame(x=data_in[[i]])
p <- ggplot(data = data, aes(x=factor(x))) + stat_count() + xlab(colnames(data_in)[i]) + theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
return (p)
}
doPlots <- function(data_in, fun, ii, ncol = 3) {
pp <- list()
for (i in ii) {
p <- fun(data_in = data_in, i = i)
pp <- c(pp, list(p))
}
do.call("grid.arrange", c(pp, ncol = ncol))
}
plotDen <- function(data_in, i){
data <- data.frame(x = data_in[[i]], SalePrice = data_in$SalePrice)
p <- ggplot(data = data) + geom_line(aes(x = x), stat = 'density', size = 1,alpha = 1.0) +
xlab(paste0((colnames(data_in)[i]), '\n', 'Skewness: ',round(skewness(data_in[[i]], na.rm = TRUE), 2))) + theme_light()
return(p)
}
#Bar plots for the categorical features
doPlots(train_cat, fun = plotHist, ii = 1:4, ncol = 2)

doPlots(train_cat, fun = plotHist, ii = 4:8, ncol = 2)

doPlots(train_cat, fun = plotHist, ii = 8:12, ncol = 2)

doPlots(train_cat, fun = plotHist, ii = 13:18, ncol = 2)

doPlots(train_cat, fun = plotHist, ii = 18:22, ncol = 2)

train %>% select(LandSlope, Neighborhood, SalePrice) %>% filter(LandSlope == c('Sev', 'Mod')) %>% arrange(Neighborhood) %>% group_by(Neighborhood, LandSlope) %>% summarize(Count = n()) %>% ggplot(aes(Neighborhood, Count)) + geom_bar(aes(fill = LandSlope), position = 'dodge', stat = 'identity') + theme_light() +theme(axis.text.x = element_text(angle = 90, hjust =1))

train %>% select(Neighborhood, SalePrice) %>% ggplot(aes(factor(Neighborhood), SalePrice)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, hjust =1)) + xlab('Neighborhoods')

#Density plots for numeric variables.
doPlots(train_cont, fun = plotDen, ii = 2:6, ncol = 2)
## Warning: Removed 259 rows containing non-finite values (stat_density).

doPlots(train_cont, fun = plotDen, ii = 7:12, ncol = 2)
## Warning: Removed 8 rows containing non-finite values (stat_density).

doPlots(train_cont, fun = plotDen, ii = 13:17, ncol = 2)

doPlots(train_cont, fun = plotHist, ii = 18:23, ncol = 2)

#Explore the correlation
correlations <- cor(na.omit(train_cont[,-1, with = FALSE]))
# correlations
row_indic <- apply(correlations, 1, function(x) sum(x > 0.3 | x < -0.3) > 1)
correlations<- correlations[row_indic ,row_indic ]
corrplot(correlations, method="square")

#Plot scatter plot for variables that have high correlation.
train %>% select(OverallCond, YearBuilt) %>% ggplot(aes(factor(OverallCond),YearBuilt)) + geom_boxplot() + xlab('Overall Condition')

plotCorr <- function(data_in, i){
data <- data.frame(x = data_in[[i]], SalePrice = data_in$SalePrice)
p <- ggplot(data, aes(x = x, y = SalePrice)) + geom_point(shape = 1, na.rm = TRUE) + geom_smooth(method = lm ) + xlab(paste0(colnames(data_in)[i], '\n', 'R-Squared: ', round(cor(data_in[[i]], data$SalePrice, use = 'complete.obs'), 2))) + theme_light()
return(suppressWarnings(p))
}
highcorr <- c(names(correlations[,'SalePrice'])[which(correlations[,'SalePrice'] > 0.5)], names(correlations[,'SalePrice'])[which(correlations[,'SalePrice'] < -0.2)])
data_corr <- train[,highcorr, with = FALSE]
doPlots(data_corr, fun = plotCorr, ii = 1:6)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

doPlots(data_corr, fun = plotCorr, ii = 6:11)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 81 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

library(scales)
ggplot(train, aes(x=SalePrice)) + geom_histogram(col = 'white') + theme_light() +scale_x_continuous(labels = comma)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(train[,.(SalePrice)])
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
#Normalize distribution
ggplot(train, aes(x=log(SalePrice+1))) + geom_histogram(col = 'white') + theme_light()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
