Advanced Regression Techniques competition

Load the Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.3
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.3
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
library(ggplot2)

Import R Dataset from GitHub

urlfile1<- "https://raw.githubusercontent.com/uzmabb182/Data605_Assignment/main/datasets/train.csv"

urlfile2<- "https://raw.githubusercontent.com/tiwari91/Housing-Prices/master/test.csv"

train_df<-read_csv(url(urlfile1))
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_df<-read_csv(url(urlfile2))
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train_df)
## # A tibble: 6 × 81
##      Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
##   <dbl>      <dbl> <chr>          <dbl>   <dbl> <chr>  <chr> <chr>   
## 1     1         60 RL                65    8450 Pave   <NA>  Reg     
## 2     2         20 RL                80    9600 Pave   <NA>  Reg     
## 3     3         60 RL                68   11250 Pave   <NA>  IR1     
## 4     4         70 RL                60    9550 Pave   <NA>  IR1     
## 5     5         60 RL                84   14260 Pave   <NA>  IR1     
## 6     6         50 RL                85   14115 Pave   <NA>  IR1     
## # ℹ 73 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## #   LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## #   BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## #   YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## #   Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## #   ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## #   BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …

View and Check Dimensions.

#view(train_df)
dim(train_df)
## [1] 1460   81

Check Column Names.

names(train_df)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "1stFlrSF"     
## [45] "2ndFlrSF"      "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "3SsnPorch"     "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"

Check Internal Structure of the Data Frame

glimpse(train_df)
## Rows: 1,460
## Columns: 81
## $ Id            <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <dbl> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
## $ LotFrontage   <dbl> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
## $ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
## $ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
## $ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
## $ LotConfig     <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
## $ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
## $ Neighborhood  <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
## $ Condition1    <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
## $ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
## $ HouseStyle    <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
## $ OverallQual   <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <dbl> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
## $ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
## $ Exterior1st   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
## $ Exterior2nd   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
## $ MasVnrType    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
## $ MasVnrArea    <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ Foundation    <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
## $ BsmtQual      <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
## $ BsmtCond      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
## $ BsmtExposure  <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
## $ BsmtFinType1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
## $ BsmtFinSF1    <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
## $ BsmtFinSF2    <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
## $ HeatingQC     <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
## $ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
## $ `1stFlrSF`    <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ `2ndFlrSF`    <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ TotRmsAbvGrd  <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
## $ Fireplaces    <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
## $ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
## $ GarageYrBlt   <dbl> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
## $ GarageCars    <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
## $ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ WoodDeckSF    <dbl> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <dbl> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <dbl> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ `3SsnPorch`   <dbl> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
## $ MiscFeature   <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
## $ MiscVal       <dbl> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
## $ SalePrice     <dbl> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

Getting Useful Summary Statistics

skim(train_df)
Data summary
Name train_df
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
character 43
numeric 38
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MSZoning 0 1.00 2 7 0 5 0
Street 0 1.00 4 4 0 2 0
Alley 1369 0.06 4 4 0 2 0
LotShape 0 1.00 3 3 0 4 0
LandContour 0 1.00 3 3 0 4 0
Utilities 0 1.00 6 6 0 2 0
LotConfig 0 1.00 3 7 0 5 0
LandSlope 0 1.00 3 3 0 3 0
Neighborhood 0 1.00 5 7 0 25 0
Condition1 0 1.00 4 6 0 9 0
Condition2 0 1.00 4 6 0 8 0
BldgType 0 1.00 4 6 0 5 0
HouseStyle 0 1.00 4 6 0 8 0
RoofStyle 0 1.00 3 7 0 6 0
RoofMatl 0 1.00 4 7 0 8 0
Exterior1st 0 1.00 5 7 0 15 0
Exterior2nd 0 1.00 5 7 0 16 0
MasVnrType 8 0.99 4 7 0 4 0
ExterQual 0 1.00 2 2 0 4 0
ExterCond 0 1.00 2 2 0 5 0
Foundation 0 1.00 4 6 0 6 0
BsmtQual 37 0.97 2 2 0 4 0
BsmtCond 37 0.97 2 2 0 4 0
BsmtExposure 38 0.97 2 2 0 4 0
BsmtFinType1 37 0.97 3 3 0 6 0
BsmtFinType2 38 0.97 3 3 0 6 0
Heating 0 1.00 4 5 0 6 0
HeatingQC 0 1.00 2 2 0 5 0
CentralAir 0 1.00 1 1 0 2 0
Electrical 1 1.00 3 5 0 5 0
KitchenQual 0 1.00 2 2 0 4 0
Functional 0 1.00 3 4 0 7 0
FireplaceQu 690 0.53 2 2 0 5 0
GarageType 81 0.94 6 7 0 6 0
GarageFinish 81 0.94 3 3 0 3 0
GarageQual 81 0.94 2 2 0 5 0
GarageCond 81 0.94 2 2 0 5 0
PavedDrive 0 1.00 1 1 0 3 0
PoolQC 1453 0.00 2 2 0 3 0
Fence 1179 0.19 4 5 0 4 0
MiscFeature 1406 0.04 4 4 0 4 0
SaleType 0 1.00 2 5 0 9 0
SaleCondition 0 1.00 6 7 0 6 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460 ▇▇▇▇▇
MSSubClass 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190 ▇▅▂▁▁
LotFrontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313 ▇▃▁▁▁
LotArea 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245 ▇▁▁▁▁
OverallQual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10 ▁▂▇▅▁
OverallCond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9 ▁▁▇▅▁
YearBuilt 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010 ▁▂▃▆▇
YearRemodAdd 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010 ▅▂▂▃▇
MasVnrArea 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600 ▇▁▁▁▁
BsmtFinSF1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644 ▇▁▁▁▁
BsmtFinSF2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474 ▇▁▁▁▁
BsmtUnfSF 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336 ▇▅▂▁▁
TotalBsmtSF 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110 ▇▃▁▁▁
1stFlrSF 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692 ▇▅▁▁▁
2ndFlrSF 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065 ▇▃▂▁▁
LowQualFinSF 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572 ▇▁▁▁▁
GrLivArea 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642 ▇▇▁▁▁
BsmtFullBath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3 ▇▆▁▁▁
BsmtHalfBath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2 ▇▁▁▁▁
FullBath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3 ▁▇▁▇▁
HalfBath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2 ▇▁▅▁▁
BedroomAbvGr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8 ▁▇▂▁▁
KitchenAbvGr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3 ▁▇▁▁▁
TotRmsAbvGrd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14 ▂▇▇▁▁
Fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3 ▇▇▁▁▁
GarageYrBlt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010 ▁▁▅▅▇
GarageCars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4 ▁▃▇▂▁
GarageArea 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418 ▂▇▃▁▁
WoodDeckSF 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857 ▇▂▁▁▁
OpenPorchSF 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547 ▇▁▁▁▁
EnclosedPorch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552 ▇▁▁▁▁
3SsnPorch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508 ▇▁▁▁▁
ScreenPorch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480 ▇▁▁▁▁
PoolArea 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738 ▇▁▁▁▁
MiscVal 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500 ▇▁▁▁▁
MoSold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12 ▃▆▇▃▃
YrSold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010 ▇▇▇▇▅
SalePrice 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000 ▇▅▁▁▁

To Investigate the Datatypes of Coumns.

str(train_df)
## spc_tbl_ [1,460 × 81] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id           : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : num [1:1460] 60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr [1:1460] "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : num [1:1460] 65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : num [1:1460] 8450 9600 11250 9550 14260 ...
##  $ Street       : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr [1:1460] NA NA NA NA ...
##  $ LotShape     : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : num [1:1460] 7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : num [1:1460] 2003 1976 2001 1915 2000 ...
##  $ YearRemodAdd : num [1:1460] 2003 1976 2002 1970 2000 ...
##  $ RoofStyle    : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : num [1:1460] 196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr [1:1460] "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : num [1:1460] 706 978 486 216 655 ...
##  $ BsmtFinType2 : chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : num [1:1460] 0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : num [1:1460] 150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : num [1:1460] 856 1262 920 756 1145 ...
##  $ Heating      : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr [1:1460] "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ 1stFlrSF     : num [1:1460] 856 1262 920 961 1145 ...
##  $ 2ndFlrSF     : num [1:1460] 854 0 866 756 1053 ...
##  $ LowQualFinSF : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : num [1:1460] 1710 1262 1786 1717 2198 ...
##  $ BsmtFullBath : num [1:1460] 1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : num [1:1460] 0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : num [1:1460] 1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : num [1:1460] 1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : num [1:1460] 0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr [1:1460] NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : num [1:1460] 2003 1976 2001 1998 2000 ...
##  $ GarageFinish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr [1:1460] "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : num [1:1460] 0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : num [1:1460] 61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: num [1:1460] 0 0 0 272 0 0 0 228 205 0 ...
##  $ 3SsnPorch    : num [1:1460] 0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr [1:1460] NA NA NA NA ...
##  $ Fence        : chr [1:1460] NA NA NA NA ...
##  $ MiscFeature  : chr [1:1460] NA NA NA NA ...
##  $ MiscVal      : num [1:1460] 0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : num [1:1460] 2008 2007 2008 2006 2008 ...
##  $ SaleType     : chr [1:1460] "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : num [1:1460] 208500 181500 223500 140000 250000 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   MSSubClass = col_double(),
##   ..   MSZoning = col_character(),
##   ..   LotFrontage = col_double(),
##   ..   LotArea = col_double(),
##   ..   Street = col_character(),
##   ..   Alley = col_character(),
##   ..   LotShape = col_character(),
##   ..   LandContour = col_character(),
##   ..   Utilities = col_character(),
##   ..   LotConfig = col_character(),
##   ..   LandSlope = col_character(),
##   ..   Neighborhood = col_character(),
##   ..   Condition1 = col_character(),
##   ..   Condition2 = col_character(),
##   ..   BldgType = col_character(),
##   ..   HouseStyle = col_character(),
##   ..   OverallQual = col_double(),
##   ..   OverallCond = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   YearRemodAdd = col_double(),
##   ..   RoofStyle = col_character(),
##   ..   RoofMatl = col_character(),
##   ..   Exterior1st = col_character(),
##   ..   Exterior2nd = col_character(),
##   ..   MasVnrType = col_character(),
##   ..   MasVnrArea = col_double(),
##   ..   ExterQual = col_character(),
##   ..   ExterCond = col_character(),
##   ..   Foundation = col_character(),
##   ..   BsmtQual = col_character(),
##   ..   BsmtCond = col_character(),
##   ..   BsmtExposure = col_character(),
##   ..   BsmtFinType1 = col_character(),
##   ..   BsmtFinSF1 = col_double(),
##   ..   BsmtFinType2 = col_character(),
##   ..   BsmtFinSF2 = col_double(),
##   ..   BsmtUnfSF = col_double(),
##   ..   TotalBsmtSF = col_double(),
##   ..   Heating = col_character(),
##   ..   HeatingQC = col_character(),
##   ..   CentralAir = col_character(),
##   ..   Electrical = col_character(),
##   ..   `1stFlrSF` = col_double(),
##   ..   `2ndFlrSF` = col_double(),
##   ..   LowQualFinSF = col_double(),
##   ..   GrLivArea = col_double(),
##   ..   BsmtFullBath = col_double(),
##   ..   BsmtHalfBath = col_double(),
##   ..   FullBath = col_double(),
##   ..   HalfBath = col_double(),
##   ..   BedroomAbvGr = col_double(),
##   ..   KitchenAbvGr = col_double(),
##   ..   KitchenQual = col_character(),
##   ..   TotRmsAbvGrd = col_double(),
##   ..   Functional = col_character(),
##   ..   Fireplaces = col_double(),
##   ..   FireplaceQu = col_character(),
##   ..   GarageType = col_character(),
##   ..   GarageYrBlt = col_double(),
##   ..   GarageFinish = col_character(),
##   ..   GarageCars = col_double(),
##   ..   GarageArea = col_double(),
##   ..   GarageQual = col_character(),
##   ..   GarageCond = col_character(),
##   ..   PavedDrive = col_character(),
##   ..   WoodDeckSF = col_double(),
##   ..   OpenPorchSF = col_double(),
##   ..   EnclosedPorch = col_double(),
##   ..   `3SsnPorch` = col_double(),
##   ..   ScreenPorch = col_double(),
##   ..   PoolArea = col_double(),
##   ..   PoolQC = col_character(),
##   ..   Fence = col_character(),
##   ..   MiscFeature = col_character(),
##   ..   MiscVal = col_double(),
##   ..   MoSold = col_double(),
##   ..   YrSold = col_double(),
##   ..   SaleType = col_character(),
##   ..   SaleCondition = col_character(),
##   ..   SalePrice = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Drop the Columns with NAs

train_df <- subset(train_df, select = -c(Alley, PoolQC, Fence, MiscFeature, FireplaceQu))
str(train_df)
## tibble [1,460 × 76] (S3: tbl_df/tbl/data.frame)
##  $ Id           : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : num [1:1460] 60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr [1:1460] "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : num [1:1460] 65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : num [1:1460] 8450 9600 11250 9550 14260 ...
##  $ Street       : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
##  $ LotShape     : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : num [1:1460] 7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : num [1:1460] 2003 1976 2001 1915 2000 ...
##  $ YearRemodAdd : num [1:1460] 2003 1976 2002 1970 2000 ...
##  $ RoofStyle    : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : num [1:1460] 196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr [1:1460] "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : num [1:1460] 706 978 486 216 655 ...
##  $ BsmtFinType2 : chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : num [1:1460] 0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : num [1:1460] 150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : num [1:1460] 856 1262 920 756 1145 ...
##  $ Heating      : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr [1:1460] "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ 1stFlrSF     : num [1:1460] 856 1262 920 961 1145 ...
##  $ 2ndFlrSF     : num [1:1460] 854 0 866 756 1053 ...
##  $ LowQualFinSF : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : num [1:1460] 1710 1262 1786 1717 2198 ...
##  $ BsmtFullBath : num [1:1460] 1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : num [1:1460] 0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : num [1:1460] 1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : num [1:1460] 1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : num [1:1460] 0 1 1 1 1 0 1 2 2 2 ...
##  $ GarageType   : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : num [1:1460] 2003 1976 2001 1998 2000 ...
##  $ GarageFinish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr [1:1460] "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : num [1:1460] 0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : num [1:1460] 61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: num [1:1460] 0 0 0 272 0 0 0 228 205 0 ...
##  $ 3SsnPorch    : num [1:1460] 0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MiscVal      : num [1:1460] 0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : num [1:1460] 2008 2007 2008 2006 2008 ...
##  $ SaleType     : chr [1:1460] "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : num [1:1460] 208500 181500 223500 140000 250000 ...
sum(is.na(train_df))
## [1] 868

Pick an Independent Variable with Right-Skewness.

Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X.

Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.

x <- train_df$OverallQual
hist(x, main = "Overall Quality")

x <- train_df$LotArea

hist(train_df$LotArea, main = "Lot Area")

# LotArea is clearly right-skewed.
X = train_df$LotArea

# The target variable we are trying to predict is SalePrice, the
# property's sale price in dollars.

Y = train_df$SalePrice

# Show histogram of SalePrice (target).
# SalePrice
hist(train_df$SalePrice, main = "Sale Price")

Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

  1. P(X>x | Y>y) b. P(X>x, Y>y) c. P(X<x | Y>y)

Summarize X (independent var.) and Y (target)

Pipe the data frame into the select() function from the dplyr package in R

df = train_df %>% dplyr::select(LotArea, SalePrice)
summary(df)
##     LotArea         SalePrice     
##  Min.   :  1300   Min.   : 34900  
##  1st Qu.:  7554   1st Qu.:129975  
##  Median :  9478   Median :163000  
##  Mean   : 10517   Mean   :180921  
##  3rd Qu.: 11602   3rd Qu.:214000  
##  Max.   :215245   Max.   :755000

The meaning of all these probabilities:

  1. means the probability of the LotArea being bigger tham the first quartile, given the SalePrice is bigger than the first Quartile.

  2. means that the probability of both scenarios happening.

  3. means the probability that the lotArea is smaller than the 1st quartile given that the SalePrice is bigger than the first quartile

#Assign quartile values to variables.
XQ1<-quantile(train_df$LotArea, 0.25)
YQ1<-quantile(train_df$SalePrice, 0.25)

#Create subsets of data based on quartile operators.
yY <- subset(train_df,SalePrice <= YQ1)
Yy <- subset(train_df,SalePrice > YQ1)
Xx_Yy<- subset(Yy, LotArea > XQ1)
Xx_yY<- subset(yY, LotArea > XQ1)
xX_Yy<- subset(Yy, LotArea <= XQ1)
xX_yY<- subset(yY, LotArea <= XQ1)

calculate the required probabilities.

a. P(X>x | Y>y)

#for P(X>x|Y>y)
a <- nrow(Xx_Yy)
nrow(Xx_Yy)/nrow(train_df)
## [1] 0.6150685

b. P(X>x, Y>y)

#for P(X>x|y<Y)
b <- nrow(Xx_yY)
nrow(Xx_yY)/nrow(train_df)
## [1] 0.1349315

c. P(X<x | Y>y)

c <- nrow(xX_Yy)
nrow(xX_Yy)/nrow(train_df)
## [1] 0.1349315
#for P(X<x|Y>y)
c <- nrow(xX_Yy)
nrow(xX_Yy)/nrow(train_df)
## [1] 0.1349315
#P(X<x|y<Y)
d <-nrow(xX_yY)
nrow(xX_yY)/nrow(train_df)
## [1] 0.1150685

Creating a table of counts

table <- matrix(c(d,c,(d+c),b,a,(b+a),(b+d),(c+a),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(table) <- c("<=1Q", ">1Q", "Total")
rownames(table) <- c('<=1Q', '>1Q', 'Total')
result_table <- as.table(table)
result_table
##       <=1Q  >1Q Total
## <=1Q   168  197   365
## >1Q    197  898  1095
## Total  365 1095  1460

Does splitting the training data in this fashion make them independent?

No, independence explains whether there is a relation between X & Y.

Splitting the data doesn’t change the relationship, it just changes the extent of problem domain.

Does P(A|B)=P(A)P(B)?

Let A be the new variable counting those observations above the 3d quartile for X,

let B be the new variable counting those observations for the 2d quartile for Y.

Does P(A|B)=P(A)P(B)?

Check mathematically, and then evaluate by running a Chi Square test for association.

#Observations above the 1d quartile for X
Xx_A <- subset(train_df, LotArea >= XQ1)

#Observations for the 1d quartile for Y
YQ1 <- quantile(train_df$SalePrice, 0.25)
Yy_B_1 <- subset(train_df, SalePrice <= YQ1)
YY_B_2 <- subset(train_df, SalePrice >= YQ1)
#P(A|B)
YY_XX <- subset(YY_B_2, LotArea >= XQ1)
res1 <- nrow(YY_XX)/nrow(train_df)
#P(A)P(B)
res2 <- (nrow(Xx_A)/nrow(train_df))*nrow(YY_B_2)/nrow(train_df)
print(c("P(A|B)=P(A)P(B)?: ", res1==res2))
## [1] "P(A|B)=P(A)P(B)?: " "FALSE"

The variables are not independent.

Description:

Chi-Square test

chi_table<- table(train_df$LotArea, train_df$SalePrice)
suppressWarnings(chisq.test(chi_table))
## 
##  Pearson's Chi-squared test
## 
## data:  chi_table
## X-squared = 735095, df = 709664, p-value < 2.2e-16

A p value is < 2.2e-16.

Therefore, we reject the null hypothesis that X is Independent of Y.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

Descriptive and Inferential Statistics

Variable Density Plots for LotArea

#Collect summary statistics
LotArea.mean <-mean(train_df$LotArea)
LotArea.median <-median(train_df$LotArea)
LotArea.mode <- as.numeric(names(sort(-table(train_df$LotArea))))[1]
LotArea.sd <- sd(train_df$LotArea)
SalePrice.mean <-mean(train_df$SalePrice)
SalePrice.median <-median(train_df$SalePrice)
SalePrice.mode <- as.numeric(names(sort(-table(train_df$SalePrice))))[1]
SalePrice.sd <- sd(train_df$SalePrice)
#Create density plot for LotArea variable
d_LotArea <- density(train_df$LotArea)
plot(d_LotArea, main="LotArea Probabilities", ylab="Probability", xlab="LotArea")
polygon(d_LotArea, col="light blue")
abline(v = LotArea.median, col = "green", lwd = 2)
abline(v = LotArea.mean, col = "blue", lwd = 2)
abline(v = LotArea.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8)

Variable Density Plots for SalePrice

#Create density plot for SalePrice variable.
SalePrice_for_graph <- density(train_df$SalePrice, na.rm=TRUE)
plot(SalePrice_for_graph, main="SalePrice Probabilities", ylab="Probability", xlab="SalePrice")
polygon(SalePrice_for_graph, col="light blue")
abline(v = SalePrice.median, col = "green", lwd = 2)
abline(v = SalePrice.mean, col = "blue", lwd = 2)
abline(v = SalePrice.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8 )

### Plotting A Graph

plot(train_df$LotArea,train_df$SalePrice, main="LotArea vs SalePrice Scatterplot", 
     xlab="LotArea", ylab="SalePrice", pch=3)

### 95% confidence Interval

Provide a 95% Confidence Interval for the difference in the mean of the variables.

t.test(train_df$LotArea,train_df$SalePrice)
## 
##  Welch Two Sample t-test
## 
## data:  train_df$LotArea and train_df$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -174514.7 -166294.1
## sample estimates:
## mean of x mean of y 
##  10516.83 180921.20

Since the p value is too low, we reject the hypothesis that the difference in means is equal to 0.

Correlation matrix

Derive a correlation matrix for THREE of the quantitative variables you selected.

corMatrix<-cor(train_df[, which(names(train_df) %in% c("LotArea", "SalePrice", 'GrLivArea'))])
corMatrix
##             LotArea GrLivArea SalePrice
## LotArea   1.0000000 0.2631162 0.2638434
## GrLivArea 0.2631162 1.0000000 0.7086245
## SalePrice 0.2638434 0.7086245 1.0000000

Results show a very low but possible positive correlation between the data of 0.2638.

The correlation between GrLivArea and SalePrice seems to be high at 0.7086245

Test the hypothesis using t-test

Test the hypothesis that the correlation between these variables is 0 and provide a 92% confidence interval.

t.test(train_df$LotArea,train_df$SalePrice, conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  train_df$LotArea and train_df$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  -174075.3 -166733.4
## sample estimates:
## mean of x mean of y 
##  10516.83 180921.20
t.test(train_df$LotArea,train_df$GrLivArea, conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  train_df$LotArea and train_df$GrLivArea
## t = 34.411, df = 1467.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  8543.097 9459.632
## sample estimates:
## mean of x mean of y 
## 10516.828  1515.464
t.test(train_df$GrLivArea,train_df$SalePrice, conf.level=0.92)
## 
##  Welch Two Sample t-test
## 
## data:  train_df$GrLivArea and train_df$SalePrice
## t = -86.288, df = 1459.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
##  -183048.2 -175763.3
## sample estimates:
##  mean of x  mean of y 
##   1515.464 180921.196

In all three cases the difference for the 3 variables’ means is not 0

Would you be worried about familywise error? Why or why not?

The correlation values are not too high so we don’t have to worry about misidentifying the outcomes.

The meaning of this result is that the variables do not say much about the final sale price.

Linear Algebra and Correlation

Invert your correlation matrix (Precision matrix).

corMatrixInverse <- ginv(corMatrix)
corMatrixInverse
##            [,1]       [,2]       [,3]
## [1,]  1.0884485 -0.1664868 -0.1692033
## [2,] -0.1664868  2.0340972 -1.3974846
## [3,] -0.1692033 -1.3974846  2.0349350

The diagonal elements represent variance inflation factors, which measures the relationship between combinations between variables.

Multiplying the correlation matrix by the precision matrix.

This is the identity Matrix

matrix1<- corMatrixInverse %*% corMatrix
matrix1
##           LotArea     GrLivArea     SalePrice
## [1,] 1.000000e+00 -2.498002e-16 -8.326673e-17
## [2,] 1.665335e-16  1.000000e+00  8.881784e-16
## [3,] 1.110223e-16 -4.440892e-16  1.000000e+00

Multiply the precision matrix by the correlation matrix.

This represents how matrix products differ depending on the order or direction by which they are multiplied.

matrix2<- corMatrix %*% corMatrixInverse
matrix2
##                   [,1]          [,2]         [,3]
## LotArea   1.000000e+00 -4.440892e-16 2.220446e-16
## GrLivArea 2.220446e-16  1.000000e+00 2.220446e-16
## SalePrice 2.498002e-16  0.000000e+00 1.000000e+00

Performing Principle Components Analysis

#Perform a log transform on each variable to normalize
dataCopy<-train_df
dataCopy$LotArea<-log(dataCopy$LotArea)
dataCopy$SalePrice<-log(dataCopy$SalePrice)
dataCopy$GrLivArea<-log(dataCopy$GrLivArea)

#apply PCA and ADD additional parameters for a more interesting interpretation

data.pca<-prcomp(dataCopy[, which(names(dataCopy) %in% c("LotArea", "SalePrice", "GrLivArea"))],center = TRUE,scale = TRUE)
data.pca
## Standard deviations (1, .., p=3):
## [1] 1.4246936 0.8370813 0.5191755
## 
## Rotation (n x k) = (3 x 3):
##                 PC1        PC2         PC3
## LotArea   0.4746802  0.8799375  0.01971454
## GrLivArea 0.6204099 -0.3503987  0.70164972
## SalePrice 0.6243159 -0.3208281 -0.71224926

Summary

summary(data.pca)
## Importance of components:
##                           PC1    PC2     PC3
## Standard deviation     1.4247 0.8371 0.51918
## Proportion of Variance 0.6766 0.2336 0.08985
## Cumulative Proportion  0.6766 0.9102 1.00000

Plotting PCA

biplot(data.pca)

### Analysis

Vectors that point in the same direction correspond to variables that have similar response profiles,

This can be interpreted as having similar meaning in the context set by the data,

Here SalePrice and GrLivArea have very similar vectors pointing to the same direction. where we will apply the regression technique.

screeplot(data.pca, type="lines")

### Calculus-Based Probability & Statistics

We take the LotArea data and fit it to an exponential function.

lambda<-fitdistr(train_df$LotArea,"exponential")
lambda$estimate
##        rate 
## 9.50857e-05

#Transpose the rate into 1000 selected variables as an exponential distribution

pdf_distr<-rexp(1000, lambda$estimate)

#Plot the results of the exponential distribution
hist(pdf_distr, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with LotArea", xlim = c(1, quantile(pdf_distr, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "green", add = TRUE)

### Plotting the results as compared to the original data

hist(train_df$LotArea, freq = FALSE, breaks = 100, main ="Exponential VS original LotArea data",xlim = c(1, quantile(train_df$LotArea, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "green", add = TRUE)

### With the exponential PDF:

5th and 95th percentiles using the cumulative distribution function (CDF)

qexp(0.05, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 539.4428
qexp(0.95, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 31505.6

95% confidence interval from the empirical data, assuming normality

qnorm(0.95,LotArea.mean, LotArea.sd)
## [1] 26934.55

Empirical 5th percentile and 95th percentile of the data

quantile(train_df$LotArea, c(.05, .95))
##       5%      95% 
##  3311.70 17401.15

What does these values mean?

Analyzing the above result we recognize the differences between a exponential equation and the selected right-skewed data.

The approximation can work to fit different models and help explain the data.

Model Selection / Regression

check_model <- function(m) {
    print(summary(m))
    res = residuals(m)
    print(summary(res))
    hist(res)
    plot(fitted(m), resid(m))
}
par(mfrow = c(1, 1))

# Full training data set
train_df.train = train_df

# Reduce to Dataframe with selected feature sets
train_df.train = train_df.train %>% dplyr::select(SalePrice,
                                      BldgType,
                                      BsmtCond,
                                      BsmtExposure,
                                      BsmtQual,
                                      CentralAir,
                                      GarageArea,
                                      GarageCars,
                                      # Exterior1st,
                                      ExterQual,
                                      # Fence,
                                      Fireplaces,
                                      #FireplaceQu,
                                      Foundation,
                                      HouseStyle,
                                      KitchenQual,
                                      LandContour,
                                      LandSlope,
                                      LotArea,
                                      MasVnrArea,
                                      MiscVal,
                                      Neighborhood,
                                      OverallCond,
                                      OverallQual,
                                      PoolArea,
                                      # # PoolQC,
                                      RoofStyle,
                                      # # Street,
                                      YearBuilt,
                                      YearRemodAdd)
regr_model = lm(train_df.train)
check_model(regr_model)
## 
## Call:
## lm(formula = train_df.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -311457  -15589    -831   13665  273845 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -5.193e+05  1.996e+05  -2.601 0.009392 ** 
## BldgType2fmCon      -2.050e+03  6.670e+03  -0.307 0.758600    
## BldgTypeDuplex       5.642e+03  5.997e+03   0.941 0.346961    
## BldgTypeTwnhs       -4.096e+04  6.790e+03  -6.032 2.10e-09 ***
## BldgTypeTwnhsE      -3.821e+04  4.290e+03  -8.907  < 2e-16 ***
## BsmtCondGd           3.527e+03  7.049e+03   0.500 0.616874    
## BsmtCondPo           4.139e+04  2.604e+04   1.589 0.112207    
## BsmtCondTA           5.689e+03  5.541e+03   1.027 0.304756    
## BsmtExposureGd       2.218e+04  4.033e+03   5.500 4.54e-08 ***
## BsmtExposureMn      -9.737e+02  4.124e+03  -0.236 0.813384    
## BsmtExposureNo      -8.296e+03  2.946e+03  -2.816 0.004932 ** 
## BsmtQualFa          -4.094e+04  8.292e+03  -4.937 8.92e-07 ***
## BsmtQualGd          -3.097e+04  4.414e+03  -7.015 3.64e-12 ***
## BsmtQualTA          -3.507e+04  5.407e+03  -6.485 1.24e-10 ***
## CentralAirY          2.649e+03  4.746e+03   0.558 0.576848    
## GarageArea           1.402e+01  9.539e+00   1.470 0.141865    
## GarageCars           1.028e+04  2.842e+03   3.618 0.000308 ***
## ExterQualFa         -2.298e+04  1.291e+04  -1.779 0.075430 .  
## ExterQualGd         -1.854e+04  6.356e+03  -2.917 0.003591 ** 
## ExterQualTA         -2.377e+04  7.011e+03  -3.390 0.000719 ***
## Fireplaces           1.210e+04  1.717e+03   7.046 2.94e-12 ***
## FoundationCBlock     5.228e+03  4.166e+03   1.255 0.209713    
## FoundationPConc      8.146e+03  4.621e+03   1.763 0.078152 .  
## FoundationStone      1.273e+04  1.420e+04   0.896 0.370232    
## FoundationWood       1.071e+04  1.994e+04   0.537 0.591206    
## HouseStyle1.5Unf    -2.081e+04  9.418e+03  -2.210 0.027302 *  
## HouseStyle1Story    -1.128e+04  3.594e+03  -3.139 0.001730 ** 
## HouseStyle2.5Fin     4.422e+04  1.266e+04   3.493 0.000493 ***
## HouseStyle2.5Unf     4.544e+03  1.091e+04   0.417 0.677108    
## HouseStyle2Story    -1.960e+03  3.742e+03  -0.524 0.600613    
## HouseStyleSFoyer    -3.070e+04  7.374e+03  -4.163 3.34e-05 ***
## HouseStyleSLvl      -2.131e+04  5.665e+03  -3.762 0.000176 ***
## KitchenQualFa       -3.274e+04  8.142e+03  -4.021 6.11e-05 ***
## KitchenQualGd       -3.329e+04  4.591e+03  -7.250 7.05e-13 ***
## KitchenQualTA       -3.789e+04  5.204e+03  -7.280 5.69e-13 ***
## LandContourHLS       1.137e+04  6.821e+03   1.667 0.095702 .  
## LandContourLow       2.026e+03  8.488e+03   0.239 0.811375    
## LandContourLvl       1.735e+04  4.886e+03   3.551 0.000396 ***
## LandSlopeMod         1.151e+04  5.243e+03   2.196 0.028268 *  
## LandSlopeSev        -3.051e+04  1.327e+04  -2.299 0.021652 *  
## LotArea              7.985e-01  1.228e-01   6.505 1.10e-10 ***
## MasVnrArea           2.046e+01  6.218e+00   3.290 0.001028 ** 
## MiscVal             -1.009e+00  1.897e+00  -0.532 0.595098    
## NeighborhoodBlueste -7.660e+03  2.517e+04  -0.304 0.760882    
## NeighborhoodBrDale  -1.076e+04  1.353e+04  -0.795 0.426585    
## NeighborhoodBrkSide -1.782e+04  1.112e+04  -1.603 0.109243    
## NeighborhoodClearCr  9.918e+02  1.192e+04   0.083 0.933716    
## NeighborhoodCollgCr -1.167e+04  9.329e+03  -1.251 0.211224    
## NeighborhoodCrawfor  1.446e+04  1.087e+04   1.331 0.183572    
## NeighborhoodEdwards -2.510e+04  1.017e+04  -2.468 0.013695 *  
## NeighborhoodGilbert -2.076e+04  9.948e+03  -2.087 0.037053 *  
## NeighborhoodIDOTRR  -3.351e+04  1.172e+04  -2.860 0.004303 ** 
## NeighborhoodMeadowV  4.281e+03  1.257e+04   0.341 0.733495    
## NeighborhoodMitchel -2.174e+04  1.043e+04  -2.084 0.037361 *  
## NeighborhoodNAmes   -1.759e+04  9.900e+03  -1.777 0.075817 .  
## NeighborhoodNoRidge  6.186e+04  1.061e+04   5.830 6.96e-09 ***
## NeighborhoodNPkVill  6.038e+03  1.434e+04   0.421 0.673799    
## NeighborhoodNridgHt  2.007e+04  9.679e+03   2.073 0.038347 *  
## NeighborhoodNWAmes  -1.315e+04  1.019e+04  -1.290 0.197446    
## NeighborhoodOldTown -2.699e+04  1.076e+04  -2.508 0.012254 *  
## NeighborhoodSawyer  -1.769e+04  1.039e+04  -1.702 0.088988 .  
## NeighborhoodSawyerW -3.910e+03  9.972e+03  -0.392 0.695086    
## NeighborhoodSomerst  1.948e+03  9.461e+03   0.206 0.836855    
## NeighborhoodStoneBr  5.363e+04  1.074e+04   4.993 6.72e-07 ***
## NeighborhoodSWISU   -1.045e+04  1.248e+04  -0.838 0.402405    
## NeighborhoodTimber  -1.369e+04  1.061e+04  -1.290 0.197413    
## NeighborhoodVeenker  1.202e+04  1.327e+04   0.906 0.365254    
## OverallCond          3.752e+03  1.083e+03   3.465 0.000548 ***
## OverallQual          1.381e+04  1.263e+03  10.934  < 2e-16 ***
## PoolArea             6.747e+01  2.246e+01   3.004 0.002710 ** 
## RoofStyleGable      -1.891e+03  1.105e+04  -0.171 0.864087    
## RoofStyleGambrel     3.983e+03  1.507e+04   0.264 0.791594    
## RoofStyleHip         4.768e+03  1.124e+04   0.424 0.671411    
## RoofStyleMansard     1.733e+04  1.698e+04   1.020 0.307775    
## RoofStyleShed        3.800e+04  2.684e+04   1.416 0.157091    
## YearBuilt            1.175e+02  8.670e+01   1.355 0.175574    
## YearRemodAdd         2.018e+02  7.077e+01   2.851 0.004424 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32710 on 1337 degrees of freedom
##   (46 observations deleted due to missingness)
## Multiple R-squared:  0.8389, Adjusted R-squared:  0.8297 
## F-statistic:  91.6 on 76 and 1337 DF,  p-value: < 2.2e-16
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -311457  -15589    -831       0   13665  273845

qqnorm(residuals(regr_model))

summary(train_df.train)
##    SalePrice        BldgType           BsmtCond         BsmtExposure      
##  Min.   : 34900   Length:1460        Length:1460        Length:1460       
##  1st Qu.:129975   Class :character   Class :character   Class :character  
##  Median :163000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :180921                                                           
##  3rd Qu.:214000                                                           
##  Max.   :755000                                                           
##                                                                           
##    BsmtQual          CentralAir          GarageArea       GarageCars   
##  Length:1460        Length:1460        Min.   :   0.0   Min.   :0.000  
##  Class :character   Class :character   1st Qu.: 334.5   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Median : 480.0   Median :2.000  
##                                        Mean   : 473.0   Mean   :1.767  
##                                        3rd Qu.: 576.0   3rd Qu.:2.000  
##                                        Max.   :1418.0   Max.   :4.000  
##                                                                        
##   ExterQual           Fireplaces     Foundation         HouseStyle       
##  Length:1460        Min.   :0.000   Length:1460        Length:1460       
##  Class :character   1st Qu.:0.000   Class :character   Class :character  
##  Mode  :character   Median :1.000   Mode  :character   Mode  :character  
##                     Mean   :0.613                                        
##                     3rd Qu.:1.000                                        
##                     Max.   :3.000                                        
##                                                                          
##  KitchenQual        LandContour         LandSlope            LotArea      
##  Length:1460        Length:1460        Length:1460        Min.   :  1300  
##  Class :character   Class :character   Class :character   1st Qu.:  7554  
##  Mode  :character   Mode  :character   Mode  :character   Median :  9478  
##                                                           Mean   : 10517  
##                                                           3rd Qu.: 11602  
##                                                           Max.   :215245  
##                                                                           
##    MasVnrArea        MiscVal         Neighborhood        OverallCond   
##  Min.   :   0.0   Min.   :    0.00   Length:1460        Min.   :1.000  
##  1st Qu.:   0.0   1st Qu.:    0.00   Class :character   1st Qu.:5.000  
##  Median :   0.0   Median :    0.00   Mode  :character   Median :5.000  
##  Mean   : 103.7   Mean   :   43.49                      Mean   :5.575  
##  3rd Qu.: 166.0   3rd Qu.:    0.00                      3rd Qu.:6.000  
##  Max.   :1600.0   Max.   :15500.00                      Max.   :9.000  
##  NA's   :8                                                             
##   OverallQual        PoolArea        RoofStyle           YearBuilt   
##  Min.   : 1.000   Min.   :  0.000   Length:1460        Min.   :1872  
##  1st Qu.: 5.000   1st Qu.:  0.000   Class :character   1st Qu.:1954  
##  Median : 6.000   Median :  0.000   Mode  :character   Median :1973  
##  Mean   : 6.099   Mean   :  2.759                      Mean   :1971  
##  3rd Qu.: 7.000   3rd Qu.:  0.000                      3rd Qu.:2000  
##  Max.   :10.000   Max.   :738.000                      Max.   :2010  
##                                                                      
##   YearRemodAdd 
##  Min.   :1950  
##  1st Qu.:1967  
##  Median :1994  
##  Mean   :1985  
##  3rd Qu.:2004  
##  Max.   :2010  
## 

Analysis:

Residual standard error: This is an estimate of the standard deviation of the residuals. In this case, it’s 32710, indicating the average difference between the observed values and the values predicted by the model is around 32710 units.

Multiple R-squared: This is the proportion of the variance in the dependent variable that is predictable from the independent variables. Here, it’s 0.8389, meaning approximately 83.89% of the variability in the dependent variable can be explained by the independent variables.

Adjusted R-squared: This is the R-squared value adjusted for the number of predictors in the model. It’s slightly lower at 0.8297 but still suggests a good fit.

F-statistic: This tests the overall significance of the regression model. A larger F-statistic with a small p-value suggests that the overall model is significant. Here, the F-statistic is 91.6 with a p-value less than 2.2e-16, indicating that the overall model is highly significant.

test_df <- subset(test_df, select = -c(Alley, PoolQC, Fence, MiscFeature, FireplaceQu))
str(test_df)
## tibble [1,459 × 75] (S3: tbl_df/tbl/data.frame)
##  $ Id           : num [1:1459] 1461 1462 1463 1464 1465 ...
##  $ MSSubClass   : num [1:1459] 20 20 60 60 120 60 20 60 20 20 ...
##  $ MSZoning     : chr [1:1459] "RH" "RL" "RL" "RL" ...
##  $ LotFrontage  : num [1:1459] 80 81 74 78 43 75 NA 63 85 70 ...
##  $ LotArea      : num [1:1459] 11622 14267 13830 9978 5005 ...
##  $ Street       : chr [1:1459] "Pave" "Pave" "Pave" "Pave" ...
##  $ LotShape     : chr [1:1459] "Reg" "IR1" "IR1" "IR1" ...
##  $ LandContour  : chr [1:1459] "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr [1:1459] "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr [1:1459] "Inside" "Corner" "Inside" "Inside" ...
##  $ LandSlope    : chr [1:1459] "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr [1:1459] "NAmes" "NAmes" "Gilbert" "Gilbert" ...
##  $ Condition1   : chr [1:1459] "Feedr" "Norm" "Norm" "Norm" ...
##  $ Condition2   : chr [1:1459] "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr [1:1459] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr [1:1459] "1Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : num [1:1459] 5 6 5 6 8 6 6 6 7 4 ...
##  $ OverallCond  : num [1:1459] 6 6 5 6 5 5 7 5 5 5 ...
##  $ YearBuilt    : num [1:1459] 1961 1958 1997 1998 1992 ...
##  $ YearRemodAdd : num [1:1459] 1961 1958 1998 1998 1992 ...
##  $ RoofStyle    : chr [1:1459] "Gable" "Hip" "Gable" "Gable" ...
##  $ RoofMatl     : chr [1:1459] "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr [1:1459] "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
##  $ Exterior2nd  : chr [1:1459] "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
##  $ MasVnrType   : chr [1:1459] "None" "BrkFace" "None" "BrkFace" ...
##  $ MasVnrArea   : num [1:1459] 0 108 0 20 0 0 0 0 0 0 ...
##  $ ExterQual    : chr [1:1459] "TA" "TA" "TA" "TA" ...
##  $ ExterCond    : chr [1:1459] "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr [1:1459] "CBlock" "CBlock" "PConc" "PConc" ...
##  $ BsmtQual     : chr [1:1459] "TA" "TA" "Gd" "TA" ...
##  $ BsmtCond     : chr [1:1459] "TA" "TA" "TA" "TA" ...
##  $ BsmtExposure : chr [1:1459] "No" "No" "No" "No" ...
##  $ BsmtFinType1 : chr [1:1459] "Rec" "ALQ" "GLQ" "GLQ" ...
##  $ BsmtFinSF1   : num [1:1459] 468 923 791 602 263 0 935 0 637 804 ...
##  $ BsmtFinType2 : chr [1:1459] "LwQ" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : num [1:1459] 144 0 0 0 0 0 0 0 0 78 ...
##  $ BsmtUnfSF    : num [1:1459] 270 406 137 324 1017 ...
##  $ TotalBsmtSF  : num [1:1459] 882 1329 928 926 1280 ...
##  $ Heating      : chr [1:1459] "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr [1:1459] "TA" "TA" "Gd" "Ex" ...
##  $ CentralAir   : chr [1:1459] "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr [1:1459] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ 1stFlrSF     : num [1:1459] 896 1329 928 926 1280 ...
##  $ 2ndFlrSF     : num [1:1459] 0 0 701 678 0 892 0 676 0 0 ...
##  $ LowQualFinSF : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : num [1:1459] 896 1329 1629 1604 1280 ...
##  $ BsmtFullBath : num [1:1459] 0 0 0 0 0 0 1 0 1 1 ...
##  $ BsmtHalfBath : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : num [1:1459] 1 1 2 2 2 2 2 2 1 1 ...
##  $ HalfBath     : num [1:1459] 0 1 1 1 0 1 0 1 1 0 ...
##  $ BedroomAbvGr : num [1:1459] 2 3 3 3 2 3 3 3 2 2 ...
##  $ KitchenAbvGr : num [1:1459] 1 1 1 1 1 1 1 1 1 1 ...
##  $ KitchenQual  : chr [1:1459] "TA" "Gd" "TA" "Gd" ...
##  $ TotRmsAbvGrd : num [1:1459] 5 6 6 7 5 7 6 7 5 4 ...
##  $ Functional   : chr [1:1459] "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : num [1:1459] 0 0 1 1 0 1 0 1 1 0 ...
##  $ GarageType   : chr [1:1459] "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ GarageYrBlt  : num [1:1459] 1961 1958 1997 1998 1992 ...
##  $ GarageFinish : chr [1:1459] "Unf" "Unf" "Fin" "Fin" ...
##  $ GarageCars   : num [1:1459] 1 1 2 2 2 2 2 2 2 2 ...
##  $ GarageArea   : num [1:1459] 730 312 482 470 506 440 420 393 506 525 ...
##  $ GarageQual   : chr [1:1459] "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr [1:1459] "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr [1:1459] "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : num [1:1459] 140 393 212 360 0 157 483 0 192 240 ...
##  $ OpenPorchSF  : num [1:1459] 0 36 34 36 82 84 21 75 0 0 ...
##  $ EnclosedPorch: num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
##  $ 3SsnPorch    : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ScreenPorch  : num [1:1459] 120 0 0 0 144 0 0 0 0 0 ...
##  $ PoolArea     : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MiscVal      : num [1:1459] 0 12500 0 0 0 0 500 0 0 0 ...
##  $ MoSold       : num [1:1459] 6 6 3 6 1 4 3 5 2 4 ...
##  $ YrSold       : num [1:1459] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ SaleType     : chr [1:1459] "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr [1:1459] "Normal" "Normal" "Normal" "Normal" ...

Second Model Selection / Regression Using Random Forest Model

#select only numeric values 
normalize <- function(train_df){
  subset <- select_if(train_df, is.numeric)
  subset[is.na(subset)] <- 0
  subset <- subset[complete.cases(subset),]
  return(subset)
}


trainMod <- normalize(train_df)
testMod <- normalize(test_df)
null <- lm(SalePrice~1, trainMod)
all <- glm(as.factor(SalePrice) ~ LotArea+GrLivArea, data=trainMod, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
stepResults <- step(null, scope = list(lower = null, upper = all), direction = "both",trace = 0)

rfFit <-train(SalePrice ~.,
              data=trainMod,
              method="rf",
              trControl=trainControl(method="oob",number=5),
              prox=TRUE, importance = TRUE,
              allowParallel=TRUE)

# show the model summary          
rfFit
## Random Forest 
## 
## 1460 samples
##   37 predictor
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared 
##    2    31543.92  0.8422305
##   19    29313.53  0.8637527
##   37    29874.85  0.8584848
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.
# display the variables determined to be the most relevant
dotPlot(varImp(rfFit), main = "Random Forest Model - Most Relevant Variables")

Predition

#append scored data

result <- data.frame('Id' = testMod$Id,'SalePrice' = predict(rfFit, testMod))
result$SalePrice[result$SalePrice<0] <- 0

plot(density(trainMod$SalePrice))

plot(density(na.omit(result$SalePrice)))

Save the results to send to Kaggle

write.csv(result, file = "results_for_kaggle.csv",row.names=FALSE)