library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.3
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.3
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
library(ggplot2)
urlfile1<- "https://raw.githubusercontent.com/uzmabb182/Data605_Assignment/main/datasets/train.csv"
urlfile2<- "https://raw.githubusercontent.com/tiwari91/Housing-Prices/master/test.csv"
train_df<-read_csv(url(urlfile1))
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_df<-read_csv(url(urlfile2))
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train_df)
## # A tibble: 6 × 81
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## # ℹ 73 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## # LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## # BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## # YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## # Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## # ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## # BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …
#view(train_df)
dim(train_df)
## [1] 1460 81
names(train_df)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "1stFlrSF"
## [45] "2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
glimpse(train_df)
## Rows: 1,460
## Columns: 81
## $ Id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass <dbl> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
## $ LotFrontage <dbl> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
## $ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
## $ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
## $ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
## $ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
## $ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
## $ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
## $ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
## $ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
## $ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
## $ OverallQual <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd <dbl> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
## $ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
## $ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
## $ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
## $ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
## $ MasVnrArea <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
## $ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
## $ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
## $ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
## $ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
## $ BsmtFinSF1 <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
## $ BsmtFinSF2 <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
## $ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
## $ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
## $ `1stFlrSF` <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ `2ndFlrSF` <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ TotRmsAbvGrd <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
## $ Fireplaces <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
## $ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
## $ GarageYrBlt <dbl> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
## $ GarageCars <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
## $ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ WoodDeckSF <dbl> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF <dbl> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <dbl> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ `3SsnPorch` <dbl> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
## $ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
## $ MiscVal <dbl> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
## $ SalePrice <dbl> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
skim(train_df)
Name | train_df |
Number of rows | 1460 |
Number of columns | 81 |
_______________________ | |
Column type frequency: | |
character | 43 |
numeric | 38 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
MSZoning | 0 | 1.00 | 2 | 7 | 0 | 5 | 0 |
Street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
Alley | 1369 | 0.06 | 4 | 4 | 0 | 2 | 0 |
LotShape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
LandContour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
Utilities | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
LotConfig | 0 | 1.00 | 3 | 7 | 0 | 5 | 0 |
LandSlope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
Neighborhood | 0 | 1.00 | 5 | 7 | 0 | 25 | 0 |
Condition1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
Condition2 | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
BldgType | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
HouseStyle | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
RoofStyle | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
RoofMatl | 0 | 1.00 | 4 | 7 | 0 | 8 | 0 |
Exterior1st | 0 | 1.00 | 5 | 7 | 0 | 15 | 0 |
Exterior2nd | 0 | 1.00 | 5 | 7 | 0 | 16 | 0 |
MasVnrType | 8 | 0.99 | 4 | 7 | 0 | 4 | 0 |
ExterQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
ExterCond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
Foundation | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
BsmtQual | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
BsmtCond | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
BsmtExposure | 38 | 0.97 | 2 | 2 | 0 | 4 | 0 |
BsmtFinType1 | 37 | 0.97 | 3 | 3 | 0 | 6 | 0 |
BsmtFinType2 | 38 | 0.97 | 3 | 3 | 0 | 6 | 0 |
Heating | 0 | 1.00 | 4 | 5 | 0 | 6 | 0 |
HeatingQC | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
CentralAir | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
Electrical | 1 | 1.00 | 3 | 5 | 0 | 5 | 0 |
KitchenQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
Functional | 0 | 1.00 | 3 | 4 | 0 | 7 | 0 |
FireplaceQu | 690 | 0.53 | 2 | 2 | 0 | 5 | 0 |
GarageType | 81 | 0.94 | 6 | 7 | 0 | 6 | 0 |
GarageFinish | 81 | 0.94 | 3 | 3 | 0 | 3 | 0 |
GarageQual | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
GarageCond | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
PavedDrive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
PoolQC | 1453 | 0.00 | 2 | 2 | 0 | 3 | 0 |
Fence | 1179 | 0.19 | 4 | 5 | 0 | 4 | 0 |
MiscFeature | 1406 | 0.04 | 4 | 4 | 0 | 4 | 0 |
SaleType | 0 | 1.00 | 2 | 5 | 0 | 9 | 0 |
SaleCondition | 0 | 1.00 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 | ▇▇▇▇▇ |
MSSubClass | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 | ▇▅▂▁▁ |
LotFrontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 | ▇▃▁▁▁ |
LotArea | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 | ▇▁▁▁▁ |
OverallQual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 | ▁▂▇▅▁ |
OverallCond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
YearBuilt | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 | ▁▂▃▆▇ |
YearRemodAdd | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 | ▅▂▂▃▇ |
MasVnrArea | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 | ▇▁▁▁▁ |
BsmtFinSF1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 | ▇▁▁▁▁ |
BsmtFinSF2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 | ▇▁▁▁▁ |
BsmtUnfSF | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 | ▇▅▂▁▁ |
TotalBsmtSF | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 | ▇▃▁▁▁ |
1stFlrSF | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 | ▇▅▁▁▁ |
2ndFlrSF | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 | ▇▃▂▁▁ |
LowQualFinSF | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 | ▇▁▁▁▁ |
GrLivArea | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 | ▇▇▁▁▁ |
BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 | ▇▆▁▁▁ |
BsmtHalfBath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 | ▇▁▁▁▁ |
FullBath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 | ▁▇▁▇▁ |
HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▅▁▁ |
BedroomAbvGr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 | ▁▇▁▁▁ |
TotRmsAbvGrd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 | ▂▇▇▁▁ |
Fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▁▁ |
GarageYrBlt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 | ▁▁▅▅▇ |
GarageCars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 | ▁▃▇▂▁ |
GarageArea | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 | ▂▇▃▁▁ |
WoodDeckSF | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 | ▇▂▁▁▁ |
OpenPorchSF | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 | ▇▁▁▁▁ |
EnclosedPorch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 | ▇▁▁▁▁ |
3SsnPorch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 | ▇▁▁▁▁ |
ScreenPorch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 | ▇▁▁▁▁ |
PoolArea | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 | ▇▁▁▁▁ |
MiscVal | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 | ▇▁▁▁▁ |
MoSold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 | ▃▆▇▃▃ |
YrSold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▅ |
SalePrice | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 | ▇▅▁▁▁ |
str(train_df)
## spc_tbl_ [1,460 × 81] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : num [1:1460] 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr [1:1460] "RL" "RL" "RL" "RL" ...
## $ LotFrontage : num [1:1460] 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : num [1:1460] 8450 9600 11250 9550 14260 ...
## $ Street : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr [1:1460] NA NA NA NA ...
## $ LotShape : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : num [1:1460] 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num [1:1460] 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd : num [1:1460] 2003 1976 2002 1970 2000 ...
## $ RoofStyle : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : num [1:1460] 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ Foundation : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr [1:1460] "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : num [1:1460] 706 978 486 216 655 ...
## $ BsmtFinType2 : chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num [1:1460] 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : num [1:1460] 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : num [1:1460] 856 1262 920 756 1145 ...
## $ Heating : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ Electrical : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : num [1:1460] 856 1262 920 961 1145 ...
## $ 2ndFlrSF : num [1:1460] 854 0 866 756 1053 ...
## $ LowQualFinSF : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num [1:1460] 1710 1262 1786 1717 2198 ...
## $ BsmtFullBath : num [1:1460] 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : num [1:1460] 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : num [1:1460] 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : num [1:1460] 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num [1:1460] 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr [1:1460] NA "TA" "TA" "Gd" ...
## $ GarageType : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : num [1:1460] 2003 1976 2001 1998 2000 ...
## $ GarageFinish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num [1:1460] 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : num [1:1460] 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: num [1:1460] 0 0 0 272 0 0 0 228 205 0 ...
## $ 3SsnPorch : num [1:1460] 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr [1:1460] NA NA NA NA ...
## $ Fence : chr [1:1460] NA NA NA NA ...
## $ MiscFeature : chr [1:1460] NA NA NA NA ...
## $ MiscVal : num [1:1460] 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : num [1:1460] 2008 2007 2008 2006 2008 ...
## $ SaleType : chr [1:1460] "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num [1:1460] 208500 181500 223500 140000 250000 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotFrontage = col_double(),
## .. LotArea = col_double(),
## .. Street = col_character(),
## .. Alley = col_character(),
## .. LotShape = col_character(),
## .. LandContour = col_character(),
## .. Utilities = col_character(),
## .. LotConfig = col_character(),
## .. LandSlope = col_character(),
## .. Neighborhood = col_character(),
## .. Condition1 = col_character(),
## .. Condition2 = col_character(),
## .. BldgType = col_character(),
## .. HouseStyle = col_character(),
## .. OverallQual = col_double(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. RoofStyle = col_character(),
## .. RoofMatl = col_character(),
## .. Exterior1st = col_character(),
## .. Exterior2nd = col_character(),
## .. MasVnrType = col_character(),
## .. MasVnrArea = col_double(),
## .. ExterQual = col_character(),
## .. ExterCond = col_character(),
## .. Foundation = col_character(),
## .. BsmtQual = col_character(),
## .. BsmtCond = col_character(),
## .. BsmtExposure = col_character(),
## .. BsmtFinType1 = col_character(),
## .. BsmtFinSF1 = col_double(),
## .. BsmtFinType2 = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. BsmtUnfSF = col_double(),
## .. TotalBsmtSF = col_double(),
## .. Heating = col_character(),
## .. HeatingQC = col_character(),
## .. CentralAir = col_character(),
## .. Electrical = col_character(),
## .. `1stFlrSF` = col_double(),
## .. `2ndFlrSF` = col_double(),
## .. LowQualFinSF = col_double(),
## .. GrLivArea = col_double(),
## .. BsmtFullBath = col_double(),
## .. BsmtHalfBath = col_double(),
## .. FullBath = col_double(),
## .. HalfBath = col_double(),
## .. BedroomAbvGr = col_double(),
## .. KitchenAbvGr = col_double(),
## .. KitchenQual = col_character(),
## .. TotRmsAbvGrd = col_double(),
## .. Functional = col_character(),
## .. Fireplaces = col_double(),
## .. FireplaceQu = col_character(),
## .. GarageType = col_character(),
## .. GarageYrBlt = col_double(),
## .. GarageFinish = col_character(),
## .. GarageCars = col_double(),
## .. GarageArea = col_double(),
## .. GarageQual = col_character(),
## .. GarageCond = col_character(),
## .. PavedDrive = col_character(),
## .. WoodDeckSF = col_double(),
## .. OpenPorchSF = col_double(),
## .. EnclosedPorch = col_double(),
## .. `3SsnPorch` = col_double(),
## .. ScreenPorch = col_double(),
## .. PoolArea = col_double(),
## .. PoolQC = col_character(),
## .. Fence = col_character(),
## .. MiscFeature = col_character(),
## .. MiscVal = col_double(),
## .. MoSold = col_double(),
## .. YrSold = col_double(),
## .. SaleType = col_character(),
## .. SaleCondition = col_character(),
## .. SalePrice = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
train_df <- subset(train_df, select = -c(Alley, PoolQC, Fence, MiscFeature, FireplaceQu))
str(train_df)
## tibble [1,460 × 76] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : num [1:1460] 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr [1:1460] "RL" "RL" "RL" "RL" ...
## $ LotFrontage : num [1:1460] 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : num [1:1460] 8450 9600 11250 9550 14260 ...
## $ Street : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
## $ LotShape : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : num [1:1460] 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num [1:1460] 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd : num [1:1460] 2003 1976 2002 1970 2000 ...
## $ RoofStyle : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : num [1:1460] 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ Foundation : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr [1:1460] "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : num [1:1460] 706 978 486 216 655 ...
## $ BsmtFinType2 : chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num [1:1460] 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : num [1:1460] 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : num [1:1460] 856 1262 920 756 1145 ...
## $ Heating : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ Electrical : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : num [1:1460] 856 1262 920 961 1145 ...
## $ 2ndFlrSF : num [1:1460] 854 0 866 756 1053 ...
## $ LowQualFinSF : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num [1:1460] 1710 1262 1786 1717 2198 ...
## $ BsmtFullBath : num [1:1460] 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : num [1:1460] 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : num [1:1460] 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : num [1:1460] 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num [1:1460] 0 1 1 1 1 0 1 2 2 2 ...
## $ GarageType : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : num [1:1460] 2003 1976 2001 1998 2000 ...
## $ GarageFinish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num [1:1460] 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : num [1:1460] 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: num [1:1460] 0 0 0 272 0 0 0 228 205 0 ...
## $ 3SsnPorch : num [1:1460] 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ MiscVal : num [1:1460] 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : num [1:1460] 2008 2007 2008 2006 2008 ...
## $ SaleType : chr [1:1460] "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num [1:1460] 208500 181500 223500 140000 250000 ...
sum(is.na(train_df))
## [1] 868
Pick one of the quantitative independent variables from the training data set (train.csv) , and define that variable as X.
Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.
x <- train_df$OverallQual
hist(x, main = "Overall Quality")
x <- train_df$LotArea
hist(train_df$LotArea, main = "Lot Area")
# LotArea is clearly right-skewed.
X = train_df$LotArea
# The target variable we are trying to predict is SalePrice, the
# property's sale price in dollars.
Y = train_df$SalePrice
# Show histogram of SalePrice (target).
# SalePrice
hist(train_df$SalePrice, main = "Sale Price")
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
Pipe the data frame into the select() function from the dplyr package in R
df = train_df %>% dplyr::select(LotArea, SalePrice)
summary(df)
## LotArea SalePrice
## Min. : 1300 Min. : 34900
## 1st Qu.: 7554 1st Qu.:129975
## Median : 9478 Median :163000
## Mean : 10517 Mean :180921
## 3rd Qu.: 11602 3rd Qu.:214000
## Max. :215245 Max. :755000
The meaning of all these probabilities:
means the probability of the LotArea being bigger tham the first quartile, given the SalePrice is bigger than the first Quartile.
means that the probability of both scenarios happening.
means the probability that the lotArea is smaller than the 1st quartile given that the SalePrice is bigger than the first quartile
#Assign quartile values to variables.
XQ1<-quantile(train_df$LotArea, 0.25)
YQ1<-quantile(train_df$SalePrice, 0.25)
#Create subsets of data based on quartile operators.
yY <- subset(train_df,SalePrice <= YQ1)
Yy <- subset(train_df,SalePrice > YQ1)
Xx_Yy<- subset(Yy, LotArea > XQ1)
Xx_yY<- subset(yY, LotArea > XQ1)
xX_Yy<- subset(Yy, LotArea <= XQ1)
xX_yY<- subset(yY, LotArea <= XQ1)
#for P(X>x|Y>y)
a <- nrow(Xx_Yy)
nrow(Xx_Yy)/nrow(train_df)
## [1] 0.6150685
#for P(X>x|y<Y)
b <- nrow(Xx_yY)
nrow(Xx_yY)/nrow(train_df)
## [1] 0.1349315
c <- nrow(xX_Yy)
nrow(xX_Yy)/nrow(train_df)
## [1] 0.1349315
#for P(X<x|Y>y)
c <- nrow(xX_Yy)
nrow(xX_Yy)/nrow(train_df)
## [1] 0.1349315
#P(X<x|y<Y)
d <-nrow(xX_yY)
nrow(xX_yY)/nrow(train_df)
## [1] 0.1150685
table <- matrix(c(d,c,(d+c),b,a,(b+a),(b+d),(c+a),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(table) <- c("<=1Q", ">1Q", "Total")
rownames(table) <- c('<=1Q', '>1Q', 'Total')
result_table <- as.table(table)
result_table
## <=1Q >1Q Total
## <=1Q 168 197 365
## >1Q 197 898 1095
## Total 365 1095 1460
No, independence explains whether there is a relation between X & Y.
Splitting the data doesn’t change the relationship, it just changes the extent of problem domain.
Let A be the new variable counting those observations above the 3d quartile for X,
let B be the new variable counting those observations for the 2d quartile for Y.
Does P(A|B)=P(A)P(B)?
Check mathematically, and then evaluate by running a Chi Square test for association.
#Observations above the 1d quartile for X
Xx_A <- subset(train_df, LotArea >= XQ1)
#Observations for the 1d quartile for Y
YQ1 <- quantile(train_df$SalePrice, 0.25)
Yy_B_1 <- subset(train_df, SalePrice <= YQ1)
YY_B_2 <- subset(train_df, SalePrice >= YQ1)
#P(A|B)
YY_XX <- subset(YY_B_2, LotArea >= XQ1)
res1 <- nrow(YY_XX)/nrow(train_df)
#P(A)P(B)
res2 <- (nrow(Xx_A)/nrow(train_df))*nrow(YY_B_2)/nrow(train_df)
print(c("P(A|B)=P(A)P(B)?: ", res1==res2))
## [1] "P(A|B)=P(A)P(B)?: " "FALSE"
The variables are not independent.
chi_table<- table(train_df$LotArea, train_df$SalePrice)
suppressWarnings(chisq.test(chi_table))
##
## Pearson's Chi-squared test
##
## data: chi_table
## X-squared = 735095, df = 709664, p-value < 2.2e-16
A p value is < 2.2e-16.
Therefore, we reject the null hypothesis that X is Independent of Y.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
#Collect summary statistics
LotArea.mean <-mean(train_df$LotArea)
LotArea.median <-median(train_df$LotArea)
LotArea.mode <- as.numeric(names(sort(-table(train_df$LotArea))))[1]
LotArea.sd <- sd(train_df$LotArea)
SalePrice.mean <-mean(train_df$SalePrice)
SalePrice.median <-median(train_df$SalePrice)
SalePrice.mode <- as.numeric(names(sort(-table(train_df$SalePrice))))[1]
SalePrice.sd <- sd(train_df$SalePrice)
#Create density plot for LotArea variable
d_LotArea <- density(train_df$LotArea)
plot(d_LotArea, main="LotArea Probabilities", ylab="Probability", xlab="LotArea")
polygon(d_LotArea, col="light blue")
abline(v = LotArea.median, col = "green", lwd = 2)
abline(v = LotArea.mean, col = "blue", lwd = 2)
abline(v = LotArea.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8)
#Create density plot for SalePrice variable.
SalePrice_for_graph <- density(train_df$SalePrice, na.rm=TRUE)
plot(SalePrice_for_graph, main="SalePrice Probabilities", ylab="Probability", xlab="SalePrice")
polygon(SalePrice_for_graph, col="light blue")
abline(v = SalePrice.median, col = "green", lwd = 2)
abline(v = SalePrice.mean, col = "blue", lwd = 2)
abline(v = SalePrice.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8 )
### Plotting A Graph
plot(train_df$LotArea,train_df$SalePrice, main="LotArea vs SalePrice Scatterplot",
xlab="LotArea", ylab="SalePrice", pch=3)
### 95% confidence Interval
Provide a 95% Confidence Interval for the difference in the mean of the variables.
t.test(train_df$LotArea,train_df$SalePrice)
##
## Welch Two Sample t-test
##
## data: train_df$LotArea and train_df$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -174514.7 -166294.1
## sample estimates:
## mean of x mean of y
## 10516.83 180921.20
Since the p value is too low, we reject the hypothesis that the difference in means is equal to 0.
Derive a correlation matrix for THREE of the quantitative variables you selected.
corMatrix<-cor(train_df[, which(names(train_df) %in% c("LotArea", "SalePrice", 'GrLivArea'))])
corMatrix
## LotArea GrLivArea SalePrice
## LotArea 1.0000000 0.2631162 0.2638434
## GrLivArea 0.2631162 1.0000000 0.7086245
## SalePrice 0.2638434 0.7086245 1.0000000
Results show a very low but possible positive correlation between the data of 0.2638.
The correlation between GrLivArea and SalePrice seems to be high at 0.7086245
Test the hypothesis that the correlation between these variables is 0 and provide a 92% confidence interval.
t.test(train_df$LotArea,train_df$SalePrice, conf.level=0.92)
##
## Welch Two Sample t-test
##
## data: train_df$LotArea and train_df$SalePrice
## t = -81.321, df = 1505.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## -174075.3 -166733.4
## sample estimates:
## mean of x mean of y
## 10516.83 180921.20
t.test(train_df$LotArea,train_df$GrLivArea, conf.level=0.92)
##
## Welch Two Sample t-test
##
## data: train_df$LotArea and train_df$GrLivArea
## t = 34.411, df = 1467.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## 8543.097 9459.632
## sample estimates:
## mean of x mean of y
## 10516.828 1515.464
t.test(train_df$GrLivArea,train_df$SalePrice, conf.level=0.92)
##
## Welch Two Sample t-test
##
## data: train_df$GrLivArea and train_df$SalePrice
## t = -86.288, df = 1459.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 92 percent confidence interval:
## -183048.2 -175763.3
## sample estimates:
## mean of x mean of y
## 1515.464 180921.196
In all three cases the difference for the 3 variables’ means is not 0
The correlation values are not too high so we don’t have to worry about misidentifying the outcomes.
The meaning of this result is that the variables do not say much about the final sale price.
Invert your correlation matrix (Precision matrix).
corMatrixInverse <- ginv(corMatrix)
corMatrixInverse
## [,1] [,2] [,3]
## [1,] 1.0884485 -0.1664868 -0.1692033
## [2,] -0.1664868 2.0340972 -1.3974846
## [3,] -0.1692033 -1.3974846 2.0349350
The diagonal elements represent variance inflation factors, which measures the relationship between combinations between variables.
This is the identity Matrix
matrix1<- corMatrixInverse %*% corMatrix
matrix1
## LotArea GrLivArea SalePrice
## [1,] 1.000000e+00 -2.498002e-16 -8.326673e-17
## [2,] 1.665335e-16 1.000000e+00 8.881784e-16
## [3,] 1.110223e-16 -4.440892e-16 1.000000e+00
This represents how matrix products differ depending on the order or direction by which they are multiplied.
matrix2<- corMatrix %*% corMatrixInverse
matrix2
## [,1] [,2] [,3]
## LotArea 1.000000e+00 -4.440892e-16 2.220446e-16
## GrLivArea 2.220446e-16 1.000000e+00 2.220446e-16
## SalePrice 2.498002e-16 0.000000e+00 1.000000e+00
#Perform a log transform on each variable to normalize
dataCopy<-train_df
dataCopy$LotArea<-log(dataCopy$LotArea)
dataCopy$SalePrice<-log(dataCopy$SalePrice)
dataCopy$GrLivArea<-log(dataCopy$GrLivArea)
#apply PCA and ADD additional parameters for a more interesting interpretation
data.pca<-prcomp(dataCopy[, which(names(dataCopy) %in% c("LotArea", "SalePrice", "GrLivArea"))],center = TRUE,scale = TRUE)
data.pca
## Standard deviations (1, .., p=3):
## [1] 1.4246936 0.8370813 0.5191755
##
## Rotation (n x k) = (3 x 3):
## PC1 PC2 PC3
## LotArea 0.4746802 0.8799375 0.01971454
## GrLivArea 0.6204099 -0.3503987 0.70164972
## SalePrice 0.6243159 -0.3208281 -0.71224926
summary(data.pca)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.4247 0.8371 0.51918
## Proportion of Variance 0.6766 0.2336 0.08985
## Cumulative Proportion 0.6766 0.9102 1.00000
biplot(data.pca)
### Analysis
Vectors that point in the same direction correspond to variables that have similar response profiles,
This can be interpreted as having similar meaning in the context set by the data,
Here SalePrice and GrLivArea have very similar vectors pointing to the same direction. where we will apply the regression technique.
screeplot(data.pca, type="lines")
### Calculus-Based Probability & Statistics
We take the LotArea data and fit it to an exponential function.
lambda<-fitdistr(train_df$LotArea,"exponential")
lambda$estimate
## rate
## 9.50857e-05
pdf_distr<-rexp(1000, lambda$estimate)
#Plot the results of the exponential distribution
hist(pdf_distr, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with LotArea", xlim = c(1, quantile(pdf_distr, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "green", add = TRUE)
### Plotting the results as compared to the original data
hist(train_df$LotArea, freq = FALSE, breaks = 100, main ="Exponential VS original LotArea data",xlim = c(1, quantile(train_df$LotArea, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "green", add = TRUE)
### With the exponential PDF:
5th and 95th percentiles using the cumulative distribution function (CDF)
qexp(0.05, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 539.4428
qexp(0.95, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 31505.6
qnorm(0.95,LotArea.mean, LotArea.sd)
## [1] 26934.55
quantile(train_df$LotArea, c(.05, .95))
## 5% 95%
## 3311.70 17401.15
Analyzing the above result we recognize the differences between a exponential equation and the selected right-skewed data.
The approximation can work to fit different models and help explain the data.
check_model <- function(m) {
print(summary(m))
res = residuals(m)
print(summary(res))
hist(res)
plot(fitted(m), resid(m))
}
par(mfrow = c(1, 1))
# Full training data set
train_df.train = train_df
# Reduce to Dataframe with selected feature sets
train_df.train = train_df.train %>% dplyr::select(SalePrice,
BldgType,
BsmtCond,
BsmtExposure,
BsmtQual,
CentralAir,
GarageArea,
GarageCars,
# Exterior1st,
ExterQual,
# Fence,
Fireplaces,
#FireplaceQu,
Foundation,
HouseStyle,
KitchenQual,
LandContour,
LandSlope,
LotArea,
MasVnrArea,
MiscVal,
Neighborhood,
OverallCond,
OverallQual,
PoolArea,
# # PoolQC,
RoofStyle,
# # Street,
YearBuilt,
YearRemodAdd)
regr_model = lm(train_df.train)
check_model(regr_model)
##
## Call:
## lm(formula = train_df.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -311457 -15589 -831 13665 273845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.193e+05 1.996e+05 -2.601 0.009392 **
## BldgType2fmCon -2.050e+03 6.670e+03 -0.307 0.758600
## BldgTypeDuplex 5.642e+03 5.997e+03 0.941 0.346961
## BldgTypeTwnhs -4.096e+04 6.790e+03 -6.032 2.10e-09 ***
## BldgTypeTwnhsE -3.821e+04 4.290e+03 -8.907 < 2e-16 ***
## BsmtCondGd 3.527e+03 7.049e+03 0.500 0.616874
## BsmtCondPo 4.139e+04 2.604e+04 1.589 0.112207
## BsmtCondTA 5.689e+03 5.541e+03 1.027 0.304756
## BsmtExposureGd 2.218e+04 4.033e+03 5.500 4.54e-08 ***
## BsmtExposureMn -9.737e+02 4.124e+03 -0.236 0.813384
## BsmtExposureNo -8.296e+03 2.946e+03 -2.816 0.004932 **
## BsmtQualFa -4.094e+04 8.292e+03 -4.937 8.92e-07 ***
## BsmtQualGd -3.097e+04 4.414e+03 -7.015 3.64e-12 ***
## BsmtQualTA -3.507e+04 5.407e+03 -6.485 1.24e-10 ***
## CentralAirY 2.649e+03 4.746e+03 0.558 0.576848
## GarageArea 1.402e+01 9.539e+00 1.470 0.141865
## GarageCars 1.028e+04 2.842e+03 3.618 0.000308 ***
## ExterQualFa -2.298e+04 1.291e+04 -1.779 0.075430 .
## ExterQualGd -1.854e+04 6.356e+03 -2.917 0.003591 **
## ExterQualTA -2.377e+04 7.011e+03 -3.390 0.000719 ***
## Fireplaces 1.210e+04 1.717e+03 7.046 2.94e-12 ***
## FoundationCBlock 5.228e+03 4.166e+03 1.255 0.209713
## FoundationPConc 8.146e+03 4.621e+03 1.763 0.078152 .
## FoundationStone 1.273e+04 1.420e+04 0.896 0.370232
## FoundationWood 1.071e+04 1.994e+04 0.537 0.591206
## HouseStyle1.5Unf -2.081e+04 9.418e+03 -2.210 0.027302 *
## HouseStyle1Story -1.128e+04 3.594e+03 -3.139 0.001730 **
## HouseStyle2.5Fin 4.422e+04 1.266e+04 3.493 0.000493 ***
## HouseStyle2.5Unf 4.544e+03 1.091e+04 0.417 0.677108
## HouseStyle2Story -1.960e+03 3.742e+03 -0.524 0.600613
## HouseStyleSFoyer -3.070e+04 7.374e+03 -4.163 3.34e-05 ***
## HouseStyleSLvl -2.131e+04 5.665e+03 -3.762 0.000176 ***
## KitchenQualFa -3.274e+04 8.142e+03 -4.021 6.11e-05 ***
## KitchenQualGd -3.329e+04 4.591e+03 -7.250 7.05e-13 ***
## KitchenQualTA -3.789e+04 5.204e+03 -7.280 5.69e-13 ***
## LandContourHLS 1.137e+04 6.821e+03 1.667 0.095702 .
## LandContourLow 2.026e+03 8.488e+03 0.239 0.811375
## LandContourLvl 1.735e+04 4.886e+03 3.551 0.000396 ***
## LandSlopeMod 1.151e+04 5.243e+03 2.196 0.028268 *
## LandSlopeSev -3.051e+04 1.327e+04 -2.299 0.021652 *
## LotArea 7.985e-01 1.228e-01 6.505 1.10e-10 ***
## MasVnrArea 2.046e+01 6.218e+00 3.290 0.001028 **
## MiscVal -1.009e+00 1.897e+00 -0.532 0.595098
## NeighborhoodBlueste -7.660e+03 2.517e+04 -0.304 0.760882
## NeighborhoodBrDale -1.076e+04 1.353e+04 -0.795 0.426585
## NeighborhoodBrkSide -1.782e+04 1.112e+04 -1.603 0.109243
## NeighborhoodClearCr 9.918e+02 1.192e+04 0.083 0.933716
## NeighborhoodCollgCr -1.167e+04 9.329e+03 -1.251 0.211224
## NeighborhoodCrawfor 1.446e+04 1.087e+04 1.331 0.183572
## NeighborhoodEdwards -2.510e+04 1.017e+04 -2.468 0.013695 *
## NeighborhoodGilbert -2.076e+04 9.948e+03 -2.087 0.037053 *
## NeighborhoodIDOTRR -3.351e+04 1.172e+04 -2.860 0.004303 **
## NeighborhoodMeadowV 4.281e+03 1.257e+04 0.341 0.733495
## NeighborhoodMitchel -2.174e+04 1.043e+04 -2.084 0.037361 *
## NeighborhoodNAmes -1.759e+04 9.900e+03 -1.777 0.075817 .
## NeighborhoodNoRidge 6.186e+04 1.061e+04 5.830 6.96e-09 ***
## NeighborhoodNPkVill 6.038e+03 1.434e+04 0.421 0.673799
## NeighborhoodNridgHt 2.007e+04 9.679e+03 2.073 0.038347 *
## NeighborhoodNWAmes -1.315e+04 1.019e+04 -1.290 0.197446
## NeighborhoodOldTown -2.699e+04 1.076e+04 -2.508 0.012254 *
## NeighborhoodSawyer -1.769e+04 1.039e+04 -1.702 0.088988 .
## NeighborhoodSawyerW -3.910e+03 9.972e+03 -0.392 0.695086
## NeighborhoodSomerst 1.948e+03 9.461e+03 0.206 0.836855
## NeighborhoodStoneBr 5.363e+04 1.074e+04 4.993 6.72e-07 ***
## NeighborhoodSWISU -1.045e+04 1.248e+04 -0.838 0.402405
## NeighborhoodTimber -1.369e+04 1.061e+04 -1.290 0.197413
## NeighborhoodVeenker 1.202e+04 1.327e+04 0.906 0.365254
## OverallCond 3.752e+03 1.083e+03 3.465 0.000548 ***
## OverallQual 1.381e+04 1.263e+03 10.934 < 2e-16 ***
## PoolArea 6.747e+01 2.246e+01 3.004 0.002710 **
## RoofStyleGable -1.891e+03 1.105e+04 -0.171 0.864087
## RoofStyleGambrel 3.983e+03 1.507e+04 0.264 0.791594
## RoofStyleHip 4.768e+03 1.124e+04 0.424 0.671411
## RoofStyleMansard 1.733e+04 1.698e+04 1.020 0.307775
## RoofStyleShed 3.800e+04 2.684e+04 1.416 0.157091
## YearBuilt 1.175e+02 8.670e+01 1.355 0.175574
## YearRemodAdd 2.018e+02 7.077e+01 2.851 0.004424 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32710 on 1337 degrees of freedom
## (46 observations deleted due to missingness)
## Multiple R-squared: 0.8389, Adjusted R-squared: 0.8297
## F-statistic: 91.6 on 76 and 1337 DF, p-value: < 2.2e-16
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -311457 -15589 -831 0 13665 273845
qqnorm(residuals(regr_model))
summary(train_df.train)
## SalePrice BldgType BsmtCond BsmtExposure
## Min. : 34900 Length:1460 Length:1460 Length:1460
## 1st Qu.:129975 Class :character Class :character Class :character
## Median :163000 Mode :character Mode :character Mode :character
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
## BsmtQual CentralAir GarageArea GarageCars
## Length:1460 Length:1460 Min. : 0.0 Min. :0.000
## Class :character Class :character 1st Qu.: 334.5 1st Qu.:1.000
## Mode :character Mode :character Median : 480.0 Median :2.000
## Mean : 473.0 Mean :1.767
## 3rd Qu.: 576.0 3rd Qu.:2.000
## Max. :1418.0 Max. :4.000
##
## ExterQual Fireplaces Foundation HouseStyle
## Length:1460 Min. :0.000 Length:1460 Length:1460
## Class :character 1st Qu.:0.000 Class :character Class :character
## Mode :character Median :1.000 Mode :character Mode :character
## Mean :0.613
## 3rd Qu.:1.000
## Max. :3.000
##
## KitchenQual LandContour LandSlope LotArea
## Length:1460 Length:1460 Length:1460 Min. : 1300
## Class :character Class :character Class :character 1st Qu.: 7554
## Mode :character Mode :character Mode :character Median : 9478
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## MasVnrArea MiscVal Neighborhood OverallCond
## Min. : 0.0 Min. : 0.00 Length:1460 Min. :1.000
## 1st Qu.: 0.0 1st Qu.: 0.00 Class :character 1st Qu.:5.000
## Median : 0.0 Median : 0.00 Mode :character Median :5.000
## Mean : 103.7 Mean : 43.49 Mean :5.575
## 3rd Qu.: 166.0 3rd Qu.: 0.00 3rd Qu.:6.000
## Max. :1600.0 Max. :15500.00 Max. :9.000
## NA's :8
## OverallQual PoolArea RoofStyle YearBuilt
## Min. : 1.000 Min. : 0.000 Length:1460 Min. :1872
## 1st Qu.: 5.000 1st Qu.: 0.000 Class :character 1st Qu.:1954
## Median : 6.000 Median : 0.000 Mode :character Median :1973
## Mean : 6.099 Mean : 2.759 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.: 0.000 3rd Qu.:2000
## Max. :10.000 Max. :738.000 Max. :2010
##
## YearRemodAdd
## Min. :1950
## 1st Qu.:1967
## Median :1994
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
Residual standard error: This is an estimate of the standard deviation of the residuals. In this case, it’s 32710, indicating the average difference between the observed values and the values predicted by the model is around 32710 units.
Multiple R-squared: This is the proportion of the variance in the dependent variable that is predictable from the independent variables. Here, it’s 0.8389, meaning approximately 83.89% of the variability in the dependent variable can be explained by the independent variables.
Adjusted R-squared: This is the R-squared value adjusted for the number of predictors in the model. It’s slightly lower at 0.8297 but still suggests a good fit.
F-statistic: This tests the overall significance of the regression model. A larger F-statistic with a small p-value suggests that the overall model is significant. Here, the F-statistic is 91.6 with a p-value less than 2.2e-16, indicating that the overall model is highly significant.
test_df <- subset(test_df, select = -c(Alley, PoolQC, Fence, MiscFeature, FireplaceQu))
str(test_df)
## tibble [1,459 × 75] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:1459] 1461 1462 1463 1464 1465 ...
## $ MSSubClass : num [1:1459] 20 20 60 60 120 60 20 60 20 20 ...
## $ MSZoning : chr [1:1459] "RH" "RL" "RL" "RL" ...
## $ LotFrontage : num [1:1459] 80 81 74 78 43 75 NA 63 85 70 ...
## $ LotArea : num [1:1459] 11622 14267 13830 9978 5005 ...
## $ Street : chr [1:1459] "Pave" "Pave" "Pave" "Pave" ...
## $ LotShape : chr [1:1459] "Reg" "IR1" "IR1" "IR1" ...
## $ LandContour : chr [1:1459] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr [1:1459] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr [1:1459] "Inside" "Corner" "Inside" "Inside" ...
## $ LandSlope : chr [1:1459] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr [1:1459] "NAmes" "NAmes" "Gilbert" "Gilbert" ...
## $ Condition1 : chr [1:1459] "Feedr" "Norm" "Norm" "Norm" ...
## $ Condition2 : chr [1:1459] "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr [1:1459] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr [1:1459] "1Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : num [1:1459] 5 6 5 6 8 6 6 6 7 4 ...
## $ OverallCond : num [1:1459] 6 6 5 6 5 5 7 5 5 5 ...
## $ YearBuilt : num [1:1459] 1961 1958 1997 1998 1992 ...
## $ YearRemodAdd : num [1:1459] 1961 1958 1998 1998 1992 ...
## $ RoofStyle : chr [1:1459] "Gable" "Hip" "Gable" "Gable" ...
## $ RoofMatl : chr [1:1459] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr [1:1459] "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
## $ Exterior2nd : chr [1:1459] "VinylSd" "Wd Sdng" "VinylSd" "VinylSd" ...
## $ MasVnrType : chr [1:1459] "None" "BrkFace" "None" "BrkFace" ...
## $ MasVnrArea : num [1:1459] 0 108 0 20 0 0 0 0 0 0 ...
## $ ExterQual : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ ExterCond : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ Foundation : chr [1:1459] "CBlock" "CBlock" "PConc" "PConc" ...
## $ BsmtQual : chr [1:1459] "TA" "TA" "Gd" "TA" ...
## $ BsmtCond : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ BsmtExposure : chr [1:1459] "No" "No" "No" "No" ...
## $ BsmtFinType1 : chr [1:1459] "Rec" "ALQ" "GLQ" "GLQ" ...
## $ BsmtFinSF1 : num [1:1459] 468 923 791 602 263 0 935 0 637 804 ...
## $ BsmtFinType2 : chr [1:1459] "LwQ" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num [1:1459] 144 0 0 0 0 0 0 0 0 78 ...
## $ BsmtUnfSF : num [1:1459] 270 406 137 324 1017 ...
## $ TotalBsmtSF : num [1:1459] 882 1329 928 926 1280 ...
## $ Heating : chr [1:1459] "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr [1:1459] "TA" "TA" "Gd" "Ex" ...
## $ CentralAir : chr [1:1459] "Y" "Y" "Y" "Y" ...
## $ Electrical : chr [1:1459] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : num [1:1459] 896 1329 928 926 1280 ...
## $ 2ndFlrSF : num [1:1459] 0 0 701 678 0 892 0 676 0 0 ...
## $ LowQualFinSF : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num [1:1459] 896 1329 1629 1604 1280 ...
## $ BsmtFullBath : num [1:1459] 0 0 0 0 0 0 1 0 1 1 ...
## $ BsmtHalfBath : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ FullBath : num [1:1459] 1 1 2 2 2 2 2 2 1 1 ...
## $ HalfBath : num [1:1459] 0 1 1 1 0 1 0 1 1 0 ...
## $ BedroomAbvGr : num [1:1459] 2 3 3 3 2 3 3 3 2 2 ...
## $ KitchenAbvGr : num [1:1459] 1 1 1 1 1 1 1 1 1 1 ...
## $ KitchenQual : chr [1:1459] "TA" "Gd" "TA" "Gd" ...
## $ TotRmsAbvGrd : num [1:1459] 5 6 6 7 5 7 6 7 5 4 ...
## $ Functional : chr [1:1459] "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num [1:1459] 0 0 1 1 0 1 0 1 1 0 ...
## $ GarageType : chr [1:1459] "Attchd" "Attchd" "Attchd" "Attchd" ...
## $ GarageYrBlt : num [1:1459] 1961 1958 1997 1998 1992 ...
## $ GarageFinish : chr [1:1459] "Unf" "Unf" "Fin" "Fin" ...
## $ GarageCars : num [1:1459] 1 1 2 2 2 2 2 2 2 2 ...
## $ GarageArea : num [1:1459] 730 312 482 470 506 440 420 393 506 525 ...
## $ GarageQual : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr [1:1459] "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr [1:1459] "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num [1:1459] 140 393 212 360 0 157 483 0 192 240 ...
## $ OpenPorchSF : num [1:1459] 0 36 34 36 82 84 21 75 0 0 ...
## $ EnclosedPorch: num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ 3SsnPorch : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ ScreenPorch : num [1:1459] 120 0 0 0 144 0 0 0 0 0 ...
## $ PoolArea : num [1:1459] 0 0 0 0 0 0 0 0 0 0 ...
## $ MiscVal : num [1:1459] 0 12500 0 0 0 0 500 0 0 0 ...
## $ MoSold : num [1:1459] 6 6 3 6 1 4 3 5 2 4 ...
## $ YrSold : num [1:1459] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ SaleType : chr [1:1459] "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr [1:1459] "Normal" "Normal" "Normal" "Normal" ...
#select only numeric values
normalize <- function(train_df){
subset <- select_if(train_df, is.numeric)
subset[is.na(subset)] <- 0
subset <- subset[complete.cases(subset),]
return(subset)
}
trainMod <- normalize(train_df)
testMod <- normalize(test_df)
null <- lm(SalePrice~1, trainMod)
all <- glm(as.factor(SalePrice) ~ LotArea+GrLivArea, data=trainMod, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
stepResults <- step(null, scope = list(lower = null, upper = all), direction = "both",trace = 0)
rfFit <-train(SalePrice ~.,
data=trainMod,
method="rf",
trControl=trainControl(method="oob",number=5),
prox=TRUE, importance = TRUE,
allowParallel=TRUE)
# show the model summary
rfFit
## Random Forest
##
## 1460 samples
## 37 predictor
##
## No pre-processing
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 31543.92 0.8422305
## 19 29313.53 0.8637527
## 37 29874.85 0.8584848
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.
# display the variables determined to be the most relevant
dotPlot(varImp(rfFit), main = "Random Forest Model - Most Relevant Variables")
#append scored data
result <- data.frame('Id' = testMod$Id,'SalePrice' = predict(rfFit, testMod))
result$SalePrice[result$SalePrice<0] <- 0
plot(density(trainMod$SalePrice))
plot(density(na.omit(result$SalePrice)))
write.csv(result, file = "results_for_kaggle.csv",row.names=FALSE)