library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.10
## v tidyr 1.2.1 v stringr 1.4.1
## v readr 2.1.2 v forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(matlib)
## Warning: package 'matlib' was built under R version 4.1.3
library(matrixcalc)
## Warning: package 'matrixcalc' was built under R version 4.1.3
##
## Attaching package: 'matrixcalc'
##
## The following object is masked from 'package:matlib':
##
## vec
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.3
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
## We are trying to predict the salesprice which is our target variable everything else can be used for our predictors..
Training <- read_csv("https://raw.githubusercontent.com/AldataSci/FinalProject-2-605-/main/train.csv",show_col_types = FALSE)
## we have 81 columns each with num and char types of column,,
str(Training)
## spec_tbl_df [1,460 x 81] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : num [1:1460] 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr [1:1460] "RL" "RL" "RL" "RL" ...
## $ LotFrontage : num [1:1460] 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : num [1:1460] 8450 9600 11250 9550 14260 ...
## $ Street : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr [1:1460] NA NA NA NA ...
## $ LotShape : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : num [1:1460] 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num [1:1460] 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd : num [1:1460] 2003 1976 2002 1970 2000 ...
## $ RoofStyle : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : num [1:1460] 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ Foundation : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr [1:1460] "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : num [1:1460] 706 978 486 216 655 ...
## $ BsmtFinType2 : chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num [1:1460] 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : num [1:1460] 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : num [1:1460] 856 1262 920 756 1145 ...
## $ Heating : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ Electrical : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ 1stFlrSF : num [1:1460] 856 1262 920 961 1145 ...
## $ 2ndFlrSF : num [1:1460] 854 0 866 756 1053 ...
## $ LowQualFinSF : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num [1:1460] 1710 1262 1786 1717 2198 ...
## $ BsmtFullBath : num [1:1460] 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : num [1:1460] 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : num [1:1460] 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : num [1:1460] 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num [1:1460] 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr [1:1460] NA "TA" "TA" "Gd" ...
## $ GarageType : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : num [1:1460] 2003 1976 2001 1998 2000 ...
## $ GarageFinish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr [1:1460] "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr [1:1460] "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num [1:1460] 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : num [1:1460] 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: num [1:1460] 0 0 0 272 0 0 0 228 205 0 ...
## $ 3SsnPorch : num [1:1460] 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr [1:1460] NA NA NA NA ...
## $ Fence : chr [1:1460] NA NA NA NA ...
## $ MiscFeature : chr [1:1460] NA NA NA NA ...
## $ MiscVal : num [1:1460] 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : num [1:1460] 2008 2007 2008 2006 2008 ...
## $ SaleType : chr [1:1460] "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num [1:1460] 208500 181500 223500 140000 250000 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotFrontage = col_double(),
## .. LotArea = col_double(),
## .. Street = col_character(),
## .. Alley = col_character(),
## .. LotShape = col_character(),
## .. LandContour = col_character(),
## .. Utilities = col_character(),
## .. LotConfig = col_character(),
## .. LandSlope = col_character(),
## .. Neighborhood = col_character(),
## .. Condition1 = col_character(),
## .. Condition2 = col_character(),
## .. BldgType = col_character(),
## .. HouseStyle = col_character(),
## .. OverallQual = col_double(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. RoofStyle = col_character(),
## .. RoofMatl = col_character(),
## .. Exterior1st = col_character(),
## .. Exterior2nd = col_character(),
## .. MasVnrType = col_character(),
## .. MasVnrArea = col_double(),
## .. ExterQual = col_character(),
## .. ExterCond = col_character(),
## .. Foundation = col_character(),
## .. BsmtQual = col_character(),
## .. BsmtCond = col_character(),
## .. BsmtExposure = col_character(),
## .. BsmtFinType1 = col_character(),
## .. BsmtFinSF1 = col_double(),
## .. BsmtFinType2 = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. BsmtUnfSF = col_double(),
## .. TotalBsmtSF = col_double(),
## .. Heating = col_character(),
## .. HeatingQC = col_character(),
## .. CentralAir = col_character(),
## .. Electrical = col_character(),
## .. `1stFlrSF` = col_double(),
## .. `2ndFlrSF` = col_double(),
## .. LowQualFinSF = col_double(),
## .. GrLivArea = col_double(),
## .. BsmtFullBath = col_double(),
## .. BsmtHalfBath = col_double(),
## .. FullBath = col_double(),
## .. HalfBath = col_double(),
## .. BedroomAbvGr = col_double(),
## .. KitchenAbvGr = col_double(),
## .. KitchenQual = col_character(),
## .. TotRmsAbvGrd = col_double(),
## .. Functional = col_character(),
## .. Fireplaces = col_double(),
## .. FireplaceQu = col_character(),
## .. GarageType = col_character(),
## .. GarageYrBlt = col_double(),
## .. GarageFinish = col_character(),
## .. GarageCars = col_double(),
## .. GarageArea = col_double(),
## .. GarageQual = col_character(),
## .. GarageCond = col_character(),
## .. PavedDrive = col_character(),
## .. WoodDeckSF = col_double(),
## .. OpenPorchSF = col_double(),
## .. EnclosedPorch = col_double(),
## .. `3SsnPorch` = col_double(),
## .. ScreenPorch = col_double(),
## .. PoolArea = col_double(),
## .. PoolQC = col_character(),
## .. Fence = col_character(),
## .. MiscFeature = col_character(),
## .. MiscVal = col_double(),
## .. MoSold = col_double(),
## .. YrSold = col_double(),
## .. SaleType = col_character(),
## .. SaleCondition = col_character(),
## .. SalePrice = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(Training)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
## looking at various distributions
hist(Training$LotArea)
## more houses built in recent years..
hist(Training$YearBuilt)
hist(Training$SalePrice)
##Bedroom above average..
summary(Training$BedroomAbvGr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 2.866 3.000 8.000
## looking at the relationship between a functional house and the sales condition.. lors of partial sales and typ functionality rating
prop.table(table(Training$Functional,Training$SaleCondition))
##
## Abnorml AdjLand Alloca Family Normal
## Maj1 0.0000000000 0.0000000000 0.0006849315 0.0000000000 0.0089041096
## Maj2 0.0006849315 0.0000000000 0.0000000000 0.0000000000 0.0027397260
## Min1 0.0020547945 0.0000000000 0.0000000000 0.0006849315 0.0184931507
## Min2 0.0013698630 0.0000000000 0.0000000000 0.0000000000 0.0219178082
## Mod 0.0006849315 0.0000000000 0.0000000000 0.0000000000 0.0095890411
## Sev 0.0006849315 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## Typ 0.0636986301 0.0027397260 0.0075342466 0.0130136986 0.7589041096
##
## Partial
## Maj1 0.0000000000
## Maj2 0.0000000000
## Min1 0.0000000000
## Min2 0.0000000000
## Mod 0.0000000000
## Sev 0.0000000000
## Typ 0.0856164384
## y is the sales prices of the house..
## x are GrLivArea,OverallQual,LotArea i.e variables I think should be correlated..
Scatter_Mat <- Training %>%
dplyr::select(GrLivArea,OverallQual,SalePrice,LotArea)
## mmaking a scatterplot matrix using the pairs argument... there is a correlation between GrLivArea and SalesPrice,same with OverallQual and SalesPrice..
pairs(Scatter_Mat, pch = 19)
## Wow if you look a the matrix salesprice is influenced by GrLivArea and OverallQual but not by LotArea which I think is weird..
res <- cor(Scatter_Mat)
round(res, 2)
## GrLivArea OverallQual SalePrice LotArea
## GrLivArea 1.00 0.59 0.71 0.26
## OverallQual 0.59 1.00 0.79 0.11
## SalePrice 0.71 0.79 1.00 0.26
## LotArea 0.26 0.11 0.26 1.00
## We reject the null hypothesis since the true correlation is not equal to 0 but correlation is 0.70
corr <- cor.test(Scatter_Mat$GrLivArea, Scatter_Mat$SalePrice, method = "pearson",conf.level = 0.80)
corr
##
## Pearson's product-moment correlation
##
## data: Scatter_Mat$GrLivArea and Scatter_Mat$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
## reject H0 since pariwise set of variable is not equal to 0 but correlation is 0.26
corr1 <- cor.test(Scatter_Mat$LotArea, Scatter_Mat$SalePrice, method = "pearson",conf.level = 0.80)
corr1
##
## Pearson's product-moment correlation
##
## data: Scatter_Mat$LotArea and Scatter_Mat$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
## reject H0 since pariwise set of variable is not equal to 0 but correlation is 0.79
corr2 <- cor.test(Scatter_Mat$OverallQual, Scatter_Mat$SalePrice, method = "pearson",conf.level = 0.80)
corr2
##
## Pearson's product-moment correlation
##
## data: Scatter_Mat$OverallQual and Scatter_Mat$SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.7780752 0.8032204
## sample estimates:
## cor
## 0.7909816
The family wise error is making at least one type I error in a series of hypothesis test, I believe for the hypothesis test between OverallQual and GrLiv Area that may not be the case since their correlation is high between the salesprice but for LotArea that may be the case since its correlation for SalesPrice is rather low. So in that hypothesis test I may have made a type one error since its correlation is 0.26,
## This is our precision matrix with variance inflation factors on the diag (according to the problem)
Inverse <- solve(res)
Inverse
## GrLivArea OverallQual SalePrice LotArea
## GrLivArea 2.0533651 -0.2314884 -1.2208648 -0.1936637
## OverallQual -0.2314884 2.7811474 -2.1219451 0.3265081
## SalePrice -1.2208648 -2.1219451 3.6539253 -0.4183207
## LotArea -0.1936637 0.3265081 -0.4183207 1.1267807
Inverse %*% res
## GrLivArea OverallQual SalePrice LotArea
## GrLivArea 1.000000e+00 -6.591949e-17 -1.595946e-16 2.775558e-17
## OverallQual 1.110223e-16 1.000000e+00 2.081668e-16 0.000000e+00
## SalePrice -2.220446e-16 -6.245005e-17 1.000000e+00 0.000000e+00
## LotArea -5.551115e-17 -8.326673e-17 -5.551115e-17 1.000000e+00
res %*% Inverse
## GrLivArea OverallQual SalePrice LotArea
## GrLivArea 1.000000e+00 -1.110223e-16 2.081668e-16 0.000000e+00
## OverallQual 4.510281e-17 1.000000e+00 -6.938894e-17 -2.775558e-17
## SalePrice -1.595946e-16 -2.359224e-16 1.000000e+00 0.000000e+00
## LotArea -2.775558e-17 -5.551115e-17 5.551115e-17 1.000000e+00
##
correl2 <- lu.decomposition(res)
correl2
## $L
## [,1] [,2] [,3] [,4]
## [1,] 1.0000000 0.00000000 0.0000000 0
## [2,] 0.5930074 1.00000000 0.0000000 0
## [3,] 0.7086245 0.57186163 1.0000000 0
## [4,] 0.2631162 -0.07746542 0.3712529 1
##
## $U
## [,1] [,2] [,3] [,4]
## [1,] 1 5.930074e-01 0.7086245 0.2631162
## [2,] 0 6.483422e-01 0.3707620 -0.0502241
## [3,] 0 0.000000e+00 0.2858268 0.1061140
## [4,] 0 -6.938894e-18 0.0000000 0.8874841
### This looks right tail skewed so I will use this variable. (First Floor Square Foot)
hist(Training$`1stFlrSF`)
## check the class of this column since fitdistr takes numeric values and there was no zero values in this column..
class(Training$`1stFlrSF`)
## [1] "numeric"
#### Then load the MASS package and run fitdistr to fit an exponential probability density function
epdf <- fitdistr(Training$`1stFlrSF`,densfun = "exponential")
## we will use this as our rate..
epdf$estimate
## rate
## 0.0008601213
## we will take 1000 samples using our lambda
set.seed(149)
exp_dist <- rexp(1000,epdf$estimate)
### Histogram of original variable:
hist(Training$`1stFlrSF`)
### histogram of our Exp_Dist
hist(exp_dist)
The histogram of the lambda rates looks more pronounced with a clear right tail skew than the original data.. and the binwidths are more bigger than the original..
## 5th estimate
qexp(.05, rate = epdf$estimate)
## [1] 59.63495
qexp(0.95,rate=epdf$estimate)
## [1] 3482.918
## I've found a function that calculates the confidence interval assuming normality..
norm.interval = function(data, variance = var(data), conf.level = 0.95) {
z = qnorm((1 - conf.level)/2, lower.tail = FALSE)
xbar = mean(data)
sdx = sqrt(variance/length(data))
c(xbar - z * sdx, xbar + z * sdx)
}
norm.interval(Training$`1stFlrSF`,variance=var(Training$`1stFlrSF`),conf.level = 0.95)
## [1] 1142.797 1182.457
Citation: https://pages.stat.wisc.edu/~yandell/st571/R/append7.pdf
quantile(Training$`1stFlrSF`,0.05)
## 5%
## 672.95
quantile(Training$`1stFlrSF`,0.95)
## 95%
## 1831.25
I believe that the model had properly generated a bunch of values that had created an exponential distribution but the column of the first floor square foot wasn’t that right tail skewed so it seems that the 95th percentile for the empirical data was higher than the 95% confidence interval for the generated values.. but the random values generated from the samples had produced more values near 0 than I would look since it doesn’t look that accurate compared to the original data..
For this part I will handpick a bunch of predictors that I think make sense in determining the price of a house put it in the linear regression model and do stepwise analysis until all the predictors are significant and increases the p value…
From the previous problem I discovered that SalesPrice was highly correlated by GrLivArea and OverallQual so I will include those into my lm model and other handpicked predictors that I think makes sense when pricing the house.
## Convert this categorical into a numerical variable
Training$Neighborhood <- as.integer(as.factor(Training$Neighborhood))
head(Training$Neighborhood)
## [1] 6 25 6 7 14 12
## Convert this categorical into a numerical
Training$Electrical <- as.integer(as.factor(Training$Electrical))
head(Training$Electrical)
## [1] 5 5 5 5 5 5
lm.model <- lm(SalePrice~OverallQual+GrLivArea+GarageArea+OverallCond+BsmtUnfSF+YearBuilt+`1stFlrSF`+Electrical+Neighborhood+OpenPorchSF+WoodDeckSF+LotArea,data=Training)
summary(lm.model)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageArea +
## OverallCond + BsmtUnfSF + YearBuilt + `1stFlrSF` + Electrical +
## Neighborhood + OpenPorchSF + WoodDeckSF + LotArea, data = Training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -487504 -18329 -1888 14014 284690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.045e+06 9.448e+04 -11.057 < 2e-16 ***
## OverallQual 2.235e+04 1.147e+03 19.484 < 2e-16 ***
## GrLivArea 4.497e+01 2.714e+00 16.571 < 2e-16 ***
## GarageArea 4.014e+01 6.085e+00 6.597 5.88e-11 ***
## OverallCond 5.804e+03 9.983e+02 5.814 7.50e-09 ***
## BsmtUnfSF -1.072e+01 2.416e+00 -4.438 9.75e-06 ***
## YearBuilt 4.758e+02 4.859e+01 9.792 < 2e-16 ***
## `1stFlrSF` 2.997e+01 3.390e+00 8.842 < 2e-16 ***
## Electrical -1.706e+03 1.014e+03 -1.682 0.092757 .
## Neighborhood 1.382e+02 1.704e+02 0.811 0.417472
## OpenPorchSF 1.035e+01 1.586e+01 0.653 0.514076
## WoodDeckSF 3.097e+01 8.324e+00 3.720 0.000207 ***
## LotArea 5.603e-01 1.048e-01 5.348 1.03e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37140 on 1446 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.7833, Adjusted R-squared: 0.7815
## F-statistic: 435.6 on 12 and 1446 DF, p-value: < 2.2e-16
I can see that the R squared value is 82% which is rather high but we have to delete some variables from the model since some of them aren’t signifcant… So I will remove OpenPorch,Neighborhood and Electrical which are not signifcant..
lm.model2 <- lm(SalePrice~OverallQual+GrLivArea+GarageArea+OverallCond+YearBuilt+`1stFlrSF`,data=Training)
summary(lm.model2)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageArea +
## OverallCond + YearBuilt + `1stFlrSF`, data = Training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -470044 -19359 -2336 15443 286637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.074e+06 9.161e+04 -11.725 < 2e-16 ***
## OverallQual 2.085e+04 1.143e+03 18.248 < 2e-16 ***
## GrLivArea 4.851e+01 2.690e+00 18.034 < 2e-16 ***
## GarageArea 4.400e+01 6.172e+00 7.129 1.58e-12 ***
## OverallCond 6.491e+03 9.900e+02 6.556 7.64e-11 ***
## YearBuilt 4.872e+02 4.685e+01 10.401 < 2e-16 ***
## `1stFlrSF` 3.169e+01 3.315e+00 9.560 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38050 on 1453 degrees of freedom
## Multiple R-squared: 0.7716, Adjusted R-squared: 0.7706
## F-statistic: 818.1 on 6 and 1453 DF, p-value: < 2.2e-16
That looks a lot better and cleaner even though our R squared value is small I’ve tried to use predictors that weren’t related to each other and it seems that these predictors explain 77% of the varablitiy in our data.. and all the predictors are significant
plot(fitted(lm.model2),resid(lm.model2))
These residuals worry me it seems like there is some sort of patterns occuring in the model which isn’t a good sign that this model would be helpful and the model predicts around the same values..
qqnorm(resid(lm.model2))
qqline(resid(lm.model2))
My Kaggle name is Al Haque and my kaggle score is 0.74556
https://www.statisticshowto.com/familywise-error-rate/ https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/cor.test https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html http://www.sthda.com/english/articles/40-regression-analysis/163-regression-with-categorical-variables-dummy-coding-essentials-in-r/ https://quantifyinghealth.com/variables-to-include-in-regression/