library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6      v purrr   0.3.4 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.4.1 
## v readr   2.1.2      v forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(matlib)
## Warning: package 'matlib' was built under R version 4.1.3
library(matrixcalc)
## Warning: package 'matrixcalc' was built under R version 4.1.3
## 
## Attaching package: 'matrixcalc'
## 
## The following object is masked from 'package:matlib':
## 
##     vec
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.3
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## We are trying to predict the salesprice which is our target variable everything else can be used for our predictors.. 
Training <- read_csv("https://raw.githubusercontent.com/AldataSci/FinalProject-2-605-/main/train.csv",show_col_types = FALSE)

Descriptive Statistics:

## we have 81 columns each with num and char types of column,,
str(Training)
## spec_tbl_df [1,460 x 81] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id           : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : num [1:1460] 60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr [1:1460] "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : num [1:1460] 65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : num [1:1460] 8450 9600 11250 9550 14260 ...
##  $ Street       : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr [1:1460] NA NA NA NA ...
##  $ LotShape     : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : num [1:1460] 7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : num [1:1460] 2003 1976 2001 1915 2000 ...
##  $ YearRemodAdd : num [1:1460] 2003 1976 2002 1970 2000 ...
##  $ RoofStyle    : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : num [1:1460] 196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr [1:1460] "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : num [1:1460] 706 978 486 216 655 ...
##  $ BsmtFinType2 : chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : num [1:1460] 0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : num [1:1460] 150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : num [1:1460] 856 1262 920 756 1145 ...
##  $ Heating      : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr [1:1460] "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ 1stFlrSF     : num [1:1460] 856 1262 920 961 1145 ...
##  $ 2ndFlrSF     : num [1:1460] 854 0 866 756 1053 ...
##  $ LowQualFinSF : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : num [1:1460] 1710 1262 1786 1717 2198 ...
##  $ BsmtFullBath : num [1:1460] 1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : num [1:1460] 0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : num [1:1460] 1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : num [1:1460] 1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : num [1:1460] 0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr [1:1460] NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : num [1:1460] 2003 1976 2001 1998 2000 ...
##  $ GarageFinish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr [1:1460] "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : num [1:1460] 0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : num [1:1460] 61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: num [1:1460] 0 0 0 272 0 0 0 228 205 0 ...
##  $ 3SsnPorch    : num [1:1460] 0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr [1:1460] NA NA NA NA ...
##  $ Fence        : chr [1:1460] NA NA NA NA ...
##  $ MiscFeature  : chr [1:1460] NA NA NA NA ...
##  $ MiscVal      : num [1:1460] 0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : num [1:1460] 2008 2007 2008 2006 2008 ...
##  $ SaleType     : chr [1:1460] "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : num [1:1460] 208500 181500 223500 140000 250000 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   MSSubClass = col_double(),
##   ..   MSZoning = col_character(),
##   ..   LotFrontage = col_double(),
##   ..   LotArea = col_double(),
##   ..   Street = col_character(),
##   ..   Alley = col_character(),
##   ..   LotShape = col_character(),
##   ..   LandContour = col_character(),
##   ..   Utilities = col_character(),
##   ..   LotConfig = col_character(),
##   ..   LandSlope = col_character(),
##   ..   Neighborhood = col_character(),
##   ..   Condition1 = col_character(),
##   ..   Condition2 = col_character(),
##   ..   BldgType = col_character(),
##   ..   HouseStyle = col_character(),
##   ..   OverallQual = col_double(),
##   ..   OverallCond = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   YearRemodAdd = col_double(),
##   ..   RoofStyle = col_character(),
##   ..   RoofMatl = col_character(),
##   ..   Exterior1st = col_character(),
##   ..   Exterior2nd = col_character(),
##   ..   MasVnrType = col_character(),
##   ..   MasVnrArea = col_double(),
##   ..   ExterQual = col_character(),
##   ..   ExterCond = col_character(),
##   ..   Foundation = col_character(),
##   ..   BsmtQual = col_character(),
##   ..   BsmtCond = col_character(),
##   ..   BsmtExposure = col_character(),
##   ..   BsmtFinType1 = col_character(),
##   ..   BsmtFinSF1 = col_double(),
##   ..   BsmtFinType2 = col_character(),
##   ..   BsmtFinSF2 = col_double(),
##   ..   BsmtUnfSF = col_double(),
##   ..   TotalBsmtSF = col_double(),
##   ..   Heating = col_character(),
##   ..   HeatingQC = col_character(),
##   ..   CentralAir = col_character(),
##   ..   Electrical = col_character(),
##   ..   `1stFlrSF` = col_double(),
##   ..   `2ndFlrSF` = col_double(),
##   ..   LowQualFinSF = col_double(),
##   ..   GrLivArea = col_double(),
##   ..   BsmtFullBath = col_double(),
##   ..   BsmtHalfBath = col_double(),
##   ..   FullBath = col_double(),
##   ..   HalfBath = col_double(),
##   ..   BedroomAbvGr = col_double(),
##   ..   KitchenAbvGr = col_double(),
##   ..   KitchenQual = col_character(),
##   ..   TotRmsAbvGrd = col_double(),
##   ..   Functional = col_character(),
##   ..   Fireplaces = col_double(),
##   ..   FireplaceQu = col_character(),
##   ..   GarageType = col_character(),
##   ..   GarageYrBlt = col_double(),
##   ..   GarageFinish = col_character(),
##   ..   GarageCars = col_double(),
##   ..   GarageArea = col_double(),
##   ..   GarageQual = col_character(),
##   ..   GarageCond = col_character(),
##   ..   PavedDrive = col_character(),
##   ..   WoodDeckSF = col_double(),
##   ..   OpenPorchSF = col_double(),
##   ..   EnclosedPorch = col_double(),
##   ..   `3SsnPorch` = col_double(),
##   ..   ScreenPorch = col_double(),
##   ..   PoolArea = col_double(),
##   ..   PoolQC = col_character(),
##   ..   Fence = col_character(),
##   ..   MiscFeature = col_character(),
##   ..   MiscVal = col_double(),
##   ..   MoSold = col_double(),
##   ..   YrSold = col_double(),
##   ..   SaleType = col_character(),
##   ..   SaleCondition = col_character(),
##   ..   SalePrice = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(Training)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical           1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##     2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      3SsnPorch       ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
## looking at various distributions
hist(Training$LotArea)

## more houses built in recent years.. 
hist(Training$YearBuilt)

hist(Training$SalePrice)

##Bedroom above average..
summary(Training$BedroomAbvGr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   2.866   3.000   8.000
## looking at the relationship between a functional house and the sales condition.. lors of partial sales and typ functionality rating
prop.table(table(Training$Functional,Training$SaleCondition))
##       
##             Abnorml      AdjLand       Alloca       Family       Normal
##   Maj1 0.0000000000 0.0000000000 0.0006849315 0.0000000000 0.0089041096
##   Maj2 0.0006849315 0.0000000000 0.0000000000 0.0000000000 0.0027397260
##   Min1 0.0020547945 0.0000000000 0.0000000000 0.0006849315 0.0184931507
##   Min2 0.0013698630 0.0000000000 0.0000000000 0.0000000000 0.0219178082
##   Mod  0.0006849315 0.0000000000 0.0000000000 0.0000000000 0.0095890411
##   Sev  0.0006849315 0.0000000000 0.0000000000 0.0000000000 0.0000000000
##   Typ  0.0636986301 0.0027397260 0.0075342466 0.0130136986 0.7589041096
##       
##             Partial
##   Maj1 0.0000000000
##   Maj2 0.0000000000
##   Min1 0.0000000000
##   Min2 0.0000000000
##   Mod  0.0000000000
##   Sev  0.0000000000
##   Typ  0.0856164384

Making the scatterplt matrix with 3 independent variables.

## y is the sales prices of the house..
## x are GrLivArea,OverallQual,LotArea i.e variables I think should be correlated.. 

Scatter_Mat <- Training %>%
  dplyr::select(GrLivArea,OverallQual,SalePrice,LotArea) 
## mmaking a scatterplot matrix using the pairs argument...  there is a correlation between GrLivArea and SalesPrice,same with OverallQual and SalesPrice.. 
pairs(Scatter_Mat, pch = 19)

Creating a scatterplot matrix with the 3 quantitive variables from above

## Wow if you look a the matrix salesprice is influenced by GrLivArea and OverallQual but not by LotArea which I think is weird..
res <- cor(Scatter_Mat)
round(res, 2)
##             GrLivArea OverallQual SalePrice LotArea
## GrLivArea        1.00        0.59      0.71    0.26
## OverallQual      0.59        1.00      0.79    0.11
## SalePrice        0.71        0.79      1.00    0.26
## LotArea          0.26        0.11      0.26    1.00

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

## We reject the null hypothesis since the true correlation is not equal to 0 but correlation is 0.70
corr <- cor.test(Scatter_Mat$GrLivArea, Scatter_Mat$SalePrice, method = "pearson",conf.level = 0.80)
corr
## 
##  Pearson's product-moment correlation
## 
## data:  Scatter_Mat$GrLivArea and Scatter_Mat$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245
## reject H0 since pariwise set of variable is not equal to 0 but correlation is 0.26
corr1 <- cor.test(Scatter_Mat$LotArea, Scatter_Mat$SalePrice, method = "pearson",conf.level = 0.80)
corr1
## 
##  Pearson's product-moment correlation
## 
## data:  Scatter_Mat$LotArea and Scatter_Mat$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434
## reject H0 since pariwise set of variable is not equal to 0 but correlation is 0.79
corr2 <- cor.test(Scatter_Mat$OverallQual, Scatter_Mat$SalePrice, method = "pearson",conf.level = 0.80)
corr2
## 
##  Pearson's product-moment correlation
## 
## data:  Scatter_Mat$OverallQual and Scatter_Mat$SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.7780752 0.8032204
## sample estimates:
##       cor 
## 0.7909816

Discuss the meaning of your analysis:

The family wise error is making at least one type I error in a series of hypothesis test, I believe for the hypothesis test between OverallQual and GrLiv Area that may not be the case since their correlation is high between the salesprice but for LotArea that may be the case since its correlation for SalesPrice is rather low. So in that hypothesis test I may have made a type one error since its correlation is 0.26,

Linear Algebra and Correlation:

Invert our correlation matrix:

## This is our precision matrix with variance inflation factors on the diag (according to the problem)
Inverse <- solve(res)
Inverse
##              GrLivArea OverallQual  SalePrice    LotArea
## GrLivArea    2.0533651  -0.2314884 -1.2208648 -0.1936637
## OverallQual -0.2314884   2.7811474 -2.1219451  0.3265081
## SalePrice   -1.2208648  -2.1219451  3.6539253 -0.4183207
## LotArea     -0.1936637   0.3265081 -0.4183207  1.1267807
Multiply the precision with the correlation matrix
Inverse %*% res
##                 GrLivArea   OverallQual     SalePrice      LotArea
## GrLivArea    1.000000e+00 -6.591949e-17 -1.595946e-16 2.775558e-17
## OverallQual  1.110223e-16  1.000000e+00  2.081668e-16 0.000000e+00
## SalePrice   -2.220446e-16 -6.245005e-17  1.000000e+00 0.000000e+00
## LotArea     -5.551115e-17 -8.326673e-17 -5.551115e-17 1.000000e+00

Multiply the correlation matrix with the precision matrix

res %*% Inverse
##                 GrLivArea   OverallQual     SalePrice       LotArea
## GrLivArea    1.000000e+00 -1.110223e-16  2.081668e-16  0.000000e+00
## OverallQual  4.510281e-17  1.000000e+00 -6.938894e-17 -2.775558e-17
## SalePrice   -1.595946e-16 -2.359224e-16  1.000000e+00  0.000000e+00
## LotArea     -2.775558e-17 -5.551115e-17  5.551115e-17  1.000000e+00

Doing LuDecomp on the correlation matrix

##
correl2 <- lu.decomposition(res)
correl2
## $L
##           [,1]        [,2]      [,3] [,4]
## [1,] 1.0000000  0.00000000 0.0000000    0
## [2,] 0.5930074  1.00000000 0.0000000    0
## [3,] 0.7086245  0.57186163 1.0000000    0
## [4,] 0.2631162 -0.07746542 0.3712529    1
## 
## $U
##      [,1]          [,2]      [,3]       [,4]
## [1,]    1  5.930074e-01 0.7086245  0.2631162
## [2,]    0  6.483422e-01 0.3707620 -0.0502241
## [3,]    0  0.000000e+00 0.2858268  0.1061140
## [4,]    0 -6.938894e-18 0.0000000  0.8874841

Calculus-Based Probablity and Statistics:

### This looks right tail skewed so I will use this variable. (First Floor Square Foot)
hist(Training$`1stFlrSF`)

## check the class of this column since fitdistr takes numeric values and there was no zero values in this column.. 
class(Training$`1stFlrSF`)
## [1] "numeric"
#### Then load the MASS package and run fitdistr to fit an exponential probability density function
epdf <- fitdistr(Training$`1stFlrSF`,densfun = "exponential")
## we will use this as our rate..
epdf$estimate
##         rate 
## 0.0008601213
## we will take 1000 samples using our lambda 
set.seed(149)
exp_dist <- rexp(1000,epdf$estimate)
### Histogram of original variable:
hist(Training$`1stFlrSF`)

### histogram of our Exp_Dist
hist(exp_dist)

The histogram of the lambda rates looks more pronounced with a clear right tail skew than the original data.. and the binwidths are more bigger than the original..

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF)

## 5th estimate
qexp(.05, rate = epdf$estimate)
## [1] 59.63495
qexp(0.95,rate=epdf$estimate)
## [1] 3482.918

Also generate a 95% confidence interval from the empirical data, assuming normality

## I've found a function that calculates the confidence interval assuming normality.. 
norm.interval = function(data, variance = var(data), conf.level = 0.95) {
 z = qnorm((1 - conf.level)/2, lower.tail = FALSE)
 xbar = mean(data)
 sdx = sqrt(variance/length(data))
 c(xbar - z * sdx, xbar + z * sdx)
 }
norm.interval(Training$`1stFlrSF`,variance=var(Training$`1stFlrSF`),conf.level = 0.95)
## [1] 1142.797 1182.457

Citation: https://pages.stat.wisc.edu/~yandell/st571/R/append7.pdf

Finally, provide the empirical 5th percentile and 95th percentile of the data

quantile(Training$`1stFlrSF`,0.05)
##     5% 
## 672.95
quantile(Training$`1stFlrSF`,0.95)
##     95% 
## 1831.25

Discuss:

I believe that the model had properly generated a bunch of values that had created an exponential distribution but the column of the first floor square foot wasn’t that right tail skewed so it seems that the 95th percentile for the empirical data was higher than the 95% confidence interval for the generated values.. but the random values generated from the samples had produced more values near 0 than I would look since it doesn’t look that accurate compared to the original data..

Part 4 Linear Regression Model

For this part I will handpick a bunch of predictors that I think make sense in determining the price of a house put it in the linear regression model and do stepwise analysis until all the predictors are significant and increases the p value…

Building a Linear Model

From the previous problem I discovered that SalesPrice was highly correlated by GrLivArea and OverallQual so I will include those into my lm model and other handpicked predictors that I think makes sense when pricing the house.

## Convert this categorical into a numerical variable
Training$Neighborhood <- as.integer(as.factor(Training$Neighborhood))
head(Training$Neighborhood)
## [1]  6 25  6  7 14 12
## Convert this categorical into a numerical 
Training$Electrical <- as.integer(as.factor(Training$Electrical))
head(Training$Electrical)
## [1] 5 5 5 5 5 5
lm.model <- lm(SalePrice~OverallQual+GrLivArea+GarageArea+OverallCond+BsmtUnfSF+YearBuilt+`1stFlrSF`+Electrical+Neighborhood+OpenPorchSF+WoodDeckSF+LotArea,data=Training)
summary(lm.model)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageArea + 
##     OverallCond + BsmtUnfSF + YearBuilt + `1stFlrSF` + Electrical + 
##     Neighborhood + OpenPorchSF + WoodDeckSF + LotArea, data = Training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -487504  -18329   -1888   14014  284690 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.045e+06  9.448e+04 -11.057  < 2e-16 ***
## OverallQual   2.235e+04  1.147e+03  19.484  < 2e-16 ***
## GrLivArea     4.497e+01  2.714e+00  16.571  < 2e-16 ***
## GarageArea    4.014e+01  6.085e+00   6.597 5.88e-11 ***
## OverallCond   5.804e+03  9.983e+02   5.814 7.50e-09 ***
## BsmtUnfSF    -1.072e+01  2.416e+00  -4.438 9.75e-06 ***
## YearBuilt     4.758e+02  4.859e+01   9.792  < 2e-16 ***
## `1stFlrSF`    2.997e+01  3.390e+00   8.842  < 2e-16 ***
## Electrical   -1.706e+03  1.014e+03  -1.682 0.092757 .  
## Neighborhood  1.382e+02  1.704e+02   0.811 0.417472    
## OpenPorchSF   1.035e+01  1.586e+01   0.653 0.514076    
## WoodDeckSF    3.097e+01  8.324e+00   3.720 0.000207 ***
## LotArea       5.603e-01  1.048e-01   5.348 1.03e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37140 on 1446 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7833, Adjusted R-squared:  0.7815 
## F-statistic: 435.6 on 12 and 1446 DF,  p-value: < 2.2e-16

I can see that the R squared value is 82% which is rather high but we have to delete some variables from the model since some of them aren’t signifcant… So I will remove OpenPorch,Neighborhood and Electrical which are not signifcant..

lm.model2 <- lm(SalePrice~OverallQual+GrLivArea+GarageArea+OverallCond+YearBuilt+`1stFlrSF`,data=Training)
summary(lm.model2)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageArea + 
##     OverallCond + YearBuilt + `1stFlrSF`, data = Training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470044  -19359   -2336   15443  286637 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.074e+06  9.161e+04 -11.725  < 2e-16 ***
## OverallQual  2.085e+04  1.143e+03  18.248  < 2e-16 ***
## GrLivArea    4.851e+01  2.690e+00  18.034  < 2e-16 ***
## GarageArea   4.400e+01  6.172e+00   7.129 1.58e-12 ***
## OverallCond  6.491e+03  9.900e+02   6.556 7.64e-11 ***
## YearBuilt    4.872e+02  4.685e+01  10.401  < 2e-16 ***
## `1stFlrSF`   3.169e+01  3.315e+00   9.560  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38050 on 1453 degrees of freedom
## Multiple R-squared:  0.7716, Adjusted R-squared:  0.7706 
## F-statistic: 818.1 on 6 and 1453 DF,  p-value: < 2.2e-16

That looks a lot better and cleaner even though our R squared value is small I’ve tried to use predictors that weren’t related to each other and it seems that these predictors explain 77% of the varablitiy in our data.. and all the predictors are significant

Residual Analysis:

plot(fitted(lm.model2),resid(lm.model2))

These residuals worry me it seems like there is some sort of patterns occuring in the model which isn’t a good sign that this model would be helpful and the model predicts around the same values..

qqnorm(resid(lm.model2))
qqline(resid(lm.model2))