library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.84 loaded
library(Matrix)
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following:

linktrain <- ('https://raw.githubusercontent.com/Fyoun123/602Final/master/train.csv')
train <- read.csv(linktrain, header=T, stringsAsFactors = F)
linktest <- ('https://raw.githubusercontent.com/sortega7878/DATA605CSV/master/test.csv')
test <- read.csv(linktest, header=T, stringsAsFactors = F)

Data test

head(train)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  1         60       RL          65    8450   Pave  <NA>      Reg
## 2  2         20       RL          80    9600   Pave  <NA>      Reg
## 3  3         60       RL          68   11250   Pave  <NA>      IR1
## 4  4         70       RL          60    9550   Pave  <NA>      IR1
## 5  5         60       RL          84   14260   Pave  <NA>      IR1
## 6  6         50       RL          85   14115   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2         Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4         Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5         Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6         Lvl    AllPub    Inside       Gtl      Mitchel       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           5      2003
## 2       Norm     1Fam     1Story           6           8      1976
## 3       Norm     1Fam     2Story           7           5      2001
## 4       Norm     1Fam     2Story           7           5      1915
## 5       Norm     1Fam     2Story           8           5      2000
## 6       Norm     1Fam     1.5Fin           5           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2         1976     Gable  CompShg     MetalSd     MetalSd       None
## 3         2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5         2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6         1995     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        196        Gd        TA      PConc       Gd       TA           No
## 2          0        TA        TA     CBlock       Gd       TA           Gd
## 3        162        Gd        TA      PConc       Gd       TA           Mn
## 4          0        TA        TA     BrkTil       TA       Gd           No
## 5        350        Gd        TA      PConc       Gd       TA           Av
## 6          0        TA        TA       Wood       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        706          Unf          0       150         856
## 2          ALQ        978          Unf          0       284        1262
## 3          GLQ        486          Unf          0       434         920
## 4          ALQ        216          Unf          0       540         756
## 5          GLQ        655          Unf          0       490        1145
## 6          GLQ        732          Unf          0        64         796
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr       856       854            0
## 2    GasA        Ex          Y      SBrkr      1262         0            0
## 3    GasA        Ex          Y      SBrkr       920       866            0
## 4    GasA        Gd          Y      SBrkr       961       756            0
## 5    GasA        Ex          Y      SBrkr      1145      1053            0
## 6    GasA        Ex          Y      SBrkr       796       566            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1710            1            0        2        1            3
## 2      1262            0            1        2        0            3
## 3      1786            1            0        2        1            3
## 4      1717            1            0        1        0            3
## 5      2198            1            0        2        1            4
## 6      1362            1            0        1        1            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2003          RFn          2        548         TA
## 2     Attchd        1976          RFn          2        460         TA
## 3     Attchd        2001          RFn          2        608         TA
## 4     Detchd        1998          Unf          3        642         TA
## 5     Attchd        2000          RFn          3        836         TA
## 6     Attchd        1993          Unf          2        480         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0          61             0          0
## 2         TA          Y        298           0             0          0
## 3         TA          Y          0          42             0          0
## 4         TA          Y          0          35           272          0
## 5         TA          Y        192          84             0          0
## 6         TA          Y         40          30             0        320
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      2   2008
## 2           0        0   <NA>  <NA>        <NA>       0      5   2007
## 3           0        0   <NA>  <NA>        <NA>       0      9   2008
## 4           0        0   <NA>  <NA>        <NA>       0      2   2006
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA> MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000
head(test)
##     Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1461         20       RH          80   11622   Pave  <NA>      Reg
## 2 1462         20       RL          81   14267   Pave  <NA>      IR1
## 3 1463         60       RL          74   13830   Pave  <NA>      IR1
## 4 1464         60       RL          78    9978   Pave  <NA>      IR1
## 5 1465        120       RL          43    5005   Pave  <NA>      IR1
## 6 1466         60       RL          75   10000   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl        NAmes      Feedr
## 2         Lvl    AllPub    Corner       Gtl        NAmes       Norm
## 3         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 4         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 5         HLS    AllPub    Inside       Gtl      StoneBr       Norm
## 6         Lvl    AllPub    Corner       Gtl      Gilbert       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1Story           5           6      1961
## 2       Norm     1Fam     1Story           6           6      1958
## 3       Norm     1Fam     2Story           5           5      1997
## 4       Norm     1Fam     2Story           6           6      1998
## 5       Norm   TwnhsE     1Story           8           5      1992
## 6       Norm     1Fam     2Story           6           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1961     Gable  CompShg     VinylSd     VinylSd       None
## 2         1958       Hip  CompShg     Wd Sdng     Wd Sdng    BrkFace
## 3         1998     Gable  CompShg     VinylSd     VinylSd       None
## 4         1998     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 5         1992     Gable  CompShg     HdBoard     HdBoard       None
## 6         1994     Gable  CompShg     HdBoard     HdBoard       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1          0        TA        TA     CBlock       TA       TA           No
## 2        108        TA        TA     CBlock       TA       TA           No
## 3          0        TA        TA      PConc       Gd       TA           No
## 4         20        TA        TA      PConc       TA       TA           No
## 5          0        Gd        TA      PConc       Gd       TA           No
## 6          0        TA        TA      PConc       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          Rec        468          LwQ        144       270         882
## 2          ALQ        923          Unf          0       406        1329
## 3          GLQ        791          Unf          0       137         928
## 4          GLQ        602          Unf          0       324         926
## 5          ALQ        263          Unf          0      1017        1280
## 6          Unf          0          Unf          0       763         763
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        TA          Y      SBrkr       896         0            0
## 2    GasA        TA          Y      SBrkr      1329         0            0
## 3    GasA        Gd          Y      SBrkr       928       701            0
## 4    GasA        Ex          Y      SBrkr       926       678            0
## 5    GasA        Ex          Y      SBrkr      1280         0            0
## 6    GasA        Gd          Y      SBrkr       763       892            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1       896            0            0        1        0            2
## 2      1329            0            0        1        1            3
## 3      1629            0            0        2        1            3
## 4      1604            0            0        2        1            3
## 5      1280            0            0        2        0            2
## 6      1655            0            0        2        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          TA            5        Typ          0        <NA>
## 2            1          Gd            6        Typ          0        <NA>
## 3            1          TA            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            5        Typ          0        <NA>
## 6            1          TA            7        Typ          1          TA
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        1961          Unf          1        730         TA
## 2     Attchd        1958          Unf          1        312         TA
## 3     Attchd        1997          Fin          2        482         TA
## 4     Attchd        1998          Fin          2        470         TA
## 5     Attchd        1992          RFn          2        506         TA
## 6     Attchd        1993          Fin          2        440         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y        140           0             0          0
## 2         TA          Y        393          36             0          0
## 3         TA          Y        212          34             0          0
## 4         TA          Y        360          36             0          0
## 5         TA          Y          0          82             0          0
## 6         TA          Y        157          84             0          0
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1         120        0   <NA> MnPrv        <NA>       0      6   2010
## 2           0        0   <NA>  <NA>        Gar2   12500      6   2010
## 3           0        0   <NA> MnPrv        <NA>       0      3   2010
## 4           0        0   <NA>  <NA>        <NA>       0      6   2010
## 5         144        0   <NA>  <NA>        <NA>       0      1   2010
## 6           0        0   <NA>  <NA>        <NA>       0      4   2010
##   SaleType SaleCondition
## 1       WD        Normal
## 2       WD        Normal
## 3       WD        Normal
## 4       WD        Normal
## 5       WD        Normal
## 6       WD        Normal

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

train_n <- dplyr::select_if(train, is.numeric)
psych::describe(train_n)
##               vars    n      mean       sd   median   trimmed      mad
## Id               1 1460    730.50   421.61    730.5    730.50   541.15
## MSSubClass       2 1460     56.90    42.30     50.0     49.15    44.48
## LotFrontage      3 1201     70.05    24.28     69.0     68.94    16.31
## LotArea          4 1460  10516.83  9981.26   9478.5   9563.28  2962.23
## OverallQual      5 1460      6.10     1.38      6.0      6.08     1.48
## OverallCond      6 1460      5.58     1.11      5.0      5.48     0.00
## YearBuilt        7 1460   1971.27    30.20   1973.0   1974.13    37.06
## YearRemodAdd     8 1460   1984.87    20.65   1994.0   1986.37    19.27
## MasVnrArea       9 1452    103.69   181.07      0.0     63.15     0.00
## BsmtFinSF1      10 1460    443.64   456.10    383.5    386.08   568.58
## BsmtFinSF2      11 1460     46.55   161.32      0.0      1.38     0.00
## BsmtUnfSF       12 1460    567.24   441.87    477.5    519.29   426.99
## TotalBsmtSF     13 1460   1057.43   438.71    991.5   1036.70   347.67
## X1stFlrSF       14 1460   1162.63   386.59   1087.0   1129.99   347.67
## X2ndFlrSF       15 1460    346.99   436.53      0.0    285.36     0.00
## LowQualFinSF    16 1460      5.84    48.62      0.0      0.00     0.00
## GrLivArea       17 1460   1515.46   525.48   1464.0   1467.67   483.33
## BsmtFullBath    18 1460      0.43     0.52      0.0      0.39     0.00
## BsmtHalfBath    19 1460      0.06     0.24      0.0      0.00     0.00
## FullBath        20 1460      1.57     0.55      2.0      1.56     0.00
## HalfBath        21 1460      0.38     0.50      0.0      0.34     0.00
## BedroomAbvGr    22 1460      2.87     0.82      3.0      2.85     0.00
## KitchenAbvGr    23 1460      1.05     0.22      1.0      1.00     0.00
## TotRmsAbvGrd    24 1460      6.52     1.63      6.0      6.41     1.48
## Fireplaces      25 1460      0.61     0.64      1.0      0.53     1.48
## GarageYrBlt     26 1379   1978.51    24.69   1980.0   1981.07    31.13
## GarageCars      27 1460      1.77     0.75      2.0      1.77     0.00
## GarageArea      28 1460    472.98   213.80    480.0    469.81   177.91
## WoodDeckSF      29 1460     94.24   125.34      0.0     71.76     0.00
## OpenPorchSF     30 1460     46.66    66.26     25.0     33.23    37.06
## EnclosedPorch   31 1460     21.95    61.12      0.0      3.87     0.00
## X3SsnPorch      32 1460      3.41    29.32      0.0      0.00     0.00
## ScreenPorch     33 1460     15.06    55.76      0.0      0.00     0.00
## PoolArea        34 1460      2.76    40.18      0.0      0.00     0.00
## MiscVal         35 1460     43.49   496.12      0.0      0.00     0.00
## MoSold          36 1460      6.32     2.70      6.0      6.25     2.97
## YrSold          37 1460   2007.82     1.33   2008.0   2007.77     1.48
## SalePrice       38 1460 180921.20 79442.50 163000.0 170783.29 56338.80
##                 min    max  range  skew kurtosis      se
## Id                1   1460   1459  0.00    -1.20   11.03
## MSSubClass       20    190    170  1.40     1.56    1.11
## LotFrontage      21    313    292  2.16    17.34    0.70
## LotArea        1300 215245 213945 12.18   202.26  261.22
## OverallQual       1     10      9  0.22     0.09    0.04
## OverallCond       1      9      8  0.69     1.09    0.03
## YearBuilt      1872   2010    138 -0.61    -0.45    0.79
## YearRemodAdd   1950   2010     60 -0.50    -1.27    0.54
## MasVnrArea        0   1600   1600  2.66    10.03    4.75
## BsmtFinSF1        0   5644   5644  1.68    11.06   11.94
## BsmtFinSF2        0   1474   1474  4.25    20.01    4.22
## BsmtUnfSF         0   2336   2336  0.92     0.46   11.56
## TotalBsmtSF       0   6110   6110  1.52    13.18   11.48
## X1stFlrSF       334   4692   4358  1.37     5.71   10.12
## X2ndFlrSF         0   2065   2065  0.81    -0.56   11.42
## LowQualFinSF      0    572    572  8.99    82.83    1.27
## GrLivArea       334   5642   5308  1.36     4.86   13.75
## BsmtFullBath      0      3      3  0.59    -0.84    0.01
## BsmtHalfBath      0      2      2  4.09    16.31    0.01
## FullBath          0      3      3  0.04    -0.86    0.01
## HalfBath          0      2      2  0.67    -1.08    0.01
## BedroomAbvGr      0      8      8  0.21     2.21    0.02
## KitchenAbvGr      0      3      3  4.48    21.42    0.01
## TotRmsAbvGrd      2     14     12  0.67     0.87    0.04
## Fireplaces        0      3      3  0.65    -0.22    0.02
## GarageYrBlt    1900   2010    110 -0.65    -0.42    0.66
## GarageCars        0      4      4 -0.34     0.21    0.02
## GarageArea        0   1418   1418  0.18     0.90    5.60
## WoodDeckSF        0    857    857  1.54     2.97    3.28
## OpenPorchSF       0    547    547  2.36     8.44    1.73
## EnclosedPorch     0    552    552  3.08    10.37    1.60
## X3SsnPorch        0    508    508 10.28   123.06    0.77
## ScreenPorch       0    480    480  4.11    18.34    1.46
## PoolArea          0    738    738 14.80   222.19    1.05
## MiscVal           0  15500  15500 24.43   697.64   12.98
## MoSold            1     12     11  0.21    -0.41    0.07
## YrSold         2006   2010      4  0.10    -1.19    0.03
## SalePrice     34900 755000 720100  1.88     6.50 2079.11
X <- train$GrLivArea 
psych::describe(X)
##    vars    n    mean     sd median trimmed    mad min  max range skew
## X1    1 1460 1515.46 525.48   1464 1467.67 483.33 334 5642  5308 1.36
##    kurtosis    se
## X1     4.86 13.75
Y <- train$SalePrice 
psych::describe(Y)
##    vars    n     mean      sd median  trimmed     mad   min    max  range
## X1    1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100
##    skew kurtosis      se
## X1 1.88      6.5 2079.11
Z <- train$TotRmsAbvGrd
psych::describe(Z)
##    vars    n mean   sd median trimmed  mad min max range skew kurtosis
## X1    1 1460 6.52 1.63      6    6.41 1.48   2  14    12 0.67     0.87
##      se
## X1 0.04
train %>% ggplot( aes(x=factor(TotRmsAbvGrd), y=SalePrice))+
  geom_boxplot() + labs(x='Rooms AG') +
  scale_y_continuous()

train %>% ggplot( aes(x=factor(GrLivArea), y=SalePrice))+
  geom_boxplot() + labs(x='Rooms AG') +
  scale_y_continuous()

train %>% ggplot( aes(x=factor(TotRmsAbvGrd), y=GrLivArea))+
  geom_boxplot() + labs(x='Rooms AG') +
  scale_y_continuous()

train %>% ggplot( aes(x=factor(GrLivArea), y=SalePrice))+
  geom_point(fill="blue") + labs(x='Living Area')

train %>% ggplot( aes(x=factor(TotRmsAbvGrd), y=SalePrice))+
  geom_point(fill="blue") + labs(x='Rooms AG')

train %>% ggplot( aes(x=factor(LotArea), y=SalePrice))+
  geom_point(fill="blue") + labs(x='Rooms AG')

pairs(~LotArea+GrLivArea+SalePrice+TotRmsAbvGrd, data=train, main="Scatter Plot")

train %>%
  dplyr::select( GrLivArea,TotRmsAbvGrd,LotArea,SalePrice) %>%
  cor(use="pairwise.complete.obs") %>%
  corrplot()

There is a correaltion between roomsize and salesprice which is to be expected. A similar correlation betwen living area and sales price. The correlation between Lot area and sales price is not as strong.

H0:?? = 0 No Association

H1:?? <> 0 Corr may exist

cor.test(train$TotRmsAbvGrd, train$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$TotRmsAbvGrd and train$SalePrice
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5092841 0.5573021
## sample estimates:
##       cor 
## 0.5337232

P-Value is very small. C_I = .509 and 0.557 Corr = .533

cor.test(train$GrLivArea, train$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$GrLivArea and train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245

P-Value is very small. C_I = .691 and 0.724 Corr = .708

cor.test(train$LotArea, train$SalePrice, method = 'pearson', conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$LotArea and train$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323391 0.2947946
## sample estimates:
##       cor 
## 0.2638434

P-Value is very small. C_I = .232 and 0.294 Corr = .263

The P-value is nearly 0 and we can reject the null hypothesis. The relationshuo with Sales price has a correlation.

Familywise error risk is low. In statistics, family-wise error rate is the probability of making one or more false discoveries, or type I errors when performing multiple hypotheses tests. In this case we ran multiple models and found the P_value to be extremely low.

5 points. Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Lacorr<-cor(train[, which(names(train) %in% c("GrLivArea", "SalePrice","TotRmsAbvGrd"))])
(LAmatrix_inv <- solve(Lacorr))
##              GrLivArea TotRmsAbvGrd  SalePrice
## GrLivArea     4.585000   -2.8676627 -1.7185052
## TotRmsAbvGrd -2.867663    3.1918921  0.3285093
## SalePrice    -1.718505    0.3285093  2.0424418
Lacorr %*% LAmatrix_inv#multiply
##                  GrLivArea  TotRmsAbvGrd SalePrice
## GrLivArea     1.000000e+00 -1.665335e-16         0
## TotRmsAbvGrd -4.440892e-16  1.000000e+00         0
## SalePrice    -4.440892e-16 -2.220446e-16         1
LAmatrix_inv %*% Lacorr
##                  GrLivArea  TotRmsAbvGrd     SalePrice
## GrLivArea     1.000000e+00 -4.440892e-16 -4.440892e-16
## TotRmsAbvGrd -1.665335e-16  1.000000e+00 -2.220446e-16
## SalePrice     0.000000e+00  0.000000e+00  1.000000e+00
lu_matrix <- expand(lu(LAmatrix_inv))
lu_matrix$L
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]       [,2]       [,3]      
## [1,]  1.0000000          .          .
## [2,] -0.6254444  1.0000000          .
## [3,] -0.3748103 -0.5337232  1.0000000
lu_matrix$U
## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]       [,2]       [,3]      
## [1,]  4.5850000 -2.8676627 -1.7185052
## [2,]          .  1.3983284 -0.7463202
## [3,]          .          .  1.0000000
lu_matrix$L %*% lu_matrix$U
## 3 x 3 Matrix of class "dgeMatrix"
##           [,1]       [,2]       [,3]
## [1,]  4.585000 -2.8676627 -1.7185052
## [2,] -2.867663  3.1918921  0.3285093
## [3,] -1.718505  0.3285093  2.0424418

5 points.
Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of ??? for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, ???)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Calc <- train$LotArea
summary(Calc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1300    7554    9478   10517   11602  215245

Min. value is 1300

hist(Calc)

exponential <- fitdistr(Calc, densfun="exponential")
L <- exponential$estimate
(Exp_l <- 1 / L)
##     rate 
## 10516.83
exponential_sample <- rexp(1000, L) #taking 1000 samples
hist(Calc, freq = FALSE, breaks = 30)

hist(exponential_sample, freq = FALSE, breaks = 30)

# 5 and 95 percentile 
qexp(c(.05, .95), rate = L)
## [1]   539.4428 31505.6013

0.05 / 2, because of two-tails

qnorm(c(.025, .975), mean=mean(Calc), sd=sd(Calc))
## [1] -9046.092 30079.748

Normal distribution would have a mean between -9046.1 and 30079.748 in 95% of the sample intervals taken.

# 5th and 95th percentiles original
quantile(Calc, c(.05, .95))
##       5%      95% 
##  3311.70 17401.15

5% will be less than 3311.70 and 5% will be greater than 17401.15

Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

I’m ignoring rows with non numeric values in the below model.

Model <- glm(SalePrice ~ ., data=train_n)
summary(Model)
## 
## Call:
## glm(formula = SalePrice ~ ., data = train_n)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -442182   -16955    -2824    15125   318183  
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.351e+05  1.701e+06  -0.197 0.843909    
## Id            -1.205e+00  2.658e+00  -0.453 0.650332    
## MSSubClass    -2.001e+02  3.451e+01  -5.797 8.84e-09 ***
## LotFrontage   -1.160e+02  6.126e+01  -1.894 0.058503 .  
## LotArea        5.422e-01  1.575e-01   3.442 0.000599 ***
## OverallQual    1.866e+04  1.482e+03  12.592  < 2e-16 ***
## OverallCond    5.239e+03  1.368e+03   3.830 0.000135 ***
## YearBuilt      3.164e+02  8.766e+01   3.610 0.000321 ***
## YearRemodAdd   1.194e+02  8.668e+01   1.378 0.168607    
## MasVnrArea     3.141e+01  7.022e+00   4.473 8.54e-06 ***
## BsmtFinSF1     1.736e+01  5.838e+00   2.973 0.003014 ** 
## BsmtFinSF2     8.342e+00  8.766e+00   0.952 0.341532    
## BsmtUnfSF      5.005e+00  5.277e+00   0.948 0.343173    
## TotalBsmtSF           NA         NA      NA       NA    
## X1stFlrSF      4.597e+01  7.360e+00   6.246 6.02e-10 ***
## X2ndFlrSF      4.663e+01  6.102e+00   7.641 4.72e-14 ***
## LowQualFinSF   3.341e+01  2.794e+01   1.196 0.232009    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   9.043e+03  3.198e+03   2.828 0.004776 ** 
## BsmtHalfBath   2.465e+03  5.073e+03   0.486 0.627135    
## FullBath       5.433e+03  3.531e+03   1.539 0.124182    
## HalfBath      -1.098e+03  3.321e+03  -0.331 0.740945    
## BedroomAbvGr  -1.022e+04  2.155e+03  -4.742 2.40e-06 ***
## KitchenAbvGr  -2.202e+04  6.710e+03  -3.282 0.001063 ** 
## TotRmsAbvGrd   5.464e+03  1.487e+03   3.674 0.000251 ***
## Fireplaces     4.372e+03  2.189e+03   1.998 0.046020 *  
## GarageYrBlt   -4.728e+01  9.106e+01  -0.519 0.603742    
## GarageCars     1.685e+04  3.491e+03   4.827 1.58e-06 ***
## GarageArea     6.274e+00  1.213e+01   0.517 0.605002    
## WoodDeckSF     2.144e+01  1.002e+01   2.139 0.032662 *  
## OpenPorchSF   -2.252e+00  1.949e+01  -0.116 0.907998    
## EnclosedPorch  7.295e+00  2.062e+01   0.354 0.723590    
## X3SsnPorch     3.349e+01  3.758e+01   0.891 0.373163    
## ScreenPorch    5.805e+01  2.041e+01   2.844 0.004532 ** 
## PoolArea      -6.052e+01  2.990e+01  -2.024 0.043204 *  
## MiscVal       -3.761e+00  6.960e+00  -0.540 0.589016    
## MoSold        -2.217e+02  4.229e+02  -0.524 0.600188    
## YrSold        -2.474e+02  8.458e+02  -0.293 0.769917    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1354264248)
## 
##     Null deviance: 7.7155e+12  on 1120  degrees of freedom
## Residual deviance: 1.4694e+12  on 1085  degrees of freedom
##   (339 observations deleted due to missingness)
## AIC: 26789
## 
## Number of Fisher Scoring iterations: 2

Taking the highest ranked predictors and what I deem to be a big factor to create model below.

ModelA <- glm(SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + YearBuilt, data=train_n)
summary(ModelA)
## 
## Call:
## glm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt, data = train_n)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -264011   -27126    -3936    19071   392919  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.757e+05  1.023e+05  -7.585 5.89e-14 ***
## MSSubClass  -1.584e+02  2.840e+01  -5.576 2.93e-08 ***
## LotArea      1.402e+00  1.210e-01  11.583  < 2e-16 ***
## OverallQual  4.031e+04  1.069e+03  37.710  < 2e-16 ***
## OverallCond  2.343e+03  1.168e+03   2.006    0.045 *  
## YearBuilt    3.510e+02  5.226e+01   6.717 2.65e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2052712835)
## 
##     Null deviance: 9.2079e+12  on 1459  degrees of freedom
## Residual deviance: 2.9846e+12  on 1454  degrees of freedom
## AIC: 35457
## 
## Number of Fisher Scoring iterations: 2
ModelB<- glm(SalePrice~OverallQual + YearBuilt + GarageCars + GrLivArea + TotalBsmtSF + TotRmsAbvGrd + BedroomAbvGr + LotArea + X2ndFlrSF + YearRemodAdd, data = train_n)
summary(ModelB)
## 
## Call:
## glm(formula = SalePrice ~ OverallQual + YearBuilt + GarageCars + 
##     GrLivArea + TotalBsmtSF + TotRmsAbvGrd + BedroomAbvGr + LotArea + 
##     X2ndFlrSF + YearRemodAdd, data = train_n)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -497751   -18752     -808    16057   293856  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.061e+06  1.202e+05  -8.828  < 2e-16 ***
## OverallQual   1.909e+04  1.185e+03  16.104  < 2e-16 ***
## YearBuilt     2.577e+02  4.723e+01   5.456 5.71e-08 ***
## GarageCars    1.248e+04  1.790e+03   6.969 4.83e-12 ***
## GrLivArea     5.486e+01  5.475e+00  10.020  < 2e-16 ***
## TotalBsmtSF   2.186e+01  4.151e+00   5.266 1.61e-07 ***
## TotRmsAbvGrd  3.185e+03  1.251e+03   2.547    0.011 *  
## BedroomAbvGr -9.578e+03  1.734e+03  -5.522 3.96e-08 ***
## LotArea       6.557e-01  1.042e-01   6.293 4.12e-10 ***
## X2ndFlrSF    -6.146e+00  4.809e+00  -1.278    0.201    
## YearRemodAdd  2.476e+02  6.246e+01   3.965 7.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1383802299)
## 
##     Null deviance: 9.2079e+12  on 1459  degrees of freedom
## Residual deviance: 2.0051e+12  on 1449  degrees of freedom
## AIC: 34886
## 
## Number of Fisher Scoring iterations: 2
ModelC <- glm(SalePrice ~ LotArea + OverallQual + YearBuilt + GrLivArea + TotRmsAbvGrd, data=train_n)
summary(ModelC)
## 
## Call:
## glm(formula = SalePrice ~ LotArea + OverallQual + YearBuilt + 
##     GrLivArea + TotRmsAbvGrd, data = train_n)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -428338   -21164    -2365    17500   289820  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.052e+06  8.222e+04 -12.792   <2e-16 ***
## LotArea       9.096e-01  1.086e-01   8.373   <2e-16 ***
## OverallQual   2.559e+04  1.150e+03  22.255   <2e-16 ***
## YearBuilt     4.991e+02  4.313e+01  11.572   <2e-16 ***
## GrLivArea     5.985e+01  4.082e+00  14.663   <2e-16 ***
## TotRmsAbvGrd -1.142e+03  1.150e+03  -0.993    0.321    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1584283249)
## 
##     Null deviance: 9.2079e+12  on 1459  degrees of freedom
## Residual deviance: 2.3035e+12  on 1454  degrees of freedom
## AIC: 35079
## 
## Number of Fisher Scoring iterations: 2
anova(ModelA,ModelB,ModelC)
## Analysis of Deviance Table
## 
## Model 1: SalePrice ~ MSSubClass + LotArea + OverallQual + OverallCond + 
##     YearBuilt
## Model 2: SalePrice ~ OverallQual + YearBuilt + GarageCars + GrLivArea + 
##     TotalBsmtSF + TotRmsAbvGrd + BedroomAbvGr + LotArea + X2ndFlrSF + 
##     YearRemodAdd
## Model 3: SalePrice ~ LotArea + OverallQual + YearBuilt + GrLivArea + TotRmsAbvGrd
##   Resid. Df Resid. Dev Df    Deviance
## 1      1454 2.9846e+12               
## 2      1449 2.0051e+12  5  9.7951e+11
## 3      1454 2.3035e+12 -5 -2.9842e+11
Predictor <- predict(ModelC, test, type="response")
Predicted <- data.frame(test$Id, Predictor)
colnames(Predicted) <- c("Id", "SalePrice")
head(Predicted)
##     Id SalePrice
## 1 1461  113326.9
## 2 1462  164600.7
## 3 1463  176029.8
## 4 1464  195978.6
## 5 1465  222537.8
## 6 1466  196555.7
write.csv(Predicted, file = "ModelC.csv", row.names = FALSE)

Model C:

Kaggle Username: