Data 605 Final Exam

library(knitr)
library(rmdformats)

## Global options
options(max.print="85")
opts_chunk$set(cache=TRUE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE)
opts_knit$set(width=35)

library(skimr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(purrr)

# prob 2 
library(forcats)
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

library(ggcorrplot)
library(matrixcalc)
library(Matrix)

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(Rmisc)

## Loading required package: lattice

## Loading required package: plyr

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:purrr':
## 
##     compact

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(parallel)
library(doParallel)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(caret)

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(data.table)

## 
## Attaching package: 'data.table'

## The following object is masked from 'package:purrr':
## 
##     transpose

## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of $\mu = \sigma=\frac{(N+1)}{2}$.

Ans:

N <- 28
# Generating random variables X and Y
X <- runif(10000, 1, N)
mu <- sigma <- (N + 1)/2
Y <- rnorm(10000, mu, sigma)
skim(X)

Data summary
Name	X
Number of rows	10000
Number of columns	1
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
data	0	1	14.57	7.74	1	7.94	14.58	21.24	28	▇▇▇▇▇

skim(Y)

Data summary
Name	Y
Number of rows	10000
Number of columns	1
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
data	0	1	14.46	14.31	-35.4	4.73	14.52	24.03	64.61	▁▃▇▃▁

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

a. P(X>x| X>y)

Ans:

P(X>x| X>y) = $\frac{P (X > x and X > y)}{P(X>y)}$

# set x be the median of X
x <- median(X)
# set y to be the first quartile of Y
y <- quantile(Y, 0.25, names = FALSE)
denominator <- length(which(X > y))
numerator <- length(which(X > x & X > y))
prob <- numerator/denominator
prob

[1] 0.5768343

The probability is .58.

b. P(X>x, Y>y)

Ans:

P(X > x) = 0.5

P(Y > y) = 0.75

P(X>x, Y>y) = 0.5 * 0.75 = 0.375

c. P(X<x| X>y)

Ans:

P (X < x | X > y) = $\frac{P(X < x \space \cap \space X > y)}{P(X > y)}$

denominator <- length(which(X > y))
numerator <- length(which(X < x & X > y))
prob <- numerator/denominator
prob

[1] 0.4231657

The prob is 0.42.

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and

evaluating the marginal and joint probabilities.

Ans:

table1 <- data.frame(X, Y)
# Joint Prob
table2 <- table1 %>% mutate(Col = ifelse(X > x, " X > x", " X < x"), Row = ifelse(Y > 
    y, " Y > y", " Y < y")) %>% group_by(Col, Row) %>% dplyr::summarise(count = dplyr::n(), 
    .groups = "drop") %>% # ungroup %>%
mutate(prob = prop.table(count))

# Marginal Prob
table2 <- table2 %>% group_by(Col) %>% summarize(count = sum(count), prob = sum(prob), 
    .groups = "drop") %>% mutate(Row = "Total") %>% bind_rows(table2)

table2 <- table2 %>% group_by(Row) %>% summarize(count = sum(count), prob = sum(prob), 
    .groups = "drop") %>% mutate(Col = "Total") %>% bind_rows(table2)

# 2 x 2 contingency table with marginal probabilities appended.
table3 <- table2 %>% dplyr::select(-count) %>% pivot_wider(names_from = Col, values_from = prob) %>% 
    dplyr::select(c(Row, " X < x", " X > x", Total))
table3

# A tibble: 4 x 4
  Row      ` X < x` ` X > x` Total
  <chr>       <dbl>    <dbl> <dbl>
1  <NA>      NA       NA         2
2 "Total"    NA       NA        NA
3 " Y < y"    0.125    0.125    NA
4 " Y > y"    0.375    0.375    NA

So the joint probability of P(X > x and Y > y) = .3706

P ( X > x)P( Y > y ) = .5 * .75 = .375

So They are close. I think what happens is the event X and event Y are not necessarily independent.

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

The 2 events that we wanted to check are a) X > x, b) Y > y Let’s create a table for these 2 events. Call it tableXY.

tableXY <- table(X > x, Y > y)
tableXY

       
        FALSE TRUE
  FALSE  1249 3751
  TRUE   1251 3749

fisher.test(tableXY)


    Fisher's Exact Test for Count Data

data:  tableXY
p-value = 0.9816
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.9105431 1.0935740
sample estimates:
odds ratio 
 0.9978691

chisq.test(tableXY)


    Pearson's Chi-squared test with Yates' continuity correction

data:  tableXY
X-squared = 0.00053333, df = 1, p-value = 0.9816

Both tests are to test for independence of the row and column variable. Chi-squared statistic follows chi-squared distribution only approximately. The more observations we have, the better approximation is. Fisher’s exact test is an alternative to chi-squared test used mainly when a chi-squared approximation is not satisfactory. One major drawback of Fisher’s exact test is that for large tables (or large samples) it becomes computationally ineffective. Chi-squared test is more appropriate for larger samples. For the purpose of this exercise, Chi-squared tests is more appropriate.

Both the results of Fisher’s Exact Test and Pearson’s Chi-squared test with Yates’ continuity correction showed that the 2 events, namely, X > x, and Y > y, are not independent because p-values are both <0.05, which is within the acceptable range for rejecting the null hypothesis.

Problem 2

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

# getwd()
training_set <- readr::read_csv("train.csv")

Working on treatments for missing values

str(training_set)

tibble [1,460 × 81] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Id           : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
 $ MSSubClass   : num [1:1460] 60 20 60 70 60 50 20 60 50 190 ...
 $ MSZoning     : chr [1:1460] "RL" "RL" "RL" "RL" ...
 $ LotFrontage  : num [1:1460] 65 80 68 60 84 85 75 NA 51 50 ...
 $ LotArea      : num [1:1460] 8450 9600 11250 9550 14260 ...
 $ Street       : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
 $ Alley        : chr [1:1460] NA NA NA NA ...
 $ LotShape     : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
 $ LandContour  : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
 $ Utilities    : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
 $ LotConfig    : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
 $ LandSlope    : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
 $ Neighborhood : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
 $ Condition1   : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
 $ Condition2   : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
 $ BldgType     : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
 $ HouseStyle   : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
 $ OverallQual  : num [1:1460] 7 6 7 7 8 5 8 7 7 5 ...
 $ OverallCond  : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
 $ YearBuilt    : num [1:1460] 2003 1976 2001 1915 2000 ...
 $ YearRemodAdd : num [1:1460] 2003 1976 2002 1970 2000 ...
 $ RoofStyle    : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
 $ RoofMatl     : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
 $ Exterior1st  : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
 $ Exterior2nd  : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
 $ MasVnrType   : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
 $ MasVnrArea   : num [1:1460] 196 0 162 0 350 0 186 240 0 0 ...
 $ ExterQual    : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
 $ ExterCond    : chr [1:1460] "TA" "TA" "TA" "TA" ...
 $ Foundation   : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
 $ BsmtQual     : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
 $ BsmtCond     : chr [1:1460] "TA" "TA" "TA" "Gd" ...
 $ BsmtExposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
 $ BsmtFinType1 : chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
 $ BsmtFinSF1   : num [1:1460] 706 978 486 216 655 ...
 $ BsmtFinType2 : chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
 $ BsmtFinSF2   : num [1:1460] 0 0 0 0 0 0 0 32 0 0 ...
 $ BsmtUnfSF    : num [1:1460] 150 284 434 540 490 64 317 216 952 140 ...
 $ TotalBsmtSF  : num [1:1460] 856 1262 920 756 1145 ...
 $ Heating      : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
 $ HeatingQC    : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
 $ CentralAir   : chr [1:1460] "Y" "Y" "Y" "Y" ...
 $ Electrical   : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
 $ 1stFlrSF     : num [1:1460] 856 1262 920 961 1145 ...
 $ 2ndFlrSF     : num [1:1460] 854 0 866 756 1053 ...
 $ LowQualFinSF : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
 $ GrLivArea    : num [1:1460] 1710 1262 1786 1717 2198 ...
 $ BsmtFullBath : num [1:1460] 1 0 1 1 1 1 1 1 0 1 ...
 $ BsmtHalfBath : num [1:1460] 0 1 0 0 0 0 0 0 0 0 ...
 $ FullBath     : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
 $ HalfBath     : num [1:1460] 1 0 1 0 1 1 0 1 0 0 ...
 $ BedroomAbvGr : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
 $ KitchenAbvGr : num [1:1460] 1 1 1 1 1 1 1 1 2 2 ...
 $ KitchenQual  : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
 $ TotRmsAbvGrd : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
 $ Functional   : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
 $ Fireplaces   : num [1:1460] 0 1 1 1 1 0 1 2 2 2 ...
 $ FireplaceQu  : chr [1:1460] NA "TA" "TA" "Gd" ...
 $ GarageType   : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
 $ GarageYrBlt  : num [1:1460] 2003 1976 2001 1998 2000 ...
 $ GarageFinish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
 $ GarageCars   : num [1:1460] 2 2 2 3 3 2 2 2 2 1 ...
 $ GarageArea   : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
 $ GarageQual   : chr [1:1460] "TA" "TA" "TA" "TA" ...
 $ GarageCond   : chr [1:1460] "TA" "TA" "TA" "TA" ...
 $ PavedDrive   : chr [1:1460] "Y" "Y" "Y" "Y" ...
 $ WoodDeckSF   : num [1:1460] 0 298 0 0 192 40 255 235 90 0 ...
 $ OpenPorchSF  : num [1:1460] 61 0 42 35 84 30 57 204 0 4 ...
 $ EnclosedPorch: num [1:1460] 0 0 0 272 0 0 0 228 205 0 ...
 $ 3SsnPorch    : num [1:1460] 0 0 0 0 0 320 0 0 0 0 ...
 $ ScreenPorch  : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
 $ PoolArea     : num [1:1460] 0 0 0 0 0 0 0 0 0 0 ...
 $ PoolQC       : chr [1:1460] NA NA NA NA ...
 $ Fence        : chr [1:1460] NA NA NA NA ...
 $ MiscFeature  : chr [1:1460] NA NA NA NA ...
 $ MiscVal      : num [1:1460] 0 0 0 0 0 700 0 350 0 0 ...
 $ MoSold       : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
 $ YrSold       : num [1:1460] 2008 2007 2008 2006 2008 ...
 $ SaleType     : chr [1:1460] "WD" "WD" "WD" "WD" ...
 $ SaleCondition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
 $ SalePrice    : num [1:1460] 208500 181500 223500 140000 250000 ...
 - attr(*, "spec")=
  .. cols(
  ..   Id = col_double(),
  ..   MSSubClass = col_double(),
  ..   MSZoning = col_character(),
  ..   LotFrontage = col_double(),
  ..   LotArea = col_double(),
  ..   Street = col_character(),
  ..   Alley = col_character(),
  ..   LotShape = col_character(),
  ..   LandContour = col_character(),
  ..   Utilities = col_character(),
  ..   LotConfig = col_character(),
  ..   LandSlope = col_character(),
  ..   Neighborhood = col_character(),
  ..   Condition1 = col_character(),
  ..   Condition2 = col_character(),
  ..   BldgType = col_character(),
  ..   HouseStyle = col_character(),
  ..   OverallQual = col_double(),
  ..   OverallCond = col_double(),
  ..   YearBuilt = col_double(),
  ..   YearRemodAdd = col_double(),
  ..   RoofStyle = col_character(),
  ..   RoofMatl = col_character(),
  ..   Exterior1st = col_character(),
  ..   Exterior2nd = col_character(),
  ..   MasVnrType = col_character(),
  ..   MasVnrArea = col_double(),
  ..   ExterQual = col_character(),
  ..   ExterCond = col_character(),
  ..   Foundation = col_character(),
  ..   BsmtQual = col_character(),
  ..   BsmtCond = col_character(),
  ..   BsmtExposure = col_character(),
  ..   BsmtFinType1 = col_character(),
  ..   BsmtFinSF1 = col_double(),
  ..   BsmtFinType2 = col_character(),
  ..   BsmtFinSF2 = col_double(),
  ..   BsmtUnfSF = col_double(),
  ..   TotalBsmtSF = col_double(),
  ..   Heating = col_character(),
  ..   HeatingQC = col_character(),
  ..   CentralAir = col_character(),
  ..   Electrical = col_character(),
  ..   `1stFlrSF` = col_double(),
  ..   `2ndFlrSF` = col_double(),
  ..   LowQualFinSF = col_double(),
  ..   GrLivArea = col_double(),
  ..   BsmtFullBath = col_double(),
  ..   BsmtHalfBath = col_double(),
  ..   FullBath = col_double(),
  ..   HalfBath = col_double(),
  ..   BedroomAbvGr = col_double(),
  ..   KitchenAbvGr = col_double(),
  ..   KitchenQual = col_character(),
  ..   TotRmsAbvGrd = col_double(),
  ..   Functional = col_character(),
  ..   Fireplaces = col_double(),
  ..   FireplaceQu = col_character(),
  ..   GarageType = col_character(),
  ..   GarageYrBlt = col_double(),
  ..   GarageFinish = col_character(),
  ..   GarageCars = col_double(),
  ..   GarageArea = col_double(),
  ..   GarageQual = col_character(),
  ..   GarageCond = col_character(),
  ..   PavedDrive = col_character(),
  ..   WoodDeckSF = col_double(),
  ..   OpenPorchSF = col_double(),
  ..   EnclosedPorch = col_double(),
  ..   `3SsnPorch` = col_double(),
  ..   ScreenPorch = col_double(),
  ..   PoolArea = col_double(),
  ..   PoolQC = col_character(),
  ..   Fence = col_character(),
  ..   MiscFeature = col_character(),
  ..   MiscVal = col_double(),
  ..   MoSold = col_double(),
  ..   YrSold = col_double(),
  ..   SaleType = col_character(),
  ..   SaleCondition = col_character(),
  ..   SalePrice = col_double()
  .. )

# use is.character to filter for the fields that are character fields
factors <- training_set %>% select_if(is.character) %>% colnames()
training_set[factors] <- lapply(training_set[factors], factor)
# assign '(Missing)', an explicit factor level to missing values
training_set[factors] <- lapply(training_set[factors], fct_explicit_na)
skim(training_set)

Data summary
Name	training_set
Number of rows	1460
Number of columns	81
_______________________
Column type frequency:
factor	43
numeric	38
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
MSZoning	1	FALSE	5	RL: 1151, RM: 218, FV: 65, RH: 16
Street	1	FALSE	2	Pav: 1454, Grv: 6
Alley	1	FALSE	3	(Mi: 1369, Grv: 50, Pav: 41
LotShape	1	FALSE	4	Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour	1	FALSE	4	Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities	1	FALSE	2	All: 1459, NoS: 1
LotConfig	1	FALSE	5	Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope	1	FALSE	3	Gtl: 1382, Mod: 65, Sev: 13
Neighborhood	1	FALSE	25	NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1	1	FALSE	9	Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2	1	FALSE	8	Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType	1	FALSE	5	1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle	1	FALSE	8	1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle	1	FALSE	6	Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl	1	FALSE	8	Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st	1	FALSE	15	Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd	1	FALSE	16	Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType	1	FALSE	5	Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual	1	FALSE	4	TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond	1	FALSE	5	TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation	1	FALSE	6	PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual	1	FALSE	5	TA: 649, Gd: 618, Ex: 121, (Mi: 37
BsmtCond	1	FALSE	5	TA: 1311, Gd: 65, Fa: 45, (Mi: 37
BsmtExposure	1	FALSE	5	No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1	1	FALSE	7	Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2	1	FALSE	7	Unf: 1256, Rec: 54, LwQ: 46, (Mi: 38
Heating	1	FALSE	6	Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC	1	FALSE	5	Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir	1	FALSE	2	Y: 1365, N: 95
Electrical	1	FALSE	6	SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual	1	FALSE	4	TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional	1	FALSE	7	Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu	1	FALSE	6	(Mi: 690, Gd: 380, TA: 313, Fa: 33
GarageType	1	FALSE	7	Att: 870, Det: 387, Bui: 88, (Mi: 81
GarageFinish	1	FALSE	4	Unf: 605, RFn: 422, Fin: 352, (Mi: 81
GarageQual	1	FALSE	6	TA: 1311, (Mi: 81, Fa: 48, Gd: 14
GarageCond	1	FALSE	6	TA: 1326, (Mi: 81, Fa: 35, Gd: 9
PavedDrive	1	FALSE	3	Y: 1340, N: 90, P: 30
PoolQC	1	FALSE	4	(Mi: 1453, Gd: 3, Ex: 2, Fa: 2
Fence	1	FALSE	5	(Mi: 1179, MnP: 157, GdP: 59, GdW: 54
MiscFeature	1	FALSE	5	(Mi: 1406, She: 49, Gar: 2, Oth: 2
SaleType	1	FALSE	9	WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition	1	FALSE	6	Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Id	0	1.00	730.50	421.61	1	365.75	730.5	1095.25	1460	▇▇▇▇▇
MSSubClass	0	1.00	56.90	42.30	20	20.00	50.0	70.00	190	▇▅▂▁▁
LotFrontage	259	0.82	70.05	24.28	21	59.00	69.0	80.00	313	▇▃▁▁▁
LotArea	0	1.00	10516.83	9981.26	1300	7553.50	9478.5	11601.50	215245	▇▁▁▁▁
OverallQual	0	1.00	6.10	1.38	1	5.00	6.0	7.00	10	▁▂▇▅▁
OverallCond	0	1.00	5.58	1.11	1	5.00	5.0	6.00	9	▁▁▇▅▁
YearBuilt	0	1.00	1971.27	30.20	1872	1954.00	1973.0	2000.00	2010	▁▂▃▆▇
YearRemodAdd	0	1.00	1984.87	20.65	1950	1967.00	1994.0	2004.00	2010	▅▂▂▃▇
MasVnrArea	8	0.99	103.69	181.07	0	0.00	0.0	166.00	1600	▇▁▁▁▁
BsmtFinSF1	0	1.00	443.64	456.10	0	0.00	383.5	712.25	5644	▇▁▁▁▁
BsmtFinSF2	0	1.00	46.55	161.32	0	0.00	0.0	0.00	1474	▇▁▁▁▁
BsmtUnfSF	0	1.00	567.24	441.87	0	223.00	477.5	808.00	2336	▇▅▂▁▁
TotalBsmtSF	0	1.00	1057.43	438.71	0	795.75	991.5	1298.25	6110	▇▃▁▁▁
1stFlrSF	0	1.00	1162.63	386.59	334	882.00	1087.0	1391.25	4692	▇▅▁▁▁
2ndFlrSF	0	1.00	346.99	436.53	0	0.00	0.0	728.00	2065	▇▃▂▁▁
LowQualFinSF	0	1.00	5.84	48.62	0	0.00	0.0	0.00	572	▇▁▁▁▁
GrLivArea	0	1.00	1515.46	525.48	334	1129.50	1464.0	1776.75	5642	▇▇▁▁▁
BsmtFullBath	0	1.00	0.43	0.52	0	0.00	0.0	1.00	3	▇▆▁▁▁
BsmtHalfBath	0	1.00	0.06	0.24	0	0.00	0.0	0.00	2	▇▁▁▁▁
FullBath	0	1.00	1.57	0.55	0	1.00	2.0	2.00	3	▁▇▁▇▁
HalfBath	0	1.00	0.38	0.50	0	0.00	0.0	1.00	2	▇▁▅▁▁
BedroomAbvGr	0	1.00	2.87	0.82	0	2.00	3.0	3.00	8	▁▇▂▁▁
KitchenAbvGr	0	1.00	1.05	0.22	0	1.00	1.0	1.00	3	▁▇▁▁▁
TotRmsAbvGrd	0	1.00	6.52	1.63	2	5.00	6.0	7.00	14	▂▇▇▁▁
Fireplaces	0	1.00	0.61	0.64	0	0.00	1.0	1.00	3	▇▇▁▁▁
GarageYrBlt	81	0.94	1978.51	24.69	1900	1961.00	1980.0	2002.00	2010	▁▁▅▅▇
GarageCars	0	1.00	1.77	0.75	0	1.00	2.0	2.00	4	▁▃▇▂▁
GarageArea	0	1.00	472.98	213.80	0	334.50	480.0	576.00	1418	▂▇▃▁▁
WoodDeckSF	0	1.00	94.24	125.34	0	0.00	0.0	168.00	857	▇▂▁▁▁
OpenPorchSF	0	1.00	46.66	66.26	0	0.00	25.0	68.00	547	▇▁▁▁▁
EnclosedPorch	0	1.00	21.95	61.12	0	0.00	0.0	0.00	552	▇▁▁▁▁
3SsnPorch	0	1.00	3.41	29.32	0	0.00	0.0	0.00	508	▇▁▁▁▁
ScreenPorch	0	1.00	15.06	55.76	0	0.00	0.0	0.00	480	▇▁▁▁▁
PoolArea	0	1.00	2.76	40.18	0	0.00	0.0	0.00	738	▇▁▁▁▁
MiscVal	0	1.00	43.49	496.12	0	0.00	0.0	0.00	15500	▇▁▁▁▁
MoSold	0	1.00	6.32	2.70	1	5.00	6.0	8.00	12	▃▆▇▃▃
YrSold	0	1.00	2007.82	1.33	2006	2007.00	2008.0	2009.00	2010	▇▇▇▇▅
SalePrice	0	1.00	180921.20	79442.50	34900	129975.00	163000.0	214000.00	755000	▇▅▁▁▁

# no more missing values seen in training_set[factors]

# repalcing numeric NA with 0 Note that the following won't work without the
# tilde symbol to spread it across all numeric variables
training_set <- training_set %>% mutate_if(is.numeric, ~replace(., is.na(.), 0))

skim(training_set)

Data summary
Name	training_set
Number of rows	1460
Number of columns	81
_______________________
Column type frequency:
factor	43
numeric	38
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
MSZoning	1	FALSE	5	RL: 1151, RM: 218, FV: 65, RH: 16
Street	1	FALSE	2	Pav: 1454, Grv: 6
Alley	1	FALSE	3	(Mi: 1369, Grv: 50, Pav: 41
LotShape	1	FALSE	4	Reg: 925, IR1: 484, IR2: 41, IR3: 10
LandContour	1	FALSE	4	Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
Utilities	1	FALSE	2	All: 1459, NoS: 1
LotConfig	1	FALSE	5	Ins: 1052, Cor: 263, Cul: 94, FR2: 47
LandSlope	1	FALSE	3	Gtl: 1382, Mod: 65, Sev: 13
Neighborhood	1	FALSE	25	NAm: 225, Col: 150, Old: 113, Edw: 100
Condition1	1	FALSE	9	Nor: 1260, Fee: 81, Art: 48, RRA: 26
Condition2	1	FALSE	8	Nor: 1445, Fee: 6, Art: 2, Pos: 2
BldgType	1	FALSE	5	1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
HouseStyle	1	FALSE	8	1St: 726, 2St: 445, 1.5: 154, SLv: 65
RoofStyle	1	FALSE	6	Gab: 1141, Hip: 286, Fla: 13, Gam: 11
RoofMatl	1	FALSE	8	Com: 1434, Tar: 11, WdS: 6, WdS: 5
Exterior1st	1	FALSE	15	Vin: 515, HdB: 222, Met: 220, Wd : 206
Exterior2nd	1	FALSE	16	Vin: 504, Met: 214, HdB: 207, Wd : 197
MasVnrType	1	FALSE	5	Non: 864, Brk: 445, Sto: 128, Brk: 15
ExterQual	1	FALSE	4	TA: 906, Gd: 488, Ex: 52, Fa: 14
ExterCond	1	FALSE	5	TA: 1282, Gd: 146, Fa: 28, Ex: 3
Foundation	1	FALSE	6	PCo: 647, CBl: 634, Brk: 146, Sla: 24
BsmtQual	1	FALSE	5	TA: 649, Gd: 618, Ex: 121, (Mi: 37
BsmtCond	1	FALSE	5	TA: 1311, Gd: 65, Fa: 45, (Mi: 37
BsmtExposure	1	FALSE	5	No: 953, Av: 221, Gd: 134, Mn: 114
BsmtFinType1	1	FALSE	7	Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
BsmtFinType2	1	FALSE	7	Unf: 1256, Rec: 54, LwQ: 46, (Mi: 38
Heating	1	FALSE	6	Gas: 1428, Gas: 18, Gra: 7, Wal: 4
HeatingQC	1	FALSE	5	Ex: 741, TA: 428, Gd: 241, Fa: 49
CentralAir	1	FALSE	2	Y: 1365, N: 95
Electrical	1	FALSE	6	SBr: 1334, Fus: 94, Fus: 27, Fus: 3
KitchenQual	1	FALSE	4	TA: 735, Gd: 586, Ex: 100, Fa: 39
Functional	1	FALSE	7	Typ: 1360, Min: 34, Min: 31, Mod: 15
FireplaceQu	1	FALSE	6	(Mi: 690, Gd: 380, TA: 313, Fa: 33
GarageType	1	FALSE	7	Att: 870, Det: 387, Bui: 88, (Mi: 81
GarageFinish	1	FALSE	4	Unf: 605, RFn: 422, Fin: 352, (Mi: 81
GarageQual	1	FALSE	6	TA: 1311, (Mi: 81, Fa: 48, Gd: 14
GarageCond	1	FALSE	6	TA: 1326, (Mi: 81, Fa: 35, Gd: 9
PavedDrive	1	FALSE	3	Y: 1340, N: 90, P: 30
PoolQC	1	FALSE	4	(Mi: 1453, Gd: 3, Ex: 2, Fa: 2
Fence	1	FALSE	5	(Mi: 1179, MnP: 157, GdP: 59, GdW: 54
MiscFeature	1	FALSE	5	(Mi: 1406, She: 49, Gar: 2, Oth: 2
SaleType	1	FALSE	9	WD: 1267, New: 122, COD: 43, Con: 9
SaleCondition	1	FALSE	6	Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Id	1	730.50	421.61	1	365.75	730.5	1095.25	1460	▇▇▇▇▇
MSSubClass	1	56.90	42.30	20	20.00	50.0	70.00	190	▇▅▂▁▁
LotFrontage	1	57.62	34.66	0	42.00	63.0	79.00	313	▇▇▁▁▁
LotArea	1	10516.83	9981.26	1300	7553.50	9478.5	11601.50	215245	▇▁▁▁▁
OverallQual	1	6.10	1.38	1	5.00	6.0	7.00	10	▁▂▇▅▁
OverallCond	1	5.58	1.11	1	5.00	5.0	6.00	9	▁▁▇▅▁
YearBuilt	1	1971.27	30.20	1872	1954.00	1973.0	2000.00	2010	▁▂▃▆▇
YearRemodAdd	1	1984.87	20.65	1950	1967.00	1994.0	2004.00	2010	▅▂▂▃▇
MasVnrArea	1	103.12	180.73	0	0.00	0.0	164.25	1600	▇▁▁▁▁
BsmtFinSF1	1	443.64	456.10	0	0.00	383.5	712.25	5644	▇▁▁▁▁
BsmtFinSF2	1	46.55	161.32	0	0.00	0.0	0.00	1474	▇▁▁▁▁
BsmtUnfSF	1	567.24	441.87	0	223.00	477.5	808.00	2336	▇▅▂▁▁
TotalBsmtSF	1	1057.43	438.71	0	795.75	991.5	1298.25	6110	▇▃▁▁▁
1stFlrSF	1	1162.63	386.59	334	882.00	1087.0	1391.25	4692	▇▅▁▁▁
2ndFlrSF	1	346.99	436.53	0	0.00	0.0	728.00	2065	▇▃▂▁▁
LowQualFinSF	1	5.84	48.62	0	0.00	0.0	0.00	572	▇▁▁▁▁
GrLivArea	1	1515.46	525.48	334	1129.50	1464.0	1776.75	5642	▇▇▁▁▁
BsmtFullBath	1	0.43	0.52	0	0.00	0.0	1.00	3	▇▆▁▁▁
BsmtHalfBath	1	0.06	0.24	0	0.00	0.0	0.00	2	▇▁▁▁▁
FullBath	1	1.57	0.55	0	1.00	2.0	2.00	3	▁▇▁▇▁
HalfBath	1	0.38	0.50	0	0.00	0.0	1.00	2	▇▁▅▁▁
BedroomAbvGr	1	2.87	0.82	0	2.00	3.0	3.00	8	▁▇▂▁▁
KitchenAbvGr	1	1.05	0.22	0	1.00	1.0	1.00	3	▁▇▁▁▁
TotRmsAbvGrd	1	6.52	1.63	2	5.00	6.0	7.00	14	▂▇▇▁▁
Fireplaces	1	0.61	0.64	0	0.00	1.0	1.00	3	▇▇▁▁▁
GarageYrBlt	1	1868.74	453.70	0	1958.00	1977.0	2001.00	2010	▁▁▁▁▇
GarageCars	1	1.77	0.75	0	1.00	2.0	2.00	4	▁▃▇▂▁
GarageArea	1	472.98	213.80	0	334.50	480.0	576.00	1418	▂▇▃▁▁
WoodDeckSF	1	94.24	125.34	0	0.00	0.0	168.00	857	▇▂▁▁▁
OpenPorchSF	1	46.66	66.26	0	0.00	25.0	68.00	547	▇▁▁▁▁
EnclosedPorch	1	21.95	61.12	0	0.00	0.0	0.00	552	▇▁▁▁▁
3SsnPorch	1	3.41	29.32	0	0.00	0.0	0.00	508	▇▁▁▁▁
ScreenPorch	1	15.06	55.76	0	0.00	0.0	0.00	480	▇▁▁▁▁
PoolArea	1	2.76	40.18	0	0.00	0.0	0.00	738	▇▁▁▁▁
MiscVal	1	43.49	496.12	0	0.00	0.0	0.00	15500	▇▁▁▁▁
MoSold	1	6.32	2.70	1	5.00	6.0	8.00	12	▃▆▇▃▃
YrSold	1	2007.82	1.33	2006	2007.00	2008.0	2009.00	2010	▇▇▇▇▅
SalePrice	1	180921.20	79442.50	34900	129975.00	163000.0	214000.00	755000	▇▅▁▁▁

# no more missing values seen in numeric fields in training_set

Independent Variables: OverallQual, GarageCars, GarageArea, GrLivArea, BsmtFinSF1, BsmtFinSF2 Dependent variable: SalePrice

# Sale Price vs. Overall Quality
ggplot(training_set, aes(x = OverallQual, y = SalePrice)) + geom_point(aes(color = factor(OverallQual))) + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_y_continuous(labels = comma) + 
    theme_classic() + labs(title = "Home Sale Price by Overall Material and Finish Quality", 
    y = "Sale Price($)", x = "Overall Quality Score (Range: 1 - 10)")

There seems to be a positive correlation between Sale Price and Overall Quality Score.

# Sale Price vs. Size of garage in car capacity
ggplot(training_set, aes(x = GarageCars, y = SalePrice)) + geom_point(aes(color = factor(GarageCars))) + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_y_continuous(labels = comma) + 
    theme_classic() + labs(title = "Home Sale Price by Size of Garage (car capacity)", 
    y = "Sale Price($)", x = "# of parkings (Range: 0 - 4)")

There seems to be positive correlation between Sale price and garage size in car capacity. But due to the fact there aren’t that many data points for housing units with 4 parking spots, we can’t really deduce if higher # of parkings will continue to correspond to a higher sale price.

# Sale Price vs. Size of garage in square feet

ggplot(training_set, aes(x = GarageArea, y = SalePrice)) + geom_point(alpha = 0.4, 
    aes(color = GarageArea)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    scale_y_continuous(labels = comma) + scale_x_continuous(labels = comma) + labs(title = "Home Sale Price by Size of Garage (sq. ft.)", 
    y = "Sale Price($)", x = "Sq. Ft.")

This graph shows size of garages matters, esp. in downtown and area with higher population density where parking space is scarce. It’ll be interesting to find out if, in the years ahead, the Telsa’s influence (autonomous driving) will eliminate the needs for parkings for residential housing and thus remove the positive relationship seen above between Size of garage and home sale price.

# Sale Price vs. Above grade (ground) living area square feet
ggplot(training_set, aes(x = GrLivArea, y = SalePrice)) + geom_point(alpha = 0.4, 
    aes(color = GrLivArea)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    scale_y_continuous(labels = comma) + scale_x_continuous(labels = comma) + labs(title = "Home Sale Price by Above Ground Square Footage", 
    y = "Sale Price($)", x = "Sq. Ft.")

There is an apparent positive relationship between above ground square footage and home sale prices.

# Sale Price vs. Total Basement Square Footage (BsmtFinSF1, BsmtFinSF2)
training_set %>% mutate(BsmtFinSF = BsmtFinSF1 + BsmtFinSF2) %>% filter(BsmtFinSF > 
    0) %>% ggplot(aes(x = BsmtFinSF, y = SalePrice)) + geom_point(alpha = 0.4, aes(color = BsmtFinSF)) + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_y_continuous(labels = comma) + 
    scale_x_continuous(labels = comma) + labs(title = "Home Sale Price by Total Basement Finished Square Footage", 
    y = "Sale Price($)", x = "Sq. Ft.")

There is an apparent positive correlation between home sale price and total basement finished square footage.

# Subset of variables
housing_vars <- training_set %>% mutate(BsmtFinSF = BsmtFinSF1 + BsmtFinSF2) %>% 
    dplyr::select(SalePrice, GarageArea, GrLivArea, BsmtFinSF)

# Compute correlations
corr <- cor(housing_vars)
ggcorrplot(corr, lab = TRUE, ggtheme = theme_classic()) + labs(title = "Correlation Matrix")

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Ans:

cor.test(housing_vars$SalePrice, housing_vars$GarageArea, conf.level = 0.8)


    Pearson's product-moment correlation

data:  housing_vars$SalePrice and housing_vars$GarageArea
t = 30.446, df = 1458, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
80 percent confidence interval:
 0.6024756 0.6435283
sample estimates:
      cor 
0.6234314

cor.test(housing_vars$SalePrice, housing_vars$GarageArea, conf.level = 0.99)


    Pearson's product-moment correlation

data:  housing_vars$SalePrice and housing_vars$GarageArea
t = 30.446, df = 1458, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
99 percent confidence interval:
 0.5804338 0.6629623
sample estimates:
      cor 
0.6234314

cor.test(housing_vars$SalePrice, housing_vars$GrLivArea, conf.level = 0.8)


    Pearson's product-moment correlation

data:  housing_vars$SalePrice and housing_vars$GrLivArea
t = 38.348, df = 1458, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
80 percent confidence interval:
 0.6915087 0.7249450
sample estimates:
      cor 
0.7086245

cor.test(housing_vars$SalePrice, housing_vars$GrLivArea, conf.level = 0.99)


    Pearson's product-moment correlation

data:  housing_vars$SalePrice and housing_vars$GrLivArea
t = 38.348, df = 1458, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
99 percent confidence interval:
 0.6733974 0.7406408
sample estimates:
      cor 
0.7086245

cor.test(housing_vars$SalePrice, housing_vars$BsmtFinSF, conf.level = 0.8)


    Pearson's product-moment correlation

data:  housing_vars$SalePrice and housing_vars$BsmtFinSF
t = 15.033, df = 1458, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
80 percent confidence interval:
 0.3369082 0.3950326
sample estimates:
      cor 
0.3663277

cor.test(housing_vars$SalePrice, housing_vars$BsmtFinSF, conf.level = 0.99)


    Pearson's product-moment correlation

data:  housing_vars$SalePrice and housing_vars$BsmtFinSF
t = 15.033, df = 1458, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
99 percent confidence interval:
 0.3065137 0.4232600
sample estimates:
      cor 
0.3663277

As you can see, all 3 hypothesis tests shown above are having the conclusion to reject the null hypothesis, which is to say the true correlation for each pairwise set of variables is not equal to 0. By definition, Familywise errors do exist, esp. when Type I error is not 0%. Meaning as long as there is a probability that Type I errors exists, familywise error rate (FWER) is > 0%. In reality, when we’re using 80% confidence level, we’re accepting the fact there is a 20% (alpha) chance of committing a Type I error. Normally I would be worried with familywise error rate. I went on to try testing with a more stringent alpha (or confidence level). I still got a very small p-value.

The false discovery rate (FDR) is the expected proportion of false rejections out of all rejections.

To curb FDR, we can do a Benjamini-Hochberg procedure. See below.

However, as I do see my smallest p-value being < 2.2e-16, I do think it’s sufficient to say I am not worried about familywise error rate or false discovery rate. I am confident that I’m truly rejecting the null hypothesis properly for all 3 tests and conclude that the correlations between each pairwise set of variables are not equal to 0. Meaning they are positively correlated with convincing evidence.

Linear Algebra and Correlation

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Ans:

precision <- solve(corr)
precision

            SalePrice GarageArea  GrLivArea  BsmtFinSF
SalePrice   2.7618759 -0.9171326 -1.4337755 -0.4752524
GarageArea -0.9171326  1.6487501 -0.1032203 -0.1026555
GrLivArea  -1.4337755 -0.1032203  2.0340198  0.1549708
BsmtFinSF  -0.4752524 -0.1026555  0.1549708  1.1722701

Correlation Matrix by Precision Matrix

(mult = round(corr %*% precision))

           SalePrice GarageArea GrLivArea BsmtFinSF
SalePrice          1          0         0         0
GarageArea         0          1         0         0
GrLivArea          0          0         1         0
BsmtFinSF          0          0         0         1

Precision Matrix by Correlation Matrix

(mult2 = round(precision %*% corr))

           SalePrice GarageArea GrLivArea BsmtFinSF
SalePrice          1          0         0         0
GarageArea         0          1         0         0
GrLivArea          0          0         1         0
BsmtFinSF          0          0         0         1

LU Decomposition

matrixcalc::lu.decomposition(corr)

$L
          [,1]       [,2]       [,3] [,4]
[1,] 1.0000000 0.00000000  0.0000000    0
[2,] 0.6234314 1.00000000  0.0000000    0
[3,] 0.7086245 0.04452351  1.0000000    0
[4,] 0.3663277 0.08168398 -0.1321972    1

$U
     [,1]      [,2]      [,3]        [,4]
[1,]    1 0.6234314 0.7086245  0.36632769
[2,]    0 0.6113332 0.0272187  0.04993613
[3,]    0 0.0000000 0.4966395 -0.06565435
[4,]    0 0.0000000 0.0000000  0.85304572

Get Permutation Matrix P

elu <- Matrix::expand(lu(corr))

Note that elu$L is equivalent to the result generated by function lu.decomposition()

elu$L

4 x 4 Matrix of class "dtrMatrix" (unitriangular)
     [,1]        [,2]        [,3]        [,4]       
[1,]  1.00000000           .           .           .
[2,]  0.62343144  1.00000000           .           .
[3,]  0.70862448  0.04452351  1.00000000           .
[4,]  0.36632769  0.08168398 -0.13219720  1.00000000

Same for elu$U

elu$U

4 x 4 Matrix of class "dtrMatrix"
     [,1]        [,2]        [,3]        [,4]       
[1,]  1.00000000  0.62343144  0.70862448  0.36632769
[2,]           .  0.61133324  0.02721870  0.04993613
[3,]           .           .  0.49663948 -0.06565435
[4,]           .           .           .  0.85304572

The function matequal ( ) checks for equality in the matrices. P * L * U is supposed to give you back the same matrix that it originates from.

(m2 <- with(elu, elu$P %*% elu$L %*% elu$U))

4 x 4 Matrix of class "dgeMatrix"
          [,1]      [,2]      [,3]      [,4]
[1,] 1.0000000 0.6234314 0.7086245 0.3663277
[2,] 0.6234314 1.0000000 0.4689975 0.2783163
[3,] 0.7086245 0.4689975 1.0000000 0.1961578
[4,] 0.3663277 0.2783163 0.1961578 1.0000000

matequal <- function(x, y) {
    is.matrix(x) && is.matrix(y) && dim(x) == dim(y) && all(x == y)
}
matequal(as(corr, "matrix"), as(m2, "matrix"))

[1] FALSE

m2 == corr

# visually, the 2 matrices m2 and corr are the same. However, element (3,3) turns
# out to be the difference
m2 == corr

4 x 4 Matrix of class "lgeMatrix"
     [,1] [,2]  [,3] [,4]
[1,] TRUE TRUE  TRUE TRUE
[2,] TRUE TRUE  TRUE TRUE
[3,] TRUE TRUE FALSE TRUE
[4,] TRUE TRUE  TRUE TRUE

m2

m2

4 x 4 Matrix of class "dgeMatrix"
          [,1]      [,2]      [,3]      [,4]
[1,] 1.0000000 0.6234314 0.7086245 0.3663277
[2,] 0.6234314 1.0000000 0.4689975 0.2783163
[3,] 0.7086245 0.4689975 1.0000000 0.1961578
[4,] 0.3663277 0.2783163 0.1961578 1.0000000

corr

corr

           SalePrice GarageArea GrLivArea BsmtFinSF
SalePrice  1.0000000  0.6234314 0.7086245 0.3663277
GarageArea 0.6234314  1.0000000 0.4689975 0.2783163
GrLivArea  0.7086245  0.4689975 1.0000000 0.1961578
BsmtFinSF  0.3663277  0.2783163 0.1961578 1.0000000

Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.

Is shifting necessary?

min_value = min(training_set$GrLivArea)
min_value

[1] 334

Fitting an exponential probability density function

Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html).

# fitdistr (Maximum-likelihood fitting of univariate distributions, allowing
# parameters to be held fixed if desired.)
expon.pdf = fitdistr(x = training_set$GrLivArea, densfun = "exponential")
expon.pdf

       rate    
  6.598640e-04 
 (1.726943e-05)

str(expon.pdf)

List of 5
 $ estimate: Named num 0.00066
  ..- attr(*, "names")= chr "rate"
 $ sd      : Named num 1.73e-05
  ..- attr(*, "names")= chr "rate"
 $ vcov    : num [1, 1] 2.98e-10
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr "rate"
  .. ..$ : chr "rate"
 $ n       : int 1460
 $ loglik  : num -12152
 - attr(*, "class")= chr "fitdistr"

Find the optimal value of λ

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ).

(optimal.value <- rexp(1000, expon.pdf$estimate))

 [1]    4.549800  648.443596 4778.190307 3293.413645 1839.324499 1081.940727
 [7] 1220.732718 3847.346382  133.501269  366.974527  374.047748 2190.454684
[13] 2618.405479 1374.522281 2136.709717  310.840035 1092.500898  149.348938
[19] 5511.401284 1028.926523   49.226886 2277.416373 3325.535222 2424.518544
[25] 2197.268312  312.849678   98.728312    5.035472 2245.699495  204.632789
[31] 1342.318524 1020.509301   50.555798  486.528715  238.652955 1325.240936
[37]  387.985976 3743.715551 2120.339195  478.410680  129.231863  737.869088
[43] 1260.284126  349.756229  108.589205   12.120637 1751.390576  105.146313
[49]  945.975136 1065.032863  682.400894  381.188468  216.938103 1182.666663
[55]  490.299892  473.135438 3784.966551 2491.565878 3132.872769  684.164175
[61]  451.644584  979.986834  100.233022 1119.798005 2263.940346  191.190219
[67] 1356.548772 4169.464485  476.890357 4221.458436  463.506758  156.268529
[73] 1445.317622  355.040796  660.106025  276.345231  472.650822  887.775319
[79] 2220.225985  948.792361  739.779334 1258.795164 3270.307911 6230.519193
[85]  613.507052
 [ reached getOption("max.print") -- omitted 915 entries ]

Plotting a histogram

Plot a histogram and compare it with a histogram of your original variable.

par(mfrow = c(1, 2))
hist(optimal.value, breaks = 50, xlim = c(0, 6000), main = "Exponential dist. of GrLivArea", 
    col = "green")
hist(training_set$GrLivArea, breaks = 50, main = "The given Dist. of GrLivArea", 
    col = "blue")

Percentiles of CDF

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

qexp(p = 0.05, rate = expon.pdf$estimate)

[1] 77.73313

qexp(p = 0.95, rate = expon.pdf$estimate)

[1] 4539.924

5th percentile is 77.73, 95th percentile is 4,539.92

Confidence Interval

Also generate a 95% confidence interval from the empirical data, assuming normality.

skim(training_set$GrLivArea)

Data summary
Name	training_set$GrLivArea
Number of rows	1460
Number of columns	1
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
data	0	1	1515.46	525.48	334	1129.5	1464	1776.75	5642	▇▇▁▁▁

CI(training_set$GrLivArea, ci = 0.95)

   upper     mean    lower 
1542.440 1515.464 1488.487

95% CI is (1488.49, 1542.44)

Empirical Percentiles

Finally, provide the empirical 5th percentile and 95th percentile of the data.

quantile(training_set$GrLivArea, 0.05)

 5% 
848

quantile(training_set$GrLivArea, 0.95)

   95% 
2466.1

5th percentile is 848 and 95th percentile is 2466.

Modeling

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis.

Training on the training_set

train <- training_set %>% dplyr::select(-Id) 

# registerDoSEQ()

cv <- caret::trainControl(method="cv", number = 10, preProc = c("scale"))
# 
# Fit an ElasticNet Model
houses.glm <- train(SalePrice ~ ., data = train,
                    method = "glmnet", trControl = cv,
                    tuneLength=10
                    # tuneLength parameter defines the total number of parameter combinations that will be evaluated
                   )

Loading test set

test_set <- readr::read_csv("test.csv")

Treat the test set

# use is.character to filter for the fields that are character fields
factors <- test_set %>% select_if(is.character) %>% colnames()
test_set[factors] <- lapply(test_set[factors], factor)
# assign '(Missing)', an explicit factor level to missing values
test_set[factors] <- lapply(test_set[factors], fct_explicit_na)
skim(test_set)

Data summary
Name	test_set
Number of rows	1459
Number of columns	80
_______________________
Column type frequency:
factor	43
numeric	37
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
MSZoning	1	FALSE	6	RL: 1114, RM: 242, FV: 74, C (: 15
Street	1	FALSE	2	Pav: 1453, Grv: 6
Alley	1	FALSE	3	(Mi: 1352, Grv: 70, Pav: 37
LotShape	1	FALSE	4	Reg: 934, IR1: 484, IR2: 35, IR3: 6
LandContour	1	FALSE	4	Lvl: 1311, HLS: 70, Bnk: 54, Low: 24
Utilities	1	FALSE	2	All: 1457, (Mi: 2
LotConfig	1	FALSE	5	Ins: 1081, Cor: 248, Cul: 82, FR2: 38
LandSlope	1	FALSE	3	Gtl: 1396, Mod: 60, Sev: 3
Neighborhood	1	FALSE	25	NAm: 218, Old: 126, Col: 117, Som: 96
Condition1	1	FALSE	9	Nor: 1251, Fee: 83, Art: 44, RRA: 24
Condition2	1	FALSE	5	Nor: 1444, Fee: 7, Art: 3, Pos: 3
BldgType	1	FALSE	5	1Fa: 1205, Twn: 113, Dup: 57, Twn: 53
HouseStyle	1	FALSE	7	1St: 745, 2St: 427, 1.5: 160, SLv: 63
RoofStyle	1	FALSE	6	Gab: 1169, Hip: 265, Gam: 11, Fla: 7
RoofMatl	1	FALSE	4	Com: 1442, Tar: 12, WdS: 4, WdS: 1
Exterior1st	1	FALSE	14	Vin: 510, Met: 230, HdB: 220, Wd : 205
Exterior2nd	1	FALSE	16	Vin: 510, Met: 233, HdB: 199, Wd : 194
MasVnrType	1	FALSE	5	Non: 878, Brk: 434, Sto: 121, (Mi: 16
ExterQual	1	FALSE	4	TA: 892, Gd: 491, Ex: 55, Fa: 21
ExterCond	1	FALSE	5	TA: 1256, Gd: 153, Fa: 39, Ex: 9
Foundation	1	FALSE	6	PCo: 661, CBl: 601, Brk: 165, Sla: 25
BsmtQual	1	FALSE	5	TA: 634, Gd: 591, Ex: 137, Fa: 53
BsmtCond	1	FALSE	5	TA: 1295, Fa: 59, Gd: 57, (Mi: 45
BsmtExposure	1	FALSE	5	No: 951, Av: 197, Gd: 142, Mn: 125
BsmtFinType1	1	FALSE	7	GLQ: 431, Unf: 421, ALQ: 209, Rec: 155
BsmtFinType2	1	FALSE	7	Unf: 1237, Rec: 51, (Mi: 42, LwQ: 41
Heating	1	FALSE	4	Gas: 1446, Gas: 9, Gra: 2, Wal: 2
HeatingQC	1	FALSE	5	Ex: 752, TA: 429, Gd: 233, Fa: 43
CentralAir	1	FALSE	2	Y: 1358, N: 101
Electrical	1	FALSE	4	SBr: 1337, Fus: 94, Fus: 23, Fus: 5
KitchenQual	1	FALSE	5	TA: 757, Gd: 565, Ex: 105, Fa: 31
Functional	1	FALSE	8	Typ: 1357, Min: 36, Min: 34, Mod: 20
FireplaceQu	1	FALSE	6	(Mi: 730, Gd: 364, TA: 279, Fa: 41
GarageType	1	FALSE	7	Att: 853, Det: 392, Bui: 98, (Mi: 76
GarageFinish	1	FALSE	4	Unf: 625, RFn: 389, Fin: 367, (Mi: 78
GarageQual	1	FALSE	5	TA: 1293, (Mi: 78, Fa: 76, Gd: 10
GarageCond	1	FALSE	6	TA: 1328, (Mi: 78, Fa: 39, Po: 7
PavedDrive	1	FALSE	3	Y: 1301, N: 126, P: 32
PoolQC	1	FALSE	3	(Mi: 1456, Ex: 2, Gd: 1
Fence	1	FALSE	5	(Mi: 1169, MnP: 172, GdP: 59, GdW: 58
MiscFeature	1	FALSE	4	(Mi: 1408, She: 46, Gar: 3, Oth: 2
SaleType	1	FALSE	10	WD: 1258, New: 117, COD: 44, Con: 17
SaleCondition	1	FALSE	6	Nor: 1204, Par: 120, Abn: 89, Fam: 26

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Id	0	1.00	2190.00	421.32	1461	1825.50	2190.0	2554.50	2919	▇▇▇▇▇
MSSubClass	0	1.00	57.38	42.75	20	20.00	50.0	70.00	190	▇▅▂▁▁
LotFrontage	227	0.84	68.58	22.38	21	58.00	67.0	80.00	200	▃▇▁▁▁
LotArea	0	1.00	9819.16	4955.52	1470	7391.00	9399.0	11517.50	56600	▇▂▁▁▁
OverallQual	0	1.00	6.08	1.44	1	5.00	6.0	7.00	10	▁▁▇▅▁
OverallCond	0	1.00	5.55	1.11	1	5.00	5.0	6.00	9	▁▁▇▅▁
YearBuilt	0	1.00	1971.36	30.39	1879	1953.00	1973.0	2001.00	2010	▁▂▃▆▇
YearRemodAdd	0	1.00	1983.66	21.13	1950	1963.00	1992.0	2004.00	2010	▅▂▂▃▇
MasVnrArea	15	0.99	100.71	177.63	0	0.00	0.0	164.00	1290	▇▁▁▁▁
BsmtFinSF1	1	1.00	439.20	455.27	0	0.00	350.5	753.50	4010	▇▂▁▁▁
BsmtFinSF2	1	1.00	52.62	176.75	0	0.00	0.0	0.00	1526	▇▁▁▁▁
BsmtUnfSF	1	1.00	554.29	437.26	0	219.25	460.0	797.75	2140	▇▆▂▁▁
TotalBsmtSF	1	1.00	1046.12	442.90	0	784.00	988.0	1305.00	5095	▇▇▁▁▁
1stFlrSF	0	1.00	1156.53	398.17	407	873.50	1079.0	1382.50	5095	▇▃▁▁▁
2ndFlrSF	0	1.00	325.97	420.61	0	0.00	0.0	676.00	1862	▇▃▂▁▁
LowQualFinSF	0	1.00	3.54	44.04	0	0.00	0.0	0.00	1064	▇▁▁▁▁
GrLivArea	0	1.00	1486.05	485.57	407	1117.50	1432.0	1721.00	5095	▇▇▁▁▁
BsmtFullBath	2	1.00	0.43	0.53	0	0.00	0.0	1.00	3	▇▆▁▁▁
BsmtHalfBath	2	1.00	0.07	0.25	0	0.00	0.0	0.00	2	▇▁▁▁▁
FullBath	0	1.00	1.57	0.56	0	1.00	2.0	2.00	4	▁▇▇▁▁
HalfBath	0	1.00	0.38	0.50	0	0.00	0.0	1.00	2	▇▁▅▁▁
BedroomAbvGr	0	1.00	2.85	0.83	0	2.00	3.0	3.00	6	▁▃▇▂▁
KitchenAbvGr	0	1.00	1.04	0.21	0	1.00	1.0	1.00	2	▁▁▇▁▁
TotRmsAbvGrd	0	1.00	6.39	1.51	3	5.00	6.0	7.00	15	▅▇▃▁▁
Fireplaces	0	1.00	0.58	0.65	0	0.00	0.0	1.00	4	▇▇▁▁▁
GarageYrBlt	78	0.95	1977.72	26.43	1895	1959.00	1979.0	2002.00	2207	▂▇▁▁▁
GarageCars	1	1.00	1.77	0.78	0	1.00	2.0	2.00	5	▅▇▂▁▁
GarageArea	1	1.00	472.77	217.05	0	318.00	480.0	576.00	1488	▃▇▃▁▁
WoodDeckSF	0	1.00	93.17	127.74	0	0.00	0.0	168.00	1424	▇▁▁▁▁
OpenPorchSF	0	1.00	48.31	68.88	0	0.00	28.0	72.00	742	▇▁▁▁▁
EnclosedPorch	0	1.00	24.24	67.23	0	0.00	0.0	0.00	1012	▇▁▁▁▁
3SsnPorch	0	1.00	1.79	20.21	0	0.00	0.0	0.00	360	▇▁▁▁▁
ScreenPorch	0	1.00	17.06	56.61	0	0.00	0.0	0.00	576	▇▁▁▁▁
PoolArea	0	1.00	1.74	30.49	0	0.00	0.0	0.00	800	▇▁▁▁▁
MiscVal	0	1.00	58.17	630.81	0	0.00	0.0	0.00	17000	▇▁▁▁▁
MoSold	0	1.00	6.10	2.72	1	4.00	6.0	8.00	12	▅▆▇▃▃
YrSold	0	1.00	2007.77	1.30	2006	2007.00	2008.0	2009.00	2010	▇▇▇▇▃

# repalcing numeric NA with 0 Note that the following won't work without the
# tilde symbol to spread it across all numeric variables
test_set <- test_set %>% mutate_if(is.numeric, ~replace(., is.na(.), 0))

# combining levels in test set and training set using fct_c
for (c in factors) {
    levels(test_set[[c]]) <- fct_c(test_set[[c]], training_set[[c]])
}

Make Predictions

houses_pred <- predict(houses.glm, test_set)

Submissions

submission <- tibble(ID = test_set$Id, SalePrice = houses_pred)
readr::write_csv(submission, "submission.csv")

Notice that Root Mean Squared Log Error (RMSLE) was what is used by Kaggle to compute the final score.

Here is a resource of the difference between RMSLE and Root Mean Squared Error (RMSE).

https://medium.com/analytics-vidhya/root-mean-square-log-error-rmse-vs-rmlse-935c6cc1802a

Model Summary and plot of Cross Validations

houses.glm

glmnet 

1460 samples
  79 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 1313, 1315, 1314, 1313, 1314, 1313, ... 
Resampling results across tuning parameters:

  alpha  lambda       RMSE      Rsquared   MAE     
  0.1       29.02263  37901.68  0.7911614  18804.77
  0.1       67.04604  37870.08  0.7913663  18788.79
  0.1      154.88506  37031.02  0.7980308  18549.84
  0.1      357.80458  35168.12  0.8140918  18260.48
  0.1      826.57500  33337.03  0.8299189  17969.17
  0.1     1909.49551  32177.35  0.8385799  17767.79
  0.1     4411.18240  31486.86  0.8426695  17587.48
  0.1    10190.40372  31379.97  0.8431731  17786.09
  0.1    23541.15487  32791.96  0.8351808  18949.56
  0.1    54383.12240  36826.08  0.8161039  22048.58
  0.2       29.02263  38009.94  0.7908257  18876.21
  0.2       67.04604  38160.72  0.7886583  18725.93
  0.2      154.88506  36965.71  0.7984914  18439.51
  0.2      357.80458  34818.51  0.8169184  18081.24
  0.2      826.57500  32949.30  0.8326259  17807.44
  0.2     1909.49551  31973.21  0.8392522  17698.05
  0.2     4411.18240  31533.40  0.8415426  17719.75
 [ reached getOption("max.print") -- omitted 83 rows ]

RMSE was used to select the optimal model using the smallest value.
The final values used for the model were alpha = 0.1 and lambda = 10190.4.

# printing out the record with the lowest RMSE
paste("The lowest RMSE is: ", houses.glm$results[which.min(houses.glm$results[, "RMSE"]), 
    "RMSE"])

[1] "The lowest RMSE is:  31379.9725540485"

ggplot(houses.glm)