# Necessary libraries
library(ggplot2)
library(knitr)
library(tidyverse)
library(DataExplorer)
library(GGally)
library(Matrix)
library(matrixcalc)
library(Rmisc)
library(ggpubr)

Problem 1.

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu = \sigma = \frac {(N+1)}{2}\).

# set seed
set.seed(786)

# define variables
N <- 15
mu <- (N+1)/2
sd <- mu

# X has 10,000 random uniform numbers from 1 to N
X <- runif(n = 10000, min = 1, max = N)
# Y has 10,000 random normal numbers
Y <- rnorm(n = 10000, mean = mu, sd = sd)

# Draw histogram for X
ggplot() +
geom_histogram(data = data.frame(X), aes(X), bins = 50)

# Draw histogram for Y
ggplot() +
geom_histogram(data = data.frame(Y), aes(Y), bins = 50)

Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

# x is estimated as the median of the X variable
x <- median(X)
# y is estimated as the 1st quartile of the Y variable
y <- summary(Y)[2][[1]]

P(X>x | X>y)

sum(X>x & X>y) / sum(X>y)

## [1] 0.561924

The probability of X greater than median value of X given that X is greater than first quartile of Y is 0.56.

P(X>x, Y>y)

sum(X>x & Y>y) / length(X)

## [1] 0.3735

The probability of X greater than median value of X and Y is greater than first quartile of Y is 0.37.

P(X<x | X>y)

sum(X<x & X>y) / sum(X>y)

## [1] 0.438076

The probability of X less than median value of X given that X is greater than first quartile of Y is 0.438076.

Marginal and Joint probabilities

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

# create table
table <- c(sum(X<x & Y < y),
         sum(X < x & Y == y),
         sum(X < x & Y > y))

table <- rbind(table, 
             c(sum(X==x & Y < y), 
               sum(X == x & Y == y), 
               sum(X == x & Y > y)))

table <- rbind(table, 
             c(sum(X>x & Y < y), 
               sum(X > x & Y == y), 
               sum(X > x & Y > y)))

# col bind
table <- cbind(table, table[,1] + table[,2] + table[,3])
# row bind 
table <- rbind(table, table[1,] + table[2,] + table[3,])

# rename columns
colnames(table) <- c("Y<y", "Y=y", "Y>y", "Total")
# rename rows
rownames(table) <- c("X<x", "X=x", "X>x", "Total")

kable(table)

	Y<y	Y>y	Total
X<x	1235	3765	5000
X=x	0	0	0
X>x	1265	3735	5000
Total	2500	7500	10000

Based on the above table, following are required probabilities which are approx equal.

# P(X>x and Y>y)
3735 / 10000

## [1] 0.3735

# P(X>x)P(Y>y)
(5000/10000) * (7500/10000)

## [1] 0.375

P(X>x and Y>y) = 0.3735

P(X>x)P(Y>y) = 0.5 * 0.75 = 0.375

Independence

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

Fisher’s Exact Test

fisher.test(table(X>x,Y>y))

## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(X > x, Y > y)
## p-value = 0.503
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8837534 1.0614009
## sample estimates:
## odds ratio 
##  0.9684979

Fisher test shows a large p-value 0.50 so we fail to to reject null hypothesis.

Chi Square Test

chisq.test(table(X>x,Y>y))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(X > x, Y > y)
## X-squared = 0.44853, df = 1, p-value = 0.503

Chi Square Test also shows a large p-value 0.50 so we fail to to reject null hypothesis.

Therefore, we fail to reject the null hypothesis and conclude two events are independent. Fisher’s Exact Test is used when sample size is small. The Chi Square Test is used when there are large values in the contingency table and tests contingency table tests and goodness-of-fit tests. In this case, there are large value, so the Chi-Square Test is most appropriate. Fisher’s exact test seems more appropriate here.

Problem 2

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques. I want you to do the following.

# read data from github
train_URL <- "https://raw.githubusercontent.com/amit-kapoor/data605/master/FinalProject/train.csv"
test_URL <- "https://raw.githubusercontent.com/amit-kapoor/data605/master/FinalProject/test.csv"

# create data frame for training and testing dataset
train <- read.csv(train_URL)
test <- read.csv(test_URL)

Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Lets first explore the training dataset. It has 81 columns and 1460 rows.

# training dataset dimension
dim(train)

## [1] 1460   81

# get glimpse of data
glimpse(train)

## Observations: 1,460
## Variables: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20…
## $ MSZoning      <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL,…
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91,…
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 61…
## $ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, P…
## $ Alley         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ LotShape      <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, …
## $ LandContour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, …
## $ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub,…
## $ LotConfig     <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corne…
## $ LandSlope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, …
## $ Neighborhood  <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, S…
## $ Condition1    <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery…
## $ Condition2    <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, A…
## $ BldgType      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2…
## $ HouseStyle    <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story,…
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 1…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 1…
## $ RoofStyle     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable,…
## $ RoofMatl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, C…
## $ Exterior1st   <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, V…
## $ Exterior2nd   <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, V…
## $ MasVnrType    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Sto…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306,…
## $ ExterQual     <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd,…
## $ ExterCond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA,…
## $ Foundation    <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBloc…
## $ BsmtQual      <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd,…
## $ BsmtCond      <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA,…
## $ BsmtExposure  <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av,…
## $ BsmtFinType1  <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, …
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 9…
## $ BsmtFinType2  <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, …
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 1…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 1…
## $ Heating       <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, G…
## $ HeatingQC     <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex,…
## $ CentralAir    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y…
## $ Electrical    <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr,…
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077,…
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 1…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1…
## $ KitchenQual   <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd,…
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, …
## $ Functional    <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ,…
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0…
## $ FireplaceQu   <fct> NA, TA, TA, Gd, TA, NA, Gd, TA, TA, TA, NA, Gd, NA, Gd,…
## $ GarageType    <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd,…
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 1…
## $ GarageFinish  <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, …
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, …
## $ GarageQual    <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA,…
## $ GarageCond    <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA,…
## $ PavedDrive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 16…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0,…
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0,…
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PoolQC        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Fence         <fct> NA, NA, NA, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, NA, …
## $ MiscFeature   <fct> NA, NA, NA, NA, NA, Shed, NA, Shed, NA, NA, NA, NA, NA,…
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 1…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 2…
## $ SaleType      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, Ne…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000,…

In next step, Id column is removed from dataset and see the summary.

# remove column Id from train dataset
train <- train %>% dplyr::select(-Id)

summary(train)

##    MSSubClass       MSZoning     LotFrontage        LotArea        Street    
##  Min.   : 20.0   C (all):  10   Min.   : 21.00   Min.   :  1300   Grvl:   6  
##  1st Qu.: 20.0   FV     :  65   1st Qu.: 59.00   1st Qu.:  7554   Pave:1454  
##  Median : 50.0   RH     :  16   Median : 69.00   Median :  9478              
##  Mean   : 56.9   RL     :1151   Mean   : 70.05   Mean   : 10517              
##  3rd Qu.: 70.0   RM     : 218   3rd Qu.: 80.00   3rd Qu.: 11602              
##  Max.   :190.0                  Max.   :313.00   Max.   :215245              
##                                 NA's   :259                                  
##   Alley      LotShape  LandContour  Utilities      LotConfig    LandSlope 
##  Grvl:  50   IR1:484   Bnk:  63    AllPub:1459   Corner : 263   Gtl:1382  
##  Pave:  41   IR2: 41   HLS:  50    NoSeWa:   1   CulDSac:  94   Mod:  65  
##  NA's:1369   IR3: 10   Low:  36                  FR2    :  47   Sev:  13  
##              Reg:925   Lvl:1311                  FR3    :   4             
##                                                  Inside :1052             
##                                                                           
##                                                                           
##   Neighborhood   Condition1     Condition2     BldgType      HouseStyle 
##  NAmes  :225   Norm   :1260   Norm   :1445   1Fam  :1220   1Story :726  
##  CollgCr:150   Feedr  :  81   Feedr  :   6   2fmCon:  31   2Story :445  
##  OldTown:113   Artery :  48   Artery :   2   Duplex:  52   1.5Fin :154  
##  Edwards:100   RRAn   :  26   PosN   :   2   Twnhs :  43   SLvl   : 65  
##  Somerst: 86   PosN   :  19   RRNn   :   2   TwnhsE: 114   SFoyer : 37  
##  Gilbert: 79   RRAe   :  11   PosA   :   1                 1.5Unf : 14  
##  (Other):707   (Other):  15   (Other):   2                 (Other): 19  
##   OverallQual      OverallCond      YearBuilt     YearRemodAdd    RoofStyle   
##  Min.   : 1.000   Min.   :1.000   Min.   :1872   Min.   :1950   Flat   :  13  
##  1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Gable  :1141  
##  Median : 6.000   Median :5.000   Median :1973   Median :1994   Gambrel:  11  
##  Mean   : 6.099   Mean   :5.575   Mean   :1971   Mean   :1985   Hip    : 286  
##  3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004   Mansard:   7  
##  Max.   :10.000   Max.   :9.000   Max.   :2010   Max.   :2010   Shed   :   2  
##                                                                               
##     RoofMatl     Exterior1st   Exterior2nd    MasVnrType    MasVnrArea    
##  CompShg:1434   VinylSd:515   VinylSd:504   BrkCmn : 15   Min.   :   0.0  
##  Tar&Grv:  11   HdBoard:222   MetalSd:214   BrkFace:445   1st Qu.:   0.0  
##  WdShngl:   6   MetalSd:220   HdBoard:207   None   :864   Median :   0.0  
##  WdShake:   5   Wd Sdng:206   Wd Sdng:197   Stone  :128   Mean   : 103.7  
##  ClyTile:   1   Plywood:108   Plywood:142   NA's   :  8   3rd Qu.: 166.0  
##  Membran:   1   CemntBd: 61   CmentBd: 60                 Max.   :1600.0  
##  (Other):   2   (Other):128   (Other):136                 NA's   :8       
##  ExterQual ExterCond  Foundation  BsmtQual   BsmtCond    BsmtExposure
##  Ex: 52    Ex:   3   BrkTil:146   Ex  :121   Fa  :  45   Av  :221    
##  Fa: 14    Fa:  28   CBlock:634   Fa  : 35   Gd  :  65   Gd  :134    
##  Gd:488    Gd: 146   PConc :647   Gd  :618   Po  :   2   Mn  :114    
##  TA:906    Po:   1   Slab  : 24   TA  :649   TA  :1311   No  :953    
##            TA:1282   Stone :  6   NA's: 37   NA's:  37   NA's: 38    
##                      Wood  :  3                                      
##                                                                      
##  BsmtFinType1   BsmtFinSF1     BsmtFinType2   BsmtFinSF2        BsmtUnfSF     
##  ALQ :220     Min.   :   0.0   ALQ :  19    Min.   :   0.00   Min.   :   0.0  
##  BLQ :148     1st Qu.:   0.0   BLQ :  33    1st Qu.:   0.00   1st Qu.: 223.0  
##  GLQ :418     Median : 383.5   GLQ :  14    Median :   0.00   Median : 477.5  
##  LwQ : 74     Mean   : 443.6   LwQ :  46    Mean   :  46.55   Mean   : 567.2  
##  Rec :133     3rd Qu.: 712.2   Rec :  54    3rd Qu.:   0.00   3rd Qu.: 808.0  
##  Unf :430     Max.   :5644.0   Unf :1256    Max.   :1474.00   Max.   :2336.0  
##  NA's: 37                      NA's:  38                                      
##   TotalBsmtSF      Heating     HeatingQC CentralAir Electrical     X1stFlrSF   
##  Min.   :   0.0   Floor:   1   Ex:741    N:  95     FuseA:  94   Min.   : 334  
##  1st Qu.: 795.8   GasA :1428   Fa: 49    Y:1365     FuseF:  27   1st Qu.: 882  
##  Median : 991.5   GasW :  18   Gd:241               FuseP:   3   Median :1087  
##  Mean   :1057.4   Grav :   7   Po:  1               Mix  :   1   Mean   :1163  
##  3rd Qu.:1298.2   OthW :   2   TA:428               SBrkr:1334   3rd Qu.:1391  
##  Max.   :6110.0   Wall :   4                        NA's :   1   Max.   :4692  
##                                                                                
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional    Fireplaces   
##  Min.   :0.000   Ex:100      Min.   : 2.000   Maj1:  14   Min.   :0.000  
##  1st Qu.:1.000   Fa: 39      1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000  
##  Median :1.000   Gd:586      Median : 6.000   Min1:  31   Median :1.000  
##  Mean   :1.047   TA:735      Mean   : 6.518   Min2:  34   Mean   :0.613  
##  3rd Qu.:1.000               3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000  
##  Max.   :3.000               Max.   :14.000   Sev :   1   Max.   :3.000  
##                                               Typ :1360                  
##  FireplaceQu   GarageType   GarageYrBlt   GarageFinish   GarageCars   
##  Ex  : 24    2Types :  6   Min.   :1900   Fin :352     Min.   :0.000  
##  Fa  : 33    Attchd :870   1st Qu.:1961   RFn :422     1st Qu.:1.000  
##  Gd  :380    Basment: 19   Median :1980   Unf :605     Median :2.000  
##  Po  : 20    BuiltIn: 88   Mean   :1979   NA's: 81     Mean   :1.767  
##  TA  :313    CarPort:  9   3rd Qu.:2002                3rd Qu.:2.000  
##  NA's:690    Detchd :387   Max.   :2010                Max.   :4.000  
##              NA's   : 81   NA's   :81                                 
##    GarageArea     GarageQual  GarageCond  PavedDrive   WoodDeckSF    
##  Min.   :   0.0   Ex  :   3   Ex  :   2   N:  90     Min.   :  0.00  
##  1st Qu.: 334.5   Fa  :  48   Fa  :  35   P:  30     1st Qu.:  0.00  
##  Median : 480.0   Gd  :  14   Gd  :   9   Y:1340     Median :  0.00  
##  Mean   : 473.0   Po  :   3   Po  :   7              Mean   : 94.24  
##  3rd Qu.: 576.0   TA  :1311   TA  :1326              3rd Qu.:168.00  
##  Max.   :1418.0   NA's:  81   NA's:  81              Max.   :857.00  
##                                                                      
##   OpenPorchSF     EnclosedPorch      X3SsnPorch      ScreenPorch    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median : 25.00   Median :  0.00   Median :  0.00   Median :  0.00  
##  Mean   : 46.66   Mean   : 21.95   Mean   :  3.41   Mean   : 15.06  
##  3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :547.00   Max.   :552.00   Max.   :508.00   Max.   :480.00  
##                                                                     
##     PoolArea        PoolQC       Fence      MiscFeature    MiscVal        
##  Min.   :  0.000   Ex  :   2   GdPrv:  59   Gar2:   2   Min.   :    0.00  
##  1st Qu.:  0.000   Fa  :   2   GdWo :  54   Othr:   2   1st Qu.:    0.00  
##  Median :  0.000   Gd  :   3   MnPrv: 157   Shed:  49   Median :    0.00  
##  Mean   :  2.759   NA's:1453   MnWw :  11   TenC:   1   Mean   :   43.49  
##  3rd Qu.:  0.000               NA's :1179   NA's:1406   3rd Qu.:    0.00  
##  Max.   :738.000                                        Max.   :15500.00  
##                                                                           
##      MoSold           YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   : 1.000   Min.   :2006   WD     :1267   Abnorml: 101   Min.   : 34900  
##  1st Qu.: 5.000   1st Qu.:2007   New    : 122   AdjLand:   4   1st Qu.:129975  
##  Median : 6.000   Median :2008   COD    :  43   Alloca :  12   Median :163000  
##  Mean   : 6.322   Mean   :2008   ConLD  :   9   Family :  20   Mean   :180921  
##  3rd Qu.: 8.000   3rd Qu.:2009   ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  Max.   :12.000   Max.   :2010   ConLw  :   5   Partial: 125   Max.   :755000  
##                                  (Other):   9

Next is to explore the distribution mainly for the quantitative variables. We will plot histograms of training variables.

# plot hist for training dataset
plot_histogram(train)

Now, we have better understanding of the data distribution. Let’s plot the SalePrice (final response variable) against all other variables.

# boxplot for train dataset
plot_boxplot(train, by="SalePrice")

## Warning: Removed 267 rows containing non-finite values (stat_boxplot).

## Warning: Removed 81 rows containing non-finite values (stat_boxplot).

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable

In this step, we will plot scatter plots for all variables against the response variable.

# scatter plot
plot_scatterplot(train, by = "SalePrice")

Derive a correlation matrix for any three quantitative variables in the dataset.

I have chosen GrLivArea, TotalBsmtSF, SalePrice from the dataset for correlation matrix.

# correlation matrix for GrLivArea, TotalBsmtSF, SalePrice
corr_matrix <- train %>% 
  dplyr::select(GrLivArea, TotalBsmtSF, SalePrice) %>% 
  cor() %>% 
  as.matrix()

corr_matrix

##             GrLivArea TotalBsmtSF SalePrice
## GrLivArea   1.0000000   0.4548682 0.7086245
## TotalBsmtSF 0.4548682   1.0000000 0.6135806
## SalePrice   0.7086245   0.6135806 1.0000000

In the next step, we will use ggpairs() function from the GGally package that allows to build a scatterplot matrix. It draws scatterplots of each pair of numeric variable on the left part of the figure. The right part has Pearson correlation and the diagonal contains variable distribution.

train %>% 
  dplyr::select(GrLivArea, TotalBsmtSF, SalePrice) %>%
  ggpairs(title = "Paiwise scatter plots")

Lets see correlation matrix plot using ggcorr(). The ggcorr() function visualizes the correlation of each pair of variable as a square. As shown below it shows strong correlation.

train %>% 
  dplyr::select(GrLivArea, TotalBsmtSF, SalePrice) %>%
  ggcorr(label = TRUE)

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis.

# Correlation Test for GrLivArea and TotalBsmtSF
cor.test(train$GrLivArea, train$TotalBsmtSF, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  train$GrLivArea and train$TotalBsmtSF
## t = 19.503, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4278380 0.4810855
## sample estimates:
##       cor 
## 0.4548682

# Correlation Test for GrLivArea and SalePrice
cor.test(train$GrLivArea, train$SalePrice, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  train$GrLivArea and train$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6915087 0.7249450
## sample estimates:
##       cor 
## 0.7086245

# Correlation Test for TotalBsmtSF and SalePrice
cor.test(train$TotalBsmtSF, train$SalePrice, conf.level = 0.8)

## 
##  Pearson's product-moment correlation
## 
## data:  train$TotalBsmtSF and train$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922142 0.6340846
## sample estimates:
##       cor 
## 0.6135806

In all three instances above, we have taken an 80 percent confidence interval. Seeing the smaller p-value for all three iterations of testing, we can reject the the null hypothesis and conclude that the true correlation is not 0 for the selected variables.

Would you be worried about familywise error? Why or why not?

The familywise error rate (FWE or FWER) is the probability of a coming to at least one false conclusion in a series of hypothesis tests . In other words, it’s the probability of making at least one Type I Error. The term “familywise” error rate comes from family of tests, which is the technical definition for a series of tests on data

In this case, I am not much worried about familywise error since the comparision are comparitively few.

print(paste0("Familywise error rate is ", (1 - (1 - .05)^3)))

## [1] "Familywise error rate is 0.142625"

Linear Algebra and Correlation.

Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

# corr_matrix from above
corr_matrix

##             GrLivArea TotalBsmtSF SalePrice
## GrLivArea   1.0000000   0.4548682 0.7086245
## TotalBsmtSF 0.4548682   1.0000000 0.6135806
## SalePrice   0.7086245   0.6135806 1.0000000

# Invert correlation matrix
preci_matrix <- solve(corr_matrix)
preci_matrix

##               GrLivArea TotalBsmtSF  SalePrice
## GrLivArea    2.01124151 -0.06473842 -1.3854927
## TotalBsmtSF -0.06473842  1.60588442 -0.9394642
## SalePrice   -1.38549273 -0.93946422  2.5582310

# multiply the correlation matrix by the precision matrix
round(corr_matrix %*% preci_matrix)

##             GrLivArea TotalBsmtSF SalePrice
## GrLivArea           1           0         0
## TotalBsmtSF         0           1         0
## SalePrice           0           0         1

# multiply the precision matrix by the correlation matrix
round(preci_matrix %*% corr_matrix )

##             GrLivArea TotalBsmtSF SalePrice
## GrLivArea           1           0         0
## TotalBsmtSF         0           1         0
## SalePrice           0           0         1

# conduct LU decomposition on the matrix
lu_corr_mat <- lu(corr_matrix)
lu_corr_mat

## 'MatrixFactorization' of Formal class 'denseLU' [package "Matrix"] with 4 slots
##   ..@ x       : num [1:9] 1 0.455 0.709 0.455 0.793 ...
##   ..@ perm    : int [1:3] 1 2 3
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:3] "GrLivArea" "TotalBsmtSF" "SalePrice"
##   .. ..$ : chr [1:3] "GrLivArea" "TotalBsmtSF" "SalePrice"
##   ..@ Dim     : int [1:2] 3 3

# Lower 
lower_corr <- expand(lu_corr_mat)$L
lower_corr

## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
##      [,1]      [,2]      [,3]     
## [1,] 1.0000000         .         .
## [2,] 0.4548682 1.0000000         .
## [3,] 0.7086245 0.3672320 1.0000000

# Upper 
upper_corr <- expand(lu_corr_mat)$U
upper_corr

## 3 x 3 Matrix of class "dtrMatrix"
##      [,1]      [,2]      [,3]     
## [1,] 1.0000000 0.4548682 0.7086245
## [2,]         . 0.7930949 0.2912498
## [3,]         .         . 0.3908951

lower_corr %*% upper_corr

## 3 x 3 Matrix of class "dgeMatrix"
##           [,1]      [,2]      [,3]
## [1,] 1.0000000 0.4548682 0.7086245
## [2,] 0.4548682 1.0000000 0.6135806
## [3,] 0.7086245 0.6135806 1.0000000

It comes out this is the original correlation matrix when L and U multiplied

Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

In the given training dataset, GrLivArea is a variable with a right skewed.

# plot histogram GrLivArea
plot_histogram(train$GrLivArea)

# summary GrLivArea
summary(train$GrLivArea)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642

We do not need to shift the variable GrLivArea since it does not have a minimum of zero.

Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

# load MASS package
library(MASS)

# run fitdistr to fit an exponential probability density function
distr <-fitdistr(train$GrLivArea, densfun = 'exponential')
distr

##        rate    
##   6.598640e-04 
##  (1.726943e-05)

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)). Plot a histogram and compare it with a histogram of your original variable

# lambda
lamda <- distr$estimate

# take 1000 samples
exp_dist <- rexp(1000, lamda)

# summary
summary(exp_dist)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     0.237   423.353  1043.306  1483.581  2032.512 14359.975

# plot histogram of exp_dist
plot_histogram(exp_dist, title ="Exponential Distribution of GrLivArea")

# plot histogram of original GrLivArea
plot_histogram(train$GrLivArea, title ="Original Distribution of GrLivArea")

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

# find the 5th percentile
qexp(0.05, rate=lamda)

## [1] 77.73313

# find the 95th percentile
qexp(0.95, rate=lamda)

## [1] 4539.924

# generate a 95% confidence interval from the empirical data
CI(train$GrLivArea, ci=0.95)

##    upper     mean    lower 
## 1542.440 1515.464 1488.487

# empirical 5th percentile and 95th percentile of the data
quantile(train$GrLivArea, c(.05, .95))

##     5%    95% 
##  848.0 2466.1

We see that the values of exponential model are more skewed then of the given data set which appeared in the corresponding histograms as well.

Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

First lets just see how are final response variable SalePrice is close to normal distribution.

# qq plot for response variable SalePrice
ggqqplot(train$SalePrice)

# SalePrice histogram
plot_histogram(train$SalePrice)

It seems that no major transformation needed to be done for response variable.

We examined earlier heat map based off the correlation matrix. We will now go through a process that can identify what predictors have significant correlations with the response variable. Lets see the correlation of predictors (non numeric) with response variable.

# correlation with response variable
cor(Filter(is.numeric, train[-80]), train$SalePrice)

##                      [,1]
## MSSubClass    -0.08428414
## LotFrontage            NA
## LotArea        0.26384335
## OverallQual    0.79098160
## OverallCond   -0.07785589
## YearBuilt      0.52289733
## YearRemodAdd   0.50710097
## MasVnrArea             NA
## BsmtFinSF1     0.38641981
## BsmtFinSF2    -0.01137812
## BsmtUnfSF      0.21447911
## TotalBsmtSF    0.61358055
## X1stFlrSF      0.60585218
## X2ndFlrSF      0.31933380
## LowQualFinSF  -0.02560613
## GrLivArea      0.70862448
## BsmtFullBath   0.22712223
## BsmtHalfBath  -0.01684415
## FullBath       0.56066376
## HalfBath       0.28410768
## BedroomAbvGr   0.16821315
## KitchenAbvGr  -0.13590737
## TotRmsAbvGrd   0.53372316
## Fireplaces     0.46692884
## GarageYrBlt            NA
## GarageCars     0.64040920
## GarageArea     0.62343144
## WoodDeckSF     0.32441344
## OpenPorchSF    0.31585623
## EnclosedPorch -0.12857796
## X3SsnPorch     0.04458367
## ScreenPorch    0.11144657
## PoolArea       0.09240355
## MiscVal       -0.02118958
## MoSold         0.04643225
## YrSold        -0.02892259

Now we will take variables with strong positive correlations greater than 0.6. Using backward elimination technique to arrive at the optimal features to be included to our model.

train_sub <- train %>%
  dplyr::select(OverallQual, TotalBsmtSF, X1stFlrSF, GrLivArea, GarageCars, GarageArea, SalePrice)

mod <- lm(SalePrice ~., data = train_sub)
summary(mod)

## 
## Call:
## lm(formula = SalePrice ~ ., data = train_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -473373  -19732   -1080   16922  288035 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.027e+05  4.904e+03 -20.932  < 2e-16 ***
## OverallQual  2.400e+04  1.083e+03  22.150  < 2e-16 ***
## TotalBsmtSF  2.439e+01  4.318e+00   5.649 1.94e-08 ***
## X1stFlrSF    1.119e+01  5.032e+00   2.223   0.0264 *  
## GrLivArea    4.312e+01  2.679e+00  16.095  < 2e-16 ***
## GarageCars   1.452e+04  3.019e+03   4.809 1.68e-06 ***
## GarageArea   1.566e+01  1.047e+01   1.495   0.1350    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38840 on 1453 degrees of freedom
## Multiple R-squared:  0.7619, Adjusted R-squared:  0.7609 
## F-statistic:   775 on 6 and 1453 DF,  p-value: < 2.2e-16

Seeing summary of the model p-value adjusted R squared value of .76 which means model shows 76% of the variability in the data which is quite good. Next we will plot model residuals and qq plot.

hist(mod$residuals, xlab = "Residuals", ylab = "", breaks = 100)

The residuals seem to follow a close to normal distribution.

mod %>% 
  ggplot(aes(sample = resid(mod))) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Q-Q Plot") +
  theme_minimal()

Finally lets apply model to our test data and make predictions.

# predict with test dataset
test_results <- predict(mod, test)

prediction <- data.frame(Id = test[,"Id"],  SalePrice = test_results)
prediction[prediction < 0] <- 0
prediction <- replace(prediction, is.na(prediction), 0)

# write to csv
write.csv(prediction, file = "AKapoor_kaggle_submit.csv", row.names = FALSE)

Kaggle Submission and Score

Display Name: Amit Kapoor User Name: amitkpr Kaggle score: 0.77092

Kaggle submission

YouTube Link for Presentation

https://youtu.be/HhRPv-VyCoQ

References

https://www.statisticshowto.com/familywise-error-rate/

Data605 - Final Project

Amit Kapoor

5/18/2020