knitr::opts_chunk$set(echo = TRUE)
wd = getwd()
setwd(wd)

Importing Packages

libs = c("ggplot2", "corrplot", "dplyr", "MASS", "caret","corrgram", 
         "data.table","GGally", "matrixcalc")

if(any(libs%in%installed.packages() == F)){
  libs.1 = libs[which(libs%in%installed.packages() == F)]
  lapply(libs.1, FUN = install.packages, dependencies = T)
}
lapply(libs, library, character.only = T, quiet = T)
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
## Warning: package 'dplyr' was built under R version 4.0.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Warning: package 'caret' was built under R version 4.0.3
## Warning: package 'corrgram' was built under R version 4.0.5
## 
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
## 
##     panel.fill
## Warning: package 'data.table' was built under R version 4.0.3
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## Warning: package 'GGally' was built under R version 4.0.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Warning: package 'matrixcalc' was built under R version 4.0.3
## [[1]]
## [1] "ggplot2"   "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
## [1] "corrplot"  "ggplot2"   "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "dplyr"     "corrplot"  "ggplot2"   "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "MASS"      "dplyr"     "corrplot"  "ggplot2"   "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "caret"     "lattice"   "MASS"      "dplyr"     "corrplot"  "ggplot2"  
##  [7] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [13] "base"     
## 
## [[6]]
##  [1] "corrgram"  "caret"     "lattice"   "MASS"      "dplyr"     "corrplot" 
##  [7] "ggplot2"   "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [13] "methods"   "base"     
## 
## [[7]]
##  [1] "data.table" "corrgram"   "caret"      "lattice"    "MASS"      
##  [6] "dplyr"      "corrplot"   "ggplot2"    "stats"      "graphics"  
## [11] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[8]]
##  [1] "GGally"     "data.table" "corrgram"   "caret"      "lattice"   
##  [6] "MASS"       "dplyr"      "corrplot"   "ggplot2"    "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## [[9]]
##  [1] "matrixcalc" "GGally"     "data.table" "corrgram"   "caret"     
##  [6] "lattice"    "MASS"       "dplyr"      "corrplot"   "ggplot2"   
## [11] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [16] "methods"    "base"
options(digits=4)
rm(list=ls(all=TRUE))

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of wer choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \[\mu=\sigma= \frac{N+1}{2}\ \]

Here we define N as 10,000.

set.seed(12345)
n <- 10000
X <- runif(10000, min=1, max=n)
mn <- (n+1)/2
Y <- rnorm(10000, mean = mn, sd = mn)
x <- median(X)
y <- as.numeric(quantile(Y, 0.25))
df <- cbind.data.frame(X,Y)

Probability

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable.
Interpret the meaning of all probabilities.

a <- round(length(X[X > x]) / length(X[X>y]),4)
a
## [1] 0.5989
b <- round(length(X[X > x & Y > y]) / length(X),4)
b
## [1] 0.3808
c <- round(length(X[X < x]) / length(X[X>y]),4)
c
## [1] 0.5989
  1. P(X>x | X>y) = 0.5989 The probability of variable X lager than its median or variable x lager than the 1st quartile of the Y variable is 59.38%.
  2. P(X>x, Y>y) = 0.3808 The probability of variable X lager than its median and at the same time, variable Y lager than its 1st quartile is 38.08%.
  3. P(X<x | X>y) = 0.5976 The probability of variable X smaller than its median or variable x lager than the 1st quartile of the Y variable is 59.76%.

Marginal and joint Probabilities

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

rownames = c('P(X>x)','P(X<=x)','Total')
colnames = c('P(Y>y)','P(Y<=y)','Total')
r1c1 = length(X[X > x & Y > y])
r2c1 = length(X[X <= x & Y > y])
r3c1 = r1c1 + r2c1
r1c2 = length(X[X > x & Y <= y])
r2c2 = length(X[X <= x & Y <= y])
r3c2 = r1c2 + r2c2
r1c3 = r1c1 + r1c2
r2c3 = r2c1 + r2c2
r3c3 = r1c3 + r2c3
matricsrandom <- matrix(c(r1c1,r2c1,r3c1,r1c2,r2c2,r3c2,r1c3,r2c3,r3c3),
nrow = 3,byrow=TRUE, dimnames=list(rownames,colnames))
A <- (r1c3/10000)*(r3c1/10000)
B <- r1c1/10000
knitr::kable(matricsrandom)
P(Y>y) P(Y<=y) Total
P(X>x) 3808 3692 7500
P(X<=x) 1192 1308 2500
Total 5000 5000 10000

From table P(X > x, Y > y) = 0.3808 and P(X > x) * P(Y > y) = (3808/5000)*(3808/7500)=0.3867 0.3807 < 0.3867. There fore we have P(X>x and Y>y) not equal to P(X>x)P(Y>y).

Independence test, Fisher’s Exact Test and the Chi Square Test

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

When we try to compare proportions of a categorical outcome according to different independent groups, we can consider several statistical tests such as chi-squared test, Fisher’s exact test, or z-test. The chi-squared test and Fisher’s exact test can assess for independence between two variables when the comparing groups are independent and not correlated. The chi-squared test applies an approximation assuming the sample is large, while the Fisher’s exact test runs an exact procedure especially for small-sized samples. Since we have 10000 samples, chi sqaured test is more appropiate.

Hypotheses Construction

H0: Independent (no association)

H1: Not independent (association)

Chi-squared test

The chi-squared test is used to compare the distribution of a categorical variable in a sample or a group with the distribution in another one. If the distribution of the categorical variable is not much different over different groups, we can conclude the distribution of the categorical variable is not related to the variable of groups. Or we can say the categorical variable and groups are independent.

Mat4test <- as.data.frame(matricsrandom)
Mat4test <- Mat4test[-c(3), -c(3)]
chisq.test(Mat4test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Mat4test
## X-squared = 7.1, df = 1, p-value = 0.008

The generated p-value from the Chi-squared test was less than 0.05 (significance level at 95%) therefore we fail to reject our null-hypothesis.

Fisher’s exact test

Fisher’s exact test is practically applied only in analysis of small samples but actually it is valid for all sample sizes. While the chi-squared test relies on an approximation, Fisher’s exact test is one of exact tests. Especially when more than 20% of cells have expected frequencies < 5, we need to use Fisher’s exact test because applying approximation method is inadequate. Fisher’s exact test assesses the null hypothesis of independence applying hypergeometric distribution of the numbers in the cells of the table.

fisher.test(Mat4test)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  Mat4test
## p-value = 0.008
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.033 1.240
## sample estimates:
## odds ratio 
##      1.132

Our p-value stayed the same using the Fisher’s exact test. And therefore we do not reject the H0.

Problem 2

Descriptive and Inferenctial Statistics

train <- read.csv("train.csv",TRUE, ",")
test <- read.csv("test.csv", TRUE, ",")

Data overview

Get the encoded variable data types and the observations, just by using glimpse().

glimpse(train)
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, ...
## $ MSZoning      <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM",...
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 9...
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, ...
## $ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave...
## $ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ LotShape      <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1...
## $ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl...
## $ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "Al...
## $ LotConfig     <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside",...
## $ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl...
## $ Neighborhood  <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge"...
## $ Condition1    <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Nor...
## $ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm...
## $ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam...
## $ HouseStyle    <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1....
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4,...
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5,...
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931,...
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950,...
## $ RoofStyle     <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable",...
## $ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg"...
## $ Exterior1st   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd"...
## $ Exterior2nd   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd"...
## $ MasVnrType    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "Non...
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 30...
## $ ExterQual     <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA",...
## $ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA",...
## $ Foundation    <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood"...
## $ BsmtQual      <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA",...
## $ BsmtCond      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA",...
## $ BsmtExposure  <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No",...
## $ BsmtFinType1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ...
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906,...
## $ BsmtFinType2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ...
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134,...
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991,...
## $ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA...
## $ HeatingQC     <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd",...
## $ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"...
## $ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr",...
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 107...
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142,...
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774,...
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,...
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2,...
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2,...
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2,...
## $ KitchenQual   <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA",...
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6...
## $ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ...
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0,...
## $ FireplaceQu   <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA...
## $ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "At...
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931,...
## $ GarageFinish  <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn...
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2,...
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384...
## $ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa",...
## $ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA",...
## $ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"...
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, ...
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 2...
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, ...
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, ...
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Fence         <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, ...
## $ MiscFeature   <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, N...
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 7...
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3,...
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008,...
## $ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD",...
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "N...
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 30700...
summary(train)
##        Id         MSSubClass      MSZoning          LotFrontage 
##  Min.   :   1   Min.   : 20.0   Length:1460        Min.   : 21  
##  1st Qu.: 366   1st Qu.: 20.0   Class :character   1st Qu.: 59  
##  Median : 730   Median : 50.0   Mode  :character   Median : 69  
##  Mean   : 730   Mean   : 56.9                      Mean   : 70  
##  3rd Qu.:1095   3rd Qu.: 70.0                      3rd Qu.: 80  
##  Max.   :1460   Max.   :190.0                      Max.   :313  
##                                                    NA's   :259  
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual    OverallCond     YearBuilt     YearRemodAdd 
##  Length:1460        Min.   : 1.0   Min.   :1.00   Min.   :1872   Min.   :1950  
##  Class :character   1st Qu.: 5.0   1st Qu.:5.00   1st Qu.:1954   1st Qu.:1967  
##  Mode  :character   Median : 6.0   Median :5.00   Median :1973   Median :1994  
##                     Mean   : 6.1   Mean   :5.58   Mean   :1971   Mean   :1985  
##                     3rd Qu.: 7.0   3rd Qu.:6.00   3rd Qu.:2000   3rd Qu.:2004  
##                     Max.   :10.0   Max.   :9.00   Max.   :2010   Max.   :2010  
##                                                                                
##   RoofStyle           RoofMatl         Exterior1st        Exterior2nd       
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   MasVnrType          MasVnrArea    ExterQual          ExterCond        
##  Length:1460        Min.   :   0   Length:1460        Length:1460       
##  Class :character   1st Qu.:   0   Class :character   Class :character  
##  Mode  :character   Median :   0   Mode  :character   Mode  :character  
##                     Mean   : 104                                        
##                     3rd Qu.: 166                                        
##                     Max.   :1600                                        
##                     NA's   :8                                           
##   Foundation          BsmtQual           BsmtCond         BsmtExposure      
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtFinType1         BsmtFinSF1   BsmtFinType2         BsmtFinSF2    
##  Length:1460        Min.   :   0   Length:1460        Min.   :   0.0  
##  Class :character   1st Qu.:   0   Class :character   1st Qu.:   0.0  
##  Mode  :character   Median : 384   Mode  :character   Median :   0.0  
##                     Mean   : 444                      Mean   :  46.5  
##                     3rd Qu.: 712                      3rd Qu.:   0.0  
##                     Max.   :5644                      Max.   :1474.0  
##                                                                       
##    BsmtUnfSF     TotalBsmtSF     Heating           HeatingQC        
##  Min.   :   0   Min.   :   0   Length:1460        Length:1460       
##  1st Qu.: 223   1st Qu.: 796   Class :character   Class :character  
##  Median : 478   Median : 992   Mode  :character   Mode  :character  
##  Mean   : 567   Mean   :1057                                        
##  3rd Qu.: 808   3rd Qu.:1298                                        
##  Max.   :2336   Max.   :6110                                        
##                                                                     
##   CentralAir         Electrical          X1stFlrSF      X2ndFlrSF   
##  Length:1460        Length:1460        Min.   : 334   Min.   :   0  
##  Class :character   Class :character   1st Qu.: 882   1st Qu.:   0  
##  Mode  :character   Mode  :character   Median :1087   Median :   0  
##                                        Mean   :1163   Mean   : 347  
##                                        3rd Qu.:1391   3rd Qu.: 728  
##                                        Max.   :4692   Max.   :2065  
##                                                                     
##   LowQualFinSF     GrLivArea     BsmtFullBath    BsmtHalfBath       FullBath   
##  Min.   :  0.0   Min.   : 334   Min.   :0.000   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:  0.0   1st Qu.:1130   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.00  
##  Median :  0.0   Median :1464   Median :0.000   Median :0.0000   Median :2.00  
##  Mean   :  5.8   Mean   :1515   Mean   :0.425   Mean   :0.0575   Mean   :1.57  
##  3rd Qu.:  0.0   3rd Qu.:1777   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:2.00  
##  Max.   :572.0   Max.   :5642   Max.   :3.000   Max.   :2.0000   Max.   :3.00  
##                                                                                
##     HalfBath      BedroomAbvGr   KitchenAbvGr  KitchenQual       
##  Min.   :0.000   Min.   :0.00   Min.   :0.00   Length:1460       
##  1st Qu.:0.000   1st Qu.:2.00   1st Qu.:1.00   Class :character  
##  Median :0.000   Median :3.00   Median :1.00   Mode  :character  
##  Mean   :0.383   Mean   :2.87   Mean   :1.05                     
##  3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:1.00                     
##  Max.   :2.000   Max.   :8.00   Max.   :3.00                     
##                                                                  
##   TotRmsAbvGrd    Functional          Fireplaces    FireplaceQu       
##  Min.   : 2.00   Length:1460        Min.   :0.000   Length:1460       
##  1st Qu.: 5.00   Class :character   1st Qu.:0.000   Class :character  
##  Median : 6.00   Mode  :character   Median :1.000   Mode  :character  
##  Mean   : 6.52                      Mean   :0.613                     
##  3rd Qu.: 7.00                      3rd Qu.:1.000                     
##  Max.   :14.00                      Max.   :3.000                     
##                                                                       
##   GarageType         GarageYrBlt   GarageFinish         GarageCars  
##  Length:1460        Min.   :1900   Length:1460        Min.   :0.00  
##  Class :character   1st Qu.:1961   Class :character   1st Qu.:1.00  
##  Mode  :character   Median :1980   Mode  :character   Median :2.00  
##                     Mean   :1978                      Mean   :1.77  
##                     3rd Qu.:2002                      3rd Qu.:2.00  
##                     Max.   :2010                      Max.   :4.00  
##                     NA's   :81                                      
##    GarageArea    GarageQual         GarageCond         PavedDrive       
##  Min.   :   0   Length:1460        Length:1460        Length:1460       
##  1st Qu.: 334   Class :character   Class :character   Class :character  
##  Median : 480   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 473                                                           
##  3rd Qu.: 576                                                           
##  Max.   :1418                                                           
##                                                                         
##    WoodDeckSF     OpenPorchSF    EnclosedPorch   X3SsnPorch     ScreenPorch   
##  Min.   :  0.0   Min.   :  0.0   Min.   :  0   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:  0.0   1st Qu.:  0.0   1st Qu.:  0   1st Qu.:  0.0   1st Qu.:  0.0  
##  Median :  0.0   Median : 25.0   Median :  0   Median :  0.0   Median :  0.0  
##  Mean   : 94.2   Mean   : 46.7   Mean   : 22   Mean   :  3.4   Mean   : 15.1  
##  3rd Qu.:168.0   3rd Qu.: 68.0   3rd Qu.:  0   3rd Qu.:  0.0   3rd Qu.:  0.0  
##  Max.   :857.0   Max.   :547.0   Max.   :552   Max.   :508.0   Max.   :480.0  
##                                                                               
##     PoolArea        PoolQC             Fence           MiscFeature       
##  Min.   :  0.0   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  0.0   Class :character   Class :character   Class :character  
##  Median :  0.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :  2.8                                                           
##  3rd Qu.:  0.0                                                           
##  Max.   :738.0                                                           
##                                                                          
##     MiscVal          MoSold          YrSold       SaleType        
##  Min.   :    0   Min.   : 1.00   Min.   :2006   Length:1460       
##  1st Qu.:    0   1st Qu.: 5.00   1st Qu.:2007   Class :character  
##  Median :    0   Median : 6.00   Median :2008   Mode  :character  
##  Mean   :   43   Mean   : 6.32   Mean   :2008                     
##  3rd Qu.:    0   3rd Qu.: 8.00   3rd Qu.:2009                     
##  Max.   :15500   Max.   :12.00   Max.   :2010                     
##                                                                   
##  SaleCondition        SalePrice     
##  Length:1460        Min.   : 34900  
##  Class :character   1st Qu.:129975  
##  Mode  :character   Median :163000  
##                     Mean   :180921  
##                     3rd Qu.:214000  
##                     Max.   :755000  
## 

The data in the trans dataset contains 1,460 rows and 81 columns. Among them exists quite some missing entries.

According to the data_description.txt(https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=data_description.txt), and some commonsense in the real estate market, the most relevant independent variables could be:

LotArea, TotalBsmtSF, 1stFlrSF…

Of course there are more factors influencing the sale price of a property, but for checking the correlation we will just take these upper mentioned variables and check their correlation.

Correlation

pairs(dplyr::select(train, c("LotArea", "TotalBsmtSF",  "X1stFlrSF", "SalePrice"))
      , pch = 19)

A positive correlation between our independent variable and dependent variable SalePrice can be identified. We use these same variables and visualize via corrgram.

corrgram(dplyr::select(train, c("LotArea", "TotalBsmtSF",  "X1stFlrSF", "SalePrice"))
         ,lower.panel=panel.shade, upper.panel=panel.cor)

The correlations are high between some attributes. But we won’t do anything about it, since the aim of this article is to predict and not the exploratory data analysis.

This will most likely result in model overfitting, but more on that later.

Hypothesis Testing

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

We will use chi square correlation test to determine. We will use a confidence level of 80%(alpha = 0.20).

Hypothesis

H0: Independent (no correlation)

H1: Not independent (correlation )

Tests

cor.test(train$SalePrice,train$LotArea,method = "pearson",conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$SalePrice and train$LotArea
## t = 10, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.2323 0.2948
## sample estimates:
##    cor 
## 0.2638
cor.test(train$SalePrice,train$TotalBsmtSF,method = "pearson",conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$SalePrice and train$TotalBsmtSF
## t = 30, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5922 0.6341
## sample estimates:
##    cor 
## 0.6136
cor.test(train$SalePrice,train$X1stFlrSF,method = "pearson",conf.level = 0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  train$SalePrice and train$X1stFlrSF
## t = 29, df = 1458, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5842 0.6267
## sample estimates:
##    cor 
## 0.6059

Each test delivered a p-value way smaller than the significance level so we reject our null hypothesis. Thus we conclude that there is a strong correlation between these three variables and dependent variable.

Familywise error

1-(1-(.2))^3
## [1] 0.488

Our familywise error using Šidák procedure is 0.488 which is very high considering we only ran three tests. The reason for that is our alpha is 0.2. If we change it to 0.01 (99% significance level), we will have a very low familywise error, but it should not be worried. Since all our p value are very small and will still be significant even if we use a 99% significance level.

Linear Algebra and Correlation

We will calculate the precision matrix by inverting the correlation matrix.

library(Matrix)
train_sub <- cor(train[, c("LotArea", "TotalBsmtSF",  "X1stFlrSF",
                           "SalePrice")])
train_sub_inv <- solve(train_sub)
train_sub_inv
##                LotArea TotalBsmtSF X1stFlrSF SalePrice
## LotArea      1.1116224  -0.0005917   -0.2448   -0.1446
## TotalBsmtSF -0.0005917   3.2603190   -2.3065   -0.6029
## X1stFlrSF   -0.2448005  -2.3064606    3.2657   -0.4987
## SalePrice   -0.1446182  -0.6029380   -0.4987    1.7103

We notice some correlation amongst these three variables and sales price, especially between TotalBsmtSF and SalesPrice.

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

zapsmall(train_sub_inv %*% train_sub)
##             LotArea TotalBsmtSF X1stFlrSF SalePrice
## LotArea           1           0         0         0
## TotalBsmtSF       0           1         0         0
## X1stFlrSF         0           0         1         0
## SalePrice         0           0         0         1
zapsmall(train_sub %*% train_sub_inv)
##             LotArea TotalBsmtSF X1stFlrSF SalePrice
## LotArea           1           0         0         0
## TotalBsmtSF       0           1         0         0
## X1stFlrSF         0           0         1         0
## SalePrice         0           0         0         1
zapsmall(train_sub %*% train_sub_inv) == zapsmall(train_sub_inv %*% train_sub)
##             LotArea TotalBsmtSF X1stFlrSF SalePrice
## LotArea        TRUE        TRUE      TRUE      TRUE
## TotalBsmtSF    TRUE        TRUE      TRUE      TRUE
## X1stFlrSF      TRUE        TRUE      TRUE      TRUE
## SalePrice      TRUE        TRUE      TRUE      TRUE

We can see multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix produced the same results

LU Decomposition

Conduct LU decomposition on the matrix. Factorization factors a matrix as the product of a lower triangular matrix and an upper triangular matrix.

train_sub_lu <- lu.decomposition(train_sub)
L <- train_sub_lu$L
U <- train_sub_lu$U
print( L )
##        [,1]   [,2]   [,3] [,4]
## [1,] 1.0000 0.0000 0.0000    0
## [2,] 0.2608 1.0000 0.0000    0
## [3,] 0.2995 0.7955 1.0000    0
## [4,] 0.2638 0.5845 0.2916    1
print( U )
##      [,1]   [,2]   [,3]    [,4]
## [1,]    1 0.2608 0.2995 0.26384
## [2,]    0 0.9320 0.7414 0.54476
## [3,]    0 0.0000 0.3205 0.09346
## [4,]    0 0.0000 0.0000 0.58470
print( L %*% U )
##        [,1]   [,2]   [,3]   [,4]
## [1,] 1.0000 0.2608 0.2995 0.2638
## [2,] 0.2608 1.0000 0.8195 0.6136
## [3,] 0.2995 0.8195 1.0000 0.6059
## [4,] 0.2638 0.6136 0.6059 1.0000

Calculus-Based Probability & Statistics

Exponential Distribution

We choose GrLivArea(Above grade (ground) living area square feet), as our selected variable. Since in most of the cases, it is one of the most important factor buyer would consider when purchasing a property.

fd <- fitdistr(train$GrLivArea, "exponential")
lambda_samples <- rexp(1000, fd$estimate)
par(mfrow=c(1,2))
lambda_samples <- as.data.frame(lambda_samples)
GrLivArea <- as.data.frame(train$GrLivArea)
ggplot(lambda_samples, aes(x=lambda_samples)) +  
  geom_histogram(binwidth = 100)+ theme(legend.position="top")

ggplot(GrLivArea, aes(x=train$GrLivArea)) + 
  geom_histogram(binwidth = 100) + theme(legend.position="top")

5th and 95th Percentiles

qexp(c(0.05, 0.95), rate = fd$estimate)
## [1]   77.73 4539.92

The estimated 5th percentile is 77.43, and 95th percentile is 4539.92.

95% Confidence Interval

emp <- scale(train$GrLivArea)
me <- qnorm(0.975) * (sd(emp)) / sqrt(length(emp))
c(1 - me, 1 + me)
## [1] 0.9487 1.0513

A 95% confidence interval from the empirical data, assuming normality is [0.9487, 1.0513].

Empirical Distribution

quantile(train$GrLivArea, c(0.05, 0.95))
##   5%  95% 
##  848 2466

The 5th percentile of the data is 848, and 95th percentile of the data is 2466.

Modeling

Formatting the Data before fitting

We have to format the data before fitting. Here we are obliged to use multiple regression model. We will use a Log-trans model based on Hendonic Price Model.

train$SalePrice <- log(train$SalePrice)
test$SalePrice <- 0
asNumeric <- function(x) as.numeric(factor(x))
factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)],
asNumeric))
characterfactor <- function(a) modifyList(a, 
                    lapply(a[, sapply(a, is.character)],as.factor))
train <- characterfactor(train)
test <- characterfactor(test)
train <- factorsNumeric(train)
test <- factorsNumeric(test)
train[is.na(train)] <- 0
test[is.na(test)] <- 0

Checking for NAs.

anyNA(train)
## [1] FALSE
anyNA(test)
## [1] FALSE

Check the class of each column to be numeric.

cls <- sapply(train, class)
cls
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##     "integer"     "integer"     "numeric"     "numeric"     "integer" 
##        Street         Alley      LotShape   LandContour     Utilities 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##     "numeric"     "numeric"     "integer"     "integer"     "integer" 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##     "integer"     "numeric"     "numeric"     "numeric"     "numeric" 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##     "numeric"     "numeric"     "numeric"     "numeric"     "numeric" 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##     "numeric"     "numeric"     "numeric"     "numeric"     "integer" 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##     "numeric"     "integer"     "integer"     "integer"     "numeric" 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##     "numeric"     "numeric"     "numeric"     "integer"     "integer" 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##     "integer"     "integer"     "integer"     "integer"     "integer" 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##     "integer"     "integer"     "integer"     "numeric"     "integer" 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##     "numeric"     "integer"     "numeric"     "numeric"     "numeric" 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##     "numeric"     "integer"     "integer"     "numeric"     "numeric" 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##     "numeric"     "integer"     "integer"     "integer"     "integer" 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##     "integer"     "integer"     "numeric"     "numeric"     "numeric" 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##     "integer"     "integer"     "integer"     "numeric"     "numeric" 
##     SalePrice 
##     "numeric"

Training Model

Linear Model

Here we train the model using a linear model.The R package caret is deployed here for the modeling process. We will start with a simple linear model. However, a random forest approch would be nice to compare if it was allowed.

linear.model <- train(SalePrice ~., data = dplyr::select(train, -Id), method = "lm")
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

Now we investigate the attributes of the linear model

attributes(linear.model)
## $names
##  [1] "method"       "modelInfo"    "modelType"    "results"      "pred"        
##  [6] "bestTune"     "call"         "dots"         "metric"       "control"     
## [11] "finalModel"   "preProcess"   "trainingData" "resample"     "resampledCM" 
## [16] "perfNames"    "maximize"     "yLimits"      "times"        "levels"      
## [21] "terms"        "coefnames"    "xlevels"     
## 
## $class
## [1] "train"         "train.formula"

For linear model object we created above the final model is simply a multiple linear regression using the full input dataset.

linear.model$finalModel
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Coefficients:
##   (Intercept)     MSSubClass       MSZoning    LotFrontage        LotArea  
##      1.81e+01      -1.15e-04      -1.64e-02      -1.97e-04       1.77e-06  
##        Street          Alley       LotShape    LandContour      Utilities  
##      1.77e-01       1.06e-02      -4.28e-03       6.74e-03      -1.78e-01  
##     LotConfig      LandSlope   Neighborhood     Condition1     Condition2  
##     -1.05e-03       2.68e-02       6.56e-04       1.37e-03      -4.72e-02  
##      BldgType     HouseStyle    OverallQual    OverallCond      YearBuilt  
##     -1.38e-02      -2.90e-03       6.71e-02       4.38e-02       1.67e-03  
##  YearRemodAdd      RoofStyle       RoofMatl    Exterior1st    Exterior2nd  
##      5.80e-04       2.02e-03       9.46e-03      -4.61e-03       4.37e-03  
##    MasVnrType     MasVnrArea      ExterQual      ExterCond     Foundation  
##      7.06e-03       2.77e-05      -9.67e-03       1.20e-02       1.13e-02  
##      BsmtQual       BsmtCond   BsmtExposure   BsmtFinType1     BsmtFinSF1  
##     -1.38e-02       8.95e-03      -6.61e-03      -5.27e-03       6.81e-05  
##  BsmtFinType2     BsmtFinSF2      BsmtUnfSF    TotalBsmtSF        Heating  
##      1.43e-02       1.37e-04       5.07e-05             NA      -3.57e-03  
##     HeatingQC     CentralAir     Electrical      X1stFlrSF      X2ndFlrSF  
##     -7.34e-03       7.01e-02      -1.10e-03       2.01e-04       1.54e-04  
##  LowQualFinSF      GrLivArea   BsmtFullBath   BsmtHalfBath       FullBath  
##      8.29e-05             NA       5.23e-02       1.56e-02       3.21e-02  
##      HalfBath   BedroomAbvGr   KitchenAbvGr    KitchenQual   TotRmsAbvGrd  
##      1.86e-02       5.17e-03      -4.11e-02      -2.09e-02       1.70e-02  
##    Functional     Fireplaces    FireplaceQu     GarageType    GarageYrBlt  
##      1.58e-02       3.55e-02       3.04e-04      -3.75e-03       1.86e-05  
##  GarageFinish     GarageCars     GarageArea     GarageQual     GarageCond  
##     -9.06e-03       5.05e-02       3.23e-05      -4.25e-03       9.72e-03  
##    PavedDrive     WoodDeckSF    OpenPorchSF  EnclosedPorch     X3SsnPorch  
##      2.38e-02       1.17e-04      -1.12e-05       1.06e-04       1.88e-04  
##   ScreenPorch       PoolArea         PoolQC          Fence    MiscFeature  
##      3.39e-04       2.06e-03      -6.33e-01      -3.97e-03      -1.33e-02  
##       MiscVal         MoSold         YrSold       SaleType  SaleCondition  
##      3.86e-06       4.14e-04      -6.10e-03      -2.38e-03       2.52e-02

Find out the r squared of the linear model

summary(linear.model$finalModel)$r.squared
## [1] 0.8928

We see that the overall R-squared (based on all the data) is 0.8928. This is the unrealistic R-squared. It’s worth repeating, this is an R-squared from a model using all the data when what we want instead is an R-squared based on applying our model to new data which is what we get with resampling. To see the “realistic” R-squared based on resampling – based on repeatedly building the model and applying to new data – we can look directly at the train object (not the finalModel attribute). Here is a brief summary of the resampling in the train object: Note:Bootstap is the default resampling approach

linear.model
## Linear Regression 
## 
## 1460 samples
##   79 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1460, 1460, 1460, 1460, 1460, 1460, ... 
## Resampling results:
## 
##   RMSE    Rsquared  MAE    
##   0.1562  0.8511    0.09945
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

We can see that in this case the resampling (realistic) R-squared is lower than the “unrealistic” R-squared we saw from the final model (linear.model$finalModel). 1. This dataset is large so the difference between unrealistic and realistic R-squared is lower than it would likely be with a smaller dataset. 2.Although a lower R-squared can be disappointing, it is a more defensible and realistic measure of our model’s likely performance on new data.

Linear model with 10 fold cross validation

Now we will use 10 fold cross validation as a comparison, since the dataset is fairly large:

tc <- trainControl(method = "cv", number = 10)
lm_cv <- train(SalePrice ~., data = dplyr::select(train, -Id), method = "lm",
             trControl = tc) 
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
lm_cv
## Linear Regression 
## 
## 1460 samples
##   79 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1313, 1316, 1315, 1314, 1312, 1315, ... 
## Resampling results:
## 
##   RMSE    Rsquared  MAE    
##   0.1528  0.8505    0.09585
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

The cross validation produced a model with similar RMSE and R squared. That is not satisfing.

Stepwise regression

Here we will use 10-fold cross validation 5 times.(Repeated CV).

set.seed(100)
tr <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

Here we run automated step-wise regression and look at the resampling results. Note that there is no tuning parameter so the final results table has just one row.

step_model <- train(SalePrice ~ ., data = dplyr::select(train, -Id), 
                    method = "lmStepAIC",
                    direction = "forward", trControl = tr, trace = FALSE)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
step_model$results
step_model
## Linear Regression with Stepwise Selection 
## 
## 1460 samples
##   79 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1314, 1314, 1313, 1312, 1315, 1315, ... 
## Resampling results:
## 
##   RMSE    Rsquared  MAE    
##   0.1556  0.8512    0.09686

Next, we can make a residuals plot, or residuals histogram to be more precise. Here we expect to see something approximately normally distributed.

modelResiduals <- as.data.frame(residuals(step_model))

ggplot(modelResiduals, aes(residuals(step_model))) +
  geom_histogram(fill="deepskyblue", color="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

normal probability plot of residuals

ggplot(modelResiduals) +
    geom_qq(aes(sample=residuals(step_model)))

Overall the resampled linear model, linear model with cross validation and after all the step-wise model performed similarly. But of course due to the limitation of the computational power, we only implemented a straight forward step-wise model. If that condition was eliminated, the performance of the model could be different.

Prediction

We will use the stepwise regression model for its slightly better performance, judging by RMSE, Rsquared and MAE.

preds <- predict(step_model, dplyr::select(test, -Id), na.action=na.pass)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
preds <- exp(preds)

Since it is a Log-trans model, we will than scale the output.

Result output

modelEval <- cbind(test$Id, preds)
colnames(modelEval) <- c("Id", "SalePrice")
modelEval <- as.data.frame(modelEval)
write.csv(modelEval,"~/submission.csv", row.names = FALSE)

Kaggle Evaluation username: josephmarsshi score: 0.14647