library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(infer)
library(ggpubr)
library(DescTools)
## 
## Attaching package: 'DescTools'
## 
## The following objects are masked from 'package:psych':
## 
##     AUC, ICC, SD
library(corrr)
library(ggcorrplot)
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
train_df <- read_csv("train.csv")
test_df <- read_csv("test.csv")

Introduction

Pick one of the quantitative independent variables from the training data set train_df, and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.

# Descriptive stats to look for potential variables that are right skewed

summary(train_df)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical           1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##     2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      3SsnPorch       ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 

Reviewing the above summary statistics it appears that Lot Area is heavily skewed to the right. We can checking skewness of X.

skew(train_df$LotArea)
## [1] 12.18262
train_df %>% ggplot(aes(LotArea)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Independent variable X = LotArea Dependent variable Y = SalesPrice

X <- train_df$LotArea

Y <- train_df$SalePrice

PART 1: Probability

Calculate as a minimum the below probabilities a through c.

  1. P(X>x | Y>y)

  2. P(X>x, Y>y)

  3. P(X<x | Y>y)

Assume the small letter “x” is estimated as the 3rd quartile of the X variable, and the small letter “y” is estimated as the 2nd quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

x <- quantile(X, 0.75)
x
##     75% 
## 11601.5
y <- quantile(Y, .5)
y
##    50% 
## 163000

Build a contigency table

Building the contingency table would be helpful in answering the probability questions from above so I pulled all the values from the data frame and constructed the table.

# X <= x and Y <= y
Xleq3q_Yleq2q <- nrow(train_df %>% filter(LotArea <= x & SalePrice <= y))

# X > x and Y <= y
Xgr3q_Yleq2q <- nrow(train_df %>% filter(LotArea > x & SalePrice <= y))

# X < x and Y > y
Xleq3q_Ygr2q <- nrow(train_df %>% filter(LotArea <= x & SalePrice > y))

# X > x and Y > y
Xgr3q_Ygr2q <- nrow(train_df %>% filter(LotArea > x & SalePrice > y))

# X < x total
Xleq3q_total <- Xleq3q_Yleq2q + Xleq3q_Ygr2q

# X > x total
Xgr3q_total <- Xgr3q_Ygr2q + Xgr3q_Yleq2q

# Y < y total
Yleq2q_total <- Xgr3q_Yleq2q + Xleq3q_Yleq2q

# Y > y total
Ygr2q_total <- Xgr3q_Ygr2q + Xleq3q_Ygr2q

# Total 
Total <- Xleq3q_total + Xgr3q_total

contingency_matrix <- matrix(c(Xleq3q_Yleq2q, Xleq3q_Ygr2q, Xleq3q_total,
                               Xgr3q_Yleq2q,Xgr3q_Ygr2q,Xgr3q_total,
                               Yleq2q_total, Ygr2q_total, Total), 
                             nrow = 3, ncol = 3, byrow = TRUE)

contingency_table <- as.table(contingency_matrix)

#contingency_table

colnames(contingency_table) <- c("<= 2d quartile", "> 2d quartile", "Total")
rownames(contingency_table) <- c("<= 3d quartile", "> 3d quartile", "Total")

contingency_table
##                <= 2d quartile > 2d quartile Total
## <= 3d quartile            643           452  1095
## > 3d quartile              89           276   365
## Total                     732           728  1460

a. P(X>x | Y>y)

Here I pulled out the values from the training data set

\[P(X>x | Y>y) = \\ \frac{P(A \cap B)}{P(B)}\\ \frac{P(X>x \cap Y>y)}{P(Y>y)}\]

# filter df to find P(X>x)
prob_A <- nrow(train_df %>% filter(LotArea > x))/nrow(train_df)

# filter df to find P(Y>y)
prob_B <- nrow(train_df %>% filter(SalePrice > y))/nrow(train_df)

prob_AandB <- nrow(train_df %>% filter(LotArea > x & SalePrice > y))/nrow(train_df)

#P(X>x | Y>y)
round(prob_AandB/prob_B, 4)
## [1] 0.3791

b. P(X>x, Y>y)

This is asking for the joint probability of X > x and Y > y. Looking at our table above there are 276/1460 observations where that condition is met. The P(X>x, Y>y) =

# Probability of A and B
round(prob_AandB, 4)
## [1] 0.189

c. P(X<x | Y>y)

\[P(X<x | Y>y) = \\ \frac{P(A \cap B)}{P(B)}\\ \frac{P(X<x \cap Y>y)}{P(Y>y)}\]

Using values from our table above The P(X<x | Y>y) =

round((452/1460)/(728/1460), 4)
## [1] 0.6209

Does splitting the training data in this fashion make them independent?

Let A be the new variable counting those observations above the 3d quartile for X, and let B be the new variable counting those observations above the 2d quartile for Y.

Does P(A|B)=P(A)P(B)?

Check mathematically, and then evaluate by running a Chi Square test for association.

A = X > x B = Y > y

P(AB)=P(A)P(B)

round(prob_AandB, 4)
## [1] 0.189
round(prob_A * prob_B, 4)
## [1] 0.1247

Chi - Squared Test

\(H_0\) : Sale Price and Lot Area are independent

\(H_a\) : Sale Price and Lot Area are not independent

A <- as.factor(ifelse(X > x, "yes", "no"))
B <- as.factor(ifelse(Y > y, "yes", "no"))

contingency_tableAB <- table(A, B)
contingency_tableAB
##      B
## A      no yes
##   no  643 452
##   yes  89 276
chisq.test(contingency_tableAB)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency_tableAB
## X-squared = 127.74, df = 1, p-value < 2.2e-16

Mathematically splitting our variables like this does not make them indepentent. The output of our Chi - squared test also gives us a significant p-value that lets us reject the null hypothesis that our variables are independent and that there is a non independent relationship between our two variables.

PART 2: Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set.

Descriptive stats: Lot Area

describe(X, quant=c(.25,.75))
##    vars    n     mean      sd median trimmed     mad  min    max  range  skew
## X1    1 1460 10516.83 9981.26 9478.5 9563.28 2962.23 1300 215245 213945 12.18
##    kurtosis     se  Q0.25   Q0.75
## X1   202.26 261.22 7553.5 11601.5
LotArea_box <- train_df %>% ggplot(aes(y = LotArea)) + 
  geom_boxplot(color = "lightblue")  + 
  labs(title = "LotArea") + 
  theme_minimal()

LotArea_hist <- train_df %>% ggplot(aes(x = LotArea)) + 
  geom_histogram(binwidth = 30, fill = "lightblue", color = "grey") + 
  theme_minimal()


ggarrange(LotArea_box, LotArea_hist)

In the above plots of the distribution of Lot Area we can see that there are significant outliers causing the distribution to be positively skewed to the right. The average lot area is 10516.83 sqft but the median is only 9478.5 sqft. There is a wide range in sizes (1300 to 215245 sq feet.).

Descriptive Stats: Sales price

describe(Y, quant=c(.25,.75))
##    vars    n     mean      sd median  trimmed     mad   min    max  range skew
## X1    1 1460 180921.2 79442.5 163000 170783.3 56338.8 34900 755000 720100 1.88
##    kurtosis      se  Q0.25  Q0.75
## X1      6.5 2079.11 129975 214000
SalePrice_box <- train_df %>% ggplot(aes(y = SalePrice)) + 
  geom_boxplot(color = "lightblue")  + 
  labs(title = "LotArea") + 
  theme_minimal()

SalePrice_hist <- train_df %>% ggplot(aes(x = SalePrice)) + 
  geom_histogram(binwidth = 30, fill = "lightblue", color = "grey") + 
  theme_minimal()


ggarrange(SalePrice_box, SalePrice_hist)

The distribution of the Sale Price also looks skewed to the right but to the extent that the Lot Area variable is.

Scatter Plot

Provide a scatterplot of X and Y.

train_df %>% ggplot(aes(x = LotArea, y = SalePrice)) + 
  geom_point(color = "lightblue") + 
  theme_minimal()

The scatter plot shows a linear relationships and what appears to be a steep increase in Sale Price as Lot Area gets bigger. The large outliers in Lot Area are compressing the scatter plot so it is difficult to fully see the relationship.

Confidence Interval

Provide a 95% CI for the difference in the mean of the variables.

X_mean <- mean(X)
Y_mean <- mean(Y)

diff_mean <- Y_mean - X_mean
diff_mean
## [1] 170404.4
train_df %>%
  specify(SalePrice ~ LotArea) %>%
  get_ci(point_estimate = diff_mean,
         level = 0.95,
         type = "se")
## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1  150841.  189967.

Correlation Matrix

Derive a correlation matrix for two of the quantitative variables you selected.

cor_matrix <- cor(train_df[,c("LotArea", "SalePrice")])
cor_matrix
##             LotArea SalePrice
## LotArea   1.0000000 0.2638434
## SalePrice 0.2638434 1.0000000

Hypothesis Test

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval.

cor.test(X, Y, conf.level = 0.99)
## 
##  Pearson's product-moment correlation
## 
## data:  X and Y
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.2000196 0.3254375
## sample estimates:
##       cor 
## 0.2638434

Discussion

Discuss the meaning of your analysis:

Since our p-value is less than 0.01 we can reject the Null hypothesis that the correlation is 0. We are confident that in 99% of our samples the correlation coefficient is between 0.2 and 0.3254. The correlation is small but positive indicating that as Lot Area increases so does the Sale Price

PART 3: Linear Algebra and Correlation

Invert your correlation matrix

This is known as the precision matrix and contains variance inflation factors on the diagonal.

# Invert the cor_matrix using solve()
prec_matirx <- solve(cor_matrix)

Multiply the correlation matrix by the precision matrix.

cor_matrix %*% prec_matirx
##                LotArea    SalePrice
## LotArea   1.000000e+00 3.862768e-18
## SalePrice 5.551115e-17 1.000000e+00

Then multiply the precision matrix by the correlation matrix.

prec_matirx %*% cor_matrix
##                LotArea    SalePrice
## LotArea   1.000000e+00 5.551115e-17
## SalePrice 3.862768e-18 1.000000e+00

Conduct principle components analysis (research this!) and interpret.

Step 1 - Standardize the data

The two variables were centered and scaled as the Sale Price values were in different units and quite larger than our Lot Area.

# Scale the filtered data frame
train_df_scaled <- scale(data.frame(X,Y))

Step 2 - Covariance matrix

scaled_matrix <- cor(train_df_scaled)
ggcorrplot(scaled_matrix)

Step 3 - Applying the PCA

train_df_pca <- princomp(scaled_matrix)
summary(train_df_pca)
## Importance of components:
##                           Comp.1 Comp.2
## Standard deviation     0.5205414      0
## Proportion of Variance 1.0000000      0
## Cumulative Proportion  1.0000000      1
str(train_df_pca)
## List of 7
##  $ sdev    : Named num [1:2] 0.521 0
##   ..- attr(*, "names")= chr [1:2] "Comp.1" "Comp.2"
##  $ loadings: 'loadings' num [1:2, 1:2] 0.707 -0.707 0.707 0.707
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "X" "Y"
##   .. ..$ : chr [1:2] "Comp.1" "Comp.2"
##  $ center  : Named num [1:2] 0.632 0.632
##   ..- attr(*, "names")= chr [1:2] "X" "Y"
##  $ scale   : Named num [1:2] 1 1
##   ..- attr(*, "names")= chr [1:2] "X" "Y"
##  $ n.obs   : int 2
##  $ scores  : num [1:2, 1:2] 5.21e-01 -5.21e-01 3.68e-17 5.80e-17
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "X" "Y"
##   .. ..$ : chr [1:2] "Comp.1" "Comp.2"
##  $ call    : language princomp(x = scaled_matrix)
##  - attr(*, "class")= chr "princomp"
# eiganvectors
print(train_df_pca$loadings[,])
##       Comp.1    Comp.2
## X  0.7071068 0.7071068
## Y -0.7071068 0.7071068
# scores 
print(train_df_pca$scores)
##       Comp.1       Comp.2
## X  0.5205414 3.675387e-17
## Y -0.5205414 5.800959e-17
fviz_eig(train_df_pca, addlabels = T)

fviz_pca_var(train_df_pca, col.var = "black")

fviz_cos2(train_df_pca, choice = "var", axes = 1:2)

Discussion:

In our principle component analysis of the two variables (Lot Area and Sale Price) we see that the first component explains 100% of the variability in our data. Both components correlated highly with both variables but in different directions. In the The first principle component each variable contributed equally to the proportion of the component but in different directions. The negative association from the y variable indicates that as Sale Price increases the first principle component score decreases. The second principle component had a positive correlation (0.7071) with both variables.

PART 4 - Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. For your variable that is skewed to the right, shift it so that the minimum value is above zero.

Fit Exponential PDF

Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html).

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
exp_pdf <- fitdistr(X, densfun = "exponential")

exp_pdf
##        rate    
##   9.508570e-05 
##  (2.488507e-06)

Find \(\lambda\)

Find the optimal value of λ for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, λ)).

rate <- exp_pdf$estimate

exp_dist <- rexp(1000, rate)

Compare Histograms

Plot a histogram and compare it with a histogram of your original variable.

hist(exp_dist, xlab = "Lot Area", main = "Histogram of Exponential Density Function.")

hist(X, xlab = "Lot Area", main = "Histogram of Original Variable")

Find the percentiles of the Exponential PDF

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

quantile(exp_dist, probs = c(0.05, 0.95))
##         5%        95% 
##   471.9132 33954.3844

Find 95% CI

Also generate a 95% confidence interval from the empirical data, assuming normality.

X_mean <- mean(X)
X_sd <- sd(X)
z <- qnorm(0.975)
X_se <- X_sd/sqrt(nrow(train_df))

upper_ci <- round(X_mean + (z * X_se), 2)
lower_ci <- round(X_mean - (z * X_se), 2)

paste0("The 95% confidence interval is ", lower_ci, " , ", upper_ci)
## [1] "The 95% confidence interval is 10004.84 , 11028.81"

Find the percentiles of the original data

Finally, provide the empirical 5th percentile and 95th percentile of the data.

quantile(X, probs = c(0.05, 0.95))
##       5%      95% 
##  3311.70 17401.15

Discussion

Fitting an exponential probability distribution to the variable Lot Area shifted the 5th and 95th quantiles down. The confidence interval of the mean of the Lot Area can be interpreted as saying that we are 95% confident that the mean Lot Area of housing samples of houses taken from this area is between 10004.84 and 11028.81 square feet.

PART 5 - Modeling.

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. kaggle. It is your job to predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable.

Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis.

Handle Missign values

Before fitting a model we need to address columns with missing values

colSums(is.na(train_df))[colSums(is.na(train_df)) > 0]
##  LotFrontage        Alley   MasVnrType   MasVnrArea     BsmtQual     BsmtCond 
##          259         1369            8            8           37           37 
## BsmtExposure BsmtFinType1 BsmtFinType2   Electrical  FireplaceQu   GarageType 
##           38           37           38            1          690           81 
##  GarageYrBlt GarageFinish   GarageQual   GarageCond       PoolQC        Fence 
##           81           81           81           81         1453         1179 
##  MiscFeature 
##         1406
# replacing NA in each numeric column with its mean value 

for (i in colnames(train_df)){
  
  if (typeof(train_df[[i]]) == "double"){
    train_df[[i]][is.na(train_df[[i]])] <- mean(train_df[[i]], na.rm = TRUE)
  }
}

# replacing NA in each character column with its mode value 

for (i in colnames(train_df)){
  
  if (typeof(train_df[[i]]) == "character"){
    train_df[[i]][is.na(train_df[[i]])] <- Mode(train_df[[i]], na.rm = TRUE)
  }
}
any(is.na(train_df))
## [1] FALSE
for (i in colnames(train_df)){
  
  if (is.factor(train_df[[i]])){
    train_df[[i]] <- as.character(train_df[[i]])
  }
}

Filter Data

I will only use the numeric variables to build a multiple linear regression model

#str(train_df)
categorical_data <- c()

for (i in colnames(train_df)){
  
  if (is.character(train_df[[i]])){
    categorical_data <- c(categorical_data, i)
  }
}


train_df_cont <- train_df %>% dplyr::select(-categorical_data, -Id)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(categorical_data)
## 
##   # Now:
##   data %>% select(all_of(categorical_data))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Build Multiple Megression Model

colnames(train_df_cont)
##  [1] "MSSubClass"    "LotFrontage"   "LotArea"       "OverallQual"  
##  [5] "OverallCond"   "YearBuilt"     "YearRemodAdd"  "MasVnrArea"   
##  [9] "BsmtFinSF1"    "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"  
## [13] "1stFlrSF"      "2ndFlrSF"      "LowQualFinSF"  "GrLivArea"    
## [17] "BsmtFullBath"  "BsmtHalfBath"  "FullBath"      "HalfBath"     
## [21] "BedroomAbvGr"  "KitchenAbvGr"  "TotRmsAbvGrd"  "Fireplaces"   
## [25] "GarageYrBlt"   "GarageCars"    "GarageArea"    "WoodDeckSF"   
## [29] "OpenPorchSF"   "EnclosedPorch" "3SsnPorch"     "ScreenPorch"  
## [33] "PoolArea"      "MiscVal"       "MoSold"        "YrSold"       
## [37] "SalePrice"

Creating the multiple linear regression model with all numeric data.

train_df_lm <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + `1stFlrSF` + `2ndFlrSF` + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces+ GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + `3SsnPorch` + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold, data = train_df)

summary(train_df_lm)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea + 
##     OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + 
##     BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + `1stFlrSF` + 
##     `2ndFlrSF` + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + 
##     FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + 
##     Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + 
##     OpenPorchSF + EnclosedPorch + `3SsnPorch` + ScreenPorch + 
##     PoolArea + MiscVal + MoSold + YrSold, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470977  -16468   -2111   13857  302683 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.660e+05  1.414e+06   0.330 0.741723    
## MSSubClass    -1.816e+02  2.767e+01  -6.562 7.42e-11 ***
## LotFrontage   -5.623e+01  5.178e+01  -1.086 0.277676    
## LotArea        4.300e-01  1.021e-01   4.211 2.70e-05 ***
## OverallQual    1.732e+04  1.188e+03  14.586  < 2e-16 ***
## OverallCond    4.665e+03  1.032e+03   4.518 6.75e-06 ***
## YearBuilt      2.717e+02  6.755e+01   4.021 6.09e-05 ***
## YearRemodAdd   1.361e+02  6.859e+01   1.984 0.047444 *  
## MasVnrArea     3.146e+01  5.950e+00   5.288 1.43e-07 ***
## BsmtFinSF1     1.920e+01  4.668e+00   4.114 4.12e-05 ***
## BsmtFinSF2     8.284e+00  7.058e+00   1.174 0.240672    
## BsmtUnfSF      9.312e+00  4.194e+00   2.220 0.026556 *  
## TotalBsmtSF           NA         NA      NA       NA    
## `1stFlrSF`     4.895e+01  5.811e+00   8.423  < 2e-16 ***
## `2ndFlrSF`     4.899e+01  4.984e+00   9.830  < 2e-16 ***
## LowQualFinSF   2.562e+01  1.997e+01   1.283 0.199644    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   9.359e+03  2.612e+03   3.583 0.000351 ***
## BsmtHalfBath   2.038e+03  4.092e+03   0.498 0.618537    
## FullBath       3.448e+03  2.837e+03   1.216 0.224341    
## HalfBath      -1.900e+03  2.663e+03  -0.713 0.475806    
## BedroomAbvGr  -1.010e+04  1.702e+03  -5.932 3.74e-09 ***
## KitchenAbvGr  -1.221e+04  5.212e+03  -2.343 0.019285 *  
## TotRmsAbvGrd   5.063e+03  1.237e+03   4.093 4.50e-05 ***
## Fireplaces     3.966e+03  1.777e+03   2.232 0.025753 *  
## GarageYrBlt    1.212e+02  6.958e+01   1.741 0.081866 .  
## GarageCars     1.123e+04  2.875e+03   3.908 9.75e-05 ***
## GarageArea    -4.237e+00  9.950e+00  -0.426 0.670345    
## WoodDeckSF     2.402e+01  8.013e+00   2.998 0.002765 ** 
## OpenPorchSF   -2.871e+00  1.518e+01  -0.189 0.850024    
## EnclosedPorch  1.183e+01  1.687e+01   0.701 0.483258    
## `3SsnPorch`    2.050e+01  3.139e+01   0.653 0.513945    
## ScreenPorch    5.600e+01  1.719e+01   3.258 0.001150 ** 
## PoolArea      -2.908e+01  2.381e+01  -1.221 0.222117    
## MiscVal       -7.302e-01  1.855e+00  -0.394 0.693910    
## MoSold        -5.009e+01  3.448e+02  -0.145 0.884523    
## YrSold        -7.805e+02  7.026e+02  -1.111 0.266812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34750 on 1425 degrees of freedom
## Multiple R-squared:  0.8131, Adjusted R-squared:  0.8087 
## F-statistic: 182.4 on 34 and 1425 DF,  p-value: < 2.2e-16

Backwards Elimination

Through backwards elimination we will find our final model, eliminating the variable with the highest p-value.

train_df_lm <- update(train_df_lm, .~. - MoSold)
train_df_lm <- update(train_df_lm, .~. - OpenPorchSF)
train_df_lm <- update(train_df_lm, .~. - MiscVal)
train_df_lm <- update(train_df_lm, .~. - TotalBsmtSF)
train_df_lm <- update(train_df_lm, .~. - GrLivArea)  
train_df_lm <- update(train_df_lm, .~. - GarageArea)
train_df_lm <- update(train_df_lm, .~. - BsmtHalfBath)
train_df_lm <- update(train_df_lm, .~. -`3SsnPorch`)
train_df_lm <- update(train_df_lm, .~. - EnclosedPorch)
train_df_lm <- update(train_df_lm, .~. - HalfBath)
train_df_lm <- update(train_df_lm, .~. - LotFrontage)
train_df_lm <- update(train_df_lm, .~. - YrSold)
train_df_lm <- update(train_df_lm, .~. - BsmtFinSF2)
train_df_lm <- update(train_df_lm, .~. - PoolArea)
train_df_lm <- update(train_df_lm, .~. - LowQualFinSF)
train_df_lm <- update(train_df_lm, .~. + FullBath)
train_df_lm <- update(train_df_lm, .~. + BsmtUnfSF)
summary(train_df_lm)
## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotArea + OverallQual + 
##     OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + 
##     BsmtUnfSF + `1stFlrSF` + `2ndFlrSF` + BsmtFullBath + FullBath + 
##     BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + Fireplaces + 
##     GarageYrBlt + GarageCars + WoodDeckSF + ScreenPorch, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -488522  -16542   -2051   13662  287892 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.034e+06  1.310e+05  -7.895 5.72e-15 ***
## MSSubClass   -1.679e+02  2.588e+01  -6.489 1.19e-10 ***
## LotArea       4.220e-01  9.990e-02   4.225 2.54e-05 ***
## OverallQual   1.756e+04  1.171e+03  15.000  < 2e-16 ***
## OverallCond   4.525e+03  1.014e+03   4.461 8.80e-06 ***
## YearBuilt     2.327e+02  5.972e+01   3.897 0.000102 ***
## YearRemodAdd  1.361e+02  6.782e+01   2.008 0.044880 *  
## MasVnrArea    3.131e+01  5.893e+00   5.312 1.25e-07 ***
## BsmtFinSF1    1.602e+01  3.907e+00   4.100 4.36e-05 ***
## BsmtUnfSF     6.804e+00  3.629e+00   1.875 0.060989 .  
## `1stFlrSF`    4.928e+01  5.244e+00   9.398  < 2e-16 ***
## `2ndFlrSF`    4.538e+01  4.219e+00  10.756  < 2e-16 ***
## BsmtFullBath  9.661e+03  2.421e+03   3.990 6.95e-05 ***
## FullBath      4.459e+03  2.597e+03   1.717 0.086209 .  
## BedroomAbvGr -1.006e+04  1.679e+03  -5.989 2.66e-09 ***
## KitchenAbvGr -1.432e+04  5.108e+03  -2.803 0.005131 ** 
## TotRmsAbvGrd  5.239e+03  1.211e+03   4.327 1.62e-05 ***
## Fireplaces    3.816e+03  1.757e+03   2.172 0.030006 *  
## GarageYrBlt   1.248e+02  6.677e+01   1.870 0.061724 .  
## GarageCars    1.016e+04  1.693e+03   6.004 2.43e-09 ***
## WoodDeckSF    2.415e+01  7.890e+00   3.061 0.002248 ** 
## ScreenPorch   5.472e+01  1.685e+01   3.247 0.001193 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34700 on 1438 degrees of freedom
## Multiple R-squared:  0.812,  Adjusted R-squared:  0.8092 
## F-statistic: 295.7 on 21 and 1438 DF,  p-value: < 2.2e-16

Diagnostic Plots

plot(train_df_lm)

Explore test data

We can now use the model to predict the prices of our test data set

head(test_df)
## # A tibble: 6 × 80
##      Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
##   <dbl>      <dbl> <chr>          <dbl>   <dbl> <chr>  <chr> <chr>   
## 1  1461         20 RH                80   11622 Pave   <NA>  Reg     
## 2  1462         20 RL                81   14267 Pave   <NA>  IR1     
## 3  1463         60 RL                74   13830 Pave   <NA>  IR1     
## 4  1464         60 RL                78    9978 Pave   <NA>  IR1     
## 5  1465        120 RL                43    5005 Pave   <NA>  IR1     
## 6  1466         60 RL                75   10000 Pave   <NA>  IR1     
## # ℹ 72 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## #   LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## #   BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## #   YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## #   Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## #   ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## #   BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …
summary(test_df)
##        Id         MSSubClass       MSZoning          LotFrontage    
##  Min.   :1461   Min.   : 20.00   Length:1459        Min.   : 21.00  
##  1st Qu.:1826   1st Qu.: 20.00   Class :character   1st Qu.: 58.00  
##  Median :2190   Median : 50.00   Mode  :character   Median : 67.00  
##  Mean   :2190   Mean   : 57.38                      Mean   : 68.58  
##  3rd Qu.:2554   3rd Qu.: 70.00                      3rd Qu.: 80.00  
##  Max.   :2919   Max.   :190.00                      Max.   :200.00  
##                                                     NA's   :227     
##     LotArea         Street             Alley             LotShape        
##  Min.   : 1470   Length:1459        Length:1459        Length:1459       
##  1st Qu.: 7391   Class :character   Class :character   Class :character  
##  Median : 9399   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 9819                                                           
##  3rd Qu.:11518                                                           
##  Max.   :56600                                                           
##                                                                          
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1459        Min.   : 1.000   Min.   :1.000   Min.   :1879  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1953  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.079   Mean   :5.554   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2001  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1459        Length:1459        Length:1459       
##  1st Qu.:1963   Class :character   Class :character   Class :character  
##  Median :1992   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1984                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1459        Length:1459        Min.   :   0.0   Length:1459       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 100.7                     
##                                        3rd Qu.: 164.0                     
##                                        Max.   :1290.0                     
##                                        NA's   :15                         
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1459        Length:1459        Length:1459        Length:1459       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1459        Length:1459        Min.   :   0.0   Length:1459       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 350.5   Mode  :character  
##                                        Mean   : 439.2                     
##                                        3rd Qu.: 753.5                     
##                                        Max.   :4010.0                     
##                                        NA's   :1                          
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF     Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0   Length:1459       
##  1st Qu.:   0.00   1st Qu.: 219.2   1st Qu.: 784   Class :character  
##  Median :   0.00   Median : 460.0   Median : 988   Mode  :character  
##  Mean   :  52.62   Mean   : 554.3   Mean   :1046                     
##  3rd Qu.:   0.00   3rd Qu.: 797.8   3rd Qu.:1305                     
##  Max.   :1526.00   Max.   :2140.0   Max.   :5095                     
##  NA's   :1         NA's   :1        NA's   :1                        
##   HeatingQC          CentralAir         Electrical           1stFlrSF     
##  Length:1459        Length:1459        Length:1459        Min.   : 407.0  
##  Class :character   Class :character   Class :character   1st Qu.: 873.5  
##  Mode  :character   Mode  :character   Mode  :character   Median :1079.0  
##                                                           Mean   :1156.5  
##                                                           3rd Qu.:1382.5  
##                                                           Max.   :5095.0  
##                                                                           
##     2ndFlrSF     LowQualFinSF        GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :   0.000   Min.   : 407   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:   0.000   1st Qu.:1118   1st Qu.:0.0000  
##  Median :   0   Median :   0.000   Median :1432   Median :0.0000  
##  Mean   : 326   Mean   :   3.543   Mean   :1486   Mean   :0.4345  
##  3rd Qu.: 676   3rd Qu.:   0.000   3rd Qu.:1721   3rd Qu.:1.0000  
##  Max.   :1862   Max.   :1064.000   Max.   :5095   Max.   :3.0000  
##                                                   NA's   :2       
##   BsmtHalfBath       FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.0000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.0652   Mean   :1.571   Mean   :0.3777   Mean   :2.854  
##  3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.0000   Max.   :4.000   Max.   :2.0000   Max.   :6.000  
##  NA's   :2                                                        
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1459        Min.   : 3.000   Length:1459       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.042                      Mean   : 6.385                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :2.000                      Max.   :15.000                     
##                                                                        
##    Fireplaces     FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.0000   Length:1459        Length:1459        Min.   :1895  
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:1959  
##  Median :0.0000   Mode  :character   Mode  :character   Median :1979  
##  Mean   :0.5812                                         Mean   :1978  
##  3rd Qu.:1.0000                                         3rd Qu.:2002  
##  Max.   :4.0000                                         Max.   :2207  
##                                                         NA's   :78    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1459        Min.   :0.000   Min.   :   0.0   Length:1459       
##  Class :character   1st Qu.:1.000   1st Qu.: 318.0   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.766   Mean   : 472.8                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :5.000   Max.   :1488.0                     
##                     NA's   :1       NA's   :1                          
##   GarageCond         PavedDrive          WoodDeckSF       OpenPorchSF    
##  Length:1459        Length:1459        Min.   :   0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:   0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :   0.00   Median : 28.00  
##                                        Mean   :  93.17   Mean   : 48.31  
##                                        3rd Qu.: 168.00   3rd Qu.: 72.00  
##                                        Max.   :1424.00   Max.   :742.00  
##                                                                          
##  EnclosedPorch       3SsnPorch        ScreenPorch        PoolArea      
##  Min.   :   0.00   Min.   :  0.000   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :   0.00   Median :  0.000   Median :  0.00   Median :  0.000  
##  Mean   :  24.24   Mean   :  1.794   Mean   : 17.06   Mean   :  1.744  
##  3rd Qu.:   0.00   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :1012.00   Max.   :360.000   Max.   :576.00   Max.   :800.000  
##                                                                        
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1459        Length:1459        Length:1459        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   58.17  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :17000.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1459        Length:1459       
##  1st Qu.: 4.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.104   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
## 

Handle Missing Values

Look for missing data and fill in using same methods as our training data.

colSums(is.na(test_df))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             4           227             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1352             0             0             2 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             1             1 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##            16            15             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            44            45            44            42             1 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            42             1             1             1             0 
##     HeatingQC    CentralAir    Electrical      1stFlrSF      2ndFlrSF 
##             0             0             0             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             2             2             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             1             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             2             0           730            76            78 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            78             1             1            78            78 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch     3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1456          1169          1408 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             1             0
# replacing NA in each numeric column with its mean value 

for (i in colnames(test_df)){
  
  if (typeof(test_df[[i]]) == "double"){
    test_df[[i]][is.na(test_df[[i]])] <- mean(test_df[[i]], na.rm = TRUE)
  }
}

# replacing NA in each character column with its mode value 

for (i in colnames(test_df)){
  
  if (typeof(test_df[[i]]) == "character"){
    test_df[[i]][is.na(test_df[[i]])] <- Mode(test_df[[i]], na.rm = TRUE)
  }
}
any(is.na(test_df))
## [1] FALSE

Make Predictions

Now we can make predictions on Sale Price of our test data set with the model we just created

test_predictions <- predict.lm(train_df_lm, newdata = test_df)

test_df$SalePrice <- test_predictions

test_saleprice <- test_df %>% dplyr::select(Id, SalePrice)
head(test_saleprice)
## # A tibble: 6 × 2
##      Id SalePrice
##   <dbl>     <dbl>
## 1  1461   120124.
## 2  1462   164980.
## 3  1463   174175.
## 4  1464   201534.
## 5  1465   197278.
## 6  1466   184179.

Save .csv and Report Kaggle Score

Report your Kaggle.com user name and score.

Kaggle user name: dirkhartog profile Score: 0.29641

write_csv(test_saleprice, "Sale_price_pred.csv")

Discussion/Anaalysis

Building the multiple regression model eliminated 16 variables from the data set returning an R-squared value of .8092 where our model explains 80.92% of the variability in our data. With a p-value of less than 0.05, we can say this is statistically significant. The residuals appear to be nearly normal by looking at the Q-Q plot however there does appear to be some outliers. This is made more apparent when looking at the residual plot. While there is constant variability above and below 0, the residuals appear to collect toward the left side. There is 1 particular point in teh leverage plot that falls below the dashed line marking COok’s distance. This is considered an influential point. There is one other point that is close but does not cross the line. Dealing with these outliers might improve our model. Other factors that may have impacted the model is the skewed distribution of some variables. Performing a log or square root transformation may have improved the model. Finally another limitation is that I did not include any of the categorical variables into the model. Figuring out how the categorical variables effects that predictability of Sale Price would also be important.