# Note I had to load the MASS package prior to the tidyverse packages
# to prevent conflicts amongst functions, e.g. dplyr::select, MASS::select
library(MASS)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(cowplot)

Kaggle Data

https://www.kaggle.com/c/house-prices-advanced-regression-techniques.

train <- read_csv("/Users/anjalibm.com/Documents/DataScience/DATA_605/Final/data/train.csv", na = c("", "NA"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   Id = col_double(),
##   MSSubClass = col_double(),
##   LotFrontage = col_double(),
##   LotArea = col_double(),
##   OverallQual = col_double(),
##   OverallCond = col_double(),
##   YearBuilt = col_double(),
##   YearRemodAdd = col_double(),
##   MasVnrArea = col_double(),
##   BsmtFinSF1 = col_double(),
##   BsmtFinSF2 = col_double(),
##   BsmtUnfSF = col_double(),
##   TotalBsmtSF = col_double(),
##   `1stFlrSF` = col_double(),
##   `2ndFlrSF` = col_double(),
##   LowQualFinSF = col_double(),
##   GrLivArea = col_double(),
##   BsmtFullBath = col_double(),
##   BsmtHalfBath = col_double(),
##   FullBath = col_double()
##   # ... with 18 more columns
## )
## ℹ Use `spec()` for the full column specifications.
test <- read_csv("/Users/anjalibm.com/Documents/DataScience/DATA_605/Final/data/test.csv", na = c("", "NA"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   Id = col_double(),
##   MSSubClass = col_double(),
##   LotFrontage = col_double(),
##   LotArea = col_double(),
##   OverallQual = col_double(),
##   OverallCond = col_double(),
##   YearBuilt = col_double(),
##   YearRemodAdd = col_double(),
##   MasVnrArea = col_double(),
##   BsmtFinSF1 = col_double(),
##   BsmtFinSF2 = col_double(),
##   BsmtUnfSF = col_double(),
##   TotalBsmtSF = col_double(),
##   `1stFlrSF` = col_double(),
##   `2ndFlrSF` = col_double(),
##   LowQualFinSF = col_double(),
##   GrLivArea = col_double(),
##   BsmtFullBath = col_double(),
##   BsmtHalfBath = col_double(),
##   FullBath = col_double()
##   # ... with 17 more columns
## )
## ℹ Use `spec()` for the full column specifications.

Problem 1.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 1st quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.

   

a.

\[ P(X > x | Y > y) = \frac{P(X > x \cap Y > y)}{P(Y > y)} \]

In words, we’re looking for the conditional probability that the OpenPorchSF variable is greater than its first quantile, given that SalePrice is greather than its first quantile.

# Get first quantile for each variable
x_fq <- quantile(train$OpenPorchSF, 0.25)
y_fq <- quantile(train$SalePrice, 0.25)
denominator <- train[train$SalePrice > y_fq, ]
numerator <- denominator[denominator$OpenPorchSF > x_fq, ]
paste0("Probability = ", nrow(numerator) / nrow(denominator))
## [1] "Probability = 0.647488584474886"
rm(denominator, numerator)

b.

\[ P(X > x, Y > Y) = P(X > x \cap Y > y) \]

jointp <- train[train$OpenPorchSF > x_fq & train$SalePrice > y_fq, ]
paste0("Probability = ", nrow(jointp) / nrow(train))
## [1] "Probability = 0.485616438356164"
rm(jointp)

c.

\[ P(X < x | Y > y) = \frac{P(X < x \cap Y > y)}{P(Y > y)} \]

denominator <- train[train$SalePrice > y_fq, ]
numerator <- denominator[denominator$OpenPorchSF < x_fq, ]
paste0("Probability = ", nrow(numerator) / nrow(denominator))
## [1] "Probability = 0"
oo <- nrow(train[train$OpenPorchSF <= x_fq & train$SalePrice <= y_fq, ])
ot <- nrow(train[train$OpenPorchSF <= x_fq & train$SalePrice > y_fq, ])  
or <- oo + ot  
to <- nrow(train[train$OpenPorchSF > x_fq & train$SalePrice <= y_fq, ])
tt <- nrow(train[train$OpenPorchSF > x_fq & train$SalePrice > y_fq, ])
tr <- to + tt
ro <- oo + to
rt <- ot + tt
rr <- ro + rt  

2. Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of X and Y. Derive a correlation matrix for any THREE quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide a 92% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

For brevity’s sake, I’ll only summarize the last 10 (numeric) columns.

train %>%
  select_if(is.numeric) %>%
  psych::describe() %>%
  tail(10)
##               vars    n      mean       sd median   trimmed      mad   min
## WoodDeckSF      29 1460     94.24   125.34      0     71.76     0.00     0
## OpenPorchSF     30 1460     46.66    66.26     25     33.23    37.06     0
## EnclosedPorch   31 1460     21.95    61.12      0      3.87     0.00     0
## 3SsnPorch       32 1460      3.41    29.32      0      0.00     0.00     0
## ScreenPorch     33 1460     15.06    55.76      0      0.00     0.00     0
## PoolArea        34 1460      2.76    40.18      0      0.00     0.00     0
## MiscVal         35 1460     43.49   496.12      0      0.00     0.00     0
## MoSold          36 1460      6.32     2.70      6      6.25     2.97     1
## YrSold          37 1460   2007.82     1.33   2008   2007.77     1.48  2006
## SalePrice       38 1460 180921.20 79442.50 163000 170783.29 56338.80 34900
##                  max  range  skew kurtosis      se
## WoodDeckSF       857    857  1.54     2.97    3.28
## OpenPorchSF      547    547  2.36     8.44    1.73
## EnclosedPorch    552    552  3.08    10.37    1.60
## 3SsnPorch        508    508 10.28   123.06    0.77
## ScreenPorch      480    480  4.11    18.34    1.46
## PoolArea         738    738 14.80   222.19    1.05
## MiscVal        15500  15500 24.43   697.64   12.98
## MoSold            12     11  0.21    -0.41    0.07
## YrSold          2010      4  0.10    -1.19    0.03
## SalePrice     755000 720100  1.88     6.50 2079.11

I will do boxplots for numeric variables, and bar charts for categorical data.

train.bp <- train %>%
  select_if(is.numeric) %>%
  select(-matches("*Year* | *yr*"), -c(Id, SalePrice, LotArea)) %>%
  gather()
ggplot(train.bp, aes(x = key, y = value)) +
  labs(x = "variable", title = "Training Data Boxplot") +
  geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
  theme(axis.text.x = element_text(angle = 90))
## Warning: Removed 267 rows containing non-finite values (stat_boxplot).

rm(oo, or, ot, pa, pab, pb, ro, rr, rt, to, tr, tt, x_fq, y_fq, train.bp, denominator, numerator)
## Warning in rm(oo, or, ot, pa, pab, pb, ro, rr, rt, to, tr, tt, x_fq, y_fq, :
## object 'pa' not found
## Warning in rm(oo, or, ot, pa, pab, pb, ro, rr, rt, to, tr, tt, x_fq, y_fq, :
## object 'pab' not found
## Warning in rm(oo, or, ot, pa, pab, pb, ro, rr, rt, to, tr, tt, x_fq, y_fq, :
## object 'pb' not found
train.bc <- train %>%
  select_if(is.character) %>%
  gather()
ggplot(data = train.bc, mapping = aes(x = value)) + 
  geom_bar() + 
  facet_wrap(~key, scales = 'free', ncol = 4) +
  theme(axis.text.x = element_text(angle = 90))

rm(train.bc)
ggplot(data = train, aes(x = OpenPorchSF, y = SalePrice)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

colnames(train)[colnames(train) == '1stFlrSF'] <- 'firstFlrSF'
colnames(train)[colnames(train) == '2ndFlrSF'] <- 'secFlrSF'
colnames(train)[colnames(train) == '3SsnPorch'] <- 'threeSsnPorch'
colnames(test)[colnames(test) == '1stFlrSF'] <- 'firstFlrSF'
colnames(test)[colnames(test) == '2ndFlrSF'] <- 'secFlrSF'
colnames(test)[colnames(test) == '3SsnPorch'] <- 'threeSsnPorch'
cors <- cor(train[, c("MSSubClass", "firstFlrSF", "GrLivArea")])

\(H_0:\) The correlation coefficient is 0, which is to say that these variables aren’t related. \(H_1:\) The correlation coefficient is not 0.

cor.test(train$MSSubClass, train$firstFlrSF, conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  train$MSSubClass and train$firstFlrSF
## t = -9.933, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  -0.2941963 -0.2083297
## sample estimates:
##        cor 
## -0.2517584
cor.test(train$MSSubClass, train$GrLivArea, conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  train$MSSubClass and train$GrLivArea
## t = 2.8662, df = 1458, p-value = 0.004214
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.02912052 0.12027312
## sample estimates:
##        cor 
## 0.07485318
cor.test(train$firstFlrSF, train$GrLivArea, conf.level = 0.92)
## 
##  Pearson's product-moment correlation
## 
## data:  train$firstFlrSF and train$GrLivArea
## t = 26.217, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 92 percent confidence interval:
##  0.5340458 0.5963849
## sample estimates:
##      cor 
## 0.566024

Since the p-value for all three tests is much smaller than \(\alpha = 0.08\), we can reject the null hypothesis, and conclude that there is some correlation between the variables.

According to this wikipedia article, familywise error is a type 1 error, which occurs when one rejects a true null hypothesis (aka a false positive). The probability of making one or more of these can be controlled by ensuring \[ FWER = P(V \geq 1) = 1 - P(v = 0) \leq \alpha \]

\(FWER = 1-(1-0.08)^3 =\) 0.221312

So we should proceed with caution, since there is a ~22% chance of making a type 1 error.

Linear Algebra and Correlation

Invert your \(3 \times 3\) correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

Invert our correlation matrix

precm <- solve(cors)

Multiplying the correlation matrix by the precision (inverted) matrix

round(cors %*% precm)
##            MSSubClass firstFlrSF GrLivArea
## MSSubClass          1          0         0
## firstFlrSF          0          1         0
## GrLivArea           0          0         1
round(precm %*% cors)
##            MSSubClass firstFlrSF GrLivArea
## MSSubClass          1          0         0
## firstFlrSF          0          1         0
## GrLivArea           0          0         1

The result of the matrix multiplcation is the identity matrix.

To perform the LU decomposition, I’ll recycle the function I created for assignment 2.

factorize <- function(mat){
  L <- diag(nrow(mat))    # Create lower diagonal matrix of the same size
  U <- mat
  # If the leading entry [1,1] is not > 1, swap it with a row that has a pivot
  if (U[1,1] == 0){
    for (i in U[2,1]:U[nrow,1]){
      if (U[i,1] != 0){
        U[1,] <- U[i,]
      }
    }
  }
  # Loop through n-1 columns, n rows, get the multipler,
  # Multiply to get the Upper Triangular matrix, and plug
  # the multiplier into the Lower Triangular Matrix
  for (k in 1:(nrow(U)-1)){
    for (j in (k+1):(nrow(U))){
      if (U[j,k] != 0){
        mult <- U[j,k] / U[k,k]
        U[j,] <- U[j,] - (mult * U[k,])
        L[j,k] <- mult
      }
    }
  }
  
  factored.matrix <- list(L, U)
  
  return(factored.matrix)
}
factorize(cors)
## [[1]]
##             [,1]      [,2] [,3]
## [1,]  1.00000000 0.0000000    0
## [2,] -0.25175835 1.0000000    0
## [3,]  0.07485318 0.6244478    1
## 
## [[2]]
##            MSSubClass firstFlrSF  GrLivArea
## MSSubClass          1 -0.2517584 0.07485318
## firstFlrSF          0  0.9366177 0.58486888
## GrLivArea           0  0.0000000 0.62917691

Calculus-Based Probability & Statistics

Many times, it makes sense to fit a closed form distribution to data. For the first variable that you selected which is skewed to the right, shift it so that the minimum value is above zero as necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Since the minimum OpenPorchSF value is 0, I’ll shift it to the right by 0.0001.

shifted <- train$OpenPorchSF- min(train$OpenPorchSF) + 0.0001
expdf <- fitdistr(shifted, densfun = "exponential")
lambda <- expdf$estimate
lambdas <- rexp(1000, lambda)

The optimal lambda value is \(\lambda =\) 0.0214315.

plot_grid(ggplot() +
  geom_histogram(aes(lambdas), fill=("blue"), col=("red"), alpha=(.2)),
ggplot(data = train, aes(x = OpenPorchSF)) +
  geom_histogram(fill = "blue", colour = "red", alpha = 0.2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

From Wikipedia, the Cumulative Distribution Function for the Exponential Distribution is:

\[ F(x;\lambda) = 1 - e^{-\lambda x} \]

The quantile function is thus

\[ F^{-1}(p;\lambda) = \frac{-\ln(1 - p)}{\lambda}, \ \ \ \ 0 \leq p < 1 \]

paste0("5th Percentile of simulated data = ", log(1 - 0.05)/-lambda)
## [1] "5th Percentile of simulated data = 2.39336429840992"
paste0("95th Percentile of simulated data = ", log(1 - 0.95)/-lambda)
## [1] "95th Percentile of simulated data = 139.781988205825"
paste0("5th Percentile of empirical data = ", quantile(shifted, 0.05))
## [1] "5th Percentile of empirical data = 1e-04"
paste0("95th Percentile of empirical data = ", quantile(shifted, 0.95))
## [1] "95th Percentile of empirical data = 175.0501"

The formula for a confidence interval on normally distributed data is \[ x \pm t_{\frac{\sigma}{2}, n-1}\cdot \frac{s}{\sqrt{n}} \]

The Z-value for a 95% confidence interval is 1.96.

paste0("95% Confidence interval for empirical data = (", mean(shifted) - 1.96 * sd(shifted) / sqrt(length(shifted)), ", ", mean(shifted) + 1.96 * sd(shifted) / sqrt(length(shifted)), ")")
## [1] "95% Confidence interval for empirical data = (43.2617349622305, 50.059012982975)"

5% of houses in the sample have an OpenPorchSF value of less than 0.0001$, while 95% are below 175.0501.   The 95% confidence interval tells us that with each sampling of the population (house prices), we are 95% sure the mean will be between ~43.26 and ~50.06.   Due to the large difference in percentile values between the sampled and empirical data, the exponential distribution may not be the best fit for our dataset.

Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

unloadNamespace("MASS")
colSums(is.na(train))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0           259             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1369             0             0             0 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             0             0 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             8             8             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            37            37            38            37             0 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            38             0             0             0             0 
##     HeatingQC    CentralAir    Electrical    firstFlrSF      secFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             0             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             0             0           690            81            81 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            81             0             0            81            81 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch threeSsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1453          1179          1406 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0
# Convert training dataset
trf <- train %>%
  select(-c(Id, Alley, BldgType, LotFrontage, LowQualFinSF, PoolArea, PoolQC, Utilities, MiscFeature, MiscVal, Street))
trf <- trf[!(trf$YearBuilt %in% c(1893)), ]
trf %<>%
  mutate_if(is.character, as_factor)
trf[, c("GarageYrBlt", "MoSold", "YrSold", "YearBuilt", "YearRemodAdd")] %<>%
  mutate_all(as.factor)
trf$MSSubClass <- factor(trf$MSSubClass, levels = c(20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180))
trf[, c("OverallQual", "OverallCond", "ExterQual", "ExterCond")] %<>% 
  mutate_all(as.ordered)
trf <- mutate_at(trf, vars(matches("ExterQ.*")), fct_relevel, ... = "Fa", "TA", "Gd", "Ex")
trf <- mutate_at(trf, vars(matches("ExterC.*")), fct_relevel, ... = "Po", "Fa", "TA", "Gd", "Ex")
# trf$LotFrontage[is.na(trf$LotFrontage)] <- 0
trf <- mutate_at(trf, colnames(trf)[colSums(is.na(trf)) > 0], addNA)
trf$MasVnrArea %<>% as.numeric()
# Convert testing dataset
tf <- test %>%
  select(-c(Id, Alley, BldgType, LotFrontage, LowQualFinSF, PoolArea, PoolQC, Utilities, MiscFeature, MiscVal, Street))
tf <- tf[!(tf$MSSubClass %in% c(150)), ]
tf <- tf[!(tf$MSZoning %in% c(NA)), ]
tf <- tf[!(tf$YearBuilt %in% c(1879, 1895, 1896, 1901, 1902, 1907)), ]
tf <- tf[!(tf$Exterior1st %in% c(NA)), ]
tf <- tf[!(tf$KitchenQual %in% c(NA)), ]
tf <- tf[!(tf$Functional %in% c(NA)), ]
tf <- tf[!(tf$GarageYrBlt %in% c(1917, 1919, 1943, 2207)), ]
tf <- tf[!(tf$SaleType %in% c(NA)), ]
tf <- tf[!(tf$BsmtFinSF1 %in% c(NA)), ]
tf <- tf[!(tf$BsmtFullBath %in% c(NA)), ]
tf <- tf[!(tf$GarageCars %in% c(NA)), ]
tf <- tf[!(tf$Condition2 %in% c("PosA")), ]
tf %<>%
  mutate_if(is.character, as_factor)
tf[, c("GarageYrBlt", "MoSold", "YrSold", "YearBuilt", "YearRemodAdd")] %<>%
  mutate_all(as.factor)
tf$MSSubClass <- factor(tf$MSSubClass, levels = c(20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180))
tf[, c("OverallQual", "OverallCond", "ExterQual", "ExterCond")] %<>% 
  mutate_all(as.ordered)
tf <- mutate_at(tf, vars(matches("ExterQ.*")), fct_relevel, ... = "Fa", "TA", "Gd", "Ex")
tf <- mutate_at(tf, vars(matches("ExterC.*")), fct_relevel, ... = "Po", "Fa", "TA", "Gd", "Ex")
# tf$LotFrontage[is.na(tf$LotFrontage)] <- 0
tf <- mutate_at(tf, colnames(tf)[colSums(is.na(tf)) > 0], addNA)
tf$MasVnrArea %<>% as.numeric()
# m1 <- lm(SalePrice ~ ., data = trf)
# 
# summary(m1)
# 
# predict(m1, newdata = tf, type = "response")

There are many variables that can be recoded. Many columns can be converted from numeric to categorical (factor). Some columns, especially those with a high number of missing values, can be converted to factors with two levels - those with the property, and those without it (1 and 0). This will allow us to still use the variable, and not have to discard it. Lastly, we can convert four columns containing quality information to ordered factors.

an NA in the PoolQC column probably means that the house doesn’t have a pool.

m1 <- lm(SalePrice ~ ., data = trf)
# summary(m1)
# length(m1$coefficients) > m1$rank
rp1 <- ggplot(m1, aes(.fitted, .resid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  geom_smooth(se = FALSE) +
  labs(title = "Residuals vs Fitted")
rp2 <- ggplot(m1, aes(.fitted, .stdresid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  geom_smooth(se = FALSE)
rp3 <- ggplot(m1) +
  stat_qq(aes(sample = .stdresid)) +
  geom_abline()
rp4 <- ggplot(m1, aes(.fitted, sqrt(abs(.stdresid)))) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(title = "Scale-Location")
rp5 <- ggplot(m1, aes(seq_along(.cooksd), .cooksd)) +
  geom_col() +
  ylim(0, 0.0075) +
  labs(title = "Cook's Distance")
rp6 <- ggplot(m1, aes(.hat, .stdresid)) +
  geom_point(aes(size = .cooksd)) +
  geom_smooth(se = FALSE, size = 0.5) +
  labs(title = "Residuals vs Leverage")
rp7 <- ggplot(m1, aes(.hat, .cooksd)) +
  geom_vline(xintercept = 0, colour = NA) +
  geom_abline(slope = seq(0, 3, by = 0.5), colour = "white") +
  geom_smooth(se = FALSE) +
  geom_point() +
  labs(title = "Cook's distance vs Leverage")
rp8 <- ggplot(m1, aes(.resid)) +
  geom_histogram(bins = 7, color="darkblue", fill="steelblue")
plot_grid(rp1, rp2, rp3, rp4, rp5, rp6, rp7, rp8, ncol = 2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing non-finite values (stat_qq).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 94 rows containing missing values (position_stack).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).
## Warning: Removed 40 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).

## Warning: Removed 40 rows containing missing values (geom_point).

# levels(tf) <- union(levels(tf), levels(trf))
preds <- predict(m1, newdata = tf, type = "response")
## Warning in predict.lm(m1, newdata = tf, type = "response"): prediction from a
## rank-deficient fit may be misleading
rdf <- data.frame(ID = 1:1431, Predicted = preds)
head(rdf, 10)
##    ID Predicted
## 1   1  112738.1
## 2   2  159855.4
## 3   3  199412.6
## 4   4  200603.6
## 5   5  194311.2
## 6   6  162449.4
## 7   7  173211.2
## 8   8  155506.3
## 9   9  185514.1
## 10 10  119749.0