Import relevant libraries:

library(readr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(equatiomatic)
library("scatterplot3d")
library(car)
## Loading required package: carData
library(effsize)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ purrr   0.3.5      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::recode()    masks car::recode()
## ✖ purrr::some()      masks car::some()
## ✖ dplyr::src()       masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
library(ggiraph)
library(ggiraphExtra)
library(dplyr)

Import final dataset:

library(readr)
final <- read_csv("final.csv")
## Rows: 33 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Borough
## dbl (4): Affordable housing, Claims, Warrants, Repossessions
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Summary statistics of data:

summary(final)
##    Borough          Affordable housing     Claims       Warrants    
##  Length:33          Min.   : 0.040     Min.   :  8   Min.   :  1.0  
##  Class :character   1st Qu.: 0.890     1st Qu.:246   1st Qu.:100.0  
##  Mode  :character   Median : 2.450     Median :337   Median :117.0  
##                     Mean   : 3.031     Mean   :359   Mean   :138.4  
##                     3rd Qu.: 4.940     3rd Qu.:469   3rd Qu.:189.0  
##                     Max.   :11.700     Max.   :644   Max.   :271.0  
##  Repossessions   
##  Min.   :  1.00  
##  1st Qu.: 32.00  
##  Median : 51.00  
##  Mean   : 58.15  
##  3rd Qu.: 75.00  
##  Max.   :138.00
describe(final)
## final 
## 
##  5  Variables      33  Observations
## --------------------------------------------------------------------------------
## Borough 
##        n  missing distinct 
##       33        0       33 
## 
## lowest : Barking and Dagenham Barnet               Bexley               Brent                Bromley             
## highest: Sutton               Tower Hamlets        Waltham Forest       Wandsworth           Westminster         
## --------------------------------------------------------------------------------
## Affordable housing 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       33        0       33        1    3.031    2.973    0.148    0.366 
##      .25      .50      .75      .90      .95 
##    0.890    2.450    4.940    6.342    7.464 
## 
## lowest :  0.04  0.10  0.18  0.36  0.39, highest:  6.23  6.37  7.44  7.50 11.70
## --------------------------------------------------------------------------------
## Claims 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       33        0       33        1      359    191.2    131.6    159.2 
##      .25      .50      .75      .90      .95 
##    246.0    337.0    469.0    598.6    635.0 
## 
## lowest :   8 116 142 151 192, highest: 589 601 631 641 644
## --------------------------------------------------------------------------------
## Warrants 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       33        0       29    0.999    138.4     81.8     46.2     57.0 
##      .25      .50      .75      .90      .95 
##    100.0    117.0    189.0    242.0    253.8 
## 
## lowest :   1  33  55  56  61, highest: 238 243 245 267 271
## --------------------------------------------------------------------------------
## Repossessions 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       33        0       29    0.999    58.15    38.98     17.2     24.0 
##      .25      .50      .75      .90      .95 
##     32.0     51.0     75.0    111.8    113.8 
## 
## lowest :   1  13  20  24  30, highest: 111 112 113 115 138
## --------------------------------------------------------------------------------

Tests on datasets

Shaprio test to see the nromality of the data:

shapiro.test(final$`Affordable housing`)
## 
##  Shapiro-Wilk normality test
## 
## data:  final$`Affordable housing`
## W = 0.88889, p-value = 0.002778
shapiro.test(final$Claims)
## 
##  Shapiro-Wilk normality test
## 
## data:  final$Claims
## W = 0.96536, p-value = 0.3636
shapiro.test(final$Warrants)
## 
##  Shapiro-Wilk normality test
## 
## data:  final$Warrants
## W = 0.96231, p-value = 0.3001
shapiro.test(final$Repossessions)
## 
##  Shapiro-Wilk normality test
## 
## data:  final$Repossessions
## W = 0.92847, p-value = 0.03165

Shapiro test shows that the affordable housing data is not nomrally distributed.

Correlation matrix:

df1<-final[, !names(final) %in% c("Borough")]
df1
## # A tibble: 33 × 4
##    `Affordable housing` Claims Warrants Repossessions
##                   <dbl>  <dbl>    <dbl>         <dbl>
##  1                 0.04      8        1             1
##  2                 5.51    334      149            63
##  3                 6.23    644      189            57
##  4                 1.41    246      117            50
##  5                 7.44    631      267           115
##  6                 0.1     277      114            51
##  7                 1.2     205       61            31
##  8                 3.19    601      217           112
##  9                 6.37    589      245           113
## 10                 2.55    583      243           108
## # … with 23 more rows
cor(df1)
##                    Affordable housing    Claims  Warrants Repossessions
## Affordable housing          1.0000000 0.5641869 0.5912998     0.5088603
## Claims                      0.5641869 1.0000000 0.9469081     0.8586961
## Warrants                    0.5912998 0.9469081 1.0000000     0.9059611
## Repossessions               0.5088603 0.8586961 0.9059611     1.0000000
cor(df1[,1:3])
##                    Affordable housing    Claims  Warrants
## Affordable housing          1.0000000 0.5641869 0.5912998
## Claims                      0.5641869 1.0000000 0.9469081
## Warrants                    0.5912998 0.9469081 1.0000000

Correlation matrix shows the highest correlation between warrants and claims.

Spearman’s correlation:

cor.test(final$`Affordable housing`,final$Repossessions, method = "spearman", exact=FALSE)
## 
##  Spearman's rank correlation rho
## 
## data:  final$`Affordable housing` and final$Repossessions
## S = 2926.5, p-value = 0.002377
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5109514

Spearman’s correlation used as affordable housing data was not normally distributed. Pearson’s correlation ius used on normally distributed data, whereas Spearman’s correlation is used on abnormaly distributed data.

Spearman’s correlation shows a Rho of 0.51, therefore there seems to be a linear relationship between affordbale housing and repossessionsn variables.

Covariance:

cov(final$`Affordable housing`,final$Repossessions)
## [1] 48.02892

Welch’s T-test: Welch’s T-test is also the unequal variances t-test. It is used to tests the hypothesis of whether the variables have equal means.

x<-final$Repossessions
y<-final$`Affordable housing`
ttest2<- t.test(x,y, var.equal = FALSE)
ttest2
## 
##  Welch Two Sample t-test
## 
## data:  x and y
## t = 9.1109, df = 32.396, p-value = 1.876e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  42.80314 67.43807
## sample estimates:
## mean of x mean of y 
## 58.151515  3.030909

Cohen’s d test: This tests investigated the effect size to indicate the difference between two means.

cohen.d(x, y)
## 
## Cohen's d
## 
## d estimate: 2.24295 (large)
## 95 percent confidence interval:
##    lower    upper 
## 1.615274 2.870626

Visualisations of the distribution of the data

Boxplot graph of affordable housing:

boxplot(final$`Affordable housing`, main='Boxplot of affordable housing in London 2020')

Histogram of affordable housing:

par(mfrow=c(1,1))
hist(final$`Affordable housing`, main='Histogram of affordable housing in London 2020', xlab='Affordable housing', ylab='Frequency')

Boxplot of reposessions:

boxplot(final$Repossessions, main='Reposessions London 2020')

Histogram of repossessions:

par(mfrow=c(1,1))
hist(final$Repossessions, main='Histogram of reposessions in London 2020', xlab='Repossessions', ylab='Frequency')

Boxplot of repossession claims:

boxplot(final$Claims, main='Reposession claims London 2020')

Histogram of repossession claims:

par(mfrow=c(1,1))
hist(final$Claims, main='Histogram of repossession claims in London 2020', xlab='Repossession claims', ylab='Frequency')

Boxplot of repossession warrants:

boxplot(final$Warrants, main='Reposession warrants London 2020')

Histogram of repossession warrants:

par(mfrow=c(1,1))
hist(final$Warrants, main='Histogram of repossession warrants in London 2020', xlab='Repossession warrants', ylab='Frequency')

Visualisations of all the data

Matrix with the kernel density curve, scatter plots, and the person correlation coefficients:

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(final[ c("Repossessions", "Claims", "Warrants", "Affordable housing")])

Scatter plot matrix:

scatterplotMatrix(~ final$Repossessions + final$Claims + final$`Affordable housing` + final$Warrants, smooth=FALSE, regLine=FALSE, ellipse=TRUE, by.groups=TRUE, diagonal=FALSE, legend=list(coords="bottomleft"))

Removing the confounder

Variance inflation factor:

mod1<-lm(final$Repossessions~final$Claims + final$Warrants + final$`Affordable housing`)
summary(mod1)
## 
## Call:
## lm(formula = final$Repossessions ~ final$Claims + final$Warrants + 
##     final$`Affordable housing`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.469  -2.153   2.368   6.933  28.305 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -3.10682    6.50748  -0.477 0.636639    
## final$Claims                0.00204    0.05085   0.040 0.968270    
## final$Warrants              0.44895    0.12165   3.690 0.000921 ***
## final$`Affordable housing` -0.52559    1.23615  -0.425 0.673841    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.36 on 29 degrees of freedom
## Multiple R-squared:  0.8219, Adjusted R-squared:  0.8035 
## F-statistic:  44.6 on 3 and 29 DF,  p-value: 5.493e-11
vif(mod1)
##               final$Claims             final$Warrants 
##                   9.677091                  10.143245 
## final$`Affordable housing` 
##                   1.538018

The VIF score for the independent variable “Warrants” is 10.1 which is higher than 10. therefore, the model will be updated without the warrants variable.

VIF without confounder:

mod2<- lm(final$Repossessions~final$Claims + final$`Affordable housing`)
summary(mod2)
## 
## Call:
## lm(formula = final$Repossessions ~ final$Claims + final$`Affordable housing`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.451  -1.442   3.047   9.057  41.102 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -6.0250     7.6989  -0.783    0.440    
## final$Claims                 0.1749     0.0236   7.411 2.94e-08 ***
## final$`Affordable housing`   0.4551     1.4389   0.316    0.754    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.31 on 30 degrees of freedom
## Multiple R-squared:  0.7382, Adjusted R-squared:  0.7208 
## F-statistic:  42.3 on 2 and 30 DF,  p-value: 1.857e-09
vif(mod2)
##               final$Claims final$`Affordable housing` 
##                   1.466936                   1.466936

The new VIF score is appropriate so confounder has been removed successfully

Performing model nonlinear logarithmic transformations to find the best model

mod2<- lm(final$Repossessions~final$Claims + final$`Affordable housing`)
summary(mod2)
## 
## Call:
## lm(formula = final$Repossessions ~ final$Claims + final$`Affordable housing`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.451  -1.442   3.047   9.057  41.102 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -6.0250     7.6989  -0.783    0.440    
## final$Claims                 0.1749     0.0236   7.411 2.94e-08 ***
## final$`Affordable housing`   0.4551     1.4389   0.316    0.754    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.31 on 30 degrees of freedom
## Multiple R-squared:  0.7382, Adjusted R-squared:  0.7208 
## F-statistic:  42.3 on 2 and 30 DF,  p-value: 1.857e-09
mod3<- lm(log(final$Repossessions) ~ final$Claims + final$`Affordable housing`)
summary(mod3)
## 
## Call:
## lm(formula = log(final$Repossessions) ~ final$Claims + final$`Affordable housing`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3119 -0.1627  0.1617  0.3192  0.5785 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.2774946  0.2369906   9.610 1.15e-10 ***
## final$Claims                0.0043105  0.0007265   5.934 1.68e-06 ***
## final$`Affordable housing` -0.0030728  0.0442941  -0.069    0.945    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5636 on 30 degrees of freedom
## Multiple R-squared:  0.6295, Adjusted R-squared:  0.6048 
## F-statistic: 25.49 on 2 and 30 DF,  p-value: 3.402e-07
mod4<- lm(log(final$Repossessions) ~ log(final$Claims) + final$`Affordable housing`)
summary(mod4)
## 
## Call:
## lm(formula = log(final$Repossessions) ~ log(final$Claims) + final$`Affordable housing`)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75654 -0.09981  0.08331  0.18837  0.46614 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -2.303245   0.406334  -5.668 3.53e-06 ***
## log(final$Claims)           1.073802   0.075391  14.243 6.89e-15 ***
## final$`Affordable housing` -0.002286   0.021949  -0.104    0.918    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2982 on 30 degrees of freedom
## Multiple R-squared:  0.8963, Adjusted R-squared:  0.8893 
## F-statistic: 129.6 on 2 and 30 DF,  p-value: 1.736e-15
mod5<- lm(log(final$Repossessions) ~ log(final$Claims) + log(final$`Affordable housing`))
summary(mod5)
## 
## Call:
## lm(formula = log(final$Repossessions) ~ log(final$Claims) + log(final$`Affordable housing`))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71399 -0.12047  0.08374  0.17606  0.50663 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -2.58816    0.50929  -5.082 1.85e-05 ***
## log(final$Claims)                1.12707    0.09239  12.199 3.70e-13 ***
## log(final$`Affordable housing`) -0.04848    0.05531  -0.876    0.388    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2945 on 30 degrees of freedom
## Multiple R-squared:  0.8988, Adjusted R-squared:  0.8921 
## F-statistic: 133.2 on 2 and 30 DF,  p-value: 1.195e-15
mod6<- lm(final$Repossessions ~ log(final$Claims) + log(final$`Affordable housing`))
summary(mod6)
## 
## Call:
## lm(formula = final$Repossessions ~ log(final$Claims) + log(final$`Affordable housing`))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.589 -16.591  -7.419   9.933  61.273 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -113.29935   44.93982  -2.521 0.017245 *  
## log(final$Claims)                 30.05520    8.15251   3.687 0.000896 ***
## log(final$`Affordable housing`)   -0.02191    4.88059  -0.004 0.996448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.99 on 30 degrees of freedom
## Multiple R-squared:  0.4725, Adjusted R-squared:  0.4373 
## F-statistic: 13.44 on 2 and 30 DF,  p-value: 6.812e-05
mod7<- lm(final$Repossessions ~ log(final$Claims) + final$`Affordable housing`)
summary(mod7)
## 
## Call:
## lm(formula = final$Repossessions ~ log(final$Claims) + final$`Affordable housing`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.444 -13.265  -8.918  17.089  43.011 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -94.402     33.908  -2.784 0.009205 ** 
## log(final$Claims)            25.137      6.291   3.995 0.000387 ***
## final$`Affordable housing`    3.019      1.832   1.648 0.109702    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.89 on 30 degrees of freedom
## Multiple R-squared:  0.5163, Adjusted R-squared:  0.4841 
## F-statistic: 16.01 on 2 and 30 DF,  p-value: 1.855e-05
mod8<- lm(final$Repossessions ~ final$Claims + log(final$`Affordable housing`))
summary(mod8)
## 
## Call:
## lm(formula = final$Repossessions ~ final$Claims + log(final$`Affordable housing`))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.474  -3.334   1.882   9.202  46.316 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -8.98044    8.56649  -1.048    0.303    
## final$Claims                     0.19034    0.02476   7.687 1.42e-08 ***
## log(final$`Affordable housing`) -2.25422    3.10456  -0.726    0.473    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.18 on 30 degrees of freedom
## Multiple R-squared:  0.7419, Adjusted R-squared:  0.7247 
## F-statistic: 43.12 on 2 and 30 DF,  p-value: 1.503e-09
mod9<- lm(log(final$Repossessions) ~ final$Claims + log(final$`Affordable housing`))
summary(mod9)
## 
## Call:
## lm(formula = log(final$Repossessions) ~ final$Claims + log(final$`Affordable housing`))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0128 -0.1577  0.1206  0.2281  0.8303 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.4655454  0.2546886   9.681 9.69e-11 ***
## final$Claims                    0.0035382  0.0007362   4.806 4.03e-05 ***
## log(final$`Affordable housing`) 0.1494523  0.0923013   1.619    0.116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5405 on 30 degrees of freedom
## Multiple R-squared:  0.6592, Adjusted R-squared:  0.6365 
## F-statistic: 29.02 on 2 and 30 DF,  p-value: 9.705e-08

Extract model equation:

extract_eq(mod6, use_coefs= TRUE)

\[ \operatorname{\widehat{final\$Repossessions}} = -113.3 + 30.06(\operatorname{\log(final\$Claims)}) - 0.02(\operatorname{\log(final\$`Affordable\ housing`)}) \] Model 6 has the highest p-value so it will be used. However, the p-value is very small so the model is not perfect. Nonetheless, this is the best linear model for the variables.

Visualisations for log transformations:

Claims log transformation:

par(mfrow=c(1, 1))
hist(log(final$Claims), main="Claims log transformation")

Affordable housing log transformation:

par(mfrow=c(1, 1))
hist(log(final$`Affordable housing`), main="Affordable housing log transformation")

Compare models with 2 variables

mod20<- lm(final$Repossessions ~ log(final$Claims))
mod21<- lm(final$Repossessions ~log(final$`Affordable housing`))
summary(mod20)
## 
## Call:
## lm(formula = final$Repossessions ~ log(final$Claims))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.610 -16.584  -7.428   9.922  61.247 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -113.164     32.814  -3.449  0.00164 ** 
## log(final$Claims)   30.029      5.699   5.270 9.91e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.57 on 31 degrees of freedom
## Multiple R-squared:  0.4725, Adjusted R-squared:  0.4555 
## F-statistic: 27.77 on 1 and 31 DF,  p-value: 9.914e-06
summary(mod21)
## 
## Call:
## lm(formula = final$Repossessions ~ log(final$`Affordable housing`))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.108 -23.895  -9.962  25.858  66.275 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       51.393      5.798   8.864 5.26e-10 ***
## log(final$`Affordable housing`)   12.639      4.112   3.073  0.00439 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.82 on 31 degrees of freedom
## Multiple R-squared:  0.2335, Adjusted R-squared:  0.2088 
## F-statistic: 9.445 on 1 and 31 DF,  p-value: 0.004388

Removing variables to see what model is best. Model 6 (the model with three varables and the log transformations) has the highest p-value

Compare model 6 with the original model:

anova(mod2, mod6)
## Analysis of Variance Table
## 
## Model 1: final$Repossessions ~ final$Claims + final$`Affordable housing`
## Model 2: final$Repossessions ~ log(final$Claims) + log(final$`Affordable housing`)
##   Res.Df   RSS Df Sum of Sq F Pr(>F)
## 1     30 10056                      
## 2     30 20263  0    -10208

Akaike Information Criterion:

AIC(mod2)
## [1] 290.3895
AIC(mod6)
## [1] 313.5119

The AIC estimates how much information was lost by a given model using the maximum likelihood. The smaller the AIC the better. Although, mod6 has a higher AIC, it will still be used as it gave the highest P-value and includes the moderator.

Model diagnostics

par(mfrow=c(1,2))
plot(mod6)

Plot 1: Residuals vs Fitted:

The linearity assumption is when the dependent variable is linearly related to the independent variables. Th assumption is correct if there is no systematic relationship between the residuals and fitted values. The data points are distributed without any pattern here. Hence, the first assumption of linearity is valid.

The mean value of error terms is zero conditional on the independent variable. This is shown by how the residuals are have no pattern around y = 0.

The points with a large positive or negative residual could be outliers as they are not predicted well by the fitted regression model. Observations 1, 19 and 25 have the largest residuals in plot 1.

If the homoscedasticity (constant variance) assumption is met, there would be constant variance vertically direction and the scatter would vertically symmetric. This is not clear in plot 1. However, there are more tests for homoscedasticity. Furthermore, the analysis must check if the fitted values are uncorrelated with the error term.

We can now use a specific test for homoscedasticity

Test for homoscedasticity :

ncvTest(mod6)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.5098689, Df = 1, p = 0.4752

P-value is 0.4752 and it is not significant. This means the homoscedasticity assumption is valid

Spread level plot:

spreadLevelPlot(mod6)
## Warning in spreadLevelPlot.lm(mod6): 
## 1 negative fitted value removed

## 
## Suggested power transformation:  0.05098301

The Spread-Level Plot uses the absolute standardized residuals and fitted values to make a line of best fit. The line is not perfectly straight but, homoscedasticity was still assumed.

Durbin Watson test:

durbinWatsonTest(mod6)
##  lag Autocorrelation D-W Statistic p-value
##    1        0.235314      1.361614   0.054
##  Alternative hypothesis: rho != 0

A Durbin-Watson test investigates autocorrelation.

The p-value larger than 0.05 and so it is not significant. It suggests there is no autocorrelation between the observations of the error terms.

To conclude = all assumptions on plot 1 are checked, however, the model is not perfect.

Overall, the model did not do well given the assumptions, especially as the red line on the Residuals vs Fitted plot was not horizontal.

Plot 2: Normal Q-Q

The Normal Q-Q plot is a probability plot of the standardized residuals against the values that would be expected from a normal distribution. Deviations from a straight line indicate that the distribution of residuals do not conform to a theoretical normal curve.

The normality assumption is mostly met as most points line along the 45 degree line. It is not perfect as there three outliers: 1, 25, and 19.

We can confirm the assumption with a Shapiro-Wilk test.

Shapiro-Wilk test:

shapiro.test(mod6$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod6$residuals
## W = 0.91775, p-value = 0.01588

The p-value is smaller than 0.05 so the data is not normally distributed and the assumption has failed.

Another Shapiro-Wilk text will be run after outliers are removed.

Plot 3 for Scale-location:

This plot checks homoscedasticity. A scale-location plot displays the fitted values of a regression model along the x-axis and the square root of the standardized residuals along the y-axis.

The red line is not horizontal on the graph. However, the residuals are spread equally above and below the red line. There is no clear pattern in the residuals.

Points 1, 19 and 25 are marked in the plot for further checks.

Overall, it is not clear if homoscedasticity asssumption is valid.

Plot 4 for Residuals vs Leverage

This plot highlights the individual observations, such as outliers, high-leverage points and influential observations.

The leverage shows how much the coefficient of the OLS model would change if some of the observations that have a great impact on the regression were deleted. The plot highlights three points: 1, 23, and 25

However, overall the red lines are not horizontal and the tests for autocorrelation and homoscedasticity failed meaning that the model assumptions have been violated and the statistical significance tests and confidence intervals may not accurate. Therefore, the results of the linear regression will not be completely reliable.

A Cook’s distance chart was used to investigate the impact of the outliers on the model and to see if the removal of the outliers would improve the model.

Cook’s distance is calculated by removing one data point from the model and recalculating the regression. It summarizes how much all the values in the regression model change when one observation is removed. If any point falls outside of Cook’s distance, then it is considered to be an influential observation.

Cook’s distance chart:

plot(mod6, 4)

final[c(1,23,25), ]
## # A tibble: 3 × 5
##   Borough        `Affordable housing` Claims Warrants Repossessions
##   <chr>                         <dbl>  <dbl>    <dbl>         <dbl>
## 1 City of London                 0.04      8        1             1
## 2 Lewisham                       0.59    641      238           111
## 3 Newham                        11.7     558      271           138

The City of London and Newham data points will be removed as they are consistently identified as outliers. The City of London is a reasonable outlier as it is primarily a commercial area. However, Newham appears to be a normal outlier in the dataset.

Create a new dataset without the City of London and Newham data points:

second <- read_csv("second.csv")
## Rows: 31 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Borough
## dbl (4): Affordable housing, Claims, Warrants, Repossessions
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Create new final model:

mod10<- lm(second$Repossessions ~ log(second$Claims) + log(second$`Affordable housing`))
summary(mod10)
## 
## Call:
## lm(formula = second$Repossessions ~ log(second$Claims) + log(second$`Affordable housing`))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.745 -10.592   3.284  10.533  26.044 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -303.326     47.842   -6.34 7.36e-07 ***
## log(second$Claims)                 62.716      8.418    7.45 4.10e-08 ***
## log(second$`Affordable housing`)   -5.264      3.418   -1.54    0.135    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.54 on 28 degrees of freedom
## Multiple R-squared:  0.7005, Adjusted R-squared:  0.6791 
## F-statistic: 32.75 on 2 and 28 DF,  p-value: 4.668e-08

Model diagnostics for new model with outlier removed:

plot(mod10)

New Shapiro-Wilk test:

shapiro.test(mod10$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod10$residuals
## W = 0.95187, p-value = 0.1757

P-value is bigger than 0.05 so the data is normally distributed and the transformation is good

New test for homoscedasticity:

ncvTest(mod10)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 4.201956, Df = 1, p = 0.040377

The low p-value is significant and suggests heteeroscedasticity.

New Durbin Watson test:

durbinWatsonTest(mod10)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.08229308      1.747927    0.52
##  Alternative hypothesis: rho != 0

The new Durbin watson tests provides a non-significant p-value, suggesting autocorrelation is absent.

New Cook’s distance chart:

plot(mod10, 4)

Numerous outliers are pointed out, hwoever, they do not have a massive impact on the dataset so they will remain.

Overall, the OLS model diagnostics have improved. The 4 model plots have straighter red lines.The points are spread equally on the Resiudals vs Fitted plot. Most data points line along the 45 degree line in the Normal Q-Q plot. The Scale-lcoation plot has a more horizontal red line and the points are spread equally, as does the Residuals vs Leverage plot. Furthermore, the new tests suggests autocorrelation is now absents. However, the new tests suggest there is still heteroscedasticity. Nonetheless, this is the best linear model without removing too much data so the analysis will continue.

Confidence vs predicted intervals for model 10

conf.int<- predict(mod10, interval="confidence")
head(conf.int)
##        fit      lwr       upr
## 1 52.14130 42.09198  62.19061
## 2 92.67140 80.95303 104.38978
## 3 40.13722 32.33712  47.93732
## 4 90.45807 78.61045 102.30569
## 5 61.51086 41.69008  81.33165
## 6 29.55172 19.98400  39.11943
pred.int<-predict(mod10, interval="predict")
## Warning in predict.lm(mod10, interval = "predict"): predictions on current data refer to _future_ responses
head(pred.int)
##        fit       lwr       upr
## 1 52.14130 14.838338  89.44425
## 2 92.67140 54.884602 130.45821
## 3 40.13722  3.376327  76.89812
## 4 90.45807 52.630990 128.28515
## 5 61.51086 20.481783 102.53995
## 6 29.55172 -7.624394  66.72783

Plot confidence vs predicted:

conf.int<- predict(mod10, interval="confidence")
head(conf.int)
##        fit      lwr       upr
## 1 52.14130 42.09198  62.19061
## 2 92.67140 80.95303 104.38978
## 3 40.13722 32.33712  47.93732
## 4 90.45807 78.61045 102.30569
## 5 61.51086 41.69008  81.33165
## 6 29.55172 19.98400  39.11943
new<- mutate(second, predicted = mod10$fitted.values, residuals = mod10$residuals)
new<- cbind(new, conf.int)
head(new)
##                Borough Affordable housing Claims Warrants Repossessions
## 1 Barking and Dagenham               5.51    334      149            63
## 2               Barnet               6.23    644      189            57
## 3               Bexley               1.41    246      117            50
## 4                Brent               7.44    631      267           115
## 5              Bromley               0.10    277      114            51
## 6               Camden               1.20    205       61            31
##   predicted  residuals      fit      lwr       upr
## 1  52.14130  10.858704 52.14130 42.09198  62.19061
## 2  92.67140 -35.671404 92.67140 80.95303 104.38978
## 3  40.13722   9.862777 40.13722 32.33712  47.93732
## 4  90.45807  24.541930 90.45807 78.61045 102.30569
## 5  61.51086 -10.510864 61.51086 41.69008  81.33165
## 6  29.55172   1.448283 29.55172 19.98400  39.11943
pred.int<- predict(mod10, interval="predict")
## Warning in predict.lm(mod10, interval = "predict"): predictions on current data refer to _future_ responses
new<- new %>% rename("CIfit"="fit", "CIlwr"="lwr", "CIupr"="upr")
new<-cbind(new, pred.int)
new<- new%>%rename(PIfit=fit, PIlwr=lwr, PIupr=upr)
head(new)
##                Borough Affordable housing Claims Warrants Repossessions
## 1 Barking and Dagenham               5.51    334      149            63
## 2               Barnet               6.23    644      189            57
## 3               Bexley               1.41    246      117            50
## 4                Brent               7.44    631      267           115
## 5              Bromley               0.10    277      114            51
## 6               Camden               1.20    205       61            31
##   predicted  residuals    CIfit    CIlwr     CIupr    PIfit     PIlwr     PIupr
## 1  52.14130  10.858704 52.14130 42.09198  62.19061 52.14130 14.838338  89.44425
## 2  92.67140 -35.671404 92.67140 80.95303 104.38978 92.67140 54.884602 130.45821
## 3  40.13722   9.862777 40.13722 32.33712  47.93732 40.13722  3.376327  76.89812
## 4  90.45807  24.541930 90.45807 78.61045 102.30569 90.45807 52.630990 128.28515
## 5  61.51086 -10.510864 61.51086 41.69008  81.33165 61.51086 20.481783 102.53995
## 6  29.55172   1.448283 29.55172 19.98400  39.11943 29.55172 -7.624394  66.72783
Repossessions<-second$Repossessions
Housing<-second$`Affordable housing`

Confidence vs predicted intervals plot

ggplot(new, aes(Repossessions, Housing)) + 
  geom_point()+
  stat_smooth(method=lm) +
  geom_line(aes(y=PIlwr), color="red", linetype="dashed") + 
  geom_line(aes(y=PIupr), color="red", linetype="dashed")
## `geom_smooth()` using formula = 'y ~ x'

The final model does not give much information, suggesting that there is a limited relationship between the variables.

Summary of final model: -Note that model 10 and model 11 are the same. The only difference between the models is how the formula was written. Data = second vs second$Repossessions

mod11<- lm(Repossessions ~ log(Claims) + log(`Affordable housing`), data=second)
summary(mod11)
## 
## Call:
## lm(formula = Repossessions ~ log(Claims) + log(`Affordable housing`), 
##     data = second)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.745 -10.592   3.284  10.533  26.044 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -303.326     47.842   -6.34 7.36e-07 ***
## log(Claims)                 62.716      8.418    7.45 4.10e-08 ***
## log(`Affordable housing`)   -5.264      3.418   -1.54    0.135    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.54 on 28 degrees of freedom
## Multiple R-squared:  0.7005, Adjusted R-squared:  0.6791 
## F-statistic: 32.75 on 2 and 28 DF,  p-value: 4.668e-08

Report final model:

library(report)
report(mod11)
## Formula contains log- or sqrt-terms.
##   See help("standardize") for how such terms are standardized.
## Formula contains log- or sqrt-terms.
##   See help("standardize") for how such terms are standardized.
## We fitted a linear model (estimated using OLS) to predict Repossessions with
## Claims (formula: Repossessions ~ log(Claims) + log(`Affordable housing`)). The
## model explains a statistically significant and substantial proportion of
## variance (R2 = 0.70, F(2, 28) = 32.75, p < .001, adj. R2 = 0.68). The model's
## intercept, corresponding to Claims = 0, is at -303.33 (95% CI [-401.33,
## -205.33], t(28) = -6.34, p < .001). Within this model:
## 
##   - The effect of Claims [log] is statistically significant and positive (beta =
## 62.72, 95% CI [45.47, 79.96], t(28) = 7.45, p < .001; Std. beta = 2.36, 95% CI
## [1.74, 2.98])
##   - The effect of Affordable housing [log] is statistically non-significant and
## negative (beta = -5.26, 95% CI [-12.27, 1.74], t(28) = -1.54, p = 0.135; Std.
## beta = -0.48, 95% CI [-1.04, 0.08])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation. and We fitted a linear
## model (estimated using OLS) to predict Repossessions with Affordable housing
## (formula: Repossessions ~ log(Claims) + log(`Affordable housing`)). The model
## explains a statistically significant and substantial proportion of variance (R2
## = 0.70, F(2, 28) = 32.75, p < .001, adj. R2 = 0.68). The model's intercept,
## corresponding to Affordable housing = 0, is at -303.33 (95% CI [-401.33,
## -205.33], t(28) = -6.34, p < .001). Within this model:
## 
##   - The effect of Claims [log] is statistically significant and positive (beta =
## 62.72, 95% CI [45.47, 79.96], t(28) = 7.45, p < .001; Std. beta = 2.36, 95% CI
## [1.74, 2.98])
##   - The effect of Affordable housing [log] is statistically non-significant and
## negative (beta = -5.26, 95% CI [-12.27, 1.74], t(28) = -1.54, p = 0.135; Std.
## beta = -0.48, 95% CI [-1.04, 0.08])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.

Final model equation:

extract_eq(mod11, use_coefs= TRUE)

\[ \operatorname{\widehat{Repossessions}} = -303.33 + 62.72(\operatorname{\log(Claims)}) - 5.26(\operatorname{\log(`Affordable\ housing`)}) \] ## Added variable plots

avPlots(mod11)

The added variable plots show the relationship between the independent variable and the dependent variables.

The labelled observations have the largest residuals.

The positive slope indicates a positive coefficient in the regression equation for repossessions and claims.

The negative slope indicates a negative coefficient in the regression equation for repossessions and affordable housing.

Interaction

Interaction between two continuous independent variables:

Repo<-second$Repossessions
Claims<-second$Claims
Aff<-second$`Affordable housing`
fit1<-lm(Repo~ log(Aff*Claims))

Model equation:

extract_eq(fit1, use_coefs= TRUE)

\[ \operatorname{\widehat{Repo}} = -14.14 + 11.19(\operatorname{\log(Aff\ *\ Claims)}) \]

Model summary:

summary(fit1)
## 
## Call:
## lm(formula = Repo ~ log(Aff * Claims))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45.32 -19.34 -10.57  18.80  58.73 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         -14.14      22.40  -0.631  0.53277   
## log(Aff * Claims)    11.19       3.42   3.272  0.00276 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.91 on 29 degrees of freedom
## Multiple R-squared:  0.2696, Adjusted R-squared:  0.2444 
## F-statistic:  10.7 on 1 and 29 DF,  p-value: 0.00276

The model summary shows that p-value is quite insignificant. Therefore, it suggests that there is an insignificant relationship between the 2 independent variables

Report interaction model fit1:

report(fit1)
## Formula contains log- or sqrt-terms.
##   See help("standardize") for how such terms are standardized.
## Formula contains log- or sqrt-terms.
##   See help("standardize") for how such terms are standardized.
## We fitted a linear model (estimated using OLS) to predict Repo with Aff
## (formula: Repo ~ log(Aff * Claims)). The model explains a statistically
## significant and substantial proportion of variance (R2 = 0.27, F(1, 29) =
## 10.70, p = 0.003, adj. R2 = 0.24). The model's intercept, corresponding to , is
## at -14.14 (95% CI [-59.95, 31.67], t(29) = -0.63, p = 0.533). Within this
## model:
## 
##   - The interaction effect of Claims [log] on Aff is statistically significant
## and positive (beta = 11.19, 95% CI [4.19, 18.18], t(29) = 3.27, p = 0.003; Std.
## beta = 0.36, 95% CI [0.14, 0.59])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation. and We fitted a linear
## model (estimated using OLS) to predict Repo with Claims (formula: Repo ~
## log(Aff * Claims)). The model explains a statistically significant and
## substantial proportion of variance (R2 = 0.27, F(1, 29) = 10.70, p = 0.003,
## adj. R2 = 0.24). The model's intercept, corresponding to , is at -14.14 (95% CI
## [-59.95, 31.67], t(29) = -0.63, p = 0.533). Within this model:
## 
##   - The interaction effect of Claims [log] on Aff is statistically significant
## and positive (beta = 11.19, 95% CI [4.19, 18.18], t(29) = 3.27, p = 0.003; Std.
## beta = 0.36, 95% CI [0.14, 0.59])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.

Interaction plot:

library(interactions)
interact_plot(fit1, "Claims", "Aff", plot.points=TRUE, interval=TRUE)
## Using data NULL from global environment. This could cause incorrect results
## if NULL has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.
## Warning: Claims and Aff are not included in an interaction with one another in the
## model.

The lack lines that cross indicate no interaction.

report(fit1)
## Formula contains log- or sqrt-terms.
##   See help("standardize") for how such terms are standardized.
## Formula contains log- or sqrt-terms.
##   See help("standardize") for how such terms are standardized.
## We fitted a linear model (estimated using OLS) to predict Repo with Aff
## (formula: Repo ~ log(Aff * Claims)). The model explains a statistically
## significant and substantial proportion of variance (R2 = 0.27, F(1, 29) =
## 10.70, p = 0.003, adj. R2 = 0.24). The model's intercept, corresponding to , is
## at -14.14 (95% CI [-59.95, 31.67], t(29) = -0.63, p = 0.533). Within this
## model:
## 
##   - The interaction effect of Claims [log] on Aff is statistically significant
## and positive (beta = 11.19, 95% CI [4.19, 18.18], t(29) = 3.27, p = 0.003; Std.
## beta = 0.36, 95% CI [0.14, 0.59])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation. and We fitted a linear
## model (estimated using OLS) to predict Repo with Claims (formula: Repo ~
## log(Aff * Claims)). The model explains a statistically significant and
## substantial proportion of variance (R2 = 0.27, F(1, 29) = 10.70, p = 0.003,
## adj. R2 = 0.24). The model's intercept, corresponding to , is at -14.14 (95% CI
## [-59.95, 31.67], t(29) = -0.63, p = 0.533). Within this model:
## 
##   - The interaction effect of Claims [log] on Aff is statistically significant
## and positive (beta = 11.19, 95% CI [4.19, 18.18], t(29) = 3.27, p = 0.003; Std.
## beta = 0.36, 95% CI [0.14, 0.59])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.