I will be using the work from the previous models and redoing it by transforming the data using the square root method instead of the log method to account for all of the 0 values.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(ggplot2)
library(readxl)
Capstone_Data_Set <- read_excel("C:/Users/carmo/Downloads/Capstone Data Set.xlsx")
View(Capstone_Data_Set)
summary(Capstone_Data_Set)
## Total Amount Awarded ROH ROSH ROUH
## Min. : 18846 Min. : 55.0 Min. : 39 Min. : 0.00
## 1st Qu.: 1201637 1st Qu.: 343.5 1st Qu.: 219 1st Qu.: 49.25
## Median : 2745798 Median : 686.0 Median : 431 Median : 170.00
## Mean : 7453131 Mean : 1749.6 Mean : 1065 Mean : 684.32
## 3rd Qu.: 7052004 3rd Qu.: 1585.5 3rd Qu.: 959 3rd Qu.: 517.75
## Max. :157847769 Max. :88025.0 Max. :83940 Max. :52307.00
## TSSFA TSFA TPHFA TOBC
## Min. : 0 Min. : 0 Min. : 0 Min. : 16.0
## 1st Qu.: 147972 1st Qu.: 0 1st Qu.: 925654 1st Qu.: 405.5
## Median : 334446 Median : 121230 Median : 2156852 Median : 855.5
## Mean : 707211 Mean : 537492 Mean : 6208429 Mean : 1928.4
## 3rd Qu.: 828246 3rd Qu.: 462263 3rd Qu.: 5719634 3rd Qu.: 1888.5
## Max. :6678318 Max. :9095741 Max. :142073710 Max. :40098.0
## PRH PSETRE PSPRE TESBC
## Min. :0.0000 Min. :0.03704 Min. :0.08333 Min. : 12.0
## 1st Qu.:0.1027 1st Qu.:0.27919 1st Qu.:0.93448 1st Qu.: 158.0
## Median :0.1471 Median :0.36378 Median :0.95948 Median : 327.0
## Mean :0.1519 Mean :0.38325 Mean :0.94547 Mean : 883.9
## 3rd Qu.:0.1948 3rd Qu.:0.48066 3rd Qu.:0.97634 3rd Qu.: 766.5
## Max. :0.4500 Max. :0.87810 Max. :1.00000 Max. :62971.0
CDS_Stat_Table <- stat.desc(Capstone_Data_Set)
CDS_Cor <- cor(Capstone_Data_Set)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(CDS_Cor, type = "html", out = "Correlation_Table.html", title = "Homeless Funding Variable Correlations", digits = 2)
##
## <table style="text-align:center"><caption><strong>Homeless Funding Variable Correlations</strong></caption>
## <tr><td colspan="13" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td>Total Amount Awarded</td><td>ROH</td><td>ROSH</td><td>ROUH</td><td>TSSFA</td><td>TSFA</td><td>TPHFA</td><td>TOBC</td><td>PRH</td><td>PSETRE</td><td>PSPRE</td><td>TESBC</td></tr>
## <tr><td colspan="13" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Total Amount Awarded</td><td>1</td><td>0.84</td><td>0.70</td><td>0.66</td><td>0.87</td><td>0.79</td><td>1.00</td><td>0.91</td><td>0.07</td><td>0.001</td><td>0.13</td><td>0.74</td></tr>
## <tr><td style="text-align:left">ROH</td><td>0.84</td><td>1</td><td>0.89</td><td>0.70</td><td>0.61</td><td>0.63</td><td>0.85</td><td>0.84</td><td>0.04</td><td>-0.03</td><td>0.05</td><td>0.92</td></tr>
## <tr><td style="text-align:left">ROSH</td><td>0.70</td><td>0.89</td><td>1</td><td>0.30</td><td>0.50</td><td>0.52</td><td>0.71</td><td>0.72</td><td>0.03</td><td>-0.02</td><td>0.04</td><td>0.99</td></tr>
## <tr><td style="text-align:left">ROUH</td><td>0.66</td><td>0.70</td><td>0.30</td><td>1</td><td>0.50</td><td>0.49</td><td>0.67</td><td>0.64</td><td>0.05</td><td>-0.04</td><td>0.04</td><td>0.38</td></tr>
## <tr><td style="text-align:left">TSSFA</td><td>0.87</td><td>0.61</td><td>0.50</td><td>0.50</td><td>1</td><td>0.75</td><td>0.84</td><td>0.75</td><td>0.04</td><td>0.01</td><td>0.13</td><td>0.54</td></tr>
## <tr><td style="text-align:left">TSFA</td><td>0.79</td><td>0.63</td><td>0.52</td><td>0.49</td><td>0.75</td><td>1</td><td>0.75</td><td>0.73</td><td>0.06</td><td>0.05</td><td>0.11</td><td>0.56</td></tr>
## <tr><td style="text-align:left">TPHFA</td><td>1.00</td><td>0.85</td><td>0.71</td><td>0.67</td><td>0.84</td><td>0.75</td><td>1</td><td>0.90</td><td>0.07</td><td>-0.004</td><td>0.12</td><td>0.75</td></tr>
## <tr><td style="text-align:left">TOBC</td><td>0.91</td><td>0.84</td><td>0.72</td><td>0.64</td><td>0.75</td><td>0.73</td><td>0.90</td><td>1</td><td>0.12</td><td>-0.01</td><td>0.10</td><td>0.76</td></tr>
## <tr><td style="text-align:left">PRH</td><td>0.07</td><td>0.04</td><td>0.03</td><td>0.05</td><td>0.04</td><td>0.06</td><td>0.07</td><td>0.12</td><td>1</td><td>-0.11</td><td>-0.04</td><td>0.03</td></tr>
## <tr><td style="text-align:left">PSETRE</td><td>0.001</td><td>-0.03</td><td>-0.02</td><td>-0.04</td><td>0.01</td><td>0.05</td><td>-0.004</td><td>-0.01</td><td>-0.11</td><td>1</td><td>-0.01</td><td>-0.02</td></tr>
## <tr><td style="text-align:left">PSPRE</td><td>0.13</td><td>0.05</td><td>0.04</td><td>0.04</td><td>0.13</td><td>0.11</td><td>0.12</td><td>0.10</td><td>-0.04</td><td>-0.01</td><td>1</td><td>0.04</td></tr>
## <tr><td style="text-align:left">TESBC</td><td>0.74</td><td>0.92</td><td>0.99</td><td>0.38</td><td>0.54</td><td>0.56</td><td>0.75</td><td>0.76</td><td>0.03</td><td>-0.02</td><td>0.04</td><td>1</td></tr>
## <tr><td colspan="13" style="border-bottom: 1px solid black"></td></tr></table>
CDS_SQRT <- Capstone_Data_Set %>% mutate(ROH_SQRT = sqrt(ROH), TPHFA_SQRT = sqrt(TPHFA), TSFA_SQRT = sqrt(TSFA), TSSFA_SQRT = sqrt(TSSFA), TESBC_SQRT = sqrt(TESBC), TOBC_SQRT = sqrt(TOBC), PSPRE_SQRT = sqrt(PSPRE))
CDS_SQRT <- CDS_SQRT %>% select(ROH_SQRT, TPHFA_SQRT, TSFA_SQRT, TSSFA_SQRT, TESBC_SQRT, TOBC_SQRT, PRH, PSETRE, PSPRE_SQRT)
CDS_Clean <- Capstone_Data_Set %>% select(ROH, TSSFA, TSFA, TPHFA, TOBC, PRH, PSETRE, PSPRE, TESBC)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
Homeless_Linear <- lm(ROH_SQRT ~ TPHFA_SQRT + TOBC_SQRT + TESBC_SQRT + PRH + PSETRE + PSPRE_SQRT, data = CDS_SQRT)
Homeless_GLM <- glm(ROH ~ TPHFA + TOBC + TESBC + PRH + PSETRE + PSPRE, data = CDS_Clean, family = Gamma(link=inverse))
summary(Homeless_Linear)
##
## Call:
## lm(formula = ROH_SQRT ~ TPHFA_SQRT + TOBC_SQRT + TESBC_SQRT +
## PRH + PSETRE + PSPRE_SQRT, data = CDS_SQRT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.360 -4.917 -1.298 4.061 74.599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.5042249 11.3313321 0.398 0.691
## TPHFA_SQRT 0.0000234 0.0007962 0.029 0.977
## TOBC_SQRT 0.2895335 0.0559512 5.175 3.78e-07 ***
## TESBC_SQRT 0.9453922 0.0511345 18.488 < 2e-16 ***
## PRH -7.8984899 7.6144834 -1.037 0.300
## PSETRE -1.8724905 3.3056257 -0.566 0.571
## PSPRE_SQRT -2.1490650 11.4718312 -0.187 0.852
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.581 on 363 degrees of freedom
## Multiple R-squared: 0.8682, Adjusted R-squared: 0.866
## F-statistic: 398.4 on 6 and 363 DF, p-value: < 2.2e-16
summary(Homeless_GLM)
##
## Call:
## glm(formula = ROH ~ TPHFA + TOBC + TESBC + PRH + PSETRE + PSPRE,
## family = Gamma(link = inverse), data = CDS_Clean)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.177e-03 1.136e-03 1.916 0.0561 .
## TPHFA 1.810e-12 2.743e-12 0.660 0.5096
## TOBC -2.409e-08 1.028e-08 -2.344 0.0196 *
## TESBC -2.359e-09 1.076e-09 -2.193 0.0289 *
## PRH -1.178e-03 6.659e-04 -1.769 0.0777 .
## PSETRE 7.894e-04 3.451e-04 2.287 0.0228 *
## PSPRE -1.501e-03 1.180e-03 -1.272 0.2041
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 1.440583)
##
## Null deviance: 629.92 on 369 degrees of freedom
## Residual deviance: 374.27 on 363 degrees of freedom
## AIC: 6026.7
##
## Number of Fisher Scoring iterations: 7
AIC(Homeless_Linear, Homeless_GLM, k = 2)
## df AIC
## Homeless_Linear 8 2731.174
## Homeless_GLM 8 6026.667
stargazer(Homeless_Linear, type = "html", title = "Homelessness Regression Results", out = "Homeless_Regression.html", dep.var.labels = "Overall Count of Homelessness", covariate.labels = c("Total Permanent Housing Funding Awarded", "Total Other Bed Count", "Total Emergency Shelter Bed Count", "Percent Returns to Homelessness", "Percent Successful Exits from ES, TH, SH, and RRH", "Percent of Successful Permanent Housing Retention and Exits"), digits = 2)
##
## <table style="text-align:center"><caption><strong>Homelessness Regression Results</strong></caption>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td><em>Dependent variable:</em></td></tr>
## <tr><td></td><td colspan="1" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td>Overall Count of Homelessness</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Total Permanent Housing Funding Awarded</td><td>0.0000</td></tr>
## <tr><td style="text-align:left"></td><td>(0.001)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Total Other Bed Count</td><td>0.29<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.06)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Total Emergency Shelter Bed Count</td><td>0.95<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.05)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Percent Returns to Homelessness</td><td>-7.90</td></tr>
## <tr><td style="text-align:left"></td><td>(7.61)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Percent Successful Exits from ES, TH, SH, and RRH</td><td>-1.87</td></tr>
## <tr><td style="text-align:left"></td><td>(3.31)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Percent of Successful Permanent Housing Retention and Exits</td><td>-2.15</td></tr>
## <tr><td style="text-align:left"></td><td>(11.47)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>4.50</td></tr>
## <tr><td style="text-align:left"></td><td>(11.33)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>370</td></tr>
## <tr><td style="text-align:left">R<sup>2</sup></td><td>0.87</td></tr>
## <tr><td style="text-align:left">Adjusted R<sup>2</sup></td><td>0.87</td></tr>
## <tr><td style="text-align:left">Residual Std. Error</td><td>9.58 (df = 363)</td></tr>
## <tr><td style="text-align:left">F Statistic</td><td>398.40<sup>***</sup> (df = 6; 363)</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>
In using the AIC test, it is determined that the more valid test is the Homeless_Linear model as its score is lower. For that reason, I will be running the assumption tests on the Homeless_Linear Model.
Linearity
library(ggplot2)
library(lmtest)
plot(Homeless_Linear, which = 1)
raintest((Homeless_Linear))
##
## Rainbow test
##
## data: (Homeless_Linear)
## Rain = 3.7139, df1 = 185, df2 = 178, p-value < 2.2e-16
The low p value determines that this model is not linear.
#Independence of Errors
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
durbinWatsonTest(Homeless_Linear)
## lag Autocorrelation D-W Statistic p-value
## 1 0.369913 1.255426 0
## Alternative hypothesis: rho != 0
Since the p-value is lower than 0.05, the model does not meet the assumption of independence of errors.
#Homoscedasticity
library(lmtest)
plot(Homeless_Linear, which = 3)
bptest(Homeless_Linear)
##
## studentized Breusch-Pagan test
##
## data: Homeless_Linear
## BP = 84.447, df = 6, p-value = 4.298e-16
The low p-value means that this model is heteroscadistic.
#Normality of Residuals
plot(Homeless_Linear, which = 2)
shapiro.test(Homeless_Linear$residuals)
##
## Shapiro-Wilk normality test
##
## data: Homeless_Linear$residuals
## W = 0.85492, p-value < 2.2e-16
The low p-value means that this model does not meet the assumption of normality of residuals.
#No Multicolinearity
vif(Homeless_Linear)
## TPHFA_SQRT TOBC_SQRT TESBC_SQRT PRH PSETRE PSPRE_SQRT
## 6.200586 8.050267 3.756763 1.045520 1.038182 1.039825
None of the variables have a score higher than 10 meaning that the model does not meet the assumption of no mutlicolinearity.