library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.2.0     âś” readr     2.2.0
## âś” forcats   1.0.1     âś” stringr   1.6.0
## âś” ggplot2   4.0.2     âś” tibble    3.3.1
## âś” lubridate 1.9.4     âś” tidyr     1.3.2
## âś” purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(dplyr)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

DATA CLEANING

# PIT VETERAN COUNTS DATASET

# create a file path for PIT Veteran Counts dataset
file_path_1 <-"C:/Users/Administrator/Desktop/Graduate School/Applied Quant Methods/My Class Stuff/Data Project/Veteran Homelessness/Updated Vet Datasets/PIT_Veteran_Counts.xlsx"

# Read sheet 2 of PIT Veteran Counts, file path 1
sheet_2<- read_excel(file_path_1, sheet="2024") 

# Remove Totals and MP territory from PIT Veteran Counts sheet 2
vet_data_PIT <- sheet_2 |> filter (!State %in% c("Total", "MP"))


# HIC COUNTS DATASET (NEW)

# create a file path for HIC Counts dataset
file_path_2 <-"C:/Users/Administrator/Desktop/Graduate School/Applied Quant Methods/My Class Stuff/Data Project/Veteran Homelessness/Updated Vet Datasets/HIC_Counts.xlsx"

# Read sheet 1 of HIC Counts, file path 2
sheet_1<- read_excel(file_path_2, sheet="2024") 
## New names:
## • `Total Year-Round Beds (ES)` -> `Total Year-Round Beds (ES)...6`
## • `Total Year-Round Beds (TH)` -> `Total Year-Round Beds (TH)...7`
## • `Total Year-Round Beds (SH)` -> `Total Year-Round Beds (SH)...8`
## • `Total Year-Round Beds (ES)` -> `Total Year-Round Beds (ES)...15`
## • `Total Year-Round Beds (TH)` -> `Total Year-Round Beds (TH)...27`
## • `Total Year-Round Beds (SH)` -> `Total Year-Round Beds (SH)...37`
# Remove Totals and MP territory row from HIC Counts Sheet 1
vet_data_HIC <- sheet_1 |> filter (!State %in% c("Total", "MP"))

#Change variable names for HIC dataset variables
vet_data_HIC <-vet_data_HIC |> rename (
                   Vet_Beds_Shelt ="Dedicated Veteran Beds (ES, TH, SH)", 
                   RRH_Beds="Dedicated Veteran Beds (RRH)", 
                   PSH_Beds="Dedicated Veteran Beds (PSH)", 
                   ES_Beds="Dedicated Veteran Beds (ES)", 
                   TH_Beds="Dedicated Veteran Beds (TH)", 
                   SH_Beds="Dedicated Veteran Beds (SH)")

#Change variable names for PIT dataset variables
vet_data_PIT <-vet_data_PIT |> rename (
                   CoC_Count = "Number of CoCs", 
                   Sheltered = "Sheltered Total Homeless Veterans, 2024", 
                   Unsheltered = "Unsheltered Homeless Veterans, 2024", 
                   Homeless_Vets = "Homeless Veterans, 2024")

# Combination of vat_data_HIC dataset and vet_data_PIT dataset, removal of ES/SH/TS raw counts columns
vet_data <-vet_data_PIT |> left_join(vet_data_HIC |> dplyr::select(State, Vet_Beds_Shelt, ES_Beds, TH_Beds, SH_Beds, RRH_Beds, PSH_Beds), by= "State") |>
  
  #Remove the columns
  dplyr::select(-`Sheltered ES Homeless Veterans, 2024`, -`Sheltered TH Homeless Veterans, 2024`, -`Sheltered SH Homeless Veterans, 2024`)

  # Check which columns need to be converted to numeric
str(vet_data)
## tibble [54 Ă— 11] (S3: tbl_df/tbl/data.frame)
##  $ State         : chr [1:54] "AK" "AL" "AR" "AZ" ...
##  $ CoC_Count     : num [1:54] 2 9 5 3 44 4 2 1 1 27 ...
##  $ Homeless_Vets : chr [1:54] "106" "291" "226" "994" ...
##  $ Sheltered     : chr [1:54] "87" "191" "144" "677" ...
##  $ Unsheltered   : chr [1:54] "19" "100" "82" "317" ...
##  $ Vet_Beds_Shelt: chr [1:54] "12" "153" "74" "513" ...
##  $ ES_Beds       : chr [1:54] "0" "80" "55" "54" ...
##  $ TH_Beds       : chr [1:54] "12" "73" "19" "388" ...
##  $ SH_Beds       : chr [1:54] "0" "0" "0" "71" ...
##  $ RRH_Beds      : chr [1:54] "157" "192" "145" "1015" ...
##  $ PSH_Beds      : chr [1:54] "368" "1055" "533" "2791" ...
# Convert columns to numeric
vet_data <- vet_data |> mutate(Unsheltered = as.numeric (Unsheltered), Homeless_Vets = as.numeric(Homeless_Vets), Vet_Beds_Shelt = as.numeric (Vet_Beds_Shelt), ES_Beds = as.numeric (ES_Beds),TH_Beds = as.numeric (TH_Beds), SH_Beds = as.numeric (SH_Beds), Sheltered = as.numeric (Sheltered), RRH_Beds = as.numeric(RRH_Beds), PSH_Beds = as.numeric(PSH_Beds))

#Drop all NAs for columns in final dataframe
vet_data <-vet_data |> drop_na(Unsheltered, Homeless_Vets, Vet_Beds_Shelt, ES_Beds, TH_Beds, SH_Beds, SH_Beds, Sheltered, RRH_Beds, PSH_Beds)
#Data Auditing
print(colSums(is.na(vet_data)))
##          State      CoC_Count  Homeless_Vets      Sheltered    Unsheltered 
##              0              0              0              0              0 
## Vet_Beds_Shelt        ES_Beds        TH_Beds        SH_Beds       RRH_Beds 
##              0              0              0              0              0 
##       PSH_Beds 
##              0
summary(vet_data)
##     State             CoC_Count      Homeless_Vets      Sheltered     
##  Length:53          Min.   : 1.000   Min.   :   9.0   Min.   :   0.0  
##  Class :character   1st Qu.: 2.000   1st Qu.: 129.0   1st Qu.: 106.0  
##  Mode  :character   Median : 4.000   Median : 299.0   Median : 199.0  
##                     Mean   : 7.189   Mean   : 616.4   Mean   : 356.2  
##                     3rd Qu.:10.000   3rd Qu.: 570.0   3rd Qu.: 435.0  
##                     Max.   :44.000   Max.   :9310.0   Max.   :2900.0  
##   Unsheltered     Vet_Beds_Shelt      ES_Beds          TH_Beds      
##  Min.   :   3.0   Min.   :   0.0   Min.   :  0.00   Min.   :   0.0  
##  1st Qu.:  22.0   1st Qu.:  74.0   1st Qu.:  8.00   1st Qu.:  40.0  
##  Median :  58.0   Median : 189.0   Median : 45.00   Median : 130.0  
##  Mean   : 260.2   Mean   : 304.6   Mean   : 72.08   Mean   : 207.4  
##  3rd Qu.: 139.0   3rd Qu.: 408.0   3rd Qu.: 93.00   3rd Qu.: 300.0  
##  Max.   :6410.0   Max.   :3004.0   Max.   :668.00   Max.   :2003.0  
##     SH_Beds          RRH_Beds         PSH_Beds    
##  Min.   :  0.00   Min.   :   0.0   Min.   :    0  
##  1st Qu.:  0.00   1st Qu.: 138.0   1st Qu.:  400  
##  Median :  5.00   Median : 234.0   Median : 1340  
##  Mean   : 25.13   Mean   : 428.8   Mean   : 2062  
##  3rd Qu.: 30.00   3rd Qu.: 440.0   3rd Qu.: 2163  
##  Max.   :333.00   Max.   :3380.0   Max.   :23594
view(vet_data)

DESCRIPTIVE VARIABLES

Unsheltered Veteran Count- DEPENDENT VARIABLE

pastecs::stat.desc(vet_data$Unsheltered)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 5.300000e+01 0.000000e+00 0.000000e+00 3.000000e+00 6.410000e+03 6.407000e+03 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 1.379200e+04 5.800000e+01 2.602264e+02 1.226385e+02 2.460922e+02 7.971312e+05 
##      std.dev     coef.var 
## 8.928220e+02 3.430943e+00

The minimum amount of Unsheltered homeless veterans living in one state or territory is 3. The maximum amount of Unsheltered homeless veterans living in one state or territory is 6410. The range is 6407. This means there is a very large variation across states where some states have 3 Unsheltered veterans in transitional housing while others have as many as 6410. The total sum of Unsheltered homeless veterans living throughout the US is 13,792. The median number of Unsheltered homeless veterans per state is 58. The average number of Unsheltered homeless veterans per state is ~260. The standard error mean is 122.64 which is closer to the true average across all states. The standard deviation is 892.82 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.

##Removing NA’s

sum(is.na(vet_data$`Unsheltered`))
## [1] 0

There are no NA’s. There is 1 zero value, but it is a data point.

Dedicated Veteran Bed Count- INDEPENDENT VARIABLE

pastecs::stat.desc(vet_data$Vet_Beds_Shelt)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 5.300000e+01 1.000000e+00 0.000000e+00 0.000000e+00 3.004000e+03 3.004000e+03 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 1.614500e+04 1.890000e+02 3.046226e+02 6.067074e+01 1.217448e+02 1.950898e+05 
##      std.dev     coef.var 
## 4.416897e+02 1.449957e+00

The minimum amount of Vet_Beds_Shelt in a state or territory is 0. The maximum amount of Vet_Beds_Shelt in a state or territory is 3004. The range is 3304. This means there is a very large variation across states where one state has zero Vet_Beds_Shelt while others have as many as 3004. The total sum of Vet_Beds_Shelt throughout the US is 16,145. The median number of Vet_Beds_Shelt per state is 189. The average number of Vet_Beds_Shelt per state is ~305. The standard error mean is 60.67 which is closer to the true average across all states. The standard deviation is 441.79 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.

sum(is.na(vet_data$Vet_Beds_Shelt))
## [1] 0

There are no NA’s. There is 1 zero value, but it is a data point.

Continuums of Care Count- INDEPENDENT VARIABLE

pastecs::stat.desc(vet_data$CoC_Count)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##    53.000000     0.000000     0.000000     1.000000    44.000000    43.000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##   381.000000     4.000000     7.188679     1.117241     2.241908    66.156023 
##      std.dev     coef.var 
##     8.133635     1.131451

The minimum amount of CoC_Count in a state or territory is 1. The maximum amount of CoC_Count in a state or territory is 44. The range is 43. This means there is a very large variation across states where one state has 1 C0C_Count while others have as many as 44. The total sum of CoC_Count throughout the US is 381. The median number of CoC_Count per state is 4. The average number of CoC_Count per state is ~7. The standard error mean is 1.12. The standard deviation is 8.13 which indicates a somewhat high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.

sum(is.na(vet_data$CoC_Count))
## [1] 0

There are no NA’s. There is 1 zero value, but it is a data point.

Dedicated Emergency Shelter Veteran Bed Count- INDEPENDENT VARIABLE

pastecs::stat.desc(vet_data$ES_Beds)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##    53.000000     7.000000     0.000000     0.000000   668.000000   668.000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##  3820.000000    45.000000    72.075472    14.575036    29.246950 11258.878810 
##      std.dev     coef.var 
##   106.107864     1.472177

The minimum amount of ES_Beds in a state or territory is 0. The maximum amount of ES_Beds in a state or territory is 668. The range is 668. This means there is a very large variation across states where one state has zero ES_Beds while others have as many as 668. The total sum of ES_Beds throughout the US is 3820. The median number of ES_Beds per state is 45. The average number of ES_Beds per state is ~72. The standard error mean is 14.57 which is closer to the true average across all states. The standard deviation is 106.11 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.

sum(is.na(vet_data$ES_Beds))
## [1] 0

There are no NA’s. There is 1 zero value, but it is a data point.

Dedicated Safe Haven Shelter Veteran Bed Count- INDEPENDENT VARIABLE

pastecs::stat.desc(vet_data$SH_Beds)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##    53.000000    26.000000     0.000000     0.000000   333.000000   333.000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##  1332.000000     5.000000    25.132075     6.962529    13.971336  2569.270682 
##      std.dev     coef.var 
##    50.687974     2.016864

The minimum amount of SH_Beds in a state or territory is 0. The maximum amount of SH_Beds in a state or territory is 33. The range is 333. This means there is a very large variation across states where one states has 0 SH_Beds while others have up to 333. The total sum of SH_Beds living throughout the US is 1,332. The median number of SH_Beds per state is 5. The average number of SH_Beds per state is ~25. The standard error mean is 6.96 which is closer to the true average across all states. The standard deviation is 50.69 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean very high and further away from the median.

sum(is.na(vet_data$SH_Beds))
## [1] 0

There are no NA’s. There is 1 zero value, but it is a data point.

Dedicated Transitional Housing Veteran Bed Count- INDEPENDENT VARIABLE

pastecs::stat.desc(vet_data$TH_Beds)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##    53.000000     2.000000     0.000000     0.000000  2003.000000  2003.000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 10993.000000   130.000000   207.415094    40.838485    81.948416 88392.439768 
##      std.dev     coef.var 
##   297.308661     1.433399

The minimum amount of TH_Beds in a state or territory is 0. The maximum amount of TH_Beds in a state or territory is 2003. The range is 2003. This means there is a very large variation across states where one state has zero TH_Beds while others have up to 2003. The total sum of TH_Beds throughout the US is 10,993. The median number of TH_Beds per state is 130. The average number of TH_Beds per state is ~207. The standard error mean is 40.84 which is closer to the true average across all states. The standard deviation is 297.31 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.

sum(is.na(vet_data$TH_Beds))
## [1] 0

There are no NA’s. There is 1 zero value, but it is a data point.

Dedicated Permanent Supportive Housing Veteran Bed Count- INDEPENDENT VARIABLE

pastecs::stat.desc(vet_data$PSH_Beds)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 5.300000e+01 1.000000e+00 0.000000e+00 0.000000e+00 2.359400e+04 2.359400e+04 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 1.093110e+05 1.340000e+03 2.062472e+03 4.792424e+02 9.616703e+02 1.217268e+07 
##      std.dev     coef.var 
## 3.488937e+03 1.691629e+00

The minimum amount of PSH_Beds in a state or territory is 0. The maximum amount of PSH_Beds in a state or territory is 23,594. The range is 23,594. This means there is a very large variation across states where one state has 0 PSH_Beds while others have up to 23,594. The total sum of PSH_Beds throughout the US is 109,311. The median number of PSH_Beds per state is 1,340. The average number of PSH_Beds per state is ~2,062. The standard error mean is 479.24 which is closer to the true average across all states. The standard deviation is 3,388.94 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.

sum(is.na(vet_data$PSH_Beds))
## [1] 0

There are no NA’s. There is 1 zero value, but it is a data point.

Dedicated Rapid Re-Housing Veteran Bed Count- INDEPENDENT VARIABLE

pastecs::stat.desc(vet_data$RRH_Beds)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##     53.00000      1.00000      0.00000      0.00000   3380.00000   3380.00000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##  22728.00000    234.00000    428.83019     84.54896    169.65990 378871.91292 
##      std.dev     coef.var 
##    615.52572      1.43536

The minimum amount of RRH_Beds in a state or territory is 0. The maximum amount of RRH_Beds in a state or territory is 3,380. The range is 3,380. This means there is a very large variation across states where one state has zero RRH_Beds while others have up to 3,380. The total sum of RRH_Beds throughout the US is 22,728. The median number of RRH_Beds per state is 234. The average number of RRH_Bedss per state is ~429. The standard error mean is 84.55 which is closer to the true average across all states. The standard deviation is 615.53 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.

sum(is.na(vet_data$RRH_Beds))
## [1] 0

There are no NA’s. There is 1 zero value, but it is a data point.


HISTOGRAMS

Histogram of the Variables

#hist(vet_data$Unsheltered, main="Histogram of Unsheltered Veterans", xlab= "Unsheltered Veteran Counts", breaks =10, probability =T)
#lines(density(vet_data$Unsheltered), col='red', lwd=2)

vet_data_long <- vet_data |>
  dplyr::select(Unsheltered, CoC_Count, Vet_Beds_Shelt, ES_Beds, SH_Beds, TH_Beds, PSH_Beds, RRH_Beds) |>
  pivot_longer(cols = everything(),
               names_to = "Variable",
               values_to = "Value")

ggplot(vet_data_long, aes(x = Value)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 10,
                 fill = "skyblue",
                 color = "black") +
  geom_density(color = "red", linewidth = 1) +
  facet_wrap(~Variable, scales = "free") +
  labs(title = "Histograms of Veteran Data Variables",
       x = "Value",
       y = "Density") +
  theme_minimal()

Square root transformation of the variables

vet_data_sqrt <- vet_data |> dplyr::mutate(Unsheltered_SQRT = sqrt(Unsheltered), Vet_Beds_Shelt_SQRT = sqrt(Vet_Beds_Shelt), CoC_SQRT = sqrt(CoC_Count), ES_SQRT = sqrt(ES_Beds), SH_SQRT = sqrt(SH_Beds), TH_SQRT = sqrt(TH_Beds), PSH_SQRT = sqrt(PSH_Beds), RRH_SQRT = sqrt(RRH_Beds))

head(vet_data_sqrt)
## # A tibble: 6 Ă— 19
##   State CoC_Count Homeless_Vets Sheltered Unsheltered Vet_Beds_Shelt ES_Beds
##   <chr>     <dbl>         <dbl>     <dbl>       <dbl>          <dbl>   <dbl>
## 1 AK            2           106        87          19             12       0
## 2 AL            9           291       191         100            153      80
## 3 AR            5           226       144          82             74      55
## 4 AZ            3           994       677         317            513      54
## 5 CA           44          9310      2900        6410           3004     668
## 6 CO            4           978       538         440            299      11
## # ℹ 12 more variables: TH_Beds <dbl>, SH_Beds <dbl>, RRH_Beds <dbl>,
## #   PSH_Beds <dbl>, Unsheltered_SQRT <dbl>, Vet_Beds_Shelt_SQRT <dbl>,
## #   CoC_SQRT <dbl>, ES_SQRT <dbl>, SH_SQRT <dbl>, TH_SQRT <dbl>,
## #   PSH_SQRT <dbl>, RRH_SQRT <dbl>

I used the square root transformations because it can be applied to values of zero and my variables are count variables. Square root transformations help to reduce skewness and stabilize variances.

Histogram of the transformed variable

# Select transformed variables
vet_data_sqrt_long <- vet_data_sqrt |>
  dplyr::select(ends_with("_SQRT")) |>
  pivot_longer(cols = everything(),
               names_to = "Variable",
               values_to = "Value")

# Create facet wrapped histograms
ggplot(vet_data_sqrt_long, aes(x = Value)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 10,
                 fill = "skyblue",
                 color = "black") +
  geom_density(color = "red", linewidth = 1) +
  facet_wrap(~Variable, scales = "free") +
  labs(
    title = "Histograms of Square Root Transformed Variables",
    x = "Square Root Transformed Values",
    y = "Density"
  ) +
  theme_minimal()

Basic linear model

vets_lm<-lm(Unsheltered~Vet_Beds_Shelt, data=vet_data)

summary(vets_lm)
## 
## Call:
## lm(formula = Unsheltered ~ Vet_Beds_Shelt, data = vet_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1013.7  -306.0   117.6   226.8  1186.7 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -299.8476    62.7053  -4.782 1.52e-05 ***
## Vet_Beds_Shelt    1.8386     0.1176  15.631  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 374.6 on 51 degrees of freedom
## Multiple R-squared:  0.8273, Adjusted R-squared:  0.8239 
## F-statistic: 244.3 on 1 and 51 DF,  p-value: < 2.2e-16

Dedicated veteran beds explain about 82% of unsheltered veteran numbers. The p-value is very low meaning it is significant, thereby disproving the null hypothesis that there is no effect on the unsheltered veteran count by dedicated veteran beds.


LINEAR MODEL

vet_model_1<-lm (Unsheltered~ CoC_Count+ Vet_Beds_Shelt, data=vet_data)
summary(vet_model_1)
## 
## Call:
## lm(formula = Unsheltered ~ CoC_Count + Vet_Beds_Shelt, data = vet_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -766.82 -241.94   87.42  217.11 1083.16 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -238.602     66.634  -3.581 0.000774 ***
## CoC_Count       -24.035     10.967  -2.192 0.033096 *  
## Vet_Beds_Shelt    2.205      0.202  10.917 7.74e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 361.4 on 50 degrees of freedom
## Multiple R-squared:  0.8424, Adjusted R-squared:  0.8361 
## F-statistic: 133.7 on 2 and 50 DF,  p-value: < 2.2e-16

###REGRESSION ASSUMPTION TESTS

## 1. Linearity (plot and raintest)

plot

plot(vet_model_1, which=1)

The fitted line is very curvy at the bottom. Just by visual inspection it does not appear to be linear. So it fails the assumption of linearity. It will require mitigation.

Rainbow test

raintest(vet_model_1)
## 
##  Rainbow test
## 
## data:  vet_model_1
## Rain = 11.574, df1 = 27, df2 = 23, p-value = 4.842e-08

According to the Rainbow Test, the low p-value means that the data is not linear.

FAIL

## 2. Independence of Errors (durbin-watson)

Durbin- Watson test

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(vet_model_1)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.05503266      1.871068   0.646
##  Alternative hypothesis: rho != 0

The p-value is high, it is 0.628, much higher than 0.05 which means that the errors ARE independent. The distance between the errors are not highly correlated. This passes the assumption of an independence of errors.

PASS

## 3. Homoscedasticity (plot, bptest)

plot

plot(vet_model_1,which=3)

The red line is very curvy and steadily increasing which means this likely violates the assumption of Homoscedasticity.The residual spread is closely clusterd at the start of the line, but gets larger as the line dips low and the fitted values increases with an extreme outlier at the upper tail end. The variance of the residuals are not evenly spread across the fitted values. We can say that this model appears to be heteroscedastic by visual inspection.

Breusch-Pagan Test

bptest(vet_model_1)
## 
##  studentized Breusch-Pagan test
## 
## data:  vet_model_1
## BP = 37.495, df = 2, p-value = 7.213e-09

The p-value is significant, it is lower than 0.05. So we can reject homoscedasticity and assume the model is heteroscedastic. This model fails the assumption of homoscedasticity and will require mitigation.

FAIL

## 4. Normality of residuals (QQ plot, shapiro test)

QQ plot

plot(vet_model_1,which=2)

The residuals are more evenly distributed in the center of the dotted like, aka the bell curve, and fit very well along the dotted line. On the tails of the line we see some residuals have minimal deviations from the dotted line with a few that touch the line. But again that’s somewhat expected that the tails would have outliers. Overall, by visual inspection, this is appears to be normally distributed.

Shapiro Test

shapiro.test(vet_model_1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  vet_model_1$residuals
## W = 0.93764, p-value = 0.008177

*The p-value is well below 0.05 which suggests that the residuals are significantly different from a normal distribution. This indicates that the Shapiro-Wilk test disagrees with our assessment of the normality of the chart. The mathematical assessment tells us that he chart is not normal, so that means we need to do some mitigation. The model does not pass the assumption of normality of residuals.

FAIL

## 5. No multicollinearity (VIF, cor)

VIF

vet_kitchen_sink<-lm(Unsheltered ~ CoC_Count + SH_Beds + TH_Beds +ES_Beds + RRH_Beds + PSH_Beds, data = vet_data)
  
vif(vet_kitchen_sink)
## CoC_Count   SH_Beds   TH_Beds   ES_Beds  RRH_Beds  PSH_Beds 
##  3.567765 10.674577 11.268550  7.022868  5.143490 25.982841

The following variables are strongly correlated as these variance inflation factors (VIF) are above 10: Safe Haven dedicated veteran beds (SH_Beds) has a VIF of a 10.67, Transitional Housing dedicated veteran beds (TH_Beds) has a VIF of 11.27, and Permanent Supportive Housing dedicated veteran beds (PSH_Beds) has a VIF of 25.98. These 3 predictors, or variables indicate serious multicollinearity. In particular, PSH_Beds is highly redundant with other variables.

cor

vet_kitchen_sink_vars<- vet_data |> dplyr::select(CoC_Count, SH_Beds, TH_Beds, ES_Beds, RRH_Beds, PSH_Beds)

cor(vet_kitchen_sink_vars)
##           CoC_Count   SH_Beds   TH_Beds   ES_Beds  RRH_Beds  PSH_Beds
## CoC_Count 1.0000000 0.7895951 0.8012225 0.8213393 0.7625480 0.8379578
## SH_Beds   0.7895951 1.0000000 0.9120698 0.8772047 0.8718648 0.9502930
## TH_Beds   0.8012225 0.9120698 1.0000000 0.8866759 0.8641670 0.9538744
## ES_Beds   0.8213393 0.8772047 0.8866759 1.0000000 0.8386710 0.9206387
## RRH_Beds  0.7625480 0.8718648 0.8641670 0.8386710 1.0000000 0.8922328
## PSH_Beds  0.8379578 0.9502930 0.9538744 0.9206387 0.8922328 1.0000000

PSH_Beds is highly correlated with SH_Beds, TH_Beds, ES_Beds, RRH_Beds and CoC_Count as the correlations are close to 1. SH_Beds and TH_Beds also run high correlations with other variables. We should take out PSH_Beds from the model since it’s highly correlated with all variables.

This will require mitigation and fails the No Multicollinearity assumption.


Assumption check

  1. Linearity- Fail
  2. Independence of Errors- Pass
  3. Homoscedasticity- Fail
  4. Normality of Residuals- Fail
  5. No multicollinearity- Fail

Mitigation

Mitigation for failure of linearity assumption

vet_model_1_log<-lm(log(Unsheltered+1)~log(Vet_Beds_Shelt+1) + log(CoC_Count+1),data=vet_data)

raintest(vet_model_1_log)
## 
##  Rainbow test
## 
## data:  vet_model_1_log
## Rain = 2.0627, df1 = 27, df2 = 23, p-value = 0.04089

Because of a zero value, we need to add 1 to each variable wthin the logged transformation. The vet_model_1 has been log transformed, so it has a raintest p-value of 0.048 meaning it is still too low to achieve linearity.

plot(vet_model_1, which=1)

plot (vet_model_1_log, which=1)

The plot is much more horizontal, by visual inspection it appears somewhat linear.

Mitigation for failure of homoscedasticity assumption

bptest(vet_model_1_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  vet_model_1_log
## BP = 5.1963, df = 2, p-value = 0.07441

The log model has a high p-value which means it is now homoscedastic.

plot(vet_model_1,which=3)

plot(vet_model_1_log,which=3)

shapiro.test(vet_model_1_log$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  vet_model_1_log$residuals
## W = 0.98156, p-value = 0.5816

##Mitigation for failure of normality of residuals

shapiro.test(residuals(vet_model_1_log))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(vet_model_1_log)
## W = 0.98156, p-value = 0.5816

The p-value is now greater than 0.05 suggesting that the residuals are not significantly different from a normal distribution and appear normal

plot(vet_model_1,which=2)

plot(vet_model_1_log,which=2)

The data is much much closer to or directly on the line. Now passes the Normality of residuals assumption test.

Mitigation for failure of no multicollinearity

vet_kitchen_sink_altered<-lm(Unsheltered ~ CoC_Count + SH_Beds + TH_Beds +ES_Beds + RRH_Beds, data = vet_data)
  
vif(vet_kitchen_sink_altered)
## CoC_Count   SH_Beds   TH_Beds   ES_Beds  RRH_Beds 
##  3.398958  7.724223  7.974306  6.259584  4.953428
vet_kitchen_sink_altered_vars<- vet_data |> dplyr::select(CoC_Count, SH_Beds, TH_Beds, ES_Beds, RRH_Beds)

cor(vet_kitchen_sink_altered_vars)
##           CoC_Count   SH_Beds   TH_Beds   ES_Beds  RRH_Beds
## CoC_Count 1.0000000 0.7895951 0.8012225 0.8213393 0.7625480
## SH_Beds   0.7895951 1.0000000 0.9120698 0.8772047 0.8718648
## TH_Beds   0.8012225 0.9120698 1.0000000 0.8866759 0.8641670
## ES_Beds   0.8213393 0.8772047 0.8866759 1.0000000 0.8386710
## RRH_Beds  0.7625480 0.8718648 0.8641670 0.8386710 1.0000000

Now it meets the assumption of multicollinearity beacuse those big correlations aren’t there.

#Summary of the final model

summary(vet_model_1_log)
## 
## Call:
## lm(formula = log(Unsheltered + 1) ~ log(Vet_Beds_Shelt + 1) + 
##     log(CoC_Count + 1), data = vet_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0558 -0.7339 -0.0883  0.7482  2.1578 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.9535     0.5037   1.893 0.064150 .  
## log(Vet_Beds_Shelt + 1)   0.4916     0.1343   3.660 0.000607 ***
## log(CoC_Count + 1)        0.4562     0.2354   1.938 0.058233 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.016 on 50 degrees of freedom
## Multiple R-squared:  0.5107, Adjusted R-squared:  0.4912 
## F-statistic:  26.1 on 2 and 50 DF,  p-value: 1.733e-08

Vet_Beds_Shelt is significant while CoC_Count is marginally significant. Together they explain 49% of the unsheltered homeless veteran count.

STARGAZER

#stargazer(vet_model_1_log,
         # type = "html",
          #title = "Regression Results for Unsheltered Veteran Homelessness",
          #dep.var.labels = "Log Unsheltered Veteran Homelessness",
          #covariate.labels = c("Log Veteran Shelter Beds",
                               #"Log CoC Count"),
          #digits = 3,
          #align = TRUE,
          #no.space = TRUE,
          #omit.stat = c("ser"))

Stargazer Table

stargazer(vet_model_1_log,
          type = "text",
          title = "Regression Results for Unsheltered Veteran Homelessness",
          dep.var.labels = "Log Unsheltered Veteran Homelessness",
          covariate.labels = c("Log Veteran Shelter Beds",
                               "Log CoC Count"),
          digits = 3,
          align = TRUE,
          no.space = TRUE,
          omit.stat = c("ser"))
## 
## Regression Results for Unsheltered Veteran Homelessness
## =============================================================
##                                  Dependent variable:         
##                          ------------------------------------
##                          Log Unsheltered Veteran Homelessness
## -------------------------------------------------------------
## Log Veteran Shelter Beds               0.492***              
##                                        (0.134)               
## Log CoC Count                           0.456*               
##                                        (0.235)               
## Constant                                0.954*               
##                                        (0.504)               
## -------------------------------------------------------------
## Observations                              53                 
## R2                                      0.511                
## Adjusted R2                             0.491                
## F Statistic                     26.097*** (df = 2; 50)       
## =============================================================
## Note:                             *p<0.1; **p<0.05; ***p<0.01