library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.2.0 âś” readr 2.2.0
## âś” forcats 1.0.1 âś” stringr 1.6.0
## âś” ggplot2 4.0.2 âś” tibble 3.3.1
## âś” lubridate 1.9.4 âś” tidyr 1.3.2
## âś” purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(dplyr)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
DATA CLEANING
# PIT VETERAN COUNTS DATASET
# create a file path for PIT Veteran Counts dataset
file_path_1 <-"C:/Users/Administrator/Desktop/Graduate School/Applied Quant Methods/My Class Stuff/Data Project/Veteran Homelessness/Updated Vet Datasets/PIT_Veteran_Counts.xlsx"
# Read sheet 2 of PIT Veteran Counts, file path 1
sheet_2<- read_excel(file_path_1, sheet="2024")
# Remove Totals and MP territory from PIT Veteran Counts sheet 2
vet_data_PIT <- sheet_2 |> filter (!State %in% c("Total", "MP"))
# HIC COUNTS DATASET (NEW)
# create a file path for HIC Counts dataset
file_path_2 <-"C:/Users/Administrator/Desktop/Graduate School/Applied Quant Methods/My Class Stuff/Data Project/Veteran Homelessness/Updated Vet Datasets/HIC_Counts.xlsx"
# Read sheet 1 of HIC Counts, file path 2
sheet_1<- read_excel(file_path_2, sheet="2024")
## New names:
## • `Total Year-Round Beds (ES)` -> `Total Year-Round Beds (ES)...6`
## • `Total Year-Round Beds (TH)` -> `Total Year-Round Beds (TH)...7`
## • `Total Year-Round Beds (SH)` -> `Total Year-Round Beds (SH)...8`
## • `Total Year-Round Beds (ES)` -> `Total Year-Round Beds (ES)...15`
## • `Total Year-Round Beds (TH)` -> `Total Year-Round Beds (TH)...27`
## • `Total Year-Round Beds (SH)` -> `Total Year-Round Beds (SH)...37`
# Remove Totals and MP territory row from HIC Counts Sheet 1
vet_data_HIC <- sheet_1 |> filter (!State %in% c("Total", "MP"))
#Change variable names for HIC dataset variables
vet_data_HIC <-vet_data_HIC |> rename (
Vet_Beds_Shelt ="Dedicated Veteran Beds (ES, TH, SH)",
RRH_Beds="Dedicated Veteran Beds (RRH)",
PSH_Beds="Dedicated Veteran Beds (PSH)",
ES_Beds="Dedicated Veteran Beds (ES)",
TH_Beds="Dedicated Veteran Beds (TH)",
SH_Beds="Dedicated Veteran Beds (SH)")
#Change variable names for PIT dataset variables
vet_data_PIT <-vet_data_PIT |> rename (
CoC_Count = "Number of CoCs",
Sheltered = "Sheltered Total Homeless Veterans, 2024",
Unsheltered = "Unsheltered Homeless Veterans, 2024",
Homeless_Vets = "Homeless Veterans, 2024")
# Combination of vat_data_HIC dataset and vet_data_PIT dataset, removal of ES/SH/TS raw counts columns
vet_data <-vet_data_PIT |> left_join(vet_data_HIC |> dplyr::select(State, Vet_Beds_Shelt, ES_Beds, TH_Beds, SH_Beds, RRH_Beds, PSH_Beds), by= "State") |>
#Remove the columns
dplyr::select(-`Sheltered ES Homeless Veterans, 2024`, -`Sheltered TH Homeless Veterans, 2024`, -`Sheltered SH Homeless Veterans, 2024`)
# Check which columns need to be converted to numeric
str(vet_data)
## tibble [54 Ă— 11] (S3: tbl_df/tbl/data.frame)
## $ State : chr [1:54] "AK" "AL" "AR" "AZ" ...
## $ CoC_Count : num [1:54] 2 9 5 3 44 4 2 1 1 27 ...
## $ Homeless_Vets : chr [1:54] "106" "291" "226" "994" ...
## $ Sheltered : chr [1:54] "87" "191" "144" "677" ...
## $ Unsheltered : chr [1:54] "19" "100" "82" "317" ...
## $ Vet_Beds_Shelt: chr [1:54] "12" "153" "74" "513" ...
## $ ES_Beds : chr [1:54] "0" "80" "55" "54" ...
## $ TH_Beds : chr [1:54] "12" "73" "19" "388" ...
## $ SH_Beds : chr [1:54] "0" "0" "0" "71" ...
## $ RRH_Beds : chr [1:54] "157" "192" "145" "1015" ...
## $ PSH_Beds : chr [1:54] "368" "1055" "533" "2791" ...
# Convert columns to numeric
vet_data <- vet_data |> mutate(Unsheltered = as.numeric (Unsheltered), Homeless_Vets = as.numeric(Homeless_Vets), Vet_Beds_Shelt = as.numeric (Vet_Beds_Shelt), ES_Beds = as.numeric (ES_Beds),TH_Beds = as.numeric (TH_Beds), SH_Beds = as.numeric (SH_Beds), Sheltered = as.numeric (Sheltered), RRH_Beds = as.numeric(RRH_Beds), PSH_Beds = as.numeric(PSH_Beds))
#Drop all NAs for columns in final dataframe
vet_data <-vet_data |> drop_na(Unsheltered, Homeless_Vets, Vet_Beds_Shelt, ES_Beds, TH_Beds, SH_Beds, SH_Beds, Sheltered, RRH_Beds, PSH_Beds)
#Data Auditing
print(colSums(is.na(vet_data)))
## State CoC_Count Homeless_Vets Sheltered Unsheltered
## 0 0 0 0 0
## Vet_Beds_Shelt ES_Beds TH_Beds SH_Beds RRH_Beds
## 0 0 0 0 0
## PSH_Beds
## 0
summary(vet_data)
## State CoC_Count Homeless_Vets Sheltered
## Length:53 Min. : 1.000 Min. : 9.0 Min. : 0.0
## Class :character 1st Qu.: 2.000 1st Qu.: 129.0 1st Qu.: 106.0
## Mode :character Median : 4.000 Median : 299.0 Median : 199.0
## Mean : 7.189 Mean : 616.4 Mean : 356.2
## 3rd Qu.:10.000 3rd Qu.: 570.0 3rd Qu.: 435.0
## Max. :44.000 Max. :9310.0 Max. :2900.0
## Unsheltered Vet_Beds_Shelt ES_Beds TH_Beds
## Min. : 3.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 22.0 1st Qu.: 74.0 1st Qu.: 8.00 1st Qu.: 40.0
## Median : 58.0 Median : 189.0 Median : 45.00 Median : 130.0
## Mean : 260.2 Mean : 304.6 Mean : 72.08 Mean : 207.4
## 3rd Qu.: 139.0 3rd Qu.: 408.0 3rd Qu.: 93.00 3rd Qu.: 300.0
## Max. :6410.0 Max. :3004.0 Max. :668.00 Max. :2003.0
## SH_Beds RRH_Beds PSH_Beds
## Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 138.0 1st Qu.: 400
## Median : 5.00 Median : 234.0 Median : 1340
## Mean : 25.13 Mean : 428.8 Mean : 2062
## 3rd Qu.: 30.00 3rd Qu.: 440.0 3rd Qu.: 2163
## Max. :333.00 Max. :3380.0 Max. :23594
view(vet_data)
DESCRIPTIVE VARIABLES
pastecs::stat.desc(vet_data$Unsheltered)
## nbr.val nbr.null nbr.na min max range
## 5.300000e+01 0.000000e+00 0.000000e+00 3.000000e+00 6.410000e+03 6.407000e+03
## sum median mean SE.mean CI.mean.0.95 var
## 1.379200e+04 5.800000e+01 2.602264e+02 1.226385e+02 2.460922e+02 7.971312e+05
## std.dev coef.var
## 8.928220e+02 3.430943e+00
The minimum amount of Unsheltered homeless veterans living in one state or territory is 3. The maximum amount of Unsheltered homeless veterans living in one state or territory is 6410. The range is 6407. This means there is a very large variation across states where some states have 3 Unsheltered veterans in transitional housing while others have as many as 6410. The total sum of Unsheltered homeless veterans living throughout the US is 13,792. The median number of Unsheltered homeless veterans per state is 58. The average number of Unsheltered homeless veterans per state is ~260. The standard error mean is 122.64 which is closer to the true average across all states. The standard deviation is 892.82 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
##Removing NA’s
sum(is.na(vet_data$`Unsheltered`))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$Vet_Beds_Shelt)
## nbr.val nbr.null nbr.na min max range
## 5.300000e+01 1.000000e+00 0.000000e+00 0.000000e+00 3.004000e+03 3.004000e+03
## sum median mean SE.mean CI.mean.0.95 var
## 1.614500e+04 1.890000e+02 3.046226e+02 6.067074e+01 1.217448e+02 1.950898e+05
## std.dev coef.var
## 4.416897e+02 1.449957e+00
The minimum amount of Vet_Beds_Shelt in a state or territory is 0. The maximum amount of Vet_Beds_Shelt in a state or territory is 3004. The range is 3304. This means there is a very large variation across states where one state has zero Vet_Beds_Shelt while others have as many as 3004. The total sum of Vet_Beds_Shelt throughout the US is 16,145. The median number of Vet_Beds_Shelt per state is 189. The average number of Vet_Beds_Shelt per state is ~305. The standard error mean is 60.67 which is closer to the true average across all states. The standard deviation is 441.79 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$Vet_Beds_Shelt))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$CoC_Count)
## nbr.val nbr.null nbr.na min max range
## 53.000000 0.000000 0.000000 1.000000 44.000000 43.000000
## sum median mean SE.mean CI.mean.0.95 var
## 381.000000 4.000000 7.188679 1.117241 2.241908 66.156023
## std.dev coef.var
## 8.133635 1.131451
The minimum amount of CoC_Count in a state or territory is 1. The maximum amount of CoC_Count in a state or territory is 44. The range is 43. This means there is a very large variation across states where one state has 1 C0C_Count while others have as many as 44. The total sum of CoC_Count throughout the US is 381. The median number of CoC_Count per state is 4. The average number of CoC_Count per state is ~7. The standard error mean is 1.12. The standard deviation is 8.13 which indicates a somewhat high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$CoC_Count))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$ES_Beds)
## nbr.val nbr.null nbr.na min max range
## 53.000000 7.000000 0.000000 0.000000 668.000000 668.000000
## sum median mean SE.mean CI.mean.0.95 var
## 3820.000000 45.000000 72.075472 14.575036 29.246950 11258.878810
## std.dev coef.var
## 106.107864 1.472177
The minimum amount of ES_Beds in a state or territory is 0. The maximum amount of ES_Beds in a state or territory is 668. The range is 668. This means there is a very large variation across states where one state has zero ES_Beds while others have as many as 668. The total sum of ES_Beds throughout the US is 3820. The median number of ES_Beds per state is 45. The average number of ES_Beds per state is ~72. The standard error mean is 14.57 which is closer to the true average across all states. The standard deviation is 106.11 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$ES_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$SH_Beds)
## nbr.val nbr.null nbr.na min max range
## 53.000000 26.000000 0.000000 0.000000 333.000000 333.000000
## sum median mean SE.mean CI.mean.0.95 var
## 1332.000000 5.000000 25.132075 6.962529 13.971336 2569.270682
## std.dev coef.var
## 50.687974 2.016864
The minimum amount of SH_Beds in a state or territory is 0. The maximum amount of SH_Beds in a state or territory is 33. The range is 333. This means there is a very large variation across states where one states has 0 SH_Beds while others have up to 333. The total sum of SH_Beds living throughout the US is 1,332. The median number of SH_Beds per state is 5. The average number of SH_Beds per state is ~25. The standard error mean is 6.96 which is closer to the true average across all states. The standard deviation is 50.69 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean very high and further away from the median.
sum(is.na(vet_data$SH_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$TH_Beds)
## nbr.val nbr.null nbr.na min max range
## 53.000000 2.000000 0.000000 0.000000 2003.000000 2003.000000
## sum median mean SE.mean CI.mean.0.95 var
## 10993.000000 130.000000 207.415094 40.838485 81.948416 88392.439768
## std.dev coef.var
## 297.308661 1.433399
The minimum amount of TH_Beds in a state or territory is 0. The maximum amount of TH_Beds in a state or territory is 2003. The range is 2003. This means there is a very large variation across states where one state has zero TH_Beds while others have up to 2003. The total sum of TH_Beds throughout the US is 10,993. The median number of TH_Beds per state is 130. The average number of TH_Beds per state is ~207. The standard error mean is 40.84 which is closer to the true average across all states. The standard deviation is 297.31 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$TH_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$PSH_Beds)
## nbr.val nbr.null nbr.na min max range
## 5.300000e+01 1.000000e+00 0.000000e+00 0.000000e+00 2.359400e+04 2.359400e+04
## sum median mean SE.mean CI.mean.0.95 var
## 1.093110e+05 1.340000e+03 2.062472e+03 4.792424e+02 9.616703e+02 1.217268e+07
## std.dev coef.var
## 3.488937e+03 1.691629e+00
The minimum amount of PSH_Beds in a state or territory is 0. The maximum amount of PSH_Beds in a state or territory is 23,594. The range is 23,594. This means there is a very large variation across states where one state has 0 PSH_Beds while others have up to 23,594. The total sum of PSH_Beds throughout the US is 109,311. The median number of PSH_Beds per state is 1,340. The average number of PSH_Beds per state is ~2,062. The standard error mean is 479.24 which is closer to the true average across all states. The standard deviation is 3,388.94 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$PSH_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$RRH_Beds)
## nbr.val nbr.null nbr.na min max range
## 53.00000 1.00000 0.00000 0.00000 3380.00000 3380.00000
## sum median mean SE.mean CI.mean.0.95 var
## 22728.00000 234.00000 428.83019 84.54896 169.65990 378871.91292
## std.dev coef.var
## 615.52572 1.43536
The minimum amount of RRH_Beds in a state or territory is 0. The maximum amount of RRH_Beds in a state or territory is 3,380. The range is 3,380. This means there is a very large variation across states where one state has zero RRH_Beds while others have up to 3,380. The total sum of RRH_Beds throughout the US is 22,728. The median number of RRH_Beds per state is 234. The average number of RRH_Bedss per state is ~429. The standard error mean is 84.55 which is closer to the true average across all states. The standard deviation is 615.53 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$RRH_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
HISTOGRAMS
#hist(vet_data$Unsheltered, main="Histogram of Unsheltered Veterans", xlab= "Unsheltered Veteran Counts", breaks =10, probability =T)
#lines(density(vet_data$Unsheltered), col='red', lwd=2)
vet_data_long <- vet_data |>
dplyr::select(Unsheltered, CoC_Count, Vet_Beds_Shelt, ES_Beds, SH_Beds, TH_Beds, PSH_Beds, RRH_Beds) |>
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Value")
ggplot(vet_data_long, aes(x = Value)) +
geom_histogram(aes(y = after_stat(density)),
bins = 10,
fill = "skyblue",
color = "black") +
geom_density(color = "red", linewidth = 1) +
facet_wrap(~Variable, scales = "free") +
labs(title = "Histograms of Veteran Data Variables",
x = "Value",
y = "Density") +
theme_minimal()
vet_data_sqrt <- vet_data |> dplyr::mutate(Unsheltered_SQRT = sqrt(Unsheltered), Vet_Beds_Shelt_SQRT = sqrt(Vet_Beds_Shelt), CoC_SQRT = sqrt(CoC_Count), ES_SQRT = sqrt(ES_Beds), SH_SQRT = sqrt(SH_Beds), TH_SQRT = sqrt(TH_Beds), PSH_SQRT = sqrt(PSH_Beds), RRH_SQRT = sqrt(RRH_Beds))
head(vet_data_sqrt)
## # A tibble: 6 Ă— 19
## State CoC_Count Homeless_Vets Sheltered Unsheltered Vet_Beds_Shelt ES_Beds
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2 106 87 19 12 0
## 2 AL 9 291 191 100 153 80
## 3 AR 5 226 144 82 74 55
## 4 AZ 3 994 677 317 513 54
## 5 CA 44 9310 2900 6410 3004 668
## 6 CO 4 978 538 440 299 11
## # ℹ 12 more variables: TH_Beds <dbl>, SH_Beds <dbl>, RRH_Beds <dbl>,
## # PSH_Beds <dbl>, Unsheltered_SQRT <dbl>, Vet_Beds_Shelt_SQRT <dbl>,
## # CoC_SQRT <dbl>, ES_SQRT <dbl>, SH_SQRT <dbl>, TH_SQRT <dbl>,
## # PSH_SQRT <dbl>, RRH_SQRT <dbl>
I used the square root transformations because it can be applied to values of zero and my variables are count variables. Square root transformations help to reduce skewness and stabilize variances.
# Select transformed variables
vet_data_sqrt_long <- vet_data_sqrt |>
dplyr::select(ends_with("_SQRT")) |>
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Value")
# Create facet wrapped histograms
ggplot(vet_data_sqrt_long, aes(x = Value)) +
geom_histogram(aes(y = after_stat(density)),
bins = 10,
fill = "skyblue",
color = "black") +
geom_density(color = "red", linewidth = 1) +
facet_wrap(~Variable, scales = "free") +
labs(
title = "Histograms of Square Root Transformed Variables",
x = "Square Root Transformed Values",
y = "Density"
) +
theme_minimal()
Basic linear model
vets_lm<-lm(Unsheltered~Vet_Beds_Shelt, data=vet_data)
summary(vets_lm)
##
## Call:
## lm(formula = Unsheltered ~ Vet_Beds_Shelt, data = vet_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1013.7 -306.0 117.6 226.8 1186.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -299.8476 62.7053 -4.782 1.52e-05 ***
## Vet_Beds_Shelt 1.8386 0.1176 15.631 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 374.6 on 51 degrees of freedom
## Multiple R-squared: 0.8273, Adjusted R-squared: 0.8239
## F-statistic: 244.3 on 1 and 51 DF, p-value: < 2.2e-16
Dedicated veteran beds explain about 82% of unsheltered veteran numbers. The p-value is very low meaning it is significant, thereby disproving the null hypothesis that there is no effect on the unsheltered veteran count by dedicated veteran beds.
LINEAR MODEL
vet_model_1<-lm (Unsheltered~ CoC_Count+ Vet_Beds_Shelt, data=vet_data)
summary(vet_model_1)
##
## Call:
## lm(formula = Unsheltered ~ CoC_Count + Vet_Beds_Shelt, data = vet_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -766.82 -241.94 87.42 217.11 1083.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -238.602 66.634 -3.581 0.000774 ***
## CoC_Count -24.035 10.967 -2.192 0.033096 *
## Vet_Beds_Shelt 2.205 0.202 10.917 7.74e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 361.4 on 50 degrees of freedom
## Multiple R-squared: 0.8424, Adjusted R-squared: 0.8361
## F-statistic: 133.7 on 2 and 50 DF, p-value: < 2.2e-16
###REGRESSION ASSUMPTION TESTS
## 1. Linearity (plot and raintest)
plot(vet_model_1, which=1)
The fitted line is very curvy at the bottom. Just by visual inspection
it does not appear to be linear. So it fails the assumption of
linearity. It will require mitigation.
raintest(vet_model_1)
##
## Rainbow test
##
## data: vet_model_1
## Rain = 11.574, df1 = 27, df2 = 23, p-value = 4.842e-08
According to the Rainbow Test, the low p-value means that the data is not linear.
FAIL
## 2. Independence of Errors (durbin-watson)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
durbinWatsonTest(vet_model_1)
## lag Autocorrelation D-W Statistic p-value
## 1 0.05503266 1.871068 0.646
## Alternative hypothesis: rho != 0
The p-value is high, it is 0.628, much higher than 0.05 which means that the errors ARE independent. The distance between the errors are not highly correlated. This passes the assumption of an independence of errors.
PASS
## 3. Homoscedasticity (plot, bptest)
plot(vet_model_1,which=3)
The red line is very curvy and steadily increasing which means this
likely violates the assumption of Homoscedasticity.The residual spread
is closely clusterd at the start of the line, but gets larger as the
line dips low and the fitted values increases with an extreme outlier at
the upper tail end. The variance of the residuals are not evenly spread
across the fitted values. We can say that this model appears to be
heteroscedastic by visual inspection.
bptest(vet_model_1)
##
## studentized Breusch-Pagan test
##
## data: vet_model_1
## BP = 37.495, df = 2, p-value = 7.213e-09
The p-value is significant, it is lower than 0.05. So we can reject homoscedasticity and assume the model is heteroscedastic. This model fails the assumption of homoscedasticity and will require mitigation.
FAIL
## 4. Normality of residuals (QQ plot, shapiro test)
plot(vet_model_1,which=2)
The residuals are more evenly distributed in the center of the dotted
like, aka the bell curve, and fit very well along the dotted line. On
the tails of the line we see some residuals have minimal deviations from
the dotted line with a few that touch the line. But again that’s
somewhat expected that the tails would have outliers. Overall, by visual
inspection, this is appears to be normally distributed.
shapiro.test(vet_model_1$residuals)
##
## Shapiro-Wilk normality test
##
## data: vet_model_1$residuals
## W = 0.93764, p-value = 0.008177
*The p-value is well below 0.05 which suggests that the residuals are significantly different from a normal distribution. This indicates that the Shapiro-Wilk test disagrees with our assessment of the normality of the chart. The mathematical assessment tells us that he chart is not normal, so that means we need to do some mitigation. The model does not pass the assumption of normality of residuals.
FAIL
## 5. No multicollinearity (VIF, cor)
vet_kitchen_sink<-lm(Unsheltered ~ CoC_Count + SH_Beds + TH_Beds +ES_Beds + RRH_Beds + PSH_Beds, data = vet_data)
vif(vet_kitchen_sink)
## CoC_Count SH_Beds TH_Beds ES_Beds RRH_Beds PSH_Beds
## 3.567765 10.674577 11.268550 7.022868 5.143490 25.982841
The following variables are strongly correlated as these variance inflation factors (VIF) are above 10: Safe Haven dedicated veteran beds (SH_Beds) has a VIF of a 10.67, Transitional Housing dedicated veteran beds (TH_Beds) has a VIF of 11.27, and Permanent Supportive Housing dedicated veteran beds (PSH_Beds) has a VIF of 25.98. These 3 predictors, or variables indicate serious multicollinearity. In particular, PSH_Beds is highly redundant with other variables.
vet_kitchen_sink_vars<- vet_data |> dplyr::select(CoC_Count, SH_Beds, TH_Beds, ES_Beds, RRH_Beds, PSH_Beds)
cor(vet_kitchen_sink_vars)
## CoC_Count SH_Beds TH_Beds ES_Beds RRH_Beds PSH_Beds
## CoC_Count 1.0000000 0.7895951 0.8012225 0.8213393 0.7625480 0.8379578
## SH_Beds 0.7895951 1.0000000 0.9120698 0.8772047 0.8718648 0.9502930
## TH_Beds 0.8012225 0.9120698 1.0000000 0.8866759 0.8641670 0.9538744
## ES_Beds 0.8213393 0.8772047 0.8866759 1.0000000 0.8386710 0.9206387
## RRH_Beds 0.7625480 0.8718648 0.8641670 0.8386710 1.0000000 0.8922328
## PSH_Beds 0.8379578 0.9502930 0.9538744 0.9206387 0.8922328 1.0000000
PSH_Beds is highly correlated with SH_Beds, TH_Beds, ES_Beds, RRH_Beds and CoC_Count as the correlations are close to 1. SH_Beds and TH_Beds also run high correlations with other variables. We should take out PSH_Beds from the model since it’s highly correlated with all variables.
This will require mitigation and fails the No Multicollinearity assumption.
vet_model_1_log<-lm(log(Unsheltered+1)~log(Vet_Beds_Shelt+1) + log(CoC_Count+1),data=vet_data)
raintest(vet_model_1_log)
##
## Rainbow test
##
## data: vet_model_1_log
## Rain = 2.0627, df1 = 27, df2 = 23, p-value = 0.04089
Because of a zero value, we need to add 1 to each variable wthin the logged transformation. The vet_model_1 has been log transformed, so it has a raintest p-value of 0.048 meaning it is still too low to achieve linearity.
plot(vet_model_1, which=1)
plot (vet_model_1_log, which=1)
The plot is much more horizontal, by visual inspection it appears
somewhat linear.
bptest(vet_model_1_log)
##
## studentized Breusch-Pagan test
##
## data: vet_model_1_log
## BP = 5.1963, df = 2, p-value = 0.07441
The log model has a high p-value which means it is now homoscedastic.
plot(vet_model_1,which=3)
plot(vet_model_1_log,which=3)
shapiro.test(vet_model_1_log$residuals)
##
## Shapiro-Wilk normality test
##
## data: vet_model_1_log$residuals
## W = 0.98156, p-value = 0.5816
##Mitigation for failure of normality of residuals
shapiro.test(residuals(vet_model_1_log))
##
## Shapiro-Wilk normality test
##
## data: residuals(vet_model_1_log)
## W = 0.98156, p-value = 0.5816
The p-value is now greater than 0.05 suggesting that the residuals are not significantly different from a normal distribution and appear normal
plot(vet_model_1,which=2)
plot(vet_model_1_log,which=2)
The data is much much closer to or directly on the line. Now passes the
Normality of residuals assumption test.
vet_kitchen_sink_altered<-lm(Unsheltered ~ CoC_Count + SH_Beds + TH_Beds +ES_Beds + RRH_Beds, data = vet_data)
vif(vet_kitchen_sink_altered)
## CoC_Count SH_Beds TH_Beds ES_Beds RRH_Beds
## 3.398958 7.724223 7.974306 6.259584 4.953428
vet_kitchen_sink_altered_vars<- vet_data |> dplyr::select(CoC_Count, SH_Beds, TH_Beds, ES_Beds, RRH_Beds)
cor(vet_kitchen_sink_altered_vars)
## CoC_Count SH_Beds TH_Beds ES_Beds RRH_Beds
## CoC_Count 1.0000000 0.7895951 0.8012225 0.8213393 0.7625480
## SH_Beds 0.7895951 1.0000000 0.9120698 0.8772047 0.8718648
## TH_Beds 0.8012225 0.9120698 1.0000000 0.8866759 0.8641670
## ES_Beds 0.8213393 0.8772047 0.8866759 1.0000000 0.8386710
## RRH_Beds 0.7625480 0.8718648 0.8641670 0.8386710 1.0000000
Now it meets the assumption of multicollinearity beacuse those big correlations aren’t there.
#Summary of the final model
summary(vet_model_1_log)
##
## Call:
## lm(formula = log(Unsheltered + 1) ~ log(Vet_Beds_Shelt + 1) +
## log(CoC_Count + 1), data = vet_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0558 -0.7339 -0.0883 0.7482 2.1578
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9535 0.5037 1.893 0.064150 .
## log(Vet_Beds_Shelt + 1) 0.4916 0.1343 3.660 0.000607 ***
## log(CoC_Count + 1) 0.4562 0.2354 1.938 0.058233 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.016 on 50 degrees of freedom
## Multiple R-squared: 0.5107, Adjusted R-squared: 0.4912
## F-statistic: 26.1 on 2 and 50 DF, p-value: 1.733e-08
Vet_Beds_Shelt is significant while CoC_Count is marginally significant. Together they explain 49% of the unsheltered homeless veteran count.
STARGAZER
#stargazer(vet_model_1_log,
# type = "html",
#title = "Regression Results for Unsheltered Veteran Homelessness",
#dep.var.labels = "Log Unsheltered Veteran Homelessness",
#covariate.labels = c("Log Veteran Shelter Beds",
#"Log CoC Count"),
#digits = 3,
#align = TRUE,
#no.space = TRUE,
#omit.stat = c("ser"))
Stargazer Table
stargazer(vet_model_1_log,
type = "text",
title = "Regression Results for Unsheltered Veteran Homelessness",
dep.var.labels = "Log Unsheltered Veteran Homelessness",
covariate.labels = c("Log Veteran Shelter Beds",
"Log CoC Count"),
digits = 3,
align = TRUE,
no.space = TRUE,
omit.stat = c("ser"))
##
## Regression Results for Unsheltered Veteran Homelessness
## =============================================================
## Dependent variable:
## ------------------------------------
## Log Unsheltered Veteran Homelessness
## -------------------------------------------------------------
## Log Veteran Shelter Beds 0.492***
## (0.134)
## Log CoC Count 0.456*
## (0.235)
## Constant 0.954*
## (0.504)
## -------------------------------------------------------------
## Observations 53
## R2 0.511
## Adjusted R2 0.491
## F Statistic 26.097*** (df = 2; 50)
## =============================================================
## Note: *p<0.1; **p<0.05; ***p<0.01