library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(dplyr)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
DATA CLEANING
# PIT VETERAN COUNTS DATASET
# create a file path for PIT Veteran Counts dataset
file_path_1 <-"C:/Users/Administrator/Desktop/Graduate School/Applied Quant Methods/My Class Stuff/Data Project/Veteran Homelessness/Updated Vet Datasets/PIT_Veteran_Counts.xlsx"
# Read sheet 2 of PIT Veteran Counts, file path 1
sheet_2<- read_excel(file_path_1, sheet="2024")
# Remove Totals and MP territory from PIT Veteran Counts sheet 2
vet_data_PIT <- sheet_2 |> filter (!State %in% c("Total", "MP"))
# HIC COUNTS DATASET (NEW)
# create a file path for HIC Counts dataset
file_path_2 <-"C:/Users/Administrator/Desktop/Graduate School/Applied Quant Methods/My Class Stuff/Data Project/Veteran Homelessness/Updated Vet Datasets/HIC_Counts.xlsx"
# Read sheet 1 of HIC Counts, file path 2
sheet_1<- read_excel(file_path_2, sheet="2024")
## New names:
## • `Total Year-Round Beds (ES)` -> `Total Year-Round Beds (ES)...6`
## • `Total Year-Round Beds (TH)` -> `Total Year-Round Beds (TH)...7`
## • `Total Year-Round Beds (SH)` -> `Total Year-Round Beds (SH)...8`
## • `Total Year-Round Beds (ES)` -> `Total Year-Round Beds (ES)...15`
## • `Total Year-Round Beds (TH)` -> `Total Year-Round Beds (TH)...27`
## • `Total Year-Round Beds (SH)` -> `Total Year-Round Beds (SH)...37`
# Remove Totals and MP territory row from HIC Counts Sheet 1
vet_data_HIC <- sheet_1 |> filter (!State %in% c("Total", "MP"))
#Change variable names for HIC dataset variables
vet_data_HIC <-vet_data_HIC |> rename (
Vet_Beds_Shelt ="Dedicated Veteran Beds (ES, TH, SH)",
RRH_Beds="Dedicated Veteran Beds (RRH)",
PSH_Beds="Dedicated Veteran Beds (PSH)",
ES_Beds="Dedicated Veteran Beds (ES)",
TH_Beds="Dedicated Veteran Beds (TH)",
SH_Beds="Dedicated Veteran Beds (SH)")
#Change variable names for PIT dataset variables
vet_data_PIT <-vet_data_PIT |> rename (
CoC_Count = "Number of CoCs",
Sheltered = "Sheltered Total Homeless Veterans, 2024",
Unsheltered = "Unsheltered Homeless Veterans, 2024",
Homeless_Vets = "Homeless Veterans, 2024")
# Combination of vat_data_HIC dataset and vet_data_PIT dataset, removal of ES/SH/TS raw counts columns
vet_data <-vet_data_PIT |> left_join(vet_data_HIC |> dplyr::select(State, Vet_Beds_Shelt, ES_Beds, TH_Beds, SH_Beds, RRH_Beds, PSH_Beds), by= "State") |>
#Remove the columns
dplyr::select(-`Sheltered ES Homeless Veterans, 2024`, -`Sheltered TH Homeless Veterans, 2024`, -`Sheltered SH Homeless Veterans, 2024`)
# Check which columns need to be converted to numeric
str(vet_data)
## tibble [54 × 11] (S3: tbl_df/tbl/data.frame)
## $ State : chr [1:54] "AK" "AL" "AR" "AZ" ...
## $ CoC_Count : num [1:54] 2 9 5 3 44 4 2 1 1 27 ...
## $ Homeless_Vets : chr [1:54] "106" "291" "226" "994" ...
## $ Sheltered : chr [1:54] "87" "191" "144" "677" ...
## $ Unsheltered : chr [1:54] "19" "100" "82" "317" ...
## $ Vet_Beds_Shelt: chr [1:54] "12" "153" "74" "513" ...
## $ ES_Beds : chr [1:54] "0" "80" "55" "54" ...
## $ TH_Beds : chr [1:54] "12" "73" "19" "388" ...
## $ SH_Beds : chr [1:54] "0" "0" "0" "71" ...
## $ RRH_Beds : chr [1:54] "157" "192" "145" "1015" ...
## $ PSH_Beds : chr [1:54] "368" "1055" "533" "2791" ...
# Convert columns to numeric
vet_data <- vet_data |> mutate(Unsheltered = as.numeric (Unsheltered), Homeless_Vets = as.numeric(Homeless_Vets), Vet_Beds_Shelt = as.numeric (Vet_Beds_Shelt), ES_Beds = as.numeric (ES_Beds),TH_Beds = as.numeric (TH_Beds), SH_Beds = as.numeric (SH_Beds), Sheltered = as.numeric (Sheltered), RRH_Beds = as.numeric(RRH_Beds), PSH_Beds = as.numeric(PSH_Beds))
#Drop all NAs for columns in final dataframe
vet_data <-vet_data |> drop_na(Unsheltered, Homeless_Vets, Vet_Beds_Shelt, ES_Beds, TH_Beds, SH_Beds, SH_Beds, Sheltered, RRH_Beds, PSH_Beds)
#Data Auditing
print(colSums(is.na(vet_data)))
## State CoC_Count Homeless_Vets Sheltered Unsheltered
## 0 0 0 0 0
## Vet_Beds_Shelt ES_Beds TH_Beds SH_Beds RRH_Beds
## 0 0 0 0 0
## PSH_Beds
## 0
summary(vet_data)
## State CoC_Count Homeless_Vets Sheltered
## Length:53 Min. : 1.000 Min. : 9.0 Min. : 0.0
## Class :character 1st Qu.: 2.000 1st Qu.: 129.0 1st Qu.: 106.0
## Mode :character Median : 4.000 Median : 299.0 Median : 199.0
## Mean : 7.189 Mean : 616.4 Mean : 356.2
## 3rd Qu.:10.000 3rd Qu.: 570.0 3rd Qu.: 435.0
## Max. :44.000 Max. :9310.0 Max. :2900.0
## Unsheltered Vet_Beds_Shelt ES_Beds TH_Beds
## Min. : 3.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 22.0 1st Qu.: 74.0 1st Qu.: 8.00 1st Qu.: 40.0
## Median : 58.0 Median : 189.0 Median : 45.00 Median : 130.0
## Mean : 260.2 Mean : 304.6 Mean : 72.08 Mean : 207.4
## 3rd Qu.: 139.0 3rd Qu.: 408.0 3rd Qu.: 93.00 3rd Qu.: 300.0
## Max. :6410.0 Max. :3004.0 Max. :668.00 Max. :2003.0
## SH_Beds RRH_Beds PSH_Beds
## Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 138.0 1st Qu.: 400
## Median : 5.00 Median : 234.0 Median : 1340
## Mean : 25.13 Mean : 428.8 Mean : 2062
## 3rd Qu.: 30.00 3rd Qu.: 440.0 3rd Qu.: 2163
## Max. :333.00 Max. :3380.0 Max. :23594
view(vet_data)
##Description of the variables
pastecs::stat.desc(vet_data$Unsheltered)
## nbr.val nbr.null nbr.na min max range
## 5.300000e+01 0.000000e+00 0.000000e+00 3.000000e+00 6.410000e+03 6.407000e+03
## sum median mean SE.mean CI.mean.0.95 var
## 1.379200e+04 5.800000e+01 2.602264e+02 1.226385e+02 2.460922e+02 7.971312e+05
## std.dev coef.var
## 8.928220e+02 3.430943e+00
The minimum amount of Unsheltered homeless veterans living in one state or territory is 3. The maximum amount of Unsheltered homeless veterans living in one state or territory is 6410. The range is 6407. This means there is a very large variation across states where some states have 3 Unsheltered veterans in transitional housing while others have as many as 6410. The total sum of Unsheltered homeless veterans living throughout the US is 13,792. The median number of Unsheltered homeless veterans per state is 58. The average number of Unsheltered homeless veterans per state is ~260. The standard error mean is 122.64 which is closer to the true average across all states. The standard deviation is 892.82 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
##Removing NA’s
sum(is.na(vet_data$`Unsheltered`))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$Vet_Beds_Shelt)
## nbr.val nbr.null nbr.na min max range
## 5.300000e+01 1.000000e+00 0.000000e+00 0.000000e+00 3.004000e+03 3.004000e+03
## sum median mean SE.mean CI.mean.0.95 var
## 1.614500e+04 1.890000e+02 3.046226e+02 6.067074e+01 1.217448e+02 1.950898e+05
## std.dev coef.var
## 4.416897e+02 1.449957e+00
The minimum amount of Vet_Beds_Shelt in a state or territory is 0. The maximum amount of Vet_Beds_Shelt in a state or territory is 3004. The range is 3304. This means there is a very large variation across states where one state has zero Vet_Beds_Shelt while others have as many as 3004. The total sum of Vet_Beds_Shelt throughout the US is 16,145. The median number of Vet_Beds_Shelt per state is 189. The average number of Vet_Beds_Shelt per state is ~305. The standard error mean is 60.67 which is closer to the true average across all states. The standard deviation is 441.79 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$Vet_Beds_Shelt))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$CoC_Count)
## nbr.val nbr.null nbr.na min max range
## 53.000000 0.000000 0.000000 1.000000 44.000000 43.000000
## sum median mean SE.mean CI.mean.0.95 var
## 381.000000 4.000000 7.188679 1.117241 2.241908 66.156023
## std.dev coef.var
## 8.133635 1.131451
The minimum amount of CoC_Count in a state or territory is 1. The maximum amount of CoC_Count in a state or territory is 44. The range is 43. This means there is a very large variation across states where one state has 1 C0C_Count while others have as many as 44. The total sum of CoC_Count throughout the US is 381. The median number of CoC_Count per state is 4. The average number of CoC_Count per state is ~7. The standard error mean is 1.12. The standard deviation is 8.13 which indicates a somewhat high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$CoC_Count))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$ES_Beds)
## nbr.val nbr.null nbr.na min max range
## 53.000000 7.000000 0.000000 0.000000 668.000000 668.000000
## sum median mean SE.mean CI.mean.0.95 var
## 3820.000000 45.000000 72.075472 14.575036 29.246950 11258.878810
## std.dev coef.var
## 106.107864 1.472177
The minimum amount of ES_Beds in a state or territory is 0. The maximum amount of ES_Beds in a state or territory is 668. The range is 668. This means there is a very large variation across states where one state has zero ES_Beds while others have as many as 668. The total sum of ES_Beds throughout the US is 3820. The median number of ES_Beds per state is 45. The average number of ES_Beds per state is ~72. The standard error mean is 14.57 which is closer to the true average across all states. The standard deviation is 106.11 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$ES_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$SH_Beds)
## nbr.val nbr.null nbr.na min max range
## 53.000000 26.000000 0.000000 0.000000 333.000000 333.000000
## sum median mean SE.mean CI.mean.0.95 var
## 1332.000000 5.000000 25.132075 6.962529 13.971336 2569.270682
## std.dev coef.var
## 50.687974 2.016864
The minimum amount of SH_Beds in a state or territory is 0. The maximum amount of SH_Beds in a state or territory is 33. The range is 333. This means there is a very large variation across states where one states has 0 SH_Beds while others have up to 333. The total sum of SH_Beds living throughout the US is 1,332. The median number of SH_Beds per state is 5. The average number of SH_Beds per state is ~25. The standard error mean is 6.96 which is closer to the true average across all states. The standard deviation is 50.69 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean very high and further away from the median.
sum(is.na(vet_data$SH_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$TH_Beds)
## nbr.val nbr.null nbr.na min max range
## 53.000000 2.000000 0.000000 0.000000 2003.000000 2003.000000
## sum median mean SE.mean CI.mean.0.95 var
## 10993.000000 130.000000 207.415094 40.838485 81.948416 88392.439768
## std.dev coef.var
## 297.308661 1.433399
The minimum amount of TH_Beds in a state or territory is 0. The maximum amount of TH_Beds in a state or territory is 2003. The range is 2003. This means there is a very large variation across states where one state has zero TH_Beds while others have up to 2003. The total sum of TH_Beds throughout the US is 10,993. The median number of TH_Beds per state is 130. The average number of TH_Beds per state is ~207. The standard error mean is 40.84 which is closer to the true average across all states. The standard deviation is 297.31 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$TH_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$PSH_Beds)
## nbr.val nbr.null nbr.na min max range
## 5.300000e+01 1.000000e+00 0.000000e+00 0.000000e+00 2.359400e+04 2.359400e+04
## sum median mean SE.mean CI.mean.0.95 var
## 1.093110e+05 1.340000e+03 2.062472e+03 4.792424e+02 9.616703e+02 1.217268e+07
## std.dev coef.var
## 3.488937e+03 1.691629e+00
The minimum amount of PSH_Beds in a state or territory is 0. The maximum amount of PSH_Beds in a state or territory is 23,594. The range is 23,594. This means there is a very large variation across states where one state has 0 PSH_Beds while others have up to 23,594. The total sum of PSH_Beds throughout the US is 109,311. The median number of PSH_Beds per state is 1,340. The average number of PSH_Beds per state is ~2,062. The standard error mean is 479.24 which is closer to the true average across all states. The standard deviation is 3,388.94 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$PSH_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
pastecs::stat.desc(vet_data$RRH_Beds)
## nbr.val nbr.null nbr.na min max range
## 53.00000 1.00000 0.00000 0.00000 3380.00000 3380.00000
## sum median mean SE.mean CI.mean.0.95 var
## 22728.00000 234.00000 428.83019 84.54896 169.65990 378871.91292
## std.dev coef.var
## 615.52572 1.43536
The minimum amount of RRH_Beds in a state or territory is 0. The maximum amount of RRH_Beds in a state or territory is 3,380. The range is 3,380. This means there is a very large variation across states where one state has zero RRH_Beds while others have up to 3,380. The total sum of RRH_Beds throughout the US is 22,728. The median number of RRH_Beds per state is 234. The average number of RRH_Bedss per state is ~429. The standard error mean is 84.55 which is closer to the true average across all states. The standard deviation is 615.53 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.
sum(is.na(vet_data$RRH_Beds))
## [1] 0
There are no NA’s. There is 1 zero value, but it is a data point.
#hist(vet_data$Unsheltered, main="Histogram of Unsheltered Veterans", xlab= "Unsheltered Veteran Counts", breaks =10, probability =T)
#lines(density(vet_data$Unsheltered), col='red', lwd=2)
vet_data_long <- vet_data |>
dplyr::select(Unsheltered, CoC_Count, Vet_Beds_Shelt, ES_Beds, SH_Beds, TH_Beds, PSH_Beds, RRH_Beds) |>
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Value")
ggplot(vet_data_long, aes(x = Value)) +
geom_histogram(aes(y = after_stat(density)),
bins = 10,
fill = "skyblue",
color = "black") +
geom_density(color = "red", linewidth = 1) +
facet_wrap(~Variable, scales = "free") +
labs(title = "Histograms of Veteran Data Variables",
x = "Value",
y = "Density") +
theme_minimal()
vet_data_sqrt <- vet_data |> dplyr::mutate(Unsheltered_SQRT = sqrt(Unsheltered), Vet_Beds_Shelt_SQRT = sqrt(Vet_Beds_Shelt), CoC_SQRT = sqrt(CoC_Count), ES_SQRT = sqrt(ES_Beds), SH_SQRT = sqrt(SH_Beds), TH_SQRT = sqrt(TH_Beds), PSH_SQRT = sqrt(PSH_Beds), RRH_SQRT = sqrt(RRH_Beds))
head(vet_data_sqrt)
## # A tibble: 6 × 19
## State CoC_Count Homeless_Vets Sheltered Unsheltered Vet_Beds_Shelt ES_Beds
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2 106 87 19 12 0
## 2 AL 9 291 191 100 153 80
## 3 AR 5 226 144 82 74 55
## 4 AZ 3 994 677 317 513 54
## 5 CA 44 9310 2900 6410 3004 668
## 6 CO 4 978 538 440 299 11
## # ℹ 12 more variables: TH_Beds <dbl>, SH_Beds <dbl>, RRH_Beds <dbl>,
## # PSH_Beds <dbl>, Unsheltered_SQRT <dbl>, Vet_Beds_Shelt_SQRT <dbl>,
## # CoC_SQRT <dbl>, ES_SQRT <dbl>, SH_SQRT <dbl>, TH_SQRT <dbl>,
## # PSH_SQRT <dbl>, RRH_SQRT <dbl>
I used the square root transformations because it can be applied to values of zero and my variables are count variables. Square root transformations help to reduce skewness and stabilize variances.
# Select transformed variables
vet_data_sqrt_long <- vet_data_sqrt |>
dplyr::select(ends_with("_SQRT")) |>
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Value")
# Create facet wrapped histograms
ggplot(vet_data_sqrt_long, aes(x = Value)) +
geom_histogram(aes(y = after_stat(density)),
bins = 10,
fill = "skyblue",
color = "black") +
geom_density(color = "red", linewidth = 1) +
facet_wrap(~Variable, scales = "free") +
labs(
title = "Histograms of Square Root Transformed Variables",
x = "Square Root Transformed Values",
y = "Density"
) +
theme_minimal()
TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS TESTABLE HYPOTHESIS
T Test to test the hypothesis… Doesn’t work with continuous variables i guess?
# Testing the hypothesis
#t.test(Unsheltered~Vet_Beds_Shelt, data=vet_data)
# We're asking are there differences in Unsheltered veteran counts by dedicated veteran beds? W
Anova Model This is basically asking if the Unsheltered vet count is impacted by Dedicated vet beds
anova_model<-aov(Unsheltered~Vet_Beds_Shelt,data=vet_data)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Vet_Beds_Shelt 1 34292908 34292908 244.3 <2e-16 ***
## Residuals 51 7157913 140351
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value is extrememly low, so the null hypothesis (there is no difference) can be rejected.
Basic linear model
vets_lm<-lm(Unsheltered~Vet_Beds_Shelt, data=vet_data)
#lm stands for linear model
#We added a few variables, 'am', 'disp', 'wt'
summary(vets_lm)
##
## Call:
## lm(formula = Unsheltered ~ Vet_Beds_Shelt, data = vet_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1013.7 -306.0 117.6 226.8 1186.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -299.8476 62.7053 -4.782 1.52e-05 ***
## Vet_Beds_Shelt 1.8386 0.1176 15.631 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 374.6 on 51 degrees of freedom
## Multiple R-squared: 0.8273, Adjusted R-squared: 0.8239
## F-statistic: 244.3 on 1 and 51 DF, p-value: < 2.2e-16
Dedicated veteran beds explain about 82% of unsheltered veteran numbers. The p-value is very low meaning it is significant, thereby disproving the null hypothesis that there is no effect on the unsheltered veteran count by dedicated veteran beds.