library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3

Data Cleaning

# create a file path
file_path <-"C:/Users/Administrator/Desktop/Graduate School/Applied Quant Methods/My Class Stuff/Data Project/Veteran Homelessness/2023 Homeless Veterans.xlsx"

# Read sheet 2 and remove totals row
sheet_2<- read_excel(file_path, sheet="2023") 

# Remove Sheet 2 "2023"  Totals row
vet_data_sheet_2 <- sheet_2 |> filter (State != "Total")

# Read Sheet 1
change_sheet <-read_excel(file_path, sheet="Change")

# Select only State and the 2023-2023 change column
change_column_unclean <- change_sheet |> select(State, "Change in Veteran Homelessness, 2022-2023") 

# Remove Sheet 1 "Change" totals row
change_column <- change_column_unclean |> filter (State != "Total")

# Merge change column into 2023 data
vet_data_long_variable_names <- vet_data_sheet_2 |> left_join (change_column, by = "State")

#change variable names
vet_data <- rename(vet_data_long_variable_names, CoCs = "Number of CoCs", ES_count = "Sheltered ES Homeless Veterans", TH_count = "Sheltered TH Homeless Veterans", SH_count = "Sheltered SH Homeless Veterans", Sheltered = "Sheltered Total Homeless Veterans", Unsheltered = "Unsheltered Homeless Veterans") #, Rate_Change ="Change in Veteran Homelessness, 2022-2023")

Step 1

From the data you have chosen, select a variable that you are interested in I am interested in the TH-count variable which is the count of sheltered homeless veterans living in transitional housing programs.

Step 2

Use pastecs::stat.desc to describe the variable. Include a few sentences about what the variable is and what it’s measuring. Remember to load pastecs “library(pastecs)”

pastecs::stat.desc(vet_data$TH_count)
##      nbr.val     nbr.null       nbr.na          min          max        range 
##    54.000000     3.000000     0.000000     0.000000  1397.000000  1397.000000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##  8959.000000    85.000000   165.907407    30.442133    61.059186 50043.066737 
##      std.dev     coef.var 
##   223.703077     1.348361

The variable being described here is called TH_count. TH_count measures the total number of homeless veterans per state who are sheltered at a transitional housing program. Transitional housing programs provide people experiencing homelessness a place to stay combined with supportive services for up to 24 months.

The minimum amount of homeless veterans living in a transitional housing program in a US state is 0. The maximum amount of homeless veterans living in a transitional housing program in a US state is 1397. The range is 0-1397. This means there is a very large variation across states where some states have zero veterans in transitional housing while others have over 1000. The total sum of homeless veterans living in transitional housing programs throughout the US is 8959. The median number of homeless veterans living in transitional housing programs per state is 85. The average number of homeless veterans living in transitional housing programs per state is ~166. The standard error mean is 30.44 which is closer to the true average across all states. The standard deviation is 224 which indicates a very high variability across states. It looks like some states are clear outliers that are driving the mean high and further away from the median.

Step 3

Remove NA’s if needed using dplyr:filter (or anything similar)

sum(is.na(vet_data$`TH_count`))
## [1] 0

There are no NA’s. There are 3 zero values, but those are data points.

Step 4

Provide a histogram of the variable (as shown in this lesson)

hist(vet_data$TH_count, breaks =10, probability =T)
lines(density(vet_data$TH_count), col='red', lwd=2)

Step 5

Transform the variable using the log transformation or square root transformation (whatever is more appropriate) using dplyr::mutate or something similar

vet_data_sqrt <- vet_data |> mutate(TH_SQRT = sqrt(TH_count)) |> select(TH_count, TH_SQRT)

head(vet_data_sqrt)
## # A tibble: 6 × 2
##   TH_count TH_SQRT
##      <dbl>   <dbl>
## 1       17    4.12
## 2       56    7.48
## 3       38    6.16
## 4      337   18.4 
## 5     1397   37.4 
## 6      182   13.5

I used the square root transformation because it can be applied to values of zero.

Step 6

Provide a histogram of the transformed variable

hist(vet_data_sqrt$TH_SQRT, breaks=10,probability = T)
lines(density(vet_data_sqrt$TH_SQRT),col='red',lwd=2)