Homelessness to 2023

Import Data

remove(list = ls())
 
cat("\f") 
#getwd() 
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(ggplot2) 
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(visdat)
library(psych)

Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':

    %+%, alpha
library(headliner)
library(conflicted)
library(readr)
library(tidyr)
library(readxl)
library(openxlsx)
library(reshape2)

data_main <- read_excel("~/Downloads/DATA-PIT-PowerQuery (3).xlsx")

data_ca <- data_main %>% dplyr::filter(substr(`CoC Number`, 1, 2) == "CA")

data_ny <- data_main %>% dplyr::filter(substr(`CoC Number`, 1, 2) == "NY")

data_ms <- data_main %>% dplyr::filter(substr(`CoC Number`, 1, 2) == "MS")

data_wy <- data_main %>% dplyr::filter(substr(`CoC Number`, 1, 2) == "WY")

California

Filtering Data: Replacing 0’s with NA’s then Removing Columns

data_ca[data_ca == 0] <- NA
threshold <- 0.4
na_proportion_cali <- colMeans(is.na(data_ca))
data_ca_clean <- data_ca[, na_proportion_cali <= threshold]

vis_dat(data_ca)

vis_dat(data_ca_clean)

total_na1 <- sum(is.na(data_ca_clean))
total_na_original_ca <- sum(is.na(data_ca))

print(paste("Total Number of NA Values:", total_na_original_ca))
[1] "Total Number of NA Values: 301402"
print(paste("Total Number of NA Values:", total_na1))
[1] "Total Number of NA Values: 3918"

Finding + Inputting Medians

#Step 1: Caluclate Median Values by Year excluding Year
median_by_year_cali <- data_ca_clean %>% 
  group_by(Year) %>%
  summarise(across(where(is.numeric) & !matches("Year"),
                   ~median(.x, na.rm = TRUE))) %>%
  ungroup()

#Step 2: Rename Columns for Median Values 
median_by_year_cali <- median_by_year_cali %>% 
  rename_with(~paste0(.,"_median"), -Year)

#Step 3: Join Median Values Back to the Original Dataset

Cali_Data_with_medians <- data_ca_clean %>% 
  left_join(median_by_year_cali, by = "Year")

#Step 4: Replace NA Values with Corresponding Median Values, excluding Year and Number_Of_Cocs

Cali_Filled <- Cali_Data_with_medians %>% 
 mutate(across(where(is.numeric) & !matches("Year|Number_Of_Cocs|_median"), 
                  ~if_else(is.na(.), 
                           if (paste0(cur_column(), "_median") %in% names(Cali_Data_with_medians)) 
                               get(paste0(cur_column(), "_median")) 
                           else ., 
                           .)))

# Step 5: Remove Median Columns (Optional)
Cali_Filled <- Cali_Filled %>% select(-ends_with("_median"))

# Optional: Visualize the dataset with NAs replaced
na_proportion_cali2 <- colMeans(is.na(Cali_Filled))
Cali_Filled_Clean <- Cali_Filled[, na_proportion_cali2 <= threshold]

vis_dat(Cali_Filled_Clean)

total_na2 <- sum(is.na(Cali_Filled_Clean))
print(paste("Total Number of NA Values:", total_na2))
[1] "Total Number of NA Values: 3398"

Separating Into 2 Data Sets (Clean vs NA’s)

cali_no_na_columns <- colSums(is.na(Cali_Filled_Clean)) == 0 

cali_no_na <- Cali_Filled_Clean[, cali_no_na_columns]

cali_with_na <- Cali_Filled_Clean[, c("Year", names(Cali_Filled_Clean)[!cali_no_na_columns])]

#Took Out the Year 2021 because it had too many NA's 
cali_2021_filtered <- subset(cali_with_na, Year != 2021)

# Count the number of NA values in each row (excluding 'year' column)
cali_2021_filtered$na_count <- apply(cali_2021_filtered[ , -which(names(cali_2021_filtered) == "Year")], 1, function(row) sum(is.na(row)))

# Filter rows with NA values
rows_with_na <- cali_2021_filtered$na_count > 0
cali_rows_na <- cali_2021_filtered[rows_with_na, ]

# Aggregate the NA counts by year
na_summary <- aggregate(na_count ~ Year, data = cali_rows_na, sum)

# Print the NA summary by year
print("Total NA values by year:")
[1] "Total NA values by year:"
print(na_summary) 
  Year na_count
1 2007      600
2 2008      600
3 2009      560
4 2010      574
5 2011      246
6 2012      246
threshold <- 0.3 
na_proportion_cali3 <- colMeans(is.na(cali_2021_filtered))
cali_final <- cali_2021_filtered[, na_proportion_cali3 <= threshold]

vis_dat(cali_no_na)

vis_dat(cali_with_na)

vis_dat(cali_2021_filtered)

vis_dat(cali_final)

total_na3 <- sum(is.na(cali_no_na))
print(paste("Total Number of NA Values:", total_na3))
[1] "Total Number of NA Values: 0"
total_na4 <- sum(is.na(cali_with_na))
print(paste("Total Number of NA Values:", total_na4))
[1] "Total Number of NA Values: 3398"
total_na5 <- sum(is.na(cali_2021_filtered))
print(paste("Total Number of NA Values:", total_na5))
[1] "Total Number of NA Values: 2826"
total_na6 <- sum(is.na(cali_final))
print(paste("Total Number of NA Values:", total_na6))
[1] "Total Number of NA Values: 1368"

Splitting Data Set Again

cali_final_no_na_columns <- colSums(is.na(cali_final)) == 0 

cali_final_clean2 <- cali_final[, cali_final_no_na_columns]

cali_final_with_na <- cali_final[, c("Year", names(cali_final)[!cali_final_no_na_columns])]

cali_final_clean3 <- na.omit(cali_final_with_na)

cali_final_clean1 <- cali_no_na

vis_dat(cali_final_clean1)

vis_dat(cali_final_clean2)

vis_dat(cali_final_clean3)

Removing Unnecessary Files

rm(na_summary, cali_2021_filtered, Cali_Filled, Cali_Filled_Clean, cali_final_with_na, cali_rows_na, cali_with_na, cali_final, cali_no_na, data_ca, data_ca_clean, Cali_Data_with_medians)

New York

Filtering Data: Replacing 0’s with NA’s then Removing Columns

data_ny[data_ny == 0] <- NA
threshold <- 0.4
na_proportion_ny <- colMeans(is.na(data_ny))
data_ny_clean <- data_ny[, na_proportion_ny <= threshold]

vis_dat(data_ny)

vis_dat(data_ny_clean)

total_na10 <- sum(is.na(data_ny_clean))
total_na_original_ny <- sum(is.na(data_ny))

print(paste("Total Number of NA Values:", total_na_original_ny))
[1] "Total Number of NA Values: 204278"
print(paste("Total Number of NA Values:", total_na10))
[1] "Total Number of NA Values: 2194"

Find + Inputting Medians

#Step 1: Caluclate Median Values by Year excluding Year
median_by_year_ny <- data_ny_clean %>% 
  group_by(Year) %>%
  summarise(across(where(is.numeric) & !matches("Year"),
                   ~median(.x, na.rm = TRUE))) %>%
  ungroup()

#Step 2: Rename Columns for Median Values 
median_by_year_ny <- median_by_year_ny %>% 
  rename_with(~paste0(.,"_median"), -Year)

#Step 3: Join Median Values Back to the Original Dataset

NY_Data_with_medians <- data_ny_clean %>% 
  left_join(median_by_year_ny, by = "Year")

#Step 4: Replace NA Values with Corresponding Median Values, excluding Year and Number_Of_Cocs

NY_Filled <- NY_Data_with_medians %>% 
 mutate(across(where(is.numeric) & !matches("Year|Number_Of_Cocs|_median"), 
                  ~if_else(is.na(.), 
                           if (paste0(cur_column(), "_median") %in% names(NY_Data_with_medians)) 
                               get(paste0(cur_column(), "_median")) 
                           else ., 
                           .)))

# Step 5: Remove Median Columns (Optional)
NY_Filled <- NY_Filled %>% select(-ends_with("_median"))

# Optional: Visualize the dataset with NAs replaced
na_proportion_NY2 <- colMeans(is.na(NY_Filled))
NY_Filled_Clean <- NY_Filled[, na_proportion_NY2 <= threshold]

vis_dat(NY_Filled_Clean)

total_na11 <- sum(is.na(NY_Filled_Clean))
print(paste("Total Number of NA Values:", total_na11))
[1] "Total Number of NA Values: 1404"

Separating Into 2 Data Sets (Clean vs NA’s)

NY_no_na_columns <- colSums(is.na(NY_Filled_Clean)) == 0 

NY_no_na <- NY_Filled_Clean[, NY_no_na_columns]

NY_with_na <- NY_Filled_Clean[, c("Year", names(NY_Filled_Clean)[!NY_no_na_columns])]

NY_final1 <- NY_no_na

vis_dat(NY_final1)

vis_dat(NY_with_na)

NY_2021_filtered <- NY_with_na
# Count the number of NA values in each row (excluding 'year' column)
NY_2021_filtered$na_count <- apply(NY_2021_filtered[ , -which(names(NY_2021_filtered) == "Year")], 1, function(row) sum(is.na(row)))

# Filter rows with NA values
rows_with_na2 <- NY_2021_filtered$na_count > 0
NY_rows_na <- NY_2021_filtered[rows_with_na2, ]

# Aggregate the NA counts by year
na_summary2 <- aggregate(na_count ~ Year, data = NY_rows_na, sum)

# Print the NA summary by year
print("Total NA values by year:")
[1] "Total NA values by year:"
print(na_summary2) 
  Year na_count
1 2007      260
2 2008      250
3 2009      243
4 2010      243
5 2011      108
6 2012      108
7 2021      192
threshold <- 0.3 
na_proportion_NY12 <- colMeans(is.na(NY_2021_filtered))
NY_split <- NY_2021_filtered[, na_proportion_NY12 <= threshold]

total_na13 <- sum(is.na(NY_split))
print(paste("Total Number of NA Values:", total_na13))
[1] "Total Number of NA Values: 768"
vis_dat(NY_split)

Splitting NY_Split into Two Data Sets

split_column <- "PIT-Unsheltered Chronically Homeless Individuals"
shared_column <- "Year"

split_index <- which(names(NY_split) == split_column)

data_left <- NY_split[, 1:(split_index - 1)]
data_right <- NY_split[, (split_index + 1):ncol(NY_split)]

data_right$na_count <- data_left$Year
data_right <- data_right %>%
  rename(Year = na_count)

data_left <- subset(data_left, `Year` != 2021)
data_right <- subset(data_right, !(`Year` %in% c(2021, 2007, 2008, 2009, 2010)))

NY_final2 <- data_left
NY_final3 <- data_right

vis_dat(NY_final1)

vis_dat(NY_final2)

vis_dat(NY_final3)

Removing Unnecessary Files

rm(data_ny, data_ny_clean, na_summary2, NY_2021_filtered, NY_Data_with_medians, NY_Filled, NY_Filled_Clean, NY_no_na, NY_rows_na, NY_split, NY_with_na, data_right, data_left)

Wyoming

Filtering Data: Replacing 0’s with NA’s then Removing Columns

data_wy[data_wy == 0] <- NA
threshold <- 0.4
na_proportion_wy <- colMeans(is.na(data_wy))
data_wy_clean <- data_wy[, na_proportion_wy <= threshold]

vis_dat(data_wy)

vis_dat(data_wy_clean)

total_na20 <- sum(is.na(data_wy_clean))
total_na_original_wy <- sum(is.na(data_wy))

print(paste("Total Number of NA Values:", total_na_original_wy))
[1] "Total Number of NA Values: 7366"
print(paste("Total Number of NA Values:", total_na20))
[1] "Total Number of NA Values: 84"
print("There is no effect on the data when medians are swapped with NA values. No NA values can be replaced, hence there is no use in running that code.")
[1] "There is no effect on the data when medians are swapped with NA values. No NA values can be replaced, hence there is no use in running that code."

Cleaning and Splitting the Data

wy_2021_filtered <- subset(data_wy_clean, Year != 2021)

vis_dat(wy_2021_filtered)

wy_col_with_na <- colnames(wy_2021_filtered)[apply(wy_2021_filtered, 2, function(col) any(is.na(col)))]

wy_col_without_na <- colnames(wy_2021_filtered)[!apply(wy_2021_filtered, 2, function(col) any(is.na(col)))]

if ("Year" %in% colnames(wy_2021_filtered)) {
  wy_col_with_na <- unique(c(wy_col_with_na, "Year"))
  wy_col_without_na <- unique(c(wy_col_without_na, "Year"))
}

wy_without_na <- wy_2021_filtered[, wy_col_without_na, drop = FALSE]
wy_with_na <- wy_2021_filtered[, wy_col_with_na, drop = FALSE]

#wy_with_na <- subset(wy_with_na, Year != (2007))

wy_with_na <- subset(wy_with_na, !(`Year` %in% c(2007, 2008, 2009, 2010)))

column_remove <- colnames(wy_with_na)[colSums(is.na(wy_with_na)) >1]

wy_with_na <- wy_with_na[, !(colnames(wy_with_na) %in% column_remove)]

vet_median <- median(
  wy_with_na$`PIT-Unsheltered Homeless Veterans`, na.rm = TRUE)

wy_with_na$`PIT-Unsheltered Homeless Veterans`[is.na(wy_with_na$`PIT-Unsheltered Homeless Veterans`)] <- vet_median


wy_final1 <- wy_without_na
wy_final2 <- wy_with_na

vis_dat(wy_final1)

vis_dat(wy_final2)

Removing Unnecessary Files

rm(data_wy_clean, wy_2021_filtered, wy_with_na, wy_without_na, data_wy)

Mississippi

Filtering Data: Replacing 0’s with NA’s then Removing Columns

data_ms[data_ms == 0] <- NA
threshold <- 0.4
na_proportion_ms <- colMeans(is.na(data_ms))
data_ms_clean <- data_ms[, na_proportion_ms <= threshold]

vis_dat(data_ms)

vis_dat(data_ms_clean)

total_na30 <- sum(is.na(data_ms_clean))
total_na_original_ms <- sum(is.na(data_ms))

print(paste("Total Number of NA Values:", total_na_original_ms))
[1] "Total Number of NA Values: 24044"
print(paste("Total Number of NA Values:", total_na30))
[1] "Total Number of NA Values: 246"

Finding and Inputting Medians

#Step 1: Caluclate Median Values by Year excluding Year
median_by_year_ms <- data_ms_clean %>% 
  group_by(Year) %>%
  summarise(across(where(is.numeric) & !matches("Year"),
                   ~median(.x, na.rm = TRUE))) %>%
  ungroup()

#Step 2: Rename Columns for Median Values 
median_by_year_ms <- median_by_year_ms %>% 
  rename_with(~paste0(.,"_median"), -Year)

#Step 3: Join Median Values Back to the Original Dataset

ms_Data_with_medians <- data_ms_clean %>% 
  left_join(median_by_year_ms, by = "Year")

#Step 4: Replace NA Values with Corresponding Median Values, excluding Year and Number_Of_Cocs

ms_Filled <- ms_Data_with_medians %>% 
 mutate(across(where(is.numeric) & !matches("Year|Number_Of_Cocs|_median"), 
                  ~if_else(is.na(.), 
                           if (paste0(cur_column(), "_median") %in% names(ms_Data_with_medians)) 
                               get(paste0(cur_column(), "_median")) 
                           else ., 
                           .)))

# Step 5: Remove Median Columns (Optional)
ms_Filled <- ms_Filled %>% select(-ends_with("_median"))

# Optional: Visualize the dataset with NAs replaced
na_proportion_ms2 <- colMeans(is.na(ms_Filled))
ms_Filled_Clean <- ms_Filled[, na_proportion_ms2 <= threshold]

vis_dat(ms_Filled_Clean)

total_na41 <- sum(is.na(ms_Filled_Clean))
print(paste("Total Number of NA Values:", total_na41))
[1] "Total Number of NA Values: 192"

Cleaning and Splitting the Data

ms_2021_filtered <- subset(ms_Filled_Clean, Year != 2021)

vis_dat(ms_2021_filtered)

ms_col_with_na <- colnames(ms_2021_filtered)[apply(ms_2021_filtered, 2, function(col) any(is.na(col)))]

ms_col_without_na <- colnames(ms_2021_filtered)[!apply(ms_2021_filtered, 2, function(col) any(is.na(col)))]

if ("Year" %in% colnames(ms_2021_filtered)) {
  ms_col_with_na <- unique(c(ms_col_with_na, "Year"))
  ms_col_without_na <- unique(c(ms_col_without_na, "Year"))
}

ms_without_na <- ms_2021_filtered[, ms_col_without_na, drop = FALSE]
ms_with_na <- ms_2021_filtered[, ms_col_with_na, drop = FALSE]

ms_with_na <- subset(ms_with_na, !(`Year` %in% c(2007, 2008, 2009, 2010)))

vis_dat(ms_without_na)

vis_dat(ms_with_na)

ms_col_with_na2 <- colnames(ms_with_na)[apply(ms_with_na, 2, function(col) any(is.na(col)))]

ms_col_without_na2 <- colnames(ms_with_na)[!apply(ms_with_na, 2, function(col) any(is.na(col)))]

if ("Year" %in% colnames(ms_with_na)) {
  ms_col_with_na2 <- unique(c(ms_col_with_na2, "Year"))
  ms_col_without_na2 <- unique(c(ms_col_without_na2, "Year"))
}

ms_without_na2 <- ms_with_na[, ms_col_without_na2, drop = FALSE]
ms_with_na2 <- ms_with_na[, ms_col_with_na2, drop = FALSE]

ms_with_na2 <- subset(ms_with_na2, !(`Year` %in% c(2011, 2012)))

vis_dat(ms_without_na2)

vis_dat(ms_with_na2)

ms_final1 <- ms_without_na
ms_final2 <- ms_without_na2
ms_final3 <- ms_with_na2

vis_dat(ms_final1)

vis_dat(ms_final2)

vis_dat(ms_final3)

Removing Unnecessary Files

rm(ms_without_na, ms_without_na2, ms_with_na, ms_with_na2, ms_Filled, ms_Filled_Clean, ms_2021_filtered, ms_Data_with_medians, data_ms, data_ms_clean)

Last Round of Removing Files

rm(data_main, median_by_year_cali, median_by_year_ms, median_by_year_ny)

Creating Visuals

Package Installation

library(ggcorrplot)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     
library(corrplot)
corrplot 0.92 loaded
library(viridisLite)
library(viridis)

Removing Unwanted Columns (California)

#cali1

columns_keep <- grep("ES|TH", names(cali_final_clean1), invert = TRUE)

cali_final_clean1 <- cali_final_clean1[, columns_keep]

cali_final_clean1$Year <- as.numeric(as.character(cali_final_clean1$Year))

numeric_columns <- sapply(cali_final_clean1, is.numeric)

cali_final_clean1 <- cali_final_clean1[, numeric_columns]

#cali2

columns_keep2 <- grep("ES|TH", names(cali_final_clean2), invert = TRUE)

cali_final_clean2 <- cali_final_clean2[, columns_keep2]

cali_final_clean2 <- cali_final_clean2[, !names(cali_final_clean2) %in% "na_count"]

cali_final_clean2$Year <- as.numeric(as.character(cali_final_clean2$Year))

#cali3

columns_keep3 <- grep("ES|TH", names(cali_final_clean3), invert = TRUE)

cali_final_clean3 <- cali_final_clean3[, columns_keep3]

cali_final_clean3$Year <- as.numeric(as.character(cali_final_clean3$Year))

Calculating Yearly Totals for California

cali_total1 <- cali_final_clean1 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

cali_total2 <- cali_final_clean2 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

cali_total3 <- cali_final_clean3 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

Correlation Matrices (California)

cali_matrix1 <- cor(cali_final_clean1)

ggcorrplot(corr = cali_matrix1, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
           colors = c("purple", "white","darkgreen"), 
           type = "lower")

cali_total_matrix1 <- cor(cali_total1)

ggcorrplot(corr = cali_total_matrix1, 
          method = "square",
           tl.cex = 7,
            lab = TRUE,
           lab_size = 2,
           colors = c("purple", "white","darkgreen"), 
           type = "lower")

###

cali_matrix2 <- cor(cali_final_clean2)

ggcorrplot(corr = cali_matrix2, 
           method = "square",
           colors = c("blue", "white", "red"),
           type = "lower",
             lab = TRUE,
           lab_size = 2,
           tl.cex = 7)

cali_total_matrix2 <- cor(cali_total2)

ggcorrplot(corr = cali_total_matrix2, 
          method = "square",
           tl.cex = 7,
            lab = TRUE,
           lab_size = 2,
           colors = c("blue", "white", "red"),
           type = "lower")

###

cali_matrix3 <- cor(cali_final_clean3)


ggcorrplot(corr = cali_matrix3,
           method = "square",
             lab = TRUE,
           lab_size = 2,
          colors = c("orange", "white", "blue"),
        type = "lower",
           tl.cex = 7)

cali_total_matrix3 <- cor(cali_total3)

ggcorrplot(corr = cali_total_matrix3, 
          method = "square",
           tl.cex = 7,
            lab = TRUE,
           lab_size = 2,
           colors = c("orange", "white", "blue"), 
           type = "lower")

Removing Unwanted Columns (Mississippi)

#MS 1

columns_keep4 <- grep("ES|TH", names(ms_final1), invert = TRUE)

ms_final1 <- ms_final1[, columns_keep4]

ms_final1$Year <- as.numeric(as.character(ms_final1$Year))

numeric_columns2 <- sapply(ms_final1, is.numeric)

ms_final1 <- ms_final1[, numeric_columns2]

#MS 2

columns_keep5 <- grep("ES|TH", names(ms_final2), invert = TRUE)

ms_final2 <- ms_final2[, columns_keep5]

ms_final2 <- ms_final2[, !names(ms_final2) %in% "na_count"]

ms_final2$Year <- as.numeric(as.character(ms_final2$Year))

#MS 3

columns_keep6 <- grep("ES|TH", names(ms_final3), invert = TRUE)

ms_final3 <- ms_final3[, columns_keep6]

ms_final3 <- ms_final3[, !names(ms_final3) %in% "na_count"]

ms_final3$Year <- as.numeric(as.character(ms_final3$Year))

Calculating Yearly Totals for Mississippi

ms_total_final1 <- ms_final1 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

ms_total_final2 <- ms_final2 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

ms_total_final3 <- ms_final3 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

Correlation Matrices (Mississippi)

ms_matrix1 <- cor(ms_final1)

ggcorrplot(corr = ms_matrix1, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
           type = "lower",
           colors = c("purple", "white","darkgreen"))

ms_total_matrix1 <- cor(ms_total_final1)

ggcorrplot(corr = ms_total_matrix1, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
           type = "lower",
           colors = c("purple", "white","darkgreen"))

###

ms_final2 <- ms_final2 %>% select(Year, everything())

ms_matrix2 <- cor(ms_final2)

ggcorrplot(corr = ms_matrix2, 
           method = "square",
              type = "lower",
             lab = TRUE,
           lab_size = 2,
             colors = c("blue", "white", "red"),
           tl.cex = 7)

ms_total_matrix2 <- cor(ms_total_final2)

ggcorrplot(corr = ms_total_matrix2, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
           type = "lower",
             colors = c("blue", "white", "red"))

###

ms_final3 <- ms_final3 %>% select(Year, everything())

ms_matrix3 <- cor(ms_final3)

ggcorrplot(corr = ms_matrix3, 
           method = "square",
           type = "lower",
             lab = TRUE,
           lab_size = 5,
          colors = c("orange", "white", "blue"),
           tl.cex = 7)

ms_total_matrix3 <- cor(ms_total_final3)

ggcorrplot(corr = ms_total_matrix3, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 5,
           type = "lower",
            colors = c("orange", "white", "blue"))

Removing Unwanted Columns (New York)

#NY 1

columns_keep7 <- grep("ES|TH", names(NY_final1), invert = TRUE)

NY_final1 <- NY_final1[, columns_keep7]

NY_final1$Year <- as.numeric(as.character(NY_final1$Year))

numeric_columns3 <- sapply(NY_final1, is.numeric)

NY_final1 <- NY_final1[, numeric_columns3]

#NY 2

columns_keep8 <- grep("ES|TH", names(NY_final2), invert = TRUE)

NY_final2 <- NY_final2[, columns_keep8]

NY_final2 <- NY_final2[, !names(NY_final2) %in% "na_count"]

NY_final2$Year <- as.numeric(as.character(NY_final2$Year))

#NY 3

columns_keep9 <- grep("ES|TH", names(NY_final3), invert = TRUE)

NY_final3 <- NY_final3[, columns_keep9]

NY_final3 <- NY_final3[, !names(NY_final3) %in% "na_count"]

NY_final3$Year <- as.numeric(as.character(NY_final3$Year))

Calculating Yearly Totals for Mississippi

ny_total_final1 <- NY_final1 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

ny_total_final2 <- NY_final2 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

ny_total_final3 <- NY_final3 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

Correlation Matrices (New York)

ny_matrix1 <- cor(NY_final1)

ggcorrplot(corr = ny_matrix1, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
               type = "lower",
           colors = c("purple", "white","darkgreen"))

ny_total_matrix1 <- cor(ny_total_final1)

ggcorrplot(corr = ny_total_matrix1, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
           type = "lower",
           colors = c("purple", "white","darkgreen"))

###

ny_matrix2 <- cor(NY_final2)

ggcorrplot(corr = ny_matrix2, 
           method = "square",
               type = "lower",
             lab = TRUE,
           lab_size = 2,
             colors = c("blue", "white", "red"),
           tl.cex = 7)

ny_total_matrix2 <- cor(ny_total_final2)

ggcorrplot(corr = ny_total_matrix2, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
           type = "lower",
             colors = c("blue", "white", "red"))

###

NY_final3 <- NY_final3 %>% select(Year, everything())

ny_matrix3 <- cor(NY_final3)

ggcorrplot(corr = ny_matrix3, 
           method = "square",
               type = "lower",
             lab = TRUE,
           lab_size = 4,
          colors = c("orange", "white", "blue"),
           tl.cex = 7)

ny_total_matrix3 <- cor(ny_total_final3)

ggcorrplot(corr = ny_total_matrix3, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 4,
           type = "lower",
            colors = c("orange", "white", "blue"))

Removing Unwanted Columns (Wyoming)

#WY 1

columns_keep10 <- grep("ES|TH", names(wy_final1), invert = TRUE)

wy_final1 <- wy_final1[, columns_keep10]

wy_final1$Year <- as.numeric(as.character(wy_final1$Year))

numeric_columns3 <- sapply(wy_final1, is.numeric)

wy_final1 <- wy_final1[, numeric_columns3]

#WY 2

columns_keep11 <- grep("ES|TH", names(wy_final2), invert = TRUE)

wy_final2 <- wy_final2[, columns_keep11]

wy_final2 <- wy_final2[, !names(wy_final2) %in% "na_count"]

wy_final2$Year <- as.numeric(as.character(wy_final2$Year))

Calculating Yearly Totals for Wyoming

wy_total_final1 <- wy_final1 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

wy_total_final2 <- wy_final2 %>%
  group_by(Year) %>%
  summarise(across(everything(), \(x) sum(x, na.rm = TRUE)))

Correlation Matrices (Wyoming)

wy_matrix1 <- cor(wy_final1)

ggcorrplot(corr = wy_matrix1, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
               type = "lower",
           colors = c("purple", "white","darkgreen"))

wy_total_matrix1 <- cor(wy_total_final1)

ggcorrplot(corr = wy_total_matrix1, 
           method = "square",
           tl.cex = 7,
             lab = TRUE,
           lab_size = 2,
           type = "lower",
           colors = c("purple", "white","darkgreen"))

###

wy_final2 <- wy_final2 %>% select(Year, everything())

wy_matrix2 <- cor(wy_final2)

ggcorrplot(corr = wy_matrix2, 
           method = "square",
               type = "lower",
             lab = TRUE,
           lab_size = 2,
            colors = c("blue", "white", "red"),
           tl.cex = 7)

wy_total_matrix2 <- cor(wy_total_final2)

ggcorrplot(corr = wy_total_matrix2, 
           method = "square",
           tl.cex = 7,
           type = "lower",
           lab = TRUE,
           lab_size = 2,
             colors = c("blue", "white", "red"))

Importing Other Data

#ALL-Transactions House Price Index - based on Index 1980: Q1 = 100 (Not seasonally adjussted )
CA_HPI <- read.csv("~/Downloads/CASTHPI (2).csv")
NY_HPI <- read.csv("~/Downloads/NYSTHPI.csv")
MS_HPI <- read.csv("~/Downloads/MSSTHPI.csv")
WY_HPI <- read.csv("~/Downloads/WYSTHPI.csv")

#Zillow Home Value Index - It reflects the typical value for homes in the 35th to 65th percentile range. - Smoothed, Seasonally Adjusted 

zillow_hvi <- read.csv("~/Downloads/State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")

#Median Household Income  

CA_medianhousehold <- read.csv("~/Downloads/MEHOINUSCAA646N.csv")
NY_medianhousehold <- read.csv("~/Downloads/MEHOINUSNYA646N.csv")
MS_medianhousehold <- read.csv("~/Downloads/MEHOINUSMSA646N.csv")
WY_medianhousehold <- read.csv("~/Downloads/MEHOINUSWYA646N.csv")

#Yearly Inflation Rate

US_Yearly_Inflation_Rate <- read_excel("~/Downloads/US Yearly Inflation Rate.xlsx")
US_Yearly_Inflation_Rate <- na.omit(US_Yearly_Inflation_Rate)

Cleaning Data

#Zillow HVI
zillow_hvi <- zillow_hvi[, !names(zillow_hvi) %in% c("RegionID", "SizeRank", "StateName", "RegionType")]

zillow_hvi <- zillow_hvi[grep("California|Mississippi|Wyoming|New York", zillow_hvi$RegionName), ]

zillow_hvi <- zillow_hvi[, colSums(is.na(zillow_hvi)) == 0]

Averaging Data in Zillow_hvi + Splitting the Data

#Averaging the Data 

states_of_interest <- c("California", "Wyoming", "Mississippi", "New York")

filtered_zillow <- zillow_hvi %>% dplyr::filter(RegionName %in% states_of_interest)

long_zillow <- filtered_zillow %>% pivot_longer(cols = starts_with("X20"), 
            names_to = "Date",
            values_to = "Price")

long_zillow <- long_zillow %>% 
  mutate(Date = sub("^X", "", Date)) %>% 
  mutate(Year = as.numeric(substr(Date, 1, 4)))

zillow_yearly_averages <- long_zillow %>%
  group_by(RegionName, Year) %>%
  summarize(Yearly_Average = mean(Price, na.rm = TRUE))
`summarise()` has grouped output by 'RegionName'. You can override using the
`.groups` argument.
zillow_yearly_averages$RegionName <- as.character(zillow_yearly_averages$RegionName)

#Splitting the Data by State 

WY_Zillow <- zillow_yearly_averages %>% dplyr::filter(RegionName == "Wyoming")
NY_Zillow <- zillow_yearly_averages %>% dplyr::filter(RegionName == "New York")
MS_Zillow <- zillow_yearly_averages %>% dplyr::filter(RegionName == "Mississippi")
CA_Zillow <- zillow_yearly_averages %>% dplyr::filter(RegionName == "California")

Removing Unnecessary Datasets

rm(zillow_hvi, long_zillow, filtered_zillow, zillow_yearly_averages)

VisDats

vis_dat(CA_HPI)

vis_dat(NY_HPI)

vis_dat(MS_HPI)

vis_dat(WY_HPI)

vis_dat(CA_medianhousehold)

vis_dat(NY_medianhousehold)

vis_dat(MS_medianhousehold)

vis_dat(WY_medianhousehold)

vis_dat(US_Yearly_Inflation_Rate)

vis_dat(WY_Zillow)

vis_dat(NY_Zillow)

vis_dat(CA_Zillow)

vis_dat(MS_Zillow)

Merging Data

#Last bit of cleaning

CA_Zillow <- CA_Zillow[, !names(CA_Zillow) %in% c("RegionName")]

CA_HPI <- CA_HPI %>% rename(Year = DATE)

#HPI 

CA_HPI$Year <- as.character(CA_HPI$Year)

CA_HPI$Year <- substr(CA_HPI$Year, 1, 4)

CA_HPI$Year <- as.numeric(CA_HPI$Year)

#Median

CA_medianhousehold <- CA_medianhousehold %>% rename(Year = DATE)
CA_medianhousehold <- CA_medianhousehold %>% rename(Median_Household_Income = MEHOINUSCAA646N)

CA_medianhousehold$Year <- as.character(CA_medianhousehold$Year)

CA_medianhousehold$Year <- substr(CA_medianhousehold$Year, 1, 4)

CA_medianhousehold$Year <- as.numeric(CA_medianhousehold$Year)


CA_Zillow <- CA_Zillow %>% rename(Yearly_Average_HVI = Yearly_Average)

#Merging Cali_Totals and CA_Zillow 

ca_zillow_and_cali_total1 <- merge(cali_total1, CA_Zillow, by = "Year")
ca_zillow_and_cali_total2 <- merge(cali_total2, CA_Zillow, by = "Year")
ca_zillow_and_cali_total3 <- merge(cali_total3, CA_Zillow, by = "Year")

#Merging Cali_Totals and CA_HPI

ca_hpi_and_cali_total1 <- merge(cali_total1, CA_HPI, by = "Year")
ca_hpi_and_cali_total2 <- merge(cali_total2, CA_HPI, by = "Year")
ca_hpi_and_cali_total3 <- merge(cali_total3, CA_HPI, by = "Year")

#Merging Cali_Totals and CA_Median

ca_medianhouse_and_cali_total1 <- merge(cali_total1, CA_medianhousehold, by = "Year")
ca_medianhouse_and_cali_total2 <- merge(cali_total2, CA_medianhousehold, by = "Year")
ca_medianhouse_and_cali_total3 <- merge(cali_total3, CA_medianhousehold, by = "Year")

#Merging Cali_Totals and US Inflation 

inflation_and_cali_total1 <- merge(cali_total1, US_Yearly_Inflation_Rate, by = "Year")
inflation_and_cali_total2 <- merge(cali_total2, US_Yearly_Inflation_Rate, by = "Year")
inflation_and_cali_total3 <- merge(cali_total3, US_Yearly_Inflation_Rate, by = "Year")

Correlations Between Datasets for Zillow HVI (California)

cali_correlations1 <- sapply(ca_zillow_and_cali_total1, function(x) cor(x, ca_zillow_and_cali_total1$Yearly_Average_HVI, use = "complete.obs"))

calicor_df1 <- data.frame(Column1 = names(cali_correlations1), Correlation1 = cali_correlations1)

calicor_df1 <- calicor_df1[calicor_df1$Column1 != "Yearly_Average_HVI", ]

ggplot(calicor_df1, aes(x = reorder(Column1, -Correlation1), y = Correlation1, fill = Correlation1)) + geom_bar(stat = "identity") +
  coord_flip() + 
  scale_fill_gradient(low = "lightgray", high = "red") +
  geom_text(aes(label = round(Correlation1, 2)), 
            vjust = 0.5, 
            hjust = -0.1) +
  labs(title = "Correlation with HVI Yearly Average", 
       x = "Columns", 
       y = "Correlation") + theme_minimal() + 
  ylim(-0.1,1)

##########################

cali_correlations2 <- sapply(ca_zillow_and_cali_total2, function(x) cor(x, ca_zillow_and_cali_total2$Yearly_Average_HVI, use = "complete.obs"))

calicor_df2 <- data.frame(Column2 = names(cali_correlations2), Correlation2 = cali_correlations2)

calicor_df2 <- calicor_df2[calicor_df2$Column2 != "Yearly_Average_HVI", ]

ggplot(calicor_df2, aes(x = reorder(Column2, -Correlation2), y = Correlation2, fill = Correlation2)) + geom_bar(stat = "identity") +
  coord_flip() + 
  scale_fill_gradient(low = "lightgray", high = "purple") +
  geom_text(aes(label = round(Correlation2, 2)), 
            vjust = 0.2, 
            hjust = 0, 
            size = 3) +
  labs(title = "Correlation with HVI Yearly Average", 
       x = "Columns", 
       y = "Correlation") + theme_minimal() +
 ylim(-0.5,1)

##########################

cali_correlations3 <- sapply(ca_zillow_and_cali_total3, function(x) cor(x, ca_zillow_and_cali_total3$Yearly_Average_HVI, use = "complete.obs"))

calicor_df3 <- data.frame(Column3 = names(cali_correlations3), Correlation3 = cali_correlations3)

calicor_df3 <- calicor_df3[calicor_df3$Column3 != "Yearly_Average_HVI", ]

ggplot(calicor_df3, aes(x = reorder(Column3, -Correlation3), y = Correlation3, fill = Correlation3)) + geom_bar(stat = "identity") +
  coord_flip() + 
  scale_fill_gradient(low = "lightgray", high = "green") +
  geom_text(aes(label = round(Correlation3, 2)), 
            vjust = 0.2, 
            hjust = 0, 
            size = 3) +
  labs(title = "Correlation with HVI Yearly Average", 
       x = "Columns", 
       y = "Correlation") + theme_minimal()

Correlations Between Datasets for Median Household Income (California)

cali_correlations4 <- sapply(ca_medianhouse_and_cali_total1, function(x) cor(x, ca_medianhouse_and_cali_total1$Median_Household_Income, use = "complete.obs"))

calicor_df4 <- data.frame(Column4 = names(cali_correlations4), Correlation4 = cali_correlations4)

calicor_df4 <- calicor_df4[calicor_df4$Column4 != "Median_Household_Income", ]

ggplot(calicor_df4, aes(x = reorder(Column4, -Correlation4), y = Correlation4, fill = Correlation4)) + geom_bar(stat = "identity") +
  coord_flip() + 
  scale_fill_gradient(low = "lightgray", high = "red") +
  geom_text(aes(label = round(Correlation4, 2)), 
            vjust = 0.5, 
            hjust = -0.1) +
  labs(title = "Correlation with Median Household Income", 
       x = "Columns", 
       y = "Correlation") + theme_minimal() 

##########################

cali_correlations5 <- sapply(ca_medianhouse_and_cali_total2, function(x) cor(x, ca_medianhouse_and_cali_total2$Median_Household_Income, use = "complete.obs"))

calicor_df5 <- data.frame(Column5 = names(cali_correlations5), Correlation5 = cali_correlations5)

calicor_df5 <- calicor_df5[calicor_df5$Column5 != "Median_Household_Income", ]

ggplot(calicor_df5, aes(x = reorder(Column5, -Correlation5), y = Correlation5, fill = Correlation5)) + geom_bar(stat = "identity") +
  coord_flip() + 
  scale_fill_gradient(low = "lightgray", high = "purple") +
  geom_text(aes(label = round(Correlation5, 2)), 
            vjust = 0.5, 
            hjust = -0.1) +
  labs(title = "Correlation with Median Household Income", 
       x = "Columns", 
       y = "Correlation") + theme_minimal()

##########################

cali_correlations6 <- sapply(ca_medianhouse_and_cali_total3, function(x) cor(x, ca_medianhouse_and_cali_total3$Median_Household_Income, use = "complete.obs"))

calicor_df6 <- data.frame(Column6 = names(cali_correlations6), Correlation6 = cali_correlations6)

calicor_df6 <- calicor_df6[calicor_df6$Column6 != "Median_Household_Income", ]

ggplot(calicor_df6, aes(x = reorder(Column6, -Correlation6), y = Correlation6, fill = Correlation6)) + geom_bar(stat = "identity") +
  coord_flip() + 
  scale_fill_gradient(low = "lightgray", high = "green") +
  geom_text(aes(label = round(Correlation6, 2)), 
            vjust = 0.5, 
            hjust = -0.1) +
  labs(title = "Correlation with Median Household Income", 
       x = "Columns", 
       y = "Correlation") + theme_minimal() 

Correlations Between Datasets (New York)

Correlations Between Datasets (Mississippi)

Correlations Between Datasets (Wyoming)

Garbage

#################

cali_correlations2_test <- sapply(ca_zillow_and_cali_total2[ , names(ca_zillow_and_cali_total2) != "Yearly_Average_HVI"], function(x) cor(ca_zillow_and_cali_total2$Yearly_Average_HVI, x, use = "complete.obs"))
                                  
calicor_df2test <- data.frame(Column2test = names(cali_correlations2_test), Correlation2test = cali_correlations2_test)

#calicor_df2test <- calicor_df2test[calicor_df2test$Column2test != "Yearly_Average_HVI", ]

ggplot(calicor_df2test, aes(x = reorder(Column2test, -Correlation2test), y = Correlation2test, fill = Correlation2test)) + geom_bar(stat = "identity") +
  coord_flip() + 
  scale_fill_gradient(low = "lightgray", high = "purple") +
  geom_text(aes(label = round(Correlation2test, 2)), 
            vjust = 0.2, 
            hjust = 0, 
            size = 3) +
  labs(title = "Correlation with HVI Yearly Average", 
       x = "Columns", 
       y = "Correlation") + theme_minimal()