Sourcing & Cleaning House Price & Income Data # 1

# Home data is sourced from Zillow database for "For-Sale" houses

# Get working directory
wd <- getwd()

# Create data path
data_dir <- file.path(wd, "Project") 

# Read "For-Sale" home prices by month from Zillow
zillow_data <- read_csv(file.path(data_dir,"Median_List_Price_Monthly_Smooth_All_Homes.csv"),show_col_types=F)

# Step 1: Clean data (remove NA states)
zillow_cleaned <- zillow_data %>%
  filter(!is.na(StateName))

# Step 2: Pivot from wide to long (dates were stored as separate col)
zillow_long <- zillow_cleaned %>%
  pivot_longer(
    cols = matches("^\\d{4}-\\d{2}-\\d{2}$"),  # selects all columns that look like dates
    names_to = "Date",
    values_to = "MedianPrice"
  ) %>%
  mutate(Date = as.Date(Date))

# Step 3: Restrict date range to [2022, 2024]
zillow_filtered <- zillow_long %>%
  filter(format(Date, "%Y") >= 2022 & format(Date, "%Y") < 2025)

# Step 4: Take median price by year
zillow_by_year_state <- zillow_filtered %>%
  mutate(Year = year(Date)) %>%
  group_by(StateName, Year) %>%
  summarise(Median_Yearly_Price = median(MedianPrice, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'StateName'. You can override using the
## `.groups` argument.
# Income data is sourced by Bureau of Labor Statistics, utilizing code 00-0000 which refers to All Occupations

# Read the Excel file
Income_2022 <- read_excel(file.path(getwd(), "Project", "all_data_M_2022.xlsx"))
Income_2023 <- read_excel(file.path(getwd(), "Project", "all_data_M_2023.xlsx"))
Income_2024 <- read_excel(file.path(getwd(), "Project", "all_data_M_2024.xlsx"))

# Filter the data
Income_2022_data <- Income_2022 %>%
  filter(OCC_CODE == "00-0000") %>%
  mutate(AREA = as.numeric(AREA)) %>%
  filter(AREA >= 1, AREA <= 56) #2022
Income_2022_data <- Income_2022_data %>%
  filter(AREA_TITLE != "District of Columbia")

Income_2023_data <- Income_2023 %>%
  filter(OCC_CODE == "00-0000") %>%
  mutate(AREA = as.numeric(AREA)) %>%
  filter(AREA >= 1, AREA <= 56) #2023
Income_2023_data <- Income_2023_data %>%
  filter(AREA_TITLE != "District of Columbia")

Income_2024_data <- Income_2024 %>%
  filter(OCC_CODE == "00-0000") %>% #00-0000 All Occupations
  mutate(AREA = as.numeric(AREA)) %>%
  filter(AREA >= 1, AREA <= 56) #2024
Income_2024_data <- Income_2024_data %>%
  filter(AREA_TITLE != "District of Columbia")

Merging House Price & Income Data # 2

# Select relevant cols from income data
income_2022_clean <- Income_2022_data %>%
  select(StateName = PRIM_STATE, Median_Income = A_MEDIAN) %>%
  mutate(Year = 2022)

income_2023_clean <- Income_2023_data %>%
  select(StateName = PRIM_STATE, Median_Income = A_MEDIAN) %>%
  mutate(Year = 2023)

income_2024_clean <- Income_2024_data %>%
  select(StateName = PRIM_STATE, Median_Income = A_MEDIAN) %>%
  mutate(Year = 2024)

# Combine income dataset
all_income <- bind_rows(income_2022_clean, income_2023_clean, income_2024_clean)

# Combine median state income and median home income at state level
affordability_data <- left_join(zillow_by_year_state, all_income, by = c("StateName", "Year")) #2024 CO will have NA for Median Income

Calculating Affordability Ratio # 3

# Convert to numerical
affordability_data <- affordability_data %>%
  mutate(
    Median_Yearly_Price = as.numeric(Median_Yearly_Price),
    Median_Income = as.numeric(Median_Income)
  )

# Use CO 2023 income as a baseline for CO 2024 income
co_2023_income <- affordability_data$Median_Income[affordability_data$StateName == "CO" & affordability_data$Year == 2023]

# Assign it to 2024
affordability_data$Median_Income[affordability_data$StateName == "CO" & affordability_data$Year == 2024] <- co_2023_income

# Affordability Ratio 
affordability_data <- affordability_data %>%
  mutate(Affordability_Ratio = Median_Yearly_Price / Median_Income)

# Get map data as a data.frame
states_map <- map_data("state")

# Match state names
affordability_data_clean <- affordability_data %>%
  mutate(region = tolower(state.name[match(StateName, state.abb)]))

State Ranking by Affordability Ratio # 4

# Group and rank state by affordability ratio
ranked_data <- affordability_data_clean %>%
  select(StateName, region, Year, Median_Income, Median_Yearly_Price, Affordability_Ratio) %>%
  arrange(Year, Affordability_Ratio) %>%
  group_by(Year) %>%
  mutate(Rank = row_number()) %>%
  ungroup()

# @parm:  Dataset used for filtering
# @parm:  Year to filter by
# Generates a table graphic of rankings of state AR based on year. Ref -> https://r-graph-gallery.com/package/gt.html
create_affordability_gt_table <- function(data, year) {
  data %>%
    filter(Year == year) %>%
    select(Rank, region, StateName, Median_Income, Median_Yearly_Price, Affordability_Ratio) %>%
    gt() %>%
    tab_header(
      title = "Housing Affordability Rankings by State",
      subtitle = paste("Year:", year, "โ€” Lower Ratio Means More Affordable")
    ) %>%
    fmt_currency(
      columns = c(Median_Income, Median_Yearly_Price),
      currency = "USD"
    ) %>%
    fmt_number(
      columns = Affordability_Ratio,
      decimals = 2
    ) %>%
    cols_label(
      region = "State (Full)",
      StateName = "State (Abbrev)",
      Median_Income = "Median Income",
      Median_Yearly_Price = "Median House Price",
      Affordability_Ratio = "Affordability Ratio"
    )
}

create_affordability_gt_table(ranked_data, 2022)
Housing Affordability Rankings by State
Year: 2022 โ€” Lower Ratio Means More Affordable
Rank State (Full) State (Abbrev) Median Income Median House Price Affordability Ratio
1 illinois IL $47,480.00 $130,891.50 2.76
2 new york NY $52,470.00 $190,958.00 3.64
3 kansas KS $41,870.00 $152,783.00 3.65
4 ohio OH $44,750.00 $171,300.00 3.83
5 iowa IA $44,350.00 $171,567.00 3.87
6 pennsylvania PA $45,790.00 $189,483.00 4.14
7 michigan MI $45,500.00 $191,758.50 4.21
8 west virginia WV $37,770.00 $161,125.00 4.27
9 indiana IN $42,100.00 $187,700.00 4.46
10 missouri MO $42,310.00 $190,108.00 4.49
11 nebraska NE $44,100.00 $212,358.50 4.82
12 oklahoma OK $39,100.00 $190,165.00 4.86
13 arkansas AR $37,270.00 $184,566.50 4.95
14 minnesota MN $48,760.00 $244,117.00 5.01
15 wisconsin WI $45,650.00 $231,608.50 5.07
16 north dakota ND $47,410.00 $247,333.00 5.22
17 louisiana LA $38,970.00 $207,566.50 5.33
18 kentucky KY $40,180.00 $217,067.00 5.40
19 georgia GA $42,890.00 $234,550.00 5.47
20 mississippi MS $36,100.00 $197,783.00 5.48
21 texas TX $43,460.00 $247,933.50 5.70
22 maryland MD $51,420.00 $294,608.00 5.73
23 alabama AL $38,470.00 $221,130.00 5.75
24 new jersey NJ $51,080.00 $300,300.00 5.88
25 virginia VA $48,290.00 $287,566.50 5.95
26 north carolina NC $41,810.00 $255,133.00 6.10
27 maine ME $45,420.00 $284,225.00 6.26
28 connecticut CT $51,780.00 $324,933.50 6.28
29 new mexico NM $39,900.00 $271,750.00 6.81
30 south dakota SD $39,870.00 $280,100.00 7.03
31 south carolina SC $38,870.00 $274,316.50 7.06
32 tennessee TN $39,930.00 $293,816.50 7.36
33 delaware DE $47,150.00 $348,691.50 7.40
34 alaska AK $52,000.00 $386,483.00 7.43
35 vermont VT $47,320.00 $357,083.00 7.55
36 wyoming WY $45,450.00 $349,408.50 7.69
37 new hampshire NH $47,920.00 $371,300.00 7.75
38 arizona AZ $45,290.00 $371,667.00 8.21
39 washington WA $56,320.00 $463,290.00 8.23
40 florida FL $40,820.00 $349,983.50 8.57
41 rhode island RI $49,360.00 $431,450.00 8.74
42 massachusetts MA $58,450.00 $519,429.50 8.89
43 oregon OR $47,770.00 $463,667.00 9.71
44 california CA $49,740.00 $499,000.00 10.03
45 nevada NV $40,810.00 $433,300.00 10.62
46 idaho ID $40,060.00 $431,666.50 10.78
47 utah UT $44,470.00 $485,800.00 10.92
48 colorado CO $50,250.00 $557,698.00 11.10
49 montana MT $42,210.00 $570,558.00 13.52
50 hawaii HI $48,560.00 $931,083.50 19.17
create_affordability_gt_table(ranked_data, 2023)
Housing Affordability Rankings by State
Year: 2023 โ€” Lower Ratio Means More Affordable
Rank State (Full) State (Abbrev) Median Income Median House Price Affordability Ratio
1 illinois IL $48,730.00 $144,608.00 2.97
2 new york NY $56,840.00 $211,000.00 3.71
3 iowa IA $46,460.00 $189,366.50 4.08
4 kansas KS $45,250.00 $186,291.50 4.12
5 ohio OH $46,690.00 $192,950.00 4.13
6 pennsylvania PA $47,430.00 $203,650.00 4.29
7 west virginia WV $39,770.00 $173,533.50 4.36
8 michigan MI $46,940.00 $217,556.00 4.63
9 indiana IN $45,470.00 $214,967.00 4.73
10 missouri MO $45,080.00 $213,133.50 4.73
11 arkansas AR $39,060.00 $196,367.00 5.03
12 oklahoma OK $41,480.00 $209,533.50 5.05
13 north dakota ND $48,830.00 $257,191.50 5.27
14 nebraska NE $46,440.00 $245,591.50 5.29
15 louisiana LA $41,320.00 $221,292.00 5.36
16 mississippi MS $37,500.00 $202,383.00 5.40
17 minnesota MN $50,880.00 $276,358.50 5.43
18 kentucky KY $43,730.00 $238,666.50 5.46
19 wisconsin WI $47,590.00 $263,117.00 5.53
20 maryland MD $55,810.00 $309,494.50 5.55
21 georgia GA $45,480.00 $261,950.00 5.76
22 texas TX $45,970.00 $267,158.50 5.81
23 alabama AL $41,350.00 $247,291.50 5.98
24 connecticut CT $56,130.00 $353,267.00 6.29
25 virginia VA $49,920.00 $316,200.00 6.33
26 north carolina NC $45,440.00 $291,316.50 6.41
27 new mexico NM $43,620.00 $285,333.00 6.54
28 new jersey NJ $54,860.00 $368,973.00 6.73
29 maine ME $47,590.00 $325,925.00 6.85
30 south carolina SC $42,220.00 $293,529.00 6.95
31 south dakota SD $43,680.00 $304,833.50 6.98
32 alaska AK $56,140.00 $399,150.00 7.11
33 tennessee TN $43,820.00 $316,300.00 7.22
34 delaware DE $49,280.00 $388,150.00 7.88
35 wyoming WY $47,250.00 $374,800.00 7.93
36 washington WA $59,920.00 $478,458.00 7.98
37 florida FL $45,070.00 $365,000.00 8.10
38 arizona AZ $47,680.00 $386,499.50 8.11
39 vermont VT $49,630.00 $418,900.00 8.44
40 new hampshire NH $49,980.00 $426,800.00 8.54
41 rhode island RI $50,970.00 $471,450.00 9.25
42 nevada NV $44,810.00 $420,333.50 9.38
43 oregon OR $50,010.00 $472,000.00 9.44
44 california CA $54,030.00 $514,325.00 9.52
45 massachusetts MA $60,690.00 $581,116.50 9.58
46 idaho ID $44,240.00 $433,366.50 9.80
47 colorado CO $54,050.00 $591,000.00 10.93
48 utah UT $47,020.00 $516,465.00 10.98
49 montana MT $45,690.00 $544,983.00 11.93
50 hawaii HI $50,510.00 $1,027,833.00 20.35
create_affordability_gt_table(ranked_data, 2024)
Housing Affordability Rankings by State
Year: 2024 โ€” Lower Ratio Means More Affordable
Rank State (Full) State (Abbrev) Median Income Median House Price Affordability Ratio
1 illinois IL $50,000.00 $159,575.00 3.19
2 new york NY $58,560.00 $227,316.50 3.88
3 kansas KS $46,850.00 $197,067.00 4.21
4 ohio OH $48,060.00 $212,617.00 4.42
5 west virginia WV $43,320.00 $193,058.50 4.46
6 pennsylvania PA $48,550.00 $217,950.00 4.49
7 iowa IA $47,670.00 $215,083.50 4.51
8 missouri MO $46,390.00 $222,683.00 4.80
9 indiana IN $46,930.00 $228,116.50 4.86
10 oklahoma OK $43,950.00 $214,583.50 4.88
11 michigan MI $48,300.00 $241,783.50 5.01
12 louisiana LA $43,770.00 $226,975.00 5.19
13 mississippi MS $39,070.00 $205,558.50 5.26
14 arkansas AR $41,020.00 $220,658.50 5.38
15 nebraska NE $47,990.00 $262,154.00 5.46
16 maryland MD $58,050.00 $326,633.00 5.63
17 minnesota MN $53,810.00 $303,933.00 5.65
18 kentucky KY $45,740.00 $262,791.50 5.75
19 texas TX $47,500.00 $273,033.50 5.75
20 georgia GA $47,020.00 $270,650.00 5.76
21 alabama AL $43,830.00 $255,283.50 5.82
22 wisconsin WI $48,930.00 $287,908.50 5.88
23 north dakota ND $50,320.00 $308,133.50 6.12
24 new mexico NM $45,870.00 $282,500.00 6.16
25 virginia VA $53,020.00 $336,224.00 6.34
26 connecticut CT $58,400.00 $380,800.00 6.52
27 north carolina NC $46,950.00 $307,291.50 6.55
28 south carolina SC $44,760.00 $296,099.00 6.62
29 new jersey NJ $57,230.00 $381,450.00 6.67
30 maine ME $49,440.00 $342,300.00 6.92
31 south dakota SD $45,620.00 $317,466.50 6.96
32 tennessee TN $46,120.00 $327,900.00 7.11
33 alaska AK $59,400.00 $426,483.50 7.18
34 wyoming WY $49,160.00 $386,658.50 7.87
35 florida FL $46,860.00 $370,333.00 7.90
36 delaware DE $51,030.00 $407,450.00 7.98
37 washington WA $61,590.00 $493,745.50 8.02
38 arizona AZ $48,810.00 $395,000.00 8.09
39 vermont VT $52,410.00 $456,567.00 8.71
40 new hampshire NH $52,610.00 $461,783.50 8.78
41 oregon OR $53,390.00 $486,225.00 9.11
42 rhode island RI $54,040.00 $493,250.00 9.13
43 california CA $56,940.00 $520,916.50 9.15
44 nevada NV $46,440.00 $429,192.00 9.24
45 idaho ID $46,470.00 $442,275.00 9.52
46 massachusetts MA $62,270.00 $608,966.50 9.78
47 utah UT $48,600.00 $505,233.50 10.40
48 colorado CO $54,050.00 $594,683.00 11.00
49 montana MT $47,360.00 $530,675.00 11.21
50 hawaii HI $53,260.00 $939,483.50 17.64
# @parm:  Dataset used for filtering
# @parm:  Year to filter by
# Bar chart visual for state ranking
plot_affordability_bar <- function(data, year) {
  data %>%
    filter(Year == year) %>%
    arrange(Affordability_Ratio) %>%
    mutate(StateName = factor(StateName, levels = StateName)) %>%
    ggplot(aes(x = StateName, y = Affordability_Ratio, fill = Affordability_Ratio)) +
    geom_col() +
    coord_flip() +
    scale_fill_gradient(low = "#00441b", high = "#ccece6") +
    labs(
      title = paste("Housing Affordability by State in", year),
      subtitle = "Lower = More Affordable",
      x = "State",
      y = "Affordability Ratio"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 14, face = "bold"),
      axis.text.y = element_text(size = 8)
    )
}

plot_affordability_bar(ranked_data, 2022)

plot_affordability_bar(ranked_data, 2023)

plot_affordability_bar(ranked_data, 2024)

Visualizing Affordability Ratio & Change In Affordability Ratio Across State based on Year # 5

us_albers_crs <- sf::st_crs(5070)

# @parm:  Dataset used for filtering
# @parm:  Year to filter by
# Create function to save time & lines. Reference -> https://www.w3schools.com/r/r_functions.asp
plot_affordability_map <- function(data, year) {
  data_year <- data %>% filter(Year == year)
  map_data_year <- left_join(states_map, data_year, by = "region")

  ggplot(map_data_year, aes(long, lat, group = group, fill = Affordability_Ratio)) +
    geom_polygon(color = "white") +
    coord_sf(crs = us_albers_crs, default_crs = sf::st_crs(4326)) +
    scale_fill_gradient(low = "#ccece6", high = "#00441b", na.value = "grey90") +
    labs(
      title = paste("Affordability Ratio -", year),
      fill = "Affordability",
      caption = "Lower Ratio = More Affordable"
    ) +
    theme_minimal() +
    theme(plot.title = element_text(size = 12))
}

# Create the three yearly maps
map_2022 <- plot_affordability_map(affordability_data_clean, 2022)
map_2023 <- plot_affordability_map(affordability_data_clean, 2023)
map_2024 <- plot_affordability_map(affordability_data_clean, 2024)

# Display plots
plot(map_2022)

plot(map_2023)

plot(map_2024)

# Determine change between 2022-2024
affordability_wide <- affordability_data_clean %>%
  select(StateName, region, Year, Affordability_Ratio) %>%
  pivot_wider(
    names_from = Year,
    values_from = Affordability_Ratio,
    names_prefix = "Y"
  ) %>%
  filter(!is.na(Y2022) & !is.na(Y2023) & !is.na(Y2024))

# Calculate year-by-year changes and total change
affordability_change <- affordability_wide %>%
  mutate(
    Change_2022_2023 = Y2023 - Y2022,
    Change_2023_2024 = Y2024 - Y2023,
    Total_Change = Change_2022_2023 + Change_2023_2024
  )

# Join with spatial data
affordability_map_data <- left_join(states_map, affordability_change, by = "region")

# Plot the Total_Change map
change_map <- ggplot(affordability_map_data, aes(long, lat, group = group, fill = Total_Change)) +
  geom_polygon(color = "white") +
  coord_sf(crs = us_albers_crs, default_crs = sf::st_crs(4326)) +
  scale_fill_gradient2(
    low = "#1a9850",    # Green = improved affordability
    mid = "white",      # No change
    high = "#d73027",   # Red = worsened affordability
    midpoint = 0,
    name = "Change"
  ) +
  labs(
    title = "Affordability Ratio Change",
    caption = "Positive = Less Affordable; Negative = More Affordable"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(size = 12))

# Combine all 4 maps into a single figure
combined_maps <- map_2022 + map_2023 + map_2024 + change_map +
  plot_layout(ncol = 2) +  # Arrange the maps into two columns
  plot_annotation(
    title = "Housing Affordability by State (2022โ€“2024)",
    subtitle = "Including Change from 2022 to 2024"
  )

# Display the combined figure
combined_maps

# Bar chart visual of median change across the years
# Median Affordability Ratio across each year
median_ratio_by_year <- affordability_data_clean %>%
  group_by(Year) %>%
  summarize(Median_Affordability = median(Affordability_Ratio, na.rm = TRUE))

# Bar chart visualizing the national median Affordability Ratio trend
ggplot(median_ratio_by_year, aes(x = factor(Year), y = Median_Affordability)) +
  geom_col(fill = "#3182bd") +
  geom_text(
    aes(label = round(Median_Affordability, 2)),
    vjust = -0.5,
    size = 4
  ) +
  labs(
    title = "Median Housing Affordability Ratio by Year",
    subtitle = "Higher Ratio = Less Affordable Housing",
    x = "Year",
    y = "Median Affordability Ratio",
    caption = "Affordability = Median House Price รท Median Income"
  ) +
  theme_minimal()