California Median Housing Bivariate Analysis

This R notebook will explore the correlation between Median House Values and Median Income in the state of California.

The dataset was downloaded from Kaggle: https://www.kaggle.com/datasets/camnugent/california-housing-prices

This notebook produces several maps. The most interesting is the median house value to income bivariate map.


library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(sf)
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(tmap)

Read the data and normalize the median income to full dollars.

housing.data <- read.csv(file = "california_housing.csv")
housing.data$median_income <- housing.data$median_income * 10000
head(housing.data, 10)
##    longitude latitude housing_median_age total_rooms total_bedrooms population
## 1    -122.23    37.88                 41         880            129        322
## 2    -122.22    37.86                 21        7099           1106       2401
## 3    -122.24    37.85                 52        1467            190        496
## 4    -122.25    37.85                 52        1274            235        558
## 5    -122.25    37.85                 52        1627            280        565
## 6    -122.25    37.85                 52         919            213        413
## 7    -122.25    37.84                 52        2535            489       1094
## 8    -122.25    37.84                 52        3104            687       1157
## 9    -122.26    37.84                 42        2555            665       1206
## 10   -122.25    37.84                 52        3549            707       1551
##    households median_income median_house_value ocean_proximity
## 1         126         83252             452600        NEAR BAY
## 2        1138         83014             358500        NEAR BAY
## 3         177         72574             352100        NEAR BAY
## 4         219         56431             341300        NEAR BAY
## 5         259         38462             342200        NEAR BAY
## 6         193         40368             269700        NEAR BAY
## 7         514         36591             299200        NEAR BAY
## 8         647         31200             241400        NEAR BAY
## 9         595         20804             226700        NEAR BAY
## 10        714         36912             261100        NEAR BAY
count(housing.data)
##       n
## 1 20640

Plot the median income against the median house value and show the linear relationship between the two variables.

ggplot(housing.data, aes(x = median_income, y = median_house_value)) +
    geom_point(col = "royalblue") +
    geom_smooth(formula = "y ~ x", method = "glm", col = "coral") +
    scale_y_continuous(labels = label_number(big.mark = ",")) +
    scale_x_continuous(labels = label_number(big.mark = ",")) +
    labs(title = "Median Income vs Median House Value",
         x = "Median Income (USD)",
         y = "Median House Value (USD)") +
    theme_bw()

Plot the distribution of the median house value variable.

ggplot(housing.data, aes(x = median_house_value)) +
    geom_histogram(stat = "bin", bins = 30, fill = "royalblue", col = "navy") +
    scale_y_continuous(labels = label_number(big.mark = ",")) +
    scale_x_continuous(labels = label_number(big.mark = ",")) +
    labs(title = "Histogram of Median House Value",
         x = "Median House Value (USD)",
         y = "Frequency") +
    theme_bw()

Plot the distribution of the median income variable.

ggplot(housing.data, aes(x = median_income)) +
    geom_histogram(stat = "bin", bins = 30, fill = "royalblue", col = "navy") +
    scale_y_continuous(labels = label_number(big.mark = ",")) +
    scale_x_continuous(labels = label_number(big.mark = ",")) +
    labs(title = "Histogram of Median Income",
         x = "Median Income (USD)",
         y = "Frequency") +
    theme_bw()

Discover the count of housing samples by proximity to the ocean.

chart.data <- housing.data %>%
    group_by(ocean_proximity) %>%
    summarise(house_count = n()) %>%
    arrange(house_count)

chart.data
## # A tibble: 5 × 2
##   ocean_proximity house_count
##   <chr>                 <int>
## 1 ISLAND                    5
## 2 NEAR BAY               2290
## 3 NEAR OCEAN             2658
## 4 INLAND                 6551
## 5 <1H OCEAN              9136
ggplot(chart.data, aes(x = as.factor(ocean_proximity), y = house_count)) +
    geom_bar(stat = "identity", fill = "royalblue", col = "navy") +
    scale_y_continuous(labels = label_number(big.mark = ",")) +
    labs(title = "Count of Houses by Ocean Proximity",
         x = "Ocean Proximity", y = "Count") +
    theme_bw()

Plot the distribution of the median house value by proximity to the ocean.

ggplot(housing.data, aes(x = as.factor(ocean_proximity), y = median_house_value)) +
    geom_boxplot(fill = "royalblue", col = "navy", staplewidth = 0.5) +
    scale_y_continuous(labels = label_number(big.mark = ",")) +
    labs(title = "Distribution of Median House Value by Ocean Proximity",
         x = "Ocean Proximity", y = "Median House Value (USD)") +
    theme_bw()

Create a Spatial DataFrame from the location variables in the housing dataset.

housing.geodata <- st_as_sf(housing.data, 
                            coords = c("longitude", "latitude"), 
                            crs = 4326
                    )

head(housing.geodata, 5)
## Simple feature collection with 5 features and 8 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -122.25 ymin: 37.85 xmax: -122.22 ymax: 37.88
## Geodetic CRS:  WGS 84
##   housing_median_age total_rooms total_bedrooms population households
## 1                 41         880            129        322        126
## 2                 21        7099           1106       2401       1138
## 3                 52        1467            190        496        177
## 4                 52        1274            235        558        219
## 5                 52        1627            280        565        259
##   median_income median_house_value ocean_proximity              geometry
## 1         83252             452600        NEAR BAY POINT (-122.23 37.88)
## 2         83014             358500        NEAR BAY POINT (-122.22 37.86)
## 3         72574             352100        NEAR BAY POINT (-122.24 37.85)
## 4         56431             341300        NEAR BAY POINT (-122.25 37.85)
## 5         38462             342200        NEAR BAY POINT (-122.25 37.85)

What is the spatial distribution of Median House values across the state of California?

map.data <- housing.geodata %>% arrange(median_house_value)

tm_shape(map.data) +
    tm_dots(fill = "median_house_value", 
            fill.scale = tm_scale_intervals(
                n = 5, 
                style = "fisher", 
                values = "plasma"
            ),
            fill.legend = tm_legend(
                "Value (USD)",
                position = tm_pos_in("right", "top")
            ),
            size = 0.25) +
    tm_title("Median House Value in California")

What is the ratio of median house value and median income?

housing.geodata$median_ratio <- round(
    housing.geodata$median_house_value / housing.geodata$median_income, 2
)

subset(
    housing.geodata, 
    select = c("median_house_value", "median_income", "median_ratio")
)
## Simple feature collection with 20640 features and 3 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -124.35 ymin: 32.54 xmax: -114.31 ymax: 41.95
## Geodetic CRS:  WGS 84
## First 10 features:
##    median_house_value median_income median_ratio              geometry
## 1              452600         83252         5.44 POINT (-122.23 37.88)
## 2              358500         83014         4.32 POINT (-122.22 37.86)
## 3              352100         72574         4.85 POINT (-122.24 37.85)
## 4              341300         56431         6.05 POINT (-122.25 37.85)
## 5              342200         38462         8.90 POINT (-122.25 37.85)
## 6              269700         40368         6.68 POINT (-122.25 37.85)
## 7              299200         36591         8.18 POINT (-122.25 37.84)
## 8              241400         31200         7.74 POINT (-122.25 37.84)
## 9              226700         20804        10.90 POINT (-122.26 37.84)
## 10             261100         36912         7.07 POINT (-122.25 37.84)

What is the distribution of the median house value to income ratio?

ggplot(housing.geodata, aes(x = median_ratio)) +
    geom_histogram(stat = "bin", bins = 50, fill = "royalblue", col = "navy") +
    scale_y_continuous(labels = label_number(big.mark = ",")) +
    scale_x_continuous(labels = label_number(big.mark = ",")) +
    labs(title = "Histogram of Median House Value to Income Ratio",
         x = "Percentage of Income",
         y = "Frequency") +
    theme_bw()

Describe the distribution of the median ratio.

quantile(housing.geodata$median_ratio)
##     0%    25%    50%    75%   100% 
##   0.36   3.86   4.95   6.59 100.02

Map the spatial distribution of median ratio.

map.data <- housing.geodata %>% arrange(median_ratio)

tm_shape(map.data) +
    tm_dots(fill = "median_ratio", 
            fill.scale = tm_scale_intervals(
                n = 5, style = "quantile", values = "plasma"
            ),
            fill.legend = tm_legend(
                "",
                position = tm_pos_in("right", "top")
            ),
            size = 0.25) +
    tm_title("Median House Value to Income Ratio", size = 1)


This section will perform a hexagonal grid density operation and map the results.

# Convert housing dataset to EPSG 3857

hse.data <- st_transform(housing.geodata, 3857)

# Make a hexagonal grid
grid.data <- st_make_grid(
                hse.data, 
                cellsize = 15000,
                square = FALSE
            ) %>%
    st_sf(.)

# Find cells which intersect with housing locations
i <- st_intersects(grid.data, hse.data)

# Filter grid data for only pertinent cells
grid.data <- grid.data[lengths(i) > 0, ]
grid.data
## Simple feature collection with 1402 features and 0 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -13850080 ymin: 3825744 xmax: -12717580 ymax: 5168084
## Projected CRS: WGS 84 / Pseudo-Mercator
## First 10 features:
##                          geometry
## 1  POLYGON ((-13842579 4942917...
## 2  POLYGON ((-13842579 5124782...
## 3  POLYGON ((-13835079 4903946...
## 4  POLYGON ((-13835079 4955908...
## 5  POLYGON ((-13835079 5111792...
## 6  POLYGON ((-13827579 4942917...
## 7  POLYGON ((-13827579 4968898...
## 8  POLYGON ((-13827579 4994879...
## 9  POLYGON ((-13827579 5124782...
## 10 POLYGON ((-13820079 4929927...
# Intersect housing dataset with grid cells ...

inter.data <- st_intersects(hse.data, grid.data) %>%
    as.data.frame()

colnames(inter.data) <- c("id", "geo.id")

inter.data <- inter.data %>%
    group_by(id) %>%
    summarise(geo.id = min(geo.id))

# This is a list of each house sample record with the corresponding grid cell
head(inter.data, 50)
## # A tibble: 50 × 2
##       id geo.id
##    <int>  <int>
##  1     1    252
##  2     2    252
##  3     3    252
##  4     4    252
##  5     5    252
##  6     6    252
##  7     7    252
##  8     8    252
##  9     9    233
## 10    10    252
## # ℹ 40 more rows
# Assign geo_id value for each housing record (using index vector)

hse.data$geo_id <- inter.data$geo.id

head(hse.data[, c("median_house_value", "median_income", "geo_id")], 10)
## Simple feature collection with 10 features and 3 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -13609920 ymin: 4556848 xmax: -13605470 ymax: 4562488
## Projected CRS: WGS 84 / Pseudo-Mercator
##    median_house_value median_income geo_id                  geometry
## 1              452600         83252    252 POINT (-13606581 4562488)
## 2              358500         83014    252 POINT (-13605468 4559667)
## 3              352100         72574    252 POINT (-13607695 4558257)
## 4              341300         56431    252 POINT (-13608808 4558257)
## 5              342200         38462    252 POINT (-13608808 4558257)
## 6              269700         40368    252 POINT (-13608808 4558257)
## 7              299200         36591    252 POINT (-13608808 4556848)
## 8              241400         31200    252 POINT (-13608808 4556848)
## 9              226700         20804    233 POINT (-13609921 4556848)
## 10             261100         36912    252 POINT (-13608808 4556848)
# Compute the average median house value and median income for each grid cell
# Then compute the `bin` values using ntile()

cell.data <- as.data.frame(hse.data) %>%
    group_by(geo_id) %>%
    summarise(median_house_value = mean(median_house_value),
              median_income = mean(median_income)) %>%
    arrange(geo_id) %>%
    mutate(
        median_house_value = ntile(median_house_value, n = 3),
        median_income = ntile(median_income, n = 3)
    )

cell.data
## # A tibble: 1,402 × 3
##    geo_id median_house_value median_income
##     <int>              <int>         <int>
##  1      1                  2             2
##  2      2                  2             1
##  3      3                  1             1
##  4      4                  1             1
##  5      5                  1             1
##  6      6                  2             2
##  7      7                  1             1
##  8      8                  2             2
##  9      9                  1             1
## 10     10                  1             1
## # ℹ 1,392 more rows
# Assign the `bin` values back to the grid cell polygons

grid.data$income <- cell.data$median_income
grid.data$house_value <- cell.data$median_house_value

head(grid.data, 10)
## Simple feature collection with 10 features and 2 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -13850080 ymin: 4903946 xmax: -13812580 ymax: 5142103
## Projected CRS: WGS 84 / Pseudo-Mercator
##                          geometry income house_value
## 1  POLYGON ((-13842579 4942917...      2           2
## 2  POLYGON ((-13842579 5124782...      1           2
## 3  POLYGON ((-13835079 4903946...      1           1
## 4  POLYGON ((-13835079 4955908...      1           1
## 5  POLYGON ((-13835079 5111792...      1           1
## 6  POLYGON ((-13827579 4942917...      2           2
## 7  POLYGON ((-13827579 4968898...      1           1
## 8  POLYGON ((-13827579 4994879...      2           2
## 9  POLYGON ((-13827579 5124782...      1           1
## 10 POLYGON ((-13820079 4929927...      1           1

Create multivariate map which compares median house value and median income binned values.

colors <- c(
    "1-1" = "#FFF0F5", 
    "1-2" = "#f78fb6", 
    "1-3" = "#f73593",
    "2-1" = "#a5e8cd", 
    "2-2" = "#a58fb6", 
    "2-3" = "#a53593",
    "3-1" = "#40dba7", 
    "3-2" = "#408fa7", 
    "3-3" = "#403593"
)

tm_shape(grid.data) +
    tm_polygons(
        fill = tm_vars(
            c("income", "house_value"), 
            multivariate = TRUE
        ),
        fill.scale = tm_scale_bivariate(
            scale1 = tm_scale_categorical(),
            scale2 = tm_scale_categorical(),
            values = colors
        ),
        fill.legend = tm_legend_bivariate(
            xlab = "Value",
            ylab = "Income",
            show = TRUE,
            position = tm_pos_in("right", "top")
        )
    ) +
    tm_title(
        "Median House Value to Income\nBivariate Choropleth Map", 
        size = 1
    )
## Labels abbreviated by the first letters, e.g.: "1" => "1"