This R notebook will explore the correlation between Median House Values and Median Income in the state of California.
The dataset was downloaded from Kaggle: https://www.kaggle.com/datasets/camnugent/california-housing-prices
This notebook produces several maps. The most interesting is the median house value to income bivariate map.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(sf)
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(tmap)
Read the data and normalize the median income to full dollars.
housing.data <- read.csv(file = "california_housing.csv")
housing.data$median_income <- housing.data$median_income * 10000
head(housing.data, 10)
## longitude latitude housing_median_age total_rooms total_bedrooms population
## 1 -122.23 37.88 41 880 129 322
## 2 -122.22 37.86 21 7099 1106 2401
## 3 -122.24 37.85 52 1467 190 496
## 4 -122.25 37.85 52 1274 235 558
## 5 -122.25 37.85 52 1627 280 565
## 6 -122.25 37.85 52 919 213 413
## 7 -122.25 37.84 52 2535 489 1094
## 8 -122.25 37.84 52 3104 687 1157
## 9 -122.26 37.84 42 2555 665 1206
## 10 -122.25 37.84 52 3549 707 1551
## households median_income median_house_value ocean_proximity
## 1 126 83252 452600 NEAR BAY
## 2 1138 83014 358500 NEAR BAY
## 3 177 72574 352100 NEAR BAY
## 4 219 56431 341300 NEAR BAY
## 5 259 38462 342200 NEAR BAY
## 6 193 40368 269700 NEAR BAY
## 7 514 36591 299200 NEAR BAY
## 8 647 31200 241400 NEAR BAY
## 9 595 20804 226700 NEAR BAY
## 10 714 36912 261100 NEAR BAY
count(housing.data)
## n
## 1 20640
Plot the median income against the median house value and show the linear relationship between the two variables.
ggplot(housing.data, aes(x = median_income, y = median_house_value)) +
geom_point(col = "royalblue") +
geom_smooth(formula = "y ~ x", method = "glm", col = "coral") +
scale_y_continuous(labels = label_number(big.mark = ",")) +
scale_x_continuous(labels = label_number(big.mark = ",")) +
labs(title = "Median Income vs Median House Value",
x = "Median Income (USD)",
y = "Median House Value (USD)") +
theme_bw()
Plot the distribution of the median house value variable.
ggplot(housing.data, aes(x = median_house_value)) +
geom_histogram(stat = "bin", bins = 30, fill = "royalblue", col = "navy") +
scale_y_continuous(labels = label_number(big.mark = ",")) +
scale_x_continuous(labels = label_number(big.mark = ",")) +
labs(title = "Histogram of Median House Value",
x = "Median House Value (USD)",
y = "Frequency") +
theme_bw()
Plot the distribution of the median income variable.
ggplot(housing.data, aes(x = median_income)) +
geom_histogram(stat = "bin", bins = 30, fill = "royalblue", col = "navy") +
scale_y_continuous(labels = label_number(big.mark = ",")) +
scale_x_continuous(labels = label_number(big.mark = ",")) +
labs(title = "Histogram of Median Income",
x = "Median Income (USD)",
y = "Frequency") +
theme_bw()
Discover the count of housing samples by proximity to the ocean.
chart.data <- housing.data %>%
group_by(ocean_proximity) %>%
summarise(house_count = n()) %>%
arrange(house_count)
chart.data
## # A tibble: 5 × 2
## ocean_proximity house_count
## <chr> <int>
## 1 ISLAND 5
## 2 NEAR BAY 2290
## 3 NEAR OCEAN 2658
## 4 INLAND 6551
## 5 <1H OCEAN 9136
ggplot(chart.data, aes(x = as.factor(ocean_proximity), y = house_count)) +
geom_bar(stat = "identity", fill = "royalblue", col = "navy") +
scale_y_continuous(labels = label_number(big.mark = ",")) +
labs(title = "Count of Houses by Ocean Proximity",
x = "Ocean Proximity", y = "Count") +
theme_bw()
Plot the distribution of the median house value by proximity to the ocean.
ggplot(housing.data, aes(x = as.factor(ocean_proximity), y = median_house_value)) +
geom_boxplot(fill = "royalblue", col = "navy", staplewidth = 0.5) +
scale_y_continuous(labels = label_number(big.mark = ",")) +
labs(title = "Distribution of Median House Value by Ocean Proximity",
x = "Ocean Proximity", y = "Median House Value (USD)") +
theme_bw()
Create a Spatial DataFrame from the location variables in the housing dataset.
housing.geodata <- st_as_sf(housing.data,
coords = c("longitude", "latitude"),
crs = 4326
)
head(housing.geodata, 5)
## Simple feature collection with 5 features and 8 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -122.25 ymin: 37.85 xmax: -122.22 ymax: 37.88
## Geodetic CRS: WGS 84
## housing_median_age total_rooms total_bedrooms population households
## 1 41 880 129 322 126
## 2 21 7099 1106 2401 1138
## 3 52 1467 190 496 177
## 4 52 1274 235 558 219
## 5 52 1627 280 565 259
## median_income median_house_value ocean_proximity geometry
## 1 83252 452600 NEAR BAY POINT (-122.23 37.88)
## 2 83014 358500 NEAR BAY POINT (-122.22 37.86)
## 3 72574 352100 NEAR BAY POINT (-122.24 37.85)
## 4 56431 341300 NEAR BAY POINT (-122.25 37.85)
## 5 38462 342200 NEAR BAY POINT (-122.25 37.85)
What is the spatial distribution of Median House values across the state of California?
map.data <- housing.geodata %>% arrange(median_house_value)
tm_shape(map.data) +
tm_dots(fill = "median_house_value",
fill.scale = tm_scale_intervals(
n = 5,
style = "fisher",
values = "plasma"
),
fill.legend = tm_legend(
"Value (USD)",
position = tm_pos_in("right", "top")
),
size = 0.25) +
tm_title("Median House Value in California")
What is the ratio of median house value and median income?
housing.geodata$median_ratio <- round(
housing.geodata$median_house_value / housing.geodata$median_income, 2
)
subset(
housing.geodata,
select = c("median_house_value", "median_income", "median_ratio")
)
## Simple feature collection with 20640 features and 3 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -124.35 ymin: 32.54 xmax: -114.31 ymax: 41.95
## Geodetic CRS: WGS 84
## First 10 features:
## median_house_value median_income median_ratio geometry
## 1 452600 83252 5.44 POINT (-122.23 37.88)
## 2 358500 83014 4.32 POINT (-122.22 37.86)
## 3 352100 72574 4.85 POINT (-122.24 37.85)
## 4 341300 56431 6.05 POINT (-122.25 37.85)
## 5 342200 38462 8.90 POINT (-122.25 37.85)
## 6 269700 40368 6.68 POINT (-122.25 37.85)
## 7 299200 36591 8.18 POINT (-122.25 37.84)
## 8 241400 31200 7.74 POINT (-122.25 37.84)
## 9 226700 20804 10.90 POINT (-122.26 37.84)
## 10 261100 36912 7.07 POINT (-122.25 37.84)
What is the distribution of the median house value to income ratio?
ggplot(housing.geodata, aes(x = median_ratio)) +
geom_histogram(stat = "bin", bins = 50, fill = "royalblue", col = "navy") +
scale_y_continuous(labels = label_number(big.mark = ",")) +
scale_x_continuous(labels = label_number(big.mark = ",")) +
labs(title = "Histogram of Median House Value to Income Ratio",
x = "Percentage of Income",
y = "Frequency") +
theme_bw()
Describe the distribution of the median ratio.
quantile(housing.geodata$median_ratio)
## 0% 25% 50% 75% 100%
## 0.36 3.86 4.95 6.59 100.02
Map the spatial distribution of median ratio.
map.data <- housing.geodata %>% arrange(median_ratio)
tm_shape(map.data) +
tm_dots(fill = "median_ratio",
fill.scale = tm_scale_intervals(
n = 5, style = "quantile", values = "plasma"
),
fill.legend = tm_legend(
"",
position = tm_pos_in("right", "top")
),
size = 0.25) +
tm_title("Median House Value to Income Ratio", size = 1)
This section will perform a hexagonal grid density operation and map the results.
# Convert housing dataset to EPSG 3857
hse.data <- st_transform(housing.geodata, 3857)
# Make a hexagonal grid
grid.data <- st_make_grid(
hse.data,
cellsize = 15000,
square = FALSE
) %>%
st_sf(.)
# Find cells which intersect with housing locations
i <- st_intersects(grid.data, hse.data)
# Filter grid data for only pertinent cells
grid.data <- grid.data[lengths(i) > 0, ]
grid.data
## Simple feature collection with 1402 features and 0 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -13850080 ymin: 3825744 xmax: -12717580 ymax: 5168084
## Projected CRS: WGS 84 / Pseudo-Mercator
## First 10 features:
## geometry
## 1 POLYGON ((-13842579 4942917...
## 2 POLYGON ((-13842579 5124782...
## 3 POLYGON ((-13835079 4903946...
## 4 POLYGON ((-13835079 4955908...
## 5 POLYGON ((-13835079 5111792...
## 6 POLYGON ((-13827579 4942917...
## 7 POLYGON ((-13827579 4968898...
## 8 POLYGON ((-13827579 4994879...
## 9 POLYGON ((-13827579 5124782...
## 10 POLYGON ((-13820079 4929927...
# Intersect housing dataset with grid cells ...
inter.data <- st_intersects(hse.data, grid.data) %>%
as.data.frame()
colnames(inter.data) <- c("id", "geo.id")
inter.data <- inter.data %>%
group_by(id) %>%
summarise(geo.id = min(geo.id))
# This is a list of each house sample record with the corresponding grid cell
head(inter.data, 50)
## # A tibble: 50 × 2
## id geo.id
## <int> <int>
## 1 1 252
## 2 2 252
## 3 3 252
## 4 4 252
## 5 5 252
## 6 6 252
## 7 7 252
## 8 8 252
## 9 9 233
## 10 10 252
## # ℹ 40 more rows
# Assign geo_id value for each housing record (using index vector)
hse.data$geo_id <- inter.data$geo.id
head(hse.data[, c("median_house_value", "median_income", "geo_id")], 10)
## Simple feature collection with 10 features and 3 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -13609920 ymin: 4556848 xmax: -13605470 ymax: 4562488
## Projected CRS: WGS 84 / Pseudo-Mercator
## median_house_value median_income geo_id geometry
## 1 452600 83252 252 POINT (-13606581 4562488)
## 2 358500 83014 252 POINT (-13605468 4559667)
## 3 352100 72574 252 POINT (-13607695 4558257)
## 4 341300 56431 252 POINT (-13608808 4558257)
## 5 342200 38462 252 POINT (-13608808 4558257)
## 6 269700 40368 252 POINT (-13608808 4558257)
## 7 299200 36591 252 POINT (-13608808 4556848)
## 8 241400 31200 252 POINT (-13608808 4556848)
## 9 226700 20804 233 POINT (-13609921 4556848)
## 10 261100 36912 252 POINT (-13608808 4556848)
# Compute the average median house value and median income for each grid cell
# Then compute the `bin` values using ntile()
cell.data <- as.data.frame(hse.data) %>%
group_by(geo_id) %>%
summarise(median_house_value = mean(median_house_value),
median_income = mean(median_income)) %>%
arrange(geo_id) %>%
mutate(
median_house_value = ntile(median_house_value, n = 3),
median_income = ntile(median_income, n = 3)
)
cell.data
## # A tibble: 1,402 × 3
## geo_id median_house_value median_income
## <int> <int> <int>
## 1 1 2 2
## 2 2 2 1
## 3 3 1 1
## 4 4 1 1
## 5 5 1 1
## 6 6 2 2
## 7 7 1 1
## 8 8 2 2
## 9 9 1 1
## 10 10 1 1
## # ℹ 1,392 more rows
# Assign the `bin` values back to the grid cell polygons
grid.data$income <- cell.data$median_income
grid.data$house_value <- cell.data$median_house_value
head(grid.data, 10)
## Simple feature collection with 10 features and 2 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -13850080 ymin: 4903946 xmax: -13812580 ymax: 5142103
## Projected CRS: WGS 84 / Pseudo-Mercator
## geometry income house_value
## 1 POLYGON ((-13842579 4942917... 2 2
## 2 POLYGON ((-13842579 5124782... 1 2
## 3 POLYGON ((-13835079 4903946... 1 1
## 4 POLYGON ((-13835079 4955908... 1 1
## 5 POLYGON ((-13835079 5111792... 1 1
## 6 POLYGON ((-13827579 4942917... 2 2
## 7 POLYGON ((-13827579 4968898... 1 1
## 8 POLYGON ((-13827579 4994879... 2 2
## 9 POLYGON ((-13827579 5124782... 1 1
## 10 POLYGON ((-13820079 4929927... 1 1
Create multivariate map which compares median house value and median income binned values.
colors <- c(
"1-1" = "#FFF0F5",
"1-2" = "#f78fb6",
"1-3" = "#f73593",
"2-1" = "#a5e8cd",
"2-2" = "#a58fb6",
"2-3" = "#a53593",
"3-1" = "#40dba7",
"3-2" = "#408fa7",
"3-3" = "#403593"
)
tm_shape(grid.data) +
tm_polygons(
fill = tm_vars(
c("income", "house_value"),
multivariate = TRUE
),
fill.scale = tm_scale_bivariate(
scale1 = tm_scale_categorical(),
scale2 = tm_scale_categorical(),
values = colors
),
fill.legend = tm_legend_bivariate(
xlab = "Value",
ylab = "Income",
show = TRUE,
position = tm_pos_in("right", "top")
)
) +
tm_title(
"Median House Value to Income\nBivariate Choropleth Map",
size = 1
)
## Labels abbreviated by the first letters, e.g.: "1" => "1"