Data Engineering and Analysis - Finding correlation for rental prices

Author

Chan Jit Lin, Firdauz B K, Celeste Tan, Lin Xiang Hong, Jonathan Chua

Published

July 1, 2024

Importing of the necessary libraries

As we are interested in doing geospatial analysis, we need to import libraries for data manipulation and visualization.

library(sf)
library(dplyr)
library(ggplot2)

Data Engineering

Adding Longitude and Latitide

We use Google Maps API to geocode the addresses in the dataset. The geocoded data is then saved to a CSV file for further analysis.

DO NOTE: Eval=FALSE, not required the run. It will take a few hours to run. (The file has already been created on the previous run.)

knitr::opts_chunk$set(echo = TRUE)
# Install and load necessary packages
if (!requireNamespace("tidygeocoder", quietly = TRUE)) install.packages("tidygeocoder")
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr")
if (!requireNamespace("ggmap", quietly = TRUE)) install.packages("ggmap")
if (!requireNamespace("readr", quietly = TRUE)) install.packages("readr")
if (!requireNamespace("progress", quietly = TRUE)) install.packages("progress")
library(tidygeocoder)
library(dplyr)
library(ggmap)
library(readr)
library(progress)


# Read CSV file
file_path <- "dataset/rent_2024.csv"  # Filepath to data
df <- read.csv(file_path)
head(df)
# Create a full address column
df <- df %>%
  mutate(full_address = paste(block, street_name, town, "Singapore", sep = ", "))
head(df)
# Register Google API key , deleted, cos exceeded google budget
register_google(key = "xxx")
# Initialize cache
cache_file <- "address_cache.csv"

# Load cache if it exists, or create an empty one
if (file.exists(cache_file)) {
  address_cache <- read_csv(cache_file)
} else {
  address_cache <- tibble(full_address = character(), lat = numeric(), lon = numeric())
  write_csv(address_cache, cache_file)
}

# Function to geocode with caching
geocode_with_cache <- function(address) {
  cached_result <- address_cache %>%
    filter(full_address == address) %>%
    select(lat, lon)
  
  if (nrow(cached_result) == 0) {
    geocoded <- geocode(address, output = "latlona", source = "google")
    if (!is.na(geocoded$lat) & !is.na(geocoded$lon)) {
      new_entry <- tibble(full_address = address, lat = geocoded$lat, lon = geocoded$lon)
      address_cache <<- bind_rows(address_cache, new_entry)
      write_csv(address_cache, cache_file)
    }
    return(geocoded)
  } else {
    return(cached_result)
  }
}

# Initialize progress bar
pb <- progress_bar$new(
  format = "  Geocoding [:bar] :percent in :elapsed",
  total = nrow(df), clear = FALSE, width = 60
)

# Apply geocoding with cache to all addresses and update progress bar
geocoded_df <- df %>%
  rowwise() %>%
  mutate(geocoded = list({
    pb$tick()
    geocode_with_cache(full_address)
  })) %>%
  unnest(cols = c(geocoded))



# Save geocoded data to CSV
write.csv(geocoded_df, file = "geocoded_df.csv", row.names = FALSE)

Load Data

Let’s start with loading data from the dataset folder containing the csv and geojson file, The st_read function from the sf package already loads the GeoJSON file into an sf object, so we have this step covered in the code above.

# Convert geocoded_df to an sf object
geocoded_sf <- st_as_sf(geocoded_df, coords = c("lon", "lat"), crs = st_crs(districts))

Plot boundaries and points

ggplot() +
  geom_sf(data = districts, fill = NA, color = "black") +
  geom_sf(data = geocoded_sf, color = "red", size = 2) +
  theme_minimal() +
  ggtitle("District Boundaries and Geocoded Points")

Spatial join to determine boundaries

The boundary of each district is from Kaggle (2023). We can use the st_join function to perform a spatial join between the geocoded points and the district boundaries.

# Ensure the CRS of geocoded_sf matches districts
if (st_crs(geocoded_sf) != st_crs(districts)) {
  geocoded_sf <- st_transform(geocoded_sf, st_crs(districts))
}

joined_df <- st_join(geocoded_sf, districts)

# Extract necessary columns and convert back to a regular data frame
result_df <- joined_df %>%
  st_drop_geometry() %>%
  mutate(lat = st_coordinates(joined_df$geometry)[, 2], 
         lon = st_coordinates(joined_df$geometry)[, 1]) %>%
  as.data.frame()

# Save the updated CSV file
write.csv(result_df, "dataset/from dataset/updated_geocoded_df.csv", row.names = FALSE)

Create chloroplath map

# Create a chloropleth map
ggplot() +
  geom_sf(data = districts, aes(fill = district), color = "black") +
  scale_fill_viridis_d() +
  theme_minimal() +
  ggtitle("Chloropleth Map of Districts")

Check the columns with the most correlation with Price of Rental

library(dplyr)
library(ggplot2)
library(reshape2)

# Read the CSV file
updated_geocoded_df <- read.csv("dataset/from dataset/updated_geocoded_df.csv")
updated_geocoded_df <- select(updated_geocoded_df, -c(lat, lon, planning_area))

# Check column names# Convert categorical columns to factors
updated_geocoded_df$town <- as.factor(updated_geocoded_df$town)
updated_geocoded_df$block <- as.factor(updated_geocoded_df$block)
updated_geocoded_df$street_name <- as.factor(updated_geocoded_df$street_name)
updated_geocoded_df$flat_type <- as.factor(updated_geocoded_df$flat_type)
updated_geocoded_df$district <- as.factor(updated_geocoded_df$district)

# Label encode the factors
updated_geocoded_df$town <- as.numeric(updated_geocoded_df$town)
updated_geocoded_df$block <- as.numeric(updated_geocoded_df$block)
updated_geocoded_df$street_name <- as.numeric(updated_geocoded_df$street_name)
updated_geocoded_df$flat_type <- as.numeric(updated_geocoded_df$flat_type)
updated_geocoded_df$district <- as.numeric(updated_geocoded_df$district)

# Select numeric columns including the encoded ones
numeric_columns <- updated_geocoded_df %>% select_if(is.numeric)

# Calculate correlation matrix
cor_matrix <- cor(numeric_columns, use = "complete.obs")
# Get correlation values for `price`
cor_with_price <- cor_matrix[,"monthly_rent"]

# Sort correlation values
sorted_cor <- sort(cor_with_price, decreasing = TRUE)

# Display sorted correlation values
print(sorted_cor)
monthly_rent    flat_type        block  street_name         town     district 
 1.000000000  0.332262429  0.010619790  0.006233406 -0.002513968 -0.072310786 
# Melt the correlation matrix for visualization
melted_cor_matrix <- melt(cor_matrix)

# Plot heatmap
ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1)) +
  coord_fixed() +
  labs(x = "Variable", y = "Variable", fill = "Correlation")  # Custom labels for axes

Strongest correlation

From the pearson correlation matrix, we can see the column that corresponds with monthly rental price the most is the flat type.

Finding prices by flat types

df <- read.csv("dataset/from dataset/updated_geocoded_df.csv")
# Ensure flat_type is treated as a factor if not already
df$flat_type <- as.factor(df$flat_type)

# Generate histogram of price by flat types
ggplot(df, aes(x = monthly_rent)) +
  geom_histogram(bins = 30, fill = "blue", color = "black") +  # Adjust the number of bins as needed
  facet_wrap(~flat_type, scales = "free_x") +  # Use 'scales = "free"' to adjust each facet to its own x-axis scale
  labs(title = "Histogram of Monthly Rent by Flat Type",
       x = "Monthly Rent",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Ensure flat_type is treated as a factor if not already
df$flat_type <- as.factor(df$flat_type)

# Calculate median rent per flat type
median_price_per_type <- df %>%
  group_by(flat_type) %>%
  summarise(median_rent = median(monthly_rent, na.rm = TRUE))

# Generate a bar plot of median rent by flat types
ggplot(median_price_per_type, aes(x = flat_type, y = median_rent, fill = flat_type)) +
  geom_col() +  # geom_col is used for bar plots; it's similar to geom_bar(stat="identity")
  labs(title = "Median Monthly Rent by Flat Type",
       x = "Flat Type",
       y = "Median Monthly Rent") +
  scale_fill_viridis_d() +  # This adds a color scale that automatically assigns colors based on the factor levels
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Interactive Map

Interactive Shiny app not working (properly)? Click here!

References

Kaggle. 2023. “Singapore District - Planning Area Geojson.” kaggle.com. https://www.kaggle.com/datasets/shymammoth/singapore-district-planning-area-geojson?resource=download.