library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
# Bringing in the cleaned CSV we built in Python
data <- read_csv("final_data.csv")
## Rows: 137534 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): pin, nearest_station
## dbl (6): sale_price, class, township_code, latitude, longitude, dist_to_tra...
## date (1): sale_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Quick look at the first few rows and the structure to make sure types are right
head(data)
## # A tibble: 6 × 9
## pin sale_price sale_date class township_code latitude longitude
## <chr> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
## 1 25151170160000 160000 2023-01-18 203 70 41.7 -87.6
## 2 20023020390000 40000 2023-01-19 205 70 41.8 -87.6
## 3 20114000191005 244000 2023-01-10 299 70 41.8 -87.6
## 4 20352090130000 55000 2023-01-25 203 70 41.7 -87.6
## 5 25032210120000 299900 2023-01-26 211 70 41.7 -87.6
## 6 20254100300000 100000 2023-01-27 203 70 41.8 -87.6
## # ℹ 2 more variables: dist_to_transit_m <dbl>, nearest_station <chr>
print(nrow(data))
## [1] 137534
str(data)
## spc_tbl_ [137,534 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ pin : chr [1:137534] "25151170160000" "20023020390000" "20114000191005" "20352090130000" ...
## $ sale_price : num [1:137534] 160000 40000 244000 55000 299900 ...
## $ sale_date : Date[1:137534], format: "2023-01-18" "2023-01-19" ...
## $ class : num [1:137534] 203 205 299 203 211 203 211 202 211 202 ...
## $ township_code : num [1:137534] 70 70 70 70 70 70 70 70 70 70 ...
## $ latitude : num [1:137534] 41.7 41.8 41.8 41.7 41.7 ...
## $ longitude : num [1:137534] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ dist_to_transit_m: num [1:137534] 2196 1502 1903 2604 1223 ...
## $ nearest_station : chr [1:137534] "95th/Dan Ryan" "43rd" "51st" "79th" ...
## - attr(*, "spec")=
## .. cols(
## .. pin = col_character(),
## .. sale_price = col_double(),
## .. sale_date = col_date(format = ""),
## .. class = col_double(),
## .. township_code = col_double(),
## .. latitude = col_double(),
## .. longitude = col_double(),
## .. dist_to_transit_m = col_double(),
## .. nearest_station = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Mapping out the data points to see the "skeleton" of Chicago
# Since we have thousands of sales, I'm using a low alpha so the transit lines "pop"
ggplot(data, aes(x = longitude, y = latitude, color = dist_to_transit_m)) +
geom_point(alpha = 0.05, size = 0.1) +
scale_color_gradient(low = "red", high = "yellow", name = "Dist to 'L' (m)") +
labs(
title = "Geographic Distribution of Transit Proximity",
subtitle = "The 'L' lines emerge as the areas closest to stations (red)",
x = "Longitude",
y = "Latitude"
) +
theme_minimal() +
coord_quickmap() # Keeps the city from looking "stretched"

# Checking how many properties are actually near the train
# This helps see if our data is skewed toward people living right on top of the 'L'
ggplot(data, aes(x = dist_to_transit_m)) +
geom_histogram(fill = "steelblue", color = "white", bins = 50) +
labs(
title = "Distribution of Property Distances to CTA Stations",
subtitle = "How far are Chicago residential sales from the 'L'?",
x = "Distance to Nearest Station (meters)",
y = "Number of Sales"
) +
theme_minimal()

# Comparing the average price by township to see geographic trends
# This helps confirm if North side vs South side differences are showing up
township_summary <- data %>%
group_by(township_code) %>%
summarise(avg_price = mean(sale_price, na.rm = TRUE)) %>%
mutate(township_code = as.factor(township_code))
ggplot(township_summary, aes(x = reorder(township_code, -avg_price), y = avg_price)) +
geom_bar(stat = "identity", fill = "darkorange") +
scale_y_continuous(labels = scales::dollar) +
labs(
title = "Average Sale Price by Chicago Township",
subtitle = "Comparing market value across different sections of the city",
x = "Township Code (70-77)",
y = "Average Sale Price"
) +
theme_minimal()

# Pre-processing the price data
# R sometimes reads numbers with symbols as characters, so we force it to numeric
data <- data %>%
mutate(sale_price = as.numeric(sale_price)) %>%
filter(!is.na(sale_price))
# Running the basic model: Price predicted by Distance
# Using log() for price because real estate values are exponential, not linear
model_simple <- lm(log(sale_price) ~ dist_to_transit_m, data = data)
# Checking the P-value and R-squared to see if distance actually matters
summary(model_simple)
##
## Call:
## lm(formula = log(sale_price) ~ dist_to_transit_m, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5997 -0.4381 0.0223 0.4608 5.8104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.294e+01 3.137e-03 4125.69 <2e-16 ***
## dist_to_transit_m -1.373e-04 1.415e-06 -97.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8584 on 137532 degrees of freedom
## Multiple R-squared: 0.06411, Adjusted R-squared: 0.0641
## F-statistic: 9421 on 1 and 137532 DF, p-value: < 2.2e-16
# Visualizing the regression line
# The red line shows the general "decay" of value as you move away from the train
ggplot(data, aes(x = dist_to_transit_m, y = sale_price)) +
geom_point(alpha = 0.02, color = "midnightblue") +
geom_smooth(method = "lm", color = "red", size = 1.2) +
scale_y_log10(labels = scales::dollar) +
labs(
title = "The 'L' Train Effect on Chicago Property Values",
subtitle = "Simple Linear Regression: log(Price) ~ Distance to Transit",
x = "Distance to Nearest CTA Station (meters)",
y = "Sale Price (Log Scale)"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

# Controlling for property type (class)
# This model is better because it compares apples to apples (bungalows to bungalows)
# # This runs a multiple regression that isolates the transit effect by
# holding the house type constant, essentially comparing "like with like."
model_fancy <- lm(log(sale_price) ~ dist_to_transit_m + as.factor(class), data = data)
# We expect the R-squared to go up here since 'class' is a huge deal for price
summary(model_fancy)
##
## Call:
## lm(formula = log(sale_price) ~ dist_to_transit_m + as.factor(class),
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1829 -0.4061 0.0478 0.4428 5.8478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.352e+01 5.768e-01 23.444 < 2e-16 ***
## dist_to_transit_m -1.083e-04 1.596e-06 -67.836 < 2e-16 ***
## as.factor(class)201 -8.672e-01 5.778e-01 -1.501 0.13342
## as.factor(class)202 -1.094e+00 5.769e-01 -1.897 0.05784 .
## as.factor(class)203 -7.932e-01 5.768e-01 -1.375 0.16909
## as.factor(class)204 -3.945e-01 5.772e-01 -0.684 0.49429
## as.factor(class)205 -6.057e-01 5.769e-01 -1.050 0.29378
## as.factor(class)206 2.062e-01 5.771e-01 0.357 0.72086
## as.factor(class)207 -3.391e-01 5.775e-01 -0.587 0.55704
## as.factor(class)208 1.315e+00 5.791e-01 2.270 0.02320 *
## as.factor(class)209 1.588e+00 5.804e-01 2.735 0.00623 **
## as.factor(class)210 -1.010e+00 5.772e-01 -1.750 0.08007 .
## as.factor(class)211 -5.303e-01 5.768e-01 -0.919 0.35796
## as.factor(class)212 -3.239e-01 5.772e-01 -0.561 0.57467
## as.factor(class)213 -8.245e-01 5.784e-01 -1.425 0.15402
## as.factor(class)218 5.830e-01 8.157e-01 0.715 0.47483
## as.factor(class)225 1.018e+00 7.064e-01 1.441 0.14971
## as.factor(class)234 -9.254e-01 5.773e-01 -1.603 0.10892
## as.factor(class)241 -6.442e-01 5.774e-01 -1.116 0.26454
## as.factor(class)278 5.547e-01 5.770e-01 0.961 0.33638
## as.factor(class)288 2.232e+00 9.991e-01 2.234 0.02547 *
## as.factor(class)290 -1.131e-02 5.800e-01 -0.020 0.98444
## as.factor(class)295 -3.050e-01 5.770e-01 -0.529 0.59714
## as.factor(class)297 4.118e-01 5.800e-01 0.710 0.47773
## as.factor(class)299 -6.445e-01 5.768e-01 -1.117 0.26385
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8157 on 137509 degrees of freedom
## Multiple R-squared: 0.155, Adjusted R-squared: 0.1549
## F-statistic: 1051 on 24 and 137509 DF, p-value: < 2.2e-16
# Seeing where our "fancy" model missed the mark
# If the dots are random, our model is solid. If there's a pattern, we missed a variable.
data$residuals <- residuals(model_fancy)
ggplot(data, aes(x = dist_to_transit_m, y = residuals)) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
geom_point(alpha = 0.02, color = "darkgreen") +
labs(
title = "Residual Plot: Where does the Model Miss?",
subtitle = "Points above the line are 'undervalued' by the model;below are 'overvalued'",
x = "Distance to Transit (m)",
y = "Model Error (Residuals)"
) +
theme_minimal()
