Stat520 Project

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
# Bringing in the cleaned CSV we built in Python
data <- read_csv("final_data.csv")

## Rows: 137534 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): pin, nearest_station
## dbl  (6): sale_price, class, township_code, latitude, longitude, dist_to_tra...
## date (1): sale_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Quick look at the first few rows and the structure to make sure types are right
head(data)

## # A tibble: 6 × 9
##   pin            sale_price sale_date  class township_code latitude longitude
##   <chr>               <dbl> <date>     <dbl>         <dbl>    <dbl>     <dbl>
## 1 25151170160000     160000 2023-01-18   203            70     41.7     -87.6
## 2 20023020390000      40000 2023-01-19   205            70     41.8     -87.6
## 3 20114000191005     244000 2023-01-10   299            70     41.8     -87.6
## 4 20352090130000      55000 2023-01-25   203            70     41.7     -87.6
## 5 25032210120000     299900 2023-01-26   211            70     41.7     -87.6
## 6 20254100300000     100000 2023-01-27   203            70     41.8     -87.6
## # ℹ 2 more variables: dist_to_transit_m <dbl>, nearest_station <chr>

print(nrow(data))

## [1] 137534

str(data)

## spc_tbl_ [137,534 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ pin              : chr [1:137534] "25151170160000" "20023020390000" "20114000191005" "20352090130000" ...
##  $ sale_price       : num [1:137534] 160000 40000 244000 55000 299900 ...
##  $ sale_date        : Date[1:137534], format: "2023-01-18" "2023-01-19" ...
##  $ class            : num [1:137534] 203 205 299 203 211 203 211 202 211 202 ...
##  $ township_code    : num [1:137534] 70 70 70 70 70 70 70 70 70 70 ...
##  $ latitude         : num [1:137534] 41.7 41.8 41.8 41.7 41.7 ...
##  $ longitude        : num [1:137534] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ dist_to_transit_m: num [1:137534] 2196 1502 1903 2604 1223 ...
##  $ nearest_station  : chr [1:137534] "95th/Dan Ryan" "43rd" "51st" "79th" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   pin = col_character(),
##   ..   sale_price = col_double(),
##   ..   sale_date = col_date(format = ""),
##   ..   class = col_double(),
##   ..   township_code = col_double(),
##   ..   latitude = col_double(),
##   ..   longitude = col_double(),
##   ..   dist_to_transit_m = col_double(),
##   ..   nearest_station = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

# Mapping out the data points to see the "skeleton" of Chicago
# Since we have thousands of sales, I'm using a low alpha so the transit lines "pop"
ggplot(data, aes(x = longitude, y = latitude, color = dist_to_transit_m)) +
  geom_point(alpha = 0.05, size = 0.1) +
  scale_color_gradient(low = "red", high = "yellow", name = "Dist to 'L' (m)") +
  labs(
    title = "Geographic Distribution of Transit Proximity",
    subtitle = "The 'L' lines emerge as the areas closest to stations (red)",
    x = "Longitude",
    y = "Latitude"
  ) +
  theme_minimal() +
  coord_quickmap() # Keeps the city from looking "stretched"

# Checking how many properties are actually near the train
# This helps see if our data is skewed toward people living right on top of the 'L'
ggplot(data, aes(x = dist_to_transit_m)) +
  geom_histogram(fill = "steelblue", color = "white", bins = 50) +
  labs(
    title = "Distribution of Property Distances to CTA Stations",
    subtitle = "How far are Chicago residential sales from the 'L'?",
    x = "Distance to Nearest Station (meters)",
    y = "Number of Sales"
  ) +
  theme_minimal()

# Comparing the average price by township to see geographic trends
# This helps confirm if North side vs South side differences are showing up
township_summary <- data %>%
  group_by(township_code) %>%
  summarise(avg_price = mean(sale_price, na.rm = TRUE)) %>%
  mutate(township_code = as.factor(township_code))

ggplot(township_summary, aes(x = reorder(township_code, -avg_price), y = avg_price)) +
  geom_bar(stat = "identity", fill = "darkorange") +
  scale_y_continuous(labels = scales::dollar) +
  labs(
    title = "Average Sale Price by Chicago Township",
    subtitle = "Comparing market value across different sections of the city",
    x = "Township Code (70-77)",
    y = "Average Sale Price"
  ) +
  theme_minimal()

# Pre-processing the price data
# R sometimes reads numbers with symbols as characters, so we force it to numeric
data <- data %>%
  mutate(sale_price = as.numeric(sale_price)) %>%
  filter(!is.na(sale_price)) 

# Running the basic model: Price predicted by Distance
# Using log() for price because real estate values are exponential, not linear
model_simple <- lm(log(sale_price) ~ dist_to_transit_m, data = data)

# Checking the P-value and R-squared to see if distance actually matters
summary(model_simple)

## 
## Call:
## lm(formula = log(sale_price) ~ dist_to_transit_m, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5997 -0.4381  0.0223  0.4608  5.8104 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.294e+01  3.137e-03 4125.69   <2e-16 ***
## dist_to_transit_m -1.373e-04  1.415e-06  -97.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8584 on 137532 degrees of freedom
## Multiple R-squared:  0.06411,    Adjusted R-squared:  0.0641 
## F-statistic:  9421 on 1 and 137532 DF,  p-value: < 2.2e-16

# Visualizing the regression line
# The red line shows the general "decay" of value as you move away from the train
ggplot(data, aes(x = dist_to_transit_m, y = sale_price)) +
  geom_point(alpha = 0.02, color = "midnightblue") + 
  geom_smooth(method = "lm", color = "red", size = 1.2) +
  scale_y_log10(labels = scales::dollar) + 
  labs(
    title = "The 'L' Train Effect on Chicago Property Values",
    subtitle = "Simple Linear Regression: log(Price) ~ Distance to Transit",
    x = "Distance to Nearest CTA Station (meters)",
    y = "Sale Price (Log Scale)"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

# Controlling for property type (class)
# This model is better because it compares apples to apples (bungalows to bungalows)
# # This runs a multiple regression that isolates the transit effect by 
# holding the house type constant, essentially comparing "like with like."
model_fancy <- lm(log(sale_price) ~ dist_to_transit_m + as.factor(class), data = data)

# We expect the R-squared to go up here since 'class' is a huge deal for price
summary(model_fancy)

## 
## Call:
## lm(formula = log(sale_price) ~ dist_to_transit_m + as.factor(class), 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1829 -0.4061  0.0478  0.4428  5.8478 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.352e+01  5.768e-01  23.444  < 2e-16 ***
## dist_to_transit_m   -1.083e-04  1.596e-06 -67.836  < 2e-16 ***
## as.factor(class)201 -8.672e-01  5.778e-01  -1.501  0.13342    
## as.factor(class)202 -1.094e+00  5.769e-01  -1.897  0.05784 .  
## as.factor(class)203 -7.932e-01  5.768e-01  -1.375  0.16909    
## as.factor(class)204 -3.945e-01  5.772e-01  -0.684  0.49429    
## as.factor(class)205 -6.057e-01  5.769e-01  -1.050  0.29378    
## as.factor(class)206  2.062e-01  5.771e-01   0.357  0.72086    
## as.factor(class)207 -3.391e-01  5.775e-01  -0.587  0.55704    
## as.factor(class)208  1.315e+00  5.791e-01   2.270  0.02320 *  
## as.factor(class)209  1.588e+00  5.804e-01   2.735  0.00623 ** 
## as.factor(class)210 -1.010e+00  5.772e-01  -1.750  0.08007 .  
## as.factor(class)211 -5.303e-01  5.768e-01  -0.919  0.35796    
## as.factor(class)212 -3.239e-01  5.772e-01  -0.561  0.57467    
## as.factor(class)213 -8.245e-01  5.784e-01  -1.425  0.15402    
## as.factor(class)218  5.830e-01  8.157e-01   0.715  0.47483    
## as.factor(class)225  1.018e+00  7.064e-01   1.441  0.14971    
## as.factor(class)234 -9.254e-01  5.773e-01  -1.603  0.10892    
## as.factor(class)241 -6.442e-01  5.774e-01  -1.116  0.26454    
## as.factor(class)278  5.547e-01  5.770e-01   0.961  0.33638    
## as.factor(class)288  2.232e+00  9.991e-01   2.234  0.02547 *  
## as.factor(class)290 -1.131e-02  5.800e-01  -0.020  0.98444    
## as.factor(class)295 -3.050e-01  5.770e-01  -0.529  0.59714    
## as.factor(class)297  4.118e-01  5.800e-01   0.710  0.47773    
## as.factor(class)299 -6.445e-01  5.768e-01  -1.117  0.26385    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8157 on 137509 degrees of freedom
## Multiple R-squared:  0.155,  Adjusted R-squared:  0.1549 
## F-statistic:  1051 on 24 and 137509 DF,  p-value: < 2.2e-16

# Seeing where our "fancy" model missed the mark
# If the dots are random, our model is solid. If there's a pattern, we missed a variable.
data$residuals <- residuals(model_fancy)

ggplot(data, aes(x = dist_to_transit_m, y = residuals)) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_point(alpha = 0.02, color = "darkgreen") +
  labs(
    title = "Residual Plot: Where does the Model Miss?",
    subtitle = "Points above the line are 'undervalued' by the model;below are 'overvalued'",
    x = "Distance to Transit (m)",
    y = "Model Error (Residuals)"
  ) +
  theme_minimal()

Stat520 Project

Prabeen Giri

2026-03-03