###### PROLOG ########

# PROLOG   ###
# PROJECT: The Lab @ DC Performance Task                                  
# PURPOSE: Performance Task                                   
# DIR:     C:\Users\kesha\Downloads\thelab_dc\thelab_task                
# DATA:    C:\Users\kesha\Downloads\thelab_dc\thelab_task\data           
# AUTHOR:  Masked                                          
# CREATED: Apr 15, 2026                                           
# LATEST:  Apr 15, 2026                                      
# NOTES:   Data sources is provided with the task

# PROLOG   ### 

# install packages and open libraries

# dplyr for data wrangling
# tidyverse for core packages
# ggplot2 for plots
# writing results to excel
# ggthemes for tuft theme
#latexpdf for converting table to pdf or png
# kableextra for table output
# stringr for string manipulation
# lubridate for date/time parsing
# scales for axis label formatting
# tibble for tibble()

pacman::p_load(dplyr, tidyverse, ggplot2, ggthemes, writexl, kableExtra, readxl, stringr, lubridate, scales, tibble, latexpdf)

# plot theme
theme_set(theme_tufte())  # but might not carry over in chunks

# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/

oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
     "#0072B2", "#D55E00", "#CC79A7", "#999999")
# # Clear existing variables from the environment
# rm(list = ls ())
# # Set the working directory to the project folder
# setwd("C:\Users\kesha\Downloads\thelab_dc\thelab_task")

Data Source 1. Recent traffic stops by Metropolitan Police Department (MPD) officers (traffic stops data): stops_2026

  1. Crime incidents reported to MPD (crime incidents data): crime_2026

Section 1. Data Manipulation

Question 1: Load the data and display a few rows of each dataset.

# read crimes data 
crime <- read_csv("data/crime_2026.csv")

# view data from top 10 rows
kable(head(crime, 10), caption="Crime datasets")
Crime datasets
X Y CCN REPORT_DAT SHIFT METHOD OFFENSE BLOCK XBLOCK YBLOCK WARD ANC DISTRICT PSA NEIGHBORHOOD_CLUSTER BLOCK_GROUP CENSUS_TRACT VOTING_PRECINCT LATITUDE LONGITUDE BID START_DATE END_DATE OBJECTID OCTO_RECORD_ID
-77.05866 38.90685 7093973 2017-06-03 12:54:27 DAY OTHERS THEFT FROM AUTO 2900 - 2999 BLOCK OF N STREET NW 394912 137666 2 2E 2 206 Cluster 4 000100 4 100 Precinct 5 38.90684 -77.05866 NA 2017-06-03 11:54:03 2017-06-03 11:55:05 341397622 07093973-01
-77.01804 38.96075 9251904 2017-04-07 00:00:00 MIDNIGHT OTHERS HOMICIDE 5800 - 5899 BLOCK OF 4TH STREET NW 398437 143648 4 4B 4 402 Cluster 17 001902 1 1902 Precinct 58 38.96074 -77.01803 NA 2009-02-13 12:03:45 2009-02-13 12:03:43 341453326 09251904-01
-76.98495 38.91515 14060710 2017-03-13 00:00:00 MIDNIGHT OTHERS HOMICIDE 2000 - 2099 BLOCK OF FENWICK STREET NE 401305 138586 5 5D 5 506 Cluster 23 008803 1 8803 Precinct 76 38.91514 -76.98495 NA 2014-05-01 19:32:00 2014-05-01 19:32:00 341600002 14060710-01
-77.00972 38.82439 16217337 2017-04-03 00:00:00 MIDNIGHT OTHERS HOMICIDE 2 - 153 BLOCK OF GALVESTON PLACE SW 399156 128511 8 8D 7 708 Cluster 39 009807 2 9807 Precinct 126 38.82438 -77.00972 NA 2016-12-25 11:59:35 2016-12-25 20:36:17 341698884 16217337-01
-77.01720 38.90453 17037020 2017-03-05 20:46:01 EVENING OTHERS THEFT FROM AUTO 400 - 471 BLOCK OF NEW YORK AVENUE NW 398508 137407 6 6E 3 308 Cluster 8 004701 2 4701 Precinct 1 38.90452 -77.01720 MOUNT VERNON TRIANGLE CID 2017-03-05 20:14:30 NA 341703531 17037020-01
-76.97611 38.87293 17037041 2017-03-05 23:23:03 MIDNIGHT OTHERS MOTOR VEHICLE THEFT 2100 - 2149 BLOCK OF YOUNG STREET SE 402073 133900 8 8A 6 607 Cluster 34 007601 1 7601 Precinct 133 38.87292 -76.97611 NA 2017-03-05 20:46:11 2017-03-05 21:45:13 341703532 17037041-01
-77.02140 38.91557 17037050 2017-03-05 22:27:38 EVENING OTHERS THEFT/OTHER 620 - 699 BLOCK OF T STREET NW 398144 138633 1 1B 3 306 Cluster 3 004801 1 4801 Precinct 137 38.91556 -77.02140 NA 2017-03-05 20:30:08 2017-03-05 20:45:13 341703533 17037050-01
-77.00879 38.94933 17037051 2017-03-05 22:24:01 EVENING GUN ROBBERY 4700 - 4931 BLOCK OF NORTH CAPITOL STREET 399238 142381 4 4D 4 405 Cluster 19 002202 1 2202 Precinct 57 38.94933 -77.00879 NA 2017-03-05 20:50:16 2017-03-05 21:30:52 341703534 17037051-01
-76.95661 38.92216 17037085 2017-03-05 23:59:44 MIDNIGHT OTHERS MOTOR VEHICLE THEFT 3279 3499 BLOCK OF FORT LINCOLN DRIVE NE 403763 139365 5 5C 5 503 Cluster 24 009000 1 9000 Precinct 139 38.92215 -76.95661 NA 2017-03-05 12:00:01 2017-03-05 22:30:40 341703535 17037085-01
-76.96778 38.87011 17037086 2017-03-06 05:09:27 MIDNIGHT OTHERS THEFT/OTHER 1600 - 1699 BLOCK OF 28TH STREET SE 402796 133587 7 7B 6 607 Cluster 34 007604 1 7604 Precinct 111 38.87010 -76.96778 NA 2017-03-05 23:22:21 2017-03-06 00:37:35 341703536 17037086-01
# read traffic stops data
stops <- read_csv("data/stops_2026.csv")

# view data from top 10 rows
kable(head(stops, 10), caption="Traffic Stops Dataset")
Traffic Stops Dataset
stop_id officer_id stop_date driver_age driver_sex ticket
52595 E 3/1/2017 37 f TRUE
46141 D 3/1/2017 16 m TRUE
154271 D 3/1/2017 16 m TRUE
184170 D 3/1/2017 23 m TRUE
91155 D 3/1/2017 18 f TRUE
9776 E 3/1/2017 36 f TRUE
64687 B 3/1/2017 52 m TRUE
35922 D 3/1/2017 17 f FALSE
49723 D 3/1/2017 31 m TRUE
176992 B 3/1/2017 45 f FALSE
just to know data overview
#(glimpse(crime))
#(glimpse(stops))

Question 2: Crime incidents are reported down to the hour/minute, while stops are only reported down to the day.

Please create a new column based on REPORT_DAT column of the crime incidents data that rounds the exact timestamp for the crime down to the date. Call this column report_daily.

# create new column in crime data
crime <- crime %>% 
  mutate(report_daily= as.Date(ymd_hms(REPORT_DAT)))

# check new column created
kable(head(crime %>% select(REPORT_DAT, report_daily), 10))
REPORT_DAT report_daily
2017-06-03 12:54:27 2017-06-03
2017-04-07 00:00:00 2017-04-07
2017-03-13 00:00:00 2017-03-13
2017-04-03 00:00:00 2017-04-03
2017-03-05 20:46:01 2017-03-05
2017-03-05 23:23:03 2017-03-05
2017-03-05 22:27:38 2017-03-05
2017-03-05 22:24:01 2017-03-05
2017-03-05 23:59:44 2017-03-05
2017-03-06 05:09:27 2017-03-06

Question 3: Create a new dataset, crimes_by_day, that gives the total count of crime incidents per day. For the purposes of aggregating, you can consider each unique OBJECTID as reflecting a different crime.

Similarly, create a new dataset, stops_by_day, that gives the total count of traffic stops per day. For the purposes of aggregating, you can consider each unique stop_id as reflecting a different stop.

# Remove duplicate OBJECTIDs from crime data
crime_dedup <- crime %>% 
  distinct(OBJECTID, .keep_all = TRUE)

# view number of row before and after deduplication
kable(data.frame(No_of_rows=nrow(crime)), caption= "Number of rows in Crime dataset")
Number of rows in Crime dataset
No_of_rows
16519
kable(data.frame(No_of_rows=nrow(crime_dedup)), caption= "Number of unique rows in Crime dataset")
Number of unique rows in Crime dataset
No_of_rows
16504
kable(data.frame(No_of_rows=nrow(crime)-nrow(crime_dedup)), caption= "Number of duplicates in crime data")
Number of duplicates in crime data
No_of_rows
15
# Remove duplicate stop_ids from stops data
stops_dedup <- stops %>% 
  distinct(stop_id, .keep_all = TRUE)

# view number of row before and after deduplication
kable(data.frame(No_of_rows=nrow(stops)), caption= "Number of rows in traffic stops dataset")
Number of rows in traffic stops dataset
No_of_rows
81129
kable(data.frame(No_of_rows=nrow(stops_dedup)), caption= "Number of unique rows in traffic stops dataset")
Number of unique rows in traffic stops dataset
No_of_rows
80946
kable(data.frame(No_of_rows=nrow(stops)-nrow(stops_dedup)), caption= "Number of duplicates in Traffic stops dataset")
Number of duplicates in Traffic stops dataset
No_of_rows
183
creating new datasets
# creating unique crime count per day
crimes_by_day <- crime_dedup %>%
  group_by(report_daily) %>%
  summarise(n_crimes = n(), .groups = "drop") %>%
  arrange(report_daily)
# check the new crime by day dataset create
kable(data.frame(crime_by_day=head(crimes_by_day, 10)), caption= "Crime by Day")
Crime by Day
crime_by_day.report_daily crime_by_day.n_crimes
2017-03-01 80
2017-03-02 59
2017-03-03 84
2017-03-04 77
2017-03-05 61
2017-03-06 88
2017-03-07 70
2017-03-08 89
2017-03-09 69
2017-03-10 95
# Parse stop_date similar to crime report daily format 
stops_dedup <- stops_dedup %>%
  mutate(stop_date_parsed = mdy(stop_date))

# creating unique traffic stops count per day
stops_by_day <- stops_dedup %>%
  group_by(stop_date_parsed) %>%
  summarise(n_stops = n(), .groups = "drop") %>%
  arrange(stop_date_parsed)

# check new stop by day dataset created
kable(data.frame(stop_by_day =head(stops_by_day, 10)), caption="Traffic stops by day")
Traffic stops by day
stop_by_day.stop_date_parsed stop_by_day.n_stops
2017-03-01 509
2017-03-02 421
2017-03-03 413
2017-03-04 475
2017-03-05 490
2017-03-06 449
2017-03-07 447
2017-03-08 450
2017-03-09 447
2017-03-10 435

Question 4: For both the traffic stops_by_day and crimes_by_day datasets: check if there are days missing in the period measured. If so, fill in the missing data in the manner you deem best suited for this problem and briefly justify your decision.

# create date sequence for crimes
crime_range <- seq(min(crimes_by_day$report_daily),
                   max(crimes_by_day$report_daily),
                   by="day")

crime_missing <- crime_range[!crime_range%in%crimes_by_day$report_daily]

# view results
# number of missing days
kable(data.frame(no_of_missing=length(crime_missing)), caption="Total number of missing in crime data") 
Total number of missing in crime data
no_of_missing
0
# first few missing dates
kable(data.frame(missing_crime =head(crime_missing, 10)), caption="List of missing in crime data")  
List of missing in crime data
missing_crime
# create date sequence for stops
stop_range <- seq(min(stops_by_day$stop_date_parsed),
                       max(stops_by_day$stop_date_parsed),
                       by = "day")
stop_missing <- stop_range[!stop_range %in%stops_by_day$stop_date_parsed]

# view results
# number of missing days
kable(data.frame(total_missing=length(stop_missing)), caption="Total number of missing in traffic stop data") 
Total number of missing in traffic stop data
total_missing
8
# first few missing dates
kable(data.frame(missing_dates=head(stop_missing, 10)), caption="List of missing in traffic stop data") 
List of missing in traffic stop data
missing_dates
2017-03-30
2017-03-31
2017-04-30
2017-05-30
2017-05-31
2017-06-30
2017-07-30
2017-07-31
fill in missing dates with ZERO because missing dates means no events recorded, not missing data collection. Replacing 0 will preserve continuity for time-series analysis or time-trend plots.
# replace for missing in crime data
crimes_by_day_full <- data.frame(report_daily = crime_range) %>%
  left_join(crimes_by_day, by = "report_daily") %>%
  mutate(n_crimes = ifelse(is.na(n_crimes), 0, n_crimes))

# replace for missing in stop data
stops_by_day_full <- data.frame(stop_date_parsed = stop_range) %>%
  left_join(stops_by_day, by = "stop_date_parsed") %>%
  mutate(n_stops = ifelse(is.na(n_stops), 0, n_stops))

Question 5: Merge the traffic stops by day data with the crimes by day data. Use code to calculate and report

combined_data <- crimes_by_day_full %>%
  rename(date = report_daily) %>%
  left_join(
    stops_by_day_full %>% rename(date = stop_date_parsed),
    by = "date"
  )

a) which specific date has the most traffic stops and how many stops there were,

max_stops <- combined_data %>% 
  filter(n_stops == max(n_stops, na.rm = TRUE)) %>% 
  mutate(day_of_week = weekdays(date))

kable((max_stops), caption="Most traffic stops by date and count")
Most traffic stops by date and count
date n_crimes n_stops day_of_week
2017-05-01 89 535 Monday

b) which specific date has the most crime incidents and how many crimes there were, and

max_crimes <- combined_data %>% 
  filter(n_crimes == max(n_crimes, na.rm = TRUE)) %>% 
  mutate(day_of_week = weekdays(date))

kable((max_crimes), caption="Most crime incident by date and count")
Most crime incident by date and count
date n_crimes n_stops day_of_week
2017-05-08 122 410 Monday
2017-05-29 122 481 Monday

c) what day of the week each of those dates was.

kable(data.frame(Day_of_week=(max_stops$day_of_week)), caption ="Max traffic stops day of week")
Max traffic stops day of week
Day_of_week
Monday
kable(data.frame(Day_of_week=(max_crimes$day_of_week)), caption="Max crime incident day of week")
Max crime incident day of week
Day_of_week
Monday
Monday

Question 6: What was the most common crime offense type committed on July 4th? Report both the type of offense and the count.

coalesce three variants of Theft Other
july4_offense<- crime_dedup %>% 
  mutate(offense_list = case_when(
    OFFENSE%in% c("THEFT OTHER", "THEFT_OTHER", "THEFT/OTHER") ~ "THEFT/OTHER",
    TRUE ~ OFFENSE
  ))

count the most common offense

july4_max_offense <- july4_offense %>% 
  filter(format(report_daily, "%m-%d") == "07-04") %>% 
  group_by(offense_list) %>% 
  summarise(count=n(), .groups ="drop") %>% 
  arrange(desc(count))

# view result
kable((july4_max_offense), caption="Count of Most common offense on July 4th")
Count of Most common offense on July 4th
offense_list count
THEFT/OTHER 33
THEFT FROM AUTO 22
MOTOR VEHICLE THEFT 13
ROBBERY 7
ASSAULT W/DANGEROUS WEAPON 6
BURGLARY 2

Most common crime offense type committed on july 4th is Theft/other = 33 followed by Theft from auto =22.

Section 2. Policy Evaluation

Suppose the DC government enacted a policy that allows people with low incomes (less than $30,000 per year) to have their ticket amount reduced by 20 to 40 percent. The goal of the new policy is to reduce the number of overdue tickets among low-income residents.

In this section, you’ll use the same traffic stop data as in Section 1. You’ll also need to download the following files: 1. Data on the court outcomes of tickets: Courts Data 2. Data on residents’ income: Income Data

Note: The ticket amounts come in $0.50 denominations.

reading courts data first

considering “50” present in some status column as discount flag

# parse with readlines()
raw_lines <- readLines("C:/Users/kesha/Downloads/CourtsData1.csv")

parse_courts_row <- function(line) {
  fields <- strsplit(line, ",")[[1]]
  if (length(fields) == 3) {
    data.frame(
      stop_id        = as.integer(fields[1]),
      ticket_amount  = as.numeric(fields[2]),
      discount_flag  = NA_character_,
      ticket_status  = trimws(fields[3]),
      stringsAsFactors = FALSE
    )
  } else if (length(fields) == 4) {
    data.frame(
      stop_id        = as.integer(fields[1]),
      ticket_amount  = as.numeric(fields[2]),
      discount_flag  = trimws(fields[3]),   
      ticket_status  = trimws(fields[4]),
      stringsAsFactors = FALSE
    )
  } else {
    NULL
  }
}


courts_clean <- do.call(rbind, lapply(raw_lines[-1], parse_courts_row))

kable(data.frame(No_of_row =nrow(courts_clean)), caption="number of rows in courts clean dataset")
number of rows in courts clean dataset
No_of_row
60257
# print ticket frequency table
kable(table(courts_clean$ticket_status, useNA = "always"), caption="Ticket status Frequency")
Ticket status Frequency
Var1 Freq
challenged 5471
overdue 32689
paid 19306
pending 2791
NA 0

laod income data

#read data
income <- read.csv("C:/Users/kesha/Downloads/IncomeData.csv", stringsAsFactors = FALSE)

# view head of data
kable(head(income, 10), caption="Top 10 row of income dataset")
Top 10 row of income dataset
stop_id income
97294 26305
87077 77578
57432 80185
121675 98784
37238 9350
199246 31981
191139 42301
153865 52278
90007 21719
111167 34801

Question 7: Find the difference between the probability of paying a ticket on time for people with incomes less than $30,000 and those with incomes greater than or equal to $30,000.

(Note: in a real project, we would set this up as a regression discontinuity design. However, as we don’t expect non-social scientists to be familiar with that methodology, we are simplifying it here into a simple difference around a threshold. If you feel comfortable working with regression discontinuity designs and feel it would be easier for you to answer that way, please feel free to do so.)

# restricting to tickets that were actually issued 
tickets_issued <- stops_dedup %>%
  filter(ticket == TRUE) %>%
  select(stop_id)

creating dataset for policy analysis

policy_data <- courts_clean %>%
  inner_join(income, by = "stop_id") %>%
  inner_join(tickets_issued, by = "stop_id") %>%
  filter(ticket_status %in% c("paid", "overdue")) %>%
  mutate(
    paid_on_time = ifelse(ticket_status == "paid", 1,0),
    income_group = ifelse(income <30000, "Below_30k", "Above_30k"))

# view data
kable(data.frame( No_of_row =nrow(policy_data)), caption="Number of rows in policy data")
Number of rows in policy data
No_of_row
37414
kable(table(policy_data$income_group), caption="Frequency for income above or below $30K")
Frequency for income above or below $30K
Var1 Freq
Above_30k 24318
Below_30k 13096

probability of paying on time by income group

pay_sum <- policy_data %>%
  group_by(income_group) %>%
  summarise(
    total_people           = n(),
    total_paid      = sum(paid_on_time),
    prob_paid      = mean(paid_on_time), # probability
    .groups     = "drop"
  )
kable((pay_sum), caption="Probability of paying on time by income group")
Probability of paying on time by income group
income_group total_people total_paid prob_paid
Above_30k 24318 8492 0.3492063
Below_30k 13096 5453 0.4163867

Difference: low income minus high income

prob_low_income  <- pay_sum %>% filter(income_group == "Below_30k")  %>% pull(prob_paid)
prob_high_income <- pay_sum %>% filter(income_group == "Above_30k") %>% pull(prob_paid)
prob_difference <- prob_low_income - prob_high_income

Two proportion z-test

count_low <- pay_sum %>% filter(income_group == "Below_30k") %>% pull(total_people)
count_high <- pay_sum  %>% filter(income_group == "Above_30k") %>%  pull(total_people)


paid_low   <- pay_sum %>% filter(income_group == "Below_30k") %>% pull(total_paid)
paid_high  <- pay_sum %>% filter(income_group == "Above_30k") %>% pull(total_paid)


test_result <- prop.test(
  x =c(paid_low, paid_high),
  n =c(count_low, count_high),
  correct =FALSE
)

# print test result
cat("\n--- Two-proportion test result ---\n")

— Two-proportion test result —

print(test_result)
2-sample test for equality of proportions without continuity correction

data: c(paid_low, paid_high) out of c(count_low, count_high) X-squared = 164.31, df = 1, p-value < 2.2e-16 alternative hypothesis: two.sided 95 percent confidence interval: 0.05682747 0.07753320 sample estimates: prop 1 prop 2 0.4163867 0.3492063

Q7_answer= The probability of paying a ticket on time for individuals with income less than $30,000 is 0.4164 (41.64%), while for individuals with income greater than or equal to $30,000 it is 0.3492 (34.92%).
The difference in probability is 0.0672 (6.72 percentage points), indicating that lower-income individuals have a higher probability of paying their tickets on time in this dataset.
A two-proportion test shows that this difference is statistically significant (p-value < 0.001), with a 95% confidence interval of [0.0568, 0.0775].

Question 8: What is the magnitude of your results? How are they relevant for informing policy? Discuss in 3-4 sentences.

The estimated effect is a 6.72 percentage point increase in the probability of paying a ticket on time for individuals with incomes below $30,000 compared to those above the threshold. This is a moderate but meaningful difference, especially in a policy context where even small increases in compliance can translate into substantial reductions in overdue tickets at the population level. The result suggests that lowering financial burden may improve timely payment behavior among lower-income individuals, supporting the rationale behind the discount policy. However, since this analysis is descriptive and not causal, therefore further evaluation would be needed.

Question 9: A colleague is wondering if we see differences in the probability of paying on time at different income thresholds. They want you to plot the distribution of the p-values for a test that looks at differences in the probability of paying on time at incomes below and above $5,000; below and above $10,000; and so on for every $5,000 increment up to $50,000.

You realize that, since that could involve copying and pasting the code many times, it would be more efficient to write a function that:

  • Takes as inputs the data and a vector of income thresholds
  • Runs the test from the previous step
  • Returns the p-value of the test

Please write a function to that effect, and then use the results to create a plot where the x axis is the income threshold (again, $5,000 to $50,000, in $5,000 increments) and the y axis is the p-value for the test on the differences in the probability of paying on time around that income threshold.

(Note: As in Question 7, if this was a real project, we would set this up as a sequence of regression discontinuity designs. However, as we don’t expect non-social scientists to be familiar with that methodology, we are simplifying it here into simple differences around a series of thresholds. If you feel comfortable working with regression discontinuity designs and feel it would be easier for you to answer that way, please feel free to do so.)

# function to test p value
get_p_value <- function(data, threshold) {
  
  temp_data <- data %>%
    mutate(income_group = ifelse(income < threshold, "Below", "Above"))
  
  pay_sum <- temp_data %>%
    group_by(income_group) %>%
    summarise(
      total_people = n(),
      total_paid   = sum(paid_on_time),
      .groups = "drop"
    )
  
  # skipping if one group missing
  if (nrow(pay_sum) < 2) return(NA)
  
  test <- prop.test(
    x = pay_sum$total_paid,
    n = pay_sum$total_people,
    correct = FALSE
  )
  
  return(test$p.value)
}
# running threshold
thresholds <- seq(5000, 50000, by = 5000)

p_values <- sapply(thresholds, function(x) {
  get_p_value(policy_data, x)
})

results <- data.frame(
  income_threshold = thresholds,
  p_value = p_values
)

kable(data.frame(p_values= p_values), caption= "List of P Values")
List of P Values
p_values
0.6049941
0.2497395
0.0000179
0.0000000
0.0000000
0.0000000
0.9102625
0.0000000
0.0000000
0.0000000
# plot p values

plot_pvalues <- ggplot(results, aes(x = income_threshold, y = p_value)) +
  geom_line(color = "#2C6E9B", linewidth = 1) +
  geom_point(color = "#2C6E9B", size = 3) +
  
  # significance line
  geom_hline(yintercept = 0.05, linetype = "dashed",
             color = "firebrick", linewidth = 0.8) +
  
  # label for alpha
  annotate("text", x = 5000, y = 0.07, label = "α = 0.05",
           color = "firebrick", hjust = 0, size = 4) +
  
  # axes
  scale_x_continuous(
    breaks = thresholds,
    labels = label_dollar(scale = 1/1000, suffix = "k")
  ) +
  scale_y_continuous(
    limits = c(0, 1),
    breaks = seq(0, 1, 0.1)
  ) +
  
  labs(
    title = "P-values Across Income Thresholds",
    subtitle = "Difference in probability of paying on time (below vs above threshold)",
    x = "Income Threshold",
    y = "P-value"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank()
  )

print(plot_pvalues)

# save plot
#ggsave("pvalue_by_threshold.png", plot = plot_pvalues,width = 8, height = 5, dpi = 150)

Question 10: In 3-5 sentences, explain to your colleague, Leon Mait, whether or not the new reduced payment policy had a detectable impact on residents’ likelihood of having an overdue ticket. What problems might this analysis have in capturing the causal effect of the policy? How would you suggest overcoming those problems? Assume that Leon is generally familiar with the policy and the data.

The results suggest that there are statistically significant differences in the probability of paying on time at several income thresholds (many p-values are < 0.05), which indicates that income level is associated with payment behavior. However, the pattern is not consistent across all thresholds incomes tested, and there are spikes around $35k, where the difference is not significant, suggesting instability in the effect. This means we cannot confidently conclude that the reduced payment policy itself caused the change in overdue rates.

A key limitation is that this analysis is purely descriptive and does not control for confounding factors such as differences in demographics, enforcement patterns, or underlying ability to pay. People just below and above the threshold may differ systematically in ways unrelated to the policy.

Section 3: Data Visualization

Question 11: We are interested in assessing your data visualization skills. Using any dataset available on opendata.dc.gov, make a visualization that you think is interesting or illuminating in some way. Your visualization should be clear, accessible, and professional. (Note: in subsequent hiring stages, you should be prepared to share why you chose to make this visualization and how you generated it.)

dataset source: Bicycle Lanes Metadata source: Metadata Bicycle Lane

# Reading bicycle lanes data
bicycle <- read.csv("data/Bicycle_Lanes.csv")

# print head of data
kable(head(bicycle, 10), caption = "Printing first 10 row of dataset")
Printing first 10 row of dataset
ROUTEID ROUTENAME ROADTYPE SUBBLOCKKEY BIKELANE_PARKINGLANE_ADJACENT BIKELANE_THROUGHLANE_ADJACENT BIKELANE_POCKETLANE_ADJACENT BIKELANE_CONTRAFLOW BIKELANE_CONVENTIONAL BIKELANE_DUAL_PROTECTED BIKELANE_DUAL_BUFFERED BIKELANE_PROTECTED BIKELANE_BUFFERED SUBBLOCKID BLOCKID BLOCKKEY QUADRANT STREETNAME STREETTYPE TOTALBIKELANES TOTALBIKELANEWIDTH WARD_ID SMD_ID ANC_ID OBJECTID SHAPELEN
11000302 3RD ST NW 1 62ee7cc40df1189eb13995b3a1b44812 IB IB IB 11000302-11000302_11076242-11000302_47015042 11000302-11000302_11076242-11000302_11065452_11079882 d24a9caf2b4db11382b6172d41484fe5 1 3RD ST 1 5 4 4B03 4B 10237121 0
11000302 3RD ST NW 1 2445dcb917e30b8935246eb42189a665 BD BD BD 11000302-11000302_11086842-11000302_58023592 11000302-11000302_11086842-11000302_11087422 4cb5ba2ff947bbbb82770ea071f63ae3 1 3RD ST 2 10 4 4B03 4B 10237134 0
11000302 3RD ST NW 1 9ad16a536e3a8edb388ec0ce951c5e63 IB IB IB 11000302-11000302_47015042-11000302_11065452_11079882 11000302-11000302_11076242-11000302_11065452_11079882 d24a9caf2b4db11382b6172d41484fe5 1 3RD ST 1 5 4 4B03 4B 10237175 0
11000302 3RD ST NW 1 62cd96d7bcd68d42f2d13529e19f542f BD BD BD 11000302-11000302_11065452_11079882-11000302_11086842 11000302-11000302_11065452_11079882-11000302_11086842 62cd96d7bcd68d42f2d13529e19f542f 1 3RD ST 2 10 4 4B03 4B 10237212 0
11000302 3RD ST NW 1 22c1ec2999731a290796fb9664491948 BD BD BD 11000302-11000302_11087422-11000302_11088792 11000302-11000302_11087422-11000302_11088792 22c1ec2999731a290796fb9664491948 1 3RD ST 2 10 4 4B03 4B 10237213 0
11000402 4TH ST NW 1 8e801d825787a035b408230dbe869ff4 OB 11000402-11000402_11033082-11000402_11066102 11000402-11000402_11033082-11000402_11066102 8e801d825787a035b408230dbe869ff4 1 4TH ST 1 5 1 1B01 1B 10237220 0
11000402 4TH ST NW 1 45092fe2113bf60f171431f8e9f0f3f9 OB OB 11000402-11000402_11019632-11000402_11016382 11000402-11000402_11019632-11000402_11016382 45092fe2113bf60f171431f8e9f0f3f9 1 4TH ST 1 5 4 4B04 4B 10237222 0
11000402 4TH ST NW 1 7eff870e38d4c76c1df1f08e827fa4ca OB 11000402-11000402_11090152-11000402_11018722 11000402-11000402_11090152-11000402_11018722 7eff870e38d4c76c1df1f08e827fa4ca 1 4TH ST 1 5 1 1E07 1E 10237223 0
11000402 4TH ST NW 1 1fb1e4c9ad5cbbb62ec65f8a245f9941 OB 11000402-11000402_11087232-11000402_11033082 11000402-11000402_11087232-11000402_11033082 1fb1e4c9ad5cbbb62ec65f8a245f9941 1 4TH ST 1 5 1 1B01 1B 10237241 0
11000402 4TH ST NW 1 de1e8797ef55bfc8215bbf4100126979 OB 11000402-11000402_47015622_47039622-11000402_11090152 11000402-11000402_11088462-11000402_11090152 30298022971c113640b720129156b018 1 4TH ST 1 5 1 1B01 1B 10237254 0
# get data overview
#glimpse(bicycle)

Goal of this visualization is to have a ward-by-ward look of the DC on-streets Bike lanes from the saftey lenses.

# filter data to streets only (ROADTYPE=1, street only)
bike_streets <-bicycle %>% 
  filter(ROADTYPE==1)

# pulling saftey variable
is_present <- function(x) nchar(trimws(x)) > 0

bike_streets <- bike_streets %>%
  mutate(
    # has any separation (protected or buffered)
    has_sep = is_present(BIKELANE_PROTECTED) |
              is_present(BIKELANE_DUAL_PROTECTED) |
              is_present(BIKELANE_BUFFERED) |
              is_present(BIKELANE_DUAL_BUFFERED),

    # adjacent to parking (dooring risk)
    parking_adj = is_present(BIKELANE_PARKINGLANE_ADJACENT), ward_label = paste0("Ward ", WARD_ID))
# summarise by ward

ward_summary <- bike_streets %>%
  group_by(ward_label) %>%
  summarise(
    total_segments = n(),
    percent_sep        = round(mean(has_sep) * 100, 1),
    percent_parking    = round(mean(parking_adj) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(percent_sep) %>% 
  mutate(ward_label = factor(ward_label, levels = ward_label))

kable((ward_summary), caption=" Summary of ward data")
Summary of ward data
ward_label total_segments percent_sep percent_parking
Ward 6 581 18.2 75.9
Ward 4 249 18.9 80.3
Ward 8 60 23.3 68.3
Ward 3 109 25.7 56.9
Ward 7 184 35.3 58.7
Ward 1 325 35.7 60.0
Ward 2 576 37.0 58.0
Ward 5 238 48.3 51.7
# pivot longer data for plotting

plot_data <- ward_summary %>%
  select(ward_label,total_segments, percent_sep, percent_parking) %>% 
  pivot_longer(
    cols = c(percent_sep, percent_parking),
    names_to = "metric",
    values_to = "percent"
  ) %>%
  mutate(
    metric_label = case_when(
      metric == "percent_sep"~ "Has some separation\n(protected or buffered)",
      metric == "percent_parking" ~ "Adjacent to parked cars\n(dooring risk)"
    ),
    metric_label = factor(metric_label, levels = c(
      "Has some separation\n(protected or buffered)",
      "Adjacent to parked cars\n(dooring risk)"
    )),
    # Color: green for separation (good), red for dooring risk (bad)
    dot_color = ifelse(metric == "percent_sep", "#1a6b3c", "#c0392b")
  )

kable((plot_data), caption ="Plot data for Bicycle lane")
Plot data for Bicycle lane
ward_label total_segments metric percent metric_label dot_color
Ward 6 581 percent_sep 18.2 Has some separation
(protected or buffered) #1a6b3c
Ward 6 581 percent_parking 75.9 Adjacent to parked cars
(dooring risk) #c0392b
Ward 4 249 percent_sep 18.9 Has some separation
(protected or buffered) #1a6b3c
Ward 4 249 percent_parking 80.3 Adjacent to parked cars
(dooring risk) #c0392b
Ward 8 60 percent_sep 23.3 Has some separation
(protected or buffered) #1a6b3c
Ward 8 60 percent_parking 68.3 Adjacent to parked cars
(dooring risk) #c0392b
Ward 3 109 percent_sep 25.7 Has some separation
(protected or buffered) #1a6b3c
Ward 3 109 percent_parking 56.9 Adjacent to parked cars
(dooring risk) #c0392b
Ward 7 184 percent_sep 35.3 Has some separation
(protected or buffered) #1a6b3c
Ward 7 184 percent_parking 58.7 Adjacent to parked cars
(dooring risk) #c0392b
Ward 1 325 percent_sep 35.7 Has some separation
(protected or buffered) #1a6b3c
Ward 1 325 percent_parking 60.0 Adjacent to parked cars
(dooring risk) #c0392b
Ward 2 576 percent_sep 37.0 Has some separation
(protected or buffered) #1a6b3c
Ward 2 576 percent_parking 58.0 Adjacent to parked cars
(dooring risk) #c0392b
Ward 5 238 percent_sep 48.3 Has some separation
(protected or buffered) #1a6b3c
Ward 5 238 percent_parking 51.7 Adjacent to parked cars
(dooring risk) #c0392b
# segment count labels

n_labels <- ward_summary %>% 
  mutate(
    metric_label =factor(
      "Has some separation\n(protected or buffered)",
      levels = c(
        "Has some separation\n(protected or buffered)",
        "Adjacent to parked cars\n(dooring risk)"
      )
    )
  )
# plot graph

p <- ggplot(plot_data, aes(x = percent, y = ward_label)) +
  geom_segment(
    aes(x = 0, xend = percent, yend = ward_label),
    color     = "gray88",
    linewidth = 0.8
  ) +
  geom_point(aes(color = dot_color), size = 5) +
  geom_text(
    aes(label = paste0(percent, "%")),
    hjust      = -0.45,
    size       = 3.2,
    color      = "gray20",
    fontface   = "bold"
  ) +
  geom_text(
    data = n_labels,
    aes(x = -2, y = ward_label, label = paste0("n=", total_segments)),
    hjust        = 1,
    size         = 2.7,
    color        = "gray55",
    inherit.aes  = FALSE
  ) +
  scale_color_identity() +
  scale_x_continuous(
    limits = c(-14, 88),
    breaks = c(0, 25, 50, 75),
    labels = function(x) paste0(x, "%")
  ) +
  facet_wrap(~ metric_label, ncol = 2) +
  labs(
    title    = "How Safe Are DC Bike Lanes? A Ward-by-Ward Look",
    subtitle = paste0(
      "Street segments only (n = 2,322)  |  ",
      "Wards ordered by share with separation (low to high)\n",
      "Separation = lane has physical barrier or painted buffer between bike and traffic\n",
      "Dooring risk = bike lane sits next to a parking lane ",
      "(opened car doors hit cyclists)"
    ),
    x        = "Share of bike lane segments in that ward (%)",
    y        = NULL,
    caption  = paste0(
      "Source: DC Open Data — [Bicycle Lanes](https://opendata.dc.gov/datasets/DCGIS::bicycle-lanes/about)(DDOT, updated weekly)"
    )
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(size = 8.5, color = "gray40",
                                      lineheight = 1.4),
    plot.caption       = element_text(size = 7.5, color = "gray55"),
    strip.text         = element_text(face = "bold", size = 11,
                                      lineheight = 1.3),
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "gray93"),
    axis.text.y        = element_text(size = 10, face = "bold"),
    axis.text.x        = element_text(size = 9),
    panel.spacing      = unit(2.2, "lines")
  )

print(p)

# save plot
#ggsave("bike_lanes_safety.png", plot = p, width = 10, height = 5)

Question 12: Compress and upload your code/script(s) used to answer all questions (e.g., .R or .Rmd; .py or .ipynb; .qmd; .txt file) in a zip file (it is easier for our system to accept zip files). Please ensure the file and the file name do not include your name or other identifying information.

Submitting .Rmd file and html knit file with all the answers.