The Lab @ DC Performance Task

###### PROLOG ########

# PROLOG   ###
# PROJECT: The Lab @ DC Performance Task                                  
# PURPOSE: Performance Task                                   
# DIR:     C:\Users\kesha\Downloads\thelab_dc\thelab_task                
# DATA:    C:\Users\kesha\Downloads\thelab_dc\thelab_task\data           
# AUTHOR:  Masked                                          
# CREATED: Apr 15, 2026                                           
# LATEST:  Apr 15, 2026                                      
# NOTES:   Data sources is provided with the task

# PROLOG   ### 

# install packages and open libraries

# dplyr for data wrangling
# tidyverse for core packages
# ggplot2 for plots
# writing results to excel
# ggthemes for tuft theme
#latexpdf for converting table to pdf or png
# kableextra for table output
# stringr for string manipulation
# lubridate for date/time parsing
# scales for axis label formatting
# tibble for tibble()

pacman::p_load(dplyr, tidyverse, ggplot2, ggthemes, writexl, kableExtra, readxl, stringr, lubridate, scales, tibble, latexpdf)

# plot theme
theme_set(theme_tufte())  # but might not carry over in chunks

# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/

oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
     "#0072B2", "#D55E00", "#CC79A7", "#999999")

# # Clear existing variables from the environment
# rm(list = ls ())
# # Set the working directory to the project folder
# setwd("C:\Users\kesha\Downloads\thelab_dc\thelab_task")

Data Source 1. Recent traffic stops by Metropolitan Police Department (MPD) officers (traffic stops data): stops_2026

Crime incidents reported to MPD (crime incidents data): crime_2026

Section 1. Data Manipulation

Question 1: Load the data and display a few rows of each dataset.

# read crimes data 
crime <- read_csv("data/crime_2026.csv")

# view data from top 10 rows
kable(head(crime, 10), caption="Crime datasets")

Crime datasets
X	Y	CCN	REPORT_DAT	SHIFT	METHOD	OFFENSE	BLOCK	XBLOCK	YBLOCK	WARD	ANC	DISTRICT	PSA	NEIGHBORHOOD_CLUSTER	BLOCK_GROUP	CENSUS_TRACT	VOTING_PRECINCT	LATITUDE	LONGITUDE	BID	START_DATE	END_DATE	OBJECTID	OCTO_RECORD_ID
-77.05866	38.90685	7093973	2017-06-03 12:54:27	DAY	OTHERS	THEFT FROM AUTO	2900 - 2999 BLOCK OF N STREET NW	394912	137666	2	2E	2	206	Cluster 4	000100 4	100	Precinct 5	38.90684	-77.05866	NA	2017-06-03 11:54:03	2017-06-03 11:55:05	341397622	07093973-01
-77.01804	38.96075	9251904	2017-04-07 00:00:00	MIDNIGHT	OTHERS	HOMICIDE	5800 - 5899 BLOCK OF 4TH STREET NW	398437	143648	4	4B	4	402	Cluster 17	001902 1	1902	Precinct 58	38.96074	-77.01803	NA	2009-02-13 12:03:45	2009-02-13 12:03:43	341453326	09251904-01
-76.98495	38.91515	14060710	2017-03-13 00:00:00	MIDNIGHT	OTHERS	HOMICIDE	2000 - 2099 BLOCK OF FENWICK STREET NE	401305	138586	5	5D	5	506	Cluster 23	008803 1	8803	Precinct 76	38.91514	-76.98495	NA	2014-05-01 19:32:00	2014-05-01 19:32:00	341600002	14060710-01
-77.00972	38.82439	16217337	2017-04-03 00:00:00	MIDNIGHT	OTHERS	HOMICIDE	2 - 153 BLOCK OF GALVESTON PLACE SW	399156	128511	8	8D	7	708	Cluster 39	009807 2	9807	Precinct 126	38.82438	-77.00972	NA	2016-12-25 11:59:35	2016-12-25 20:36:17	341698884	16217337-01
-77.01720	38.90453	17037020	2017-03-05 20:46:01	EVENING	OTHERS	THEFT FROM AUTO	400 - 471 BLOCK OF NEW YORK AVENUE NW	398508	137407	6	6E	3	308	Cluster 8	004701 2	4701	Precinct 1	38.90452	-77.01720	MOUNT VERNON TRIANGLE CID	2017-03-05 20:14:30	NA	341703531	17037020-01
-76.97611	38.87293	17037041	2017-03-05 23:23:03	MIDNIGHT	OTHERS	MOTOR VEHICLE THEFT	2100 - 2149 BLOCK OF YOUNG STREET SE	402073	133900	8	8A	6	607	Cluster 34	007601 1	7601	Precinct 133	38.87292	-76.97611	NA	2017-03-05 20:46:11	2017-03-05 21:45:13	341703532	17037041-01
-77.02140	38.91557	17037050	2017-03-05 22:27:38	EVENING	OTHERS	THEFT/OTHER	620 - 699 BLOCK OF T STREET NW	398144	138633	1	1B	3	306	Cluster 3	004801 1	4801	Precinct 137	38.91556	-77.02140	NA	2017-03-05 20:30:08	2017-03-05 20:45:13	341703533	17037050-01
-77.00879	38.94933	17037051	2017-03-05 22:24:01	EVENING	GUN	ROBBERY	4700 - 4931 BLOCK OF NORTH CAPITOL STREET	399238	142381	4	4D	4	405	Cluster 19	002202 1	2202	Precinct 57	38.94933	-77.00879	NA	2017-03-05 20:50:16	2017-03-05 21:30:52	341703534	17037051-01
-76.95661	38.92216	17037085	2017-03-05 23:59:44	MIDNIGHT	OTHERS	MOTOR VEHICLE THEFT	3279 3499 BLOCK OF FORT LINCOLN DRIVE NE	403763	139365	5	5C	5	503	Cluster 24	009000 1	9000	Precinct 139	38.92215	-76.95661	NA	2017-03-05 12:00:01	2017-03-05 22:30:40	341703535	17037085-01
-76.96778	38.87011	17037086	2017-03-06 05:09:27	MIDNIGHT	OTHERS	THEFT/OTHER	1600 - 1699 BLOCK OF 28TH STREET SE	402796	133587	7	7B	6	607	Cluster 34	007604 1	7604	Precinct 111	38.87010	-76.96778	NA	2017-03-05 23:22:21	2017-03-06 00:37:35	341703536	17037086-01

# read traffic stops data
stops <- read_csv("data/stops_2026.csv")

# view data from top 10 rows
kable(head(stops, 10), caption="Traffic Stops Dataset")

Traffic Stops Dataset
stop_id	officer_id	stop_date	driver_age	driver_sex	ticket
52595	E	3/1/2017	37	f	TRUE
46141	D	3/1/2017	16	m	TRUE
154271	D	3/1/2017	16	m	TRUE
184170	D	3/1/2017	23	m	TRUE
91155	D	3/1/2017	18	f	TRUE
9776	E	3/1/2017	36	f	TRUE
64687	B	3/1/2017	52	m	TRUE
35922	D	3/1/2017	17	f	FALSE
49723	D	3/1/2017	31	m	TRUE
176992	B	3/1/2017	45	f	FALSE

just to know data overview

#(glimpse(crime))
#(glimpse(stops))

Question 2: Crime incidents are reported down to the hour/minute, while stops are only reported down to the day.

Please create a new column based on REPORT_DAT column of the crime incidents data that rounds the exact timestamp for the crime down to the date. Call this column report_daily.

# create new column in crime data
crime <- crime %>% 
  mutate(report_daily= as.Date(ymd_hms(REPORT_DAT)))

# check new column created
kable(head(crime %>% select(REPORT_DAT, report_daily), 10))

REPORT_DAT	report_daily
2017-06-03 12:54:27	2017-06-03
2017-04-07 00:00:00	2017-04-07
2017-03-13 00:00:00	2017-03-13
2017-04-03 00:00:00	2017-04-03
2017-03-05 20:46:01	2017-03-05
2017-03-05 23:23:03	2017-03-05
2017-03-05 22:27:38	2017-03-05
2017-03-05 22:24:01	2017-03-05
2017-03-05 23:59:44	2017-03-05
2017-03-06 05:09:27	2017-03-06

Question 3: Create a new dataset, crimes_by_day, that gives the total count of crime incidents per day. For the purposes of aggregating, you can consider each unique OBJECTID as reflecting a different crime.

Similarly, create a new dataset, stops_by_day, that gives the total count of traffic stops per day. For the purposes of aggregating, you can consider each unique stop_id as reflecting a different stop.

# Remove duplicate OBJECTIDs from crime data
crime_dedup <- crime %>% 
  distinct(OBJECTID, .keep_all = TRUE)

# view number of row before and after deduplication
kable(data.frame(No_of_rows=nrow(crime)), caption= "Number of rows in Crime dataset")

Number of rows in Crime dataset
No_of_rows
16519

kable(data.frame(No_of_rows=nrow(crime_dedup)), caption= "Number of unique rows in Crime dataset")

Number of unique rows in Crime dataset
No_of_rows
16504

kable(data.frame(No_of_rows=nrow(crime)-nrow(crime_dedup)), caption= "Number of duplicates in crime data")

Number of duplicates in crime data
No_of_rows
15

# Remove duplicate stop_ids from stops data
stops_dedup <- stops %>% 
  distinct(stop_id, .keep_all = TRUE)

# view number of row before and after deduplication
kable(data.frame(No_of_rows=nrow(stops)), caption= "Number of rows in traffic stops dataset")

Number of rows in traffic stops dataset
No_of_rows
81129

kable(data.frame(No_of_rows=nrow(stops_dedup)), caption= "Number of unique rows in traffic stops dataset")

Number of unique rows in traffic stops dataset
No_of_rows
80946

kable(data.frame(No_of_rows=nrow(stops)-nrow(stops_dedup)), caption= "Number of duplicates in Traffic stops dataset")

Number of duplicates in Traffic stops dataset
No_of_rows
183

creating new datasets

# creating unique crime count per day
crimes_by_day <- crime_dedup %>%
  group_by(report_daily) %>%
  summarise(n_crimes = n(), .groups = "drop") %>%
  arrange(report_daily)
# check the new crime by day dataset create
kable(data.frame(crime_by_day=head(crimes_by_day, 10)), caption= "Crime by Day")

Crime by Day
crime_by_day.report_daily	crime_by_day.n_crimes
2017-03-01	80
2017-03-02	59
2017-03-03	84
2017-03-04	77
2017-03-05	61
2017-03-06	88
2017-03-07	70
2017-03-08	89
2017-03-09	69
2017-03-10	95

# Parse stop_date similar to crime report daily format 
stops_dedup <- stops_dedup %>%
  mutate(stop_date_parsed = mdy(stop_date))

# creating unique traffic stops count per day
stops_by_day <- stops_dedup %>%
  group_by(stop_date_parsed) %>%
  summarise(n_stops = n(), .groups = "drop") %>%
  arrange(stop_date_parsed)

# check new stop by day dataset created
kable(data.frame(stop_by_day =head(stops_by_day, 10)), caption="Traffic stops by day")

Traffic stops by day
stop_by_day.stop_date_parsed	stop_by_day.n_stops
2017-03-01	509
2017-03-02	421
2017-03-03	413
2017-03-04	475
2017-03-05	490
2017-03-06	449
2017-03-07	447
2017-03-08	450
2017-03-09	447
2017-03-10	435

Question 4: For both the traffic stops_by_day and crimes_by_day datasets: check if there are days missing in the period measured. If so, fill in the missing data in the manner you deem best suited for this problem and briefly justify your decision.

# create date sequence for crimes
crime_range <- seq(min(crimes_by_day$report_daily),
                   max(crimes_by_day$report_daily),
                   by="day")

crime_missing <- crime_range[!crime_range%in%crimes_by_day$report_daily]

# view results
# number of missing days
kable(data.frame(no_of_missing=length(crime_missing)), caption="Total number of missing in crime data")

Total number of missing in crime data
no_of_missing
0

# first few missing dates
kable(data.frame(missing_crime =head(crime_missing, 10)), caption="List of missing in crime data")

List of missing in crime data
missing_crime

# create date sequence for stops
stop_range <- seq(min(stops_by_day$stop_date_parsed),
                       max(stops_by_day$stop_date_parsed),
                       by = "day")
stop_missing <- stop_range[!stop_range %in%stops_by_day$stop_date_parsed]

# view results
# number of missing days
kable(data.frame(total_missing=length(stop_missing)), caption="Total number of missing in traffic stop data")

Total number of missing in traffic stop data
total_missing
8

# first few missing dates
kable(data.frame(missing_dates=head(stop_missing, 10)), caption="List of missing in traffic stop data")

List of missing in traffic stop data
missing_dates
2017-03-30
2017-03-31
2017-04-30
2017-05-30
2017-05-31
2017-06-30
2017-07-30
2017-07-31

fill in missing dates with ZERO because missing dates means no events recorded, not missing data collection. Replacing 0 will preserve continuity for time-series analysis or time-trend plots.

# replace for missing in crime data
crimes_by_day_full <- data.frame(report_daily = crime_range) %>%
  left_join(crimes_by_day, by = "report_daily") %>%
  mutate(n_crimes = ifelse(is.na(n_crimes), 0, n_crimes))

# replace for missing in stop data
stops_by_day_full <- data.frame(stop_date_parsed = stop_range) %>%
  left_join(stops_by_day, by = "stop_date_parsed") %>%
  mutate(n_stops = ifelse(is.na(n_stops), 0, n_stops))

Question 5: Merge the traffic stops by day data with the crimes by day data. Use code to calculate and report

combined_data <- crimes_by_day_full %>%
  rename(date = report_daily) %>%
  left_join(
    stops_by_day_full %>% rename(date = stop_date_parsed),
    by = "date"
  )

a) which specific date has the most traffic stops and how many stops there were,

max_stops <- combined_data %>% 
  filter(n_stops == max(n_stops, na.rm = TRUE)) %>% 
  mutate(day_of_week = weekdays(date))

kable((max_stops), caption="Most traffic stops by date and count")

Most traffic stops by date and count
date	n_crimes	n_stops	day_of_week
2017-05-01	89	535	Monday

b) which specific date has the most crime incidents and how many crimes there were, and

max_crimes <- combined_data %>% 
  filter(n_crimes == max(n_crimes, na.rm = TRUE)) %>% 
  mutate(day_of_week = weekdays(date))

kable((max_crimes), caption="Most crime incident by date and count")

Most crime incident by date and count
date	n_crimes	n_stops	day_of_week
2017-05-08	122	410	Monday
2017-05-29	122	481	Monday

c) what day of the week each of those dates was.

kable(data.frame(Day_of_week=(max_stops$day_of_week)), caption ="Max traffic stops day of week")

Max traffic stops day of week
Day_of_week
Monday

kable(data.frame(Day_of_week=(max_crimes$day_of_week)), caption="Max crime incident day of week")

Max crime incident day of week
Day_of_week
Monday
Monday

Question 6: What was the most common crime offense type committed on July 4th? Report both the type of offense and the count.

coalesce three variants of Theft Other

july4_offense<- crime_dedup %>% 
  mutate(offense_list = case_when(
    OFFENSE%in% c("THEFT OTHER", "THEFT_OTHER", "THEFT/OTHER") ~ "THEFT/OTHER",
    TRUE ~ OFFENSE
  ))

count the most common offense

july4_max_offense <- july4_offense %>% 
  filter(format(report_daily, "%m-%d") == "07-04") %>% 
  group_by(offense_list) %>% 
  summarise(count=n(), .groups ="drop") %>% 
  arrange(desc(count))

# view result
kable((july4_max_offense), caption="Count of Most common offense on July 4th")

Count of Most common offense on July 4th
offense_list	count
THEFT/OTHER	33
THEFT FROM AUTO	22
MOTOR VEHICLE THEFT	13
ROBBERY	7
ASSAULT W/DANGEROUS WEAPON	6
BURGLARY	2

Most common crime offense type committed on july 4th is Theft/other = 33 followed by Theft from auto =22.

Section 2. Policy Evaluation

Suppose the DC government enacted a policy that allows people with low incomes (less than $30,000 per year) to have their ticket amount reduced by 20 to 40 percent. The goal of the new policy is to reduce the number of overdue tickets among low-income residents.

In this section, you’ll use the same traffic stop data as in Section 1. You’ll also need to download the following files: 1. Data on the court outcomes of tickets: Courts Data 2. Data on residents’ income: Income Data

Note: The ticket amounts come in $0.50 denominations.

reading courts data first

considering “50” present in some status column as discount flag

# parse with readlines()
raw_lines <- readLines("C:/Users/kesha/Downloads/CourtsData1.csv")

parse_courts_row <- function(line) {
  fields <- strsplit(line, ",")[[1]]
  if (length(fields) == 3) {
    data.frame(
      stop_id        = as.integer(fields[1]),
      ticket_amount  = as.numeric(fields[2]),
      discount_flag  = NA_character_,
      ticket_status  = trimws(fields[3]),
      stringsAsFactors = FALSE
    )
  } else if (length(fields) == 4) {
    data.frame(
      stop_id        = as.integer(fields[1]),
      ticket_amount  = as.numeric(fields[2]),
      discount_flag  = trimws(fields[3]),   
      ticket_status  = trimws(fields[4]),
      stringsAsFactors = FALSE
    )
  } else {
    NULL
  }
}


courts_clean <- do.call(rbind, lapply(raw_lines[-1], parse_courts_row))

kable(data.frame(No_of_row =nrow(courts_clean)), caption="number of rows in courts clean dataset")

number of rows in courts clean dataset
No_of_row
60257

# print ticket frequency table
kable(table(courts_clean$ticket_status, useNA = "always"), caption="Ticket status Frequency")

Ticket status Frequency
Var1	Freq
challenged	5471
overdue	32689
paid	19306
pending	2791
NA	0

laod income data

#read data
income <- read.csv("C:/Users/kesha/Downloads/IncomeData.csv", stringsAsFactors = FALSE)

# view head of data
kable(head(income, 10), caption="Top 10 row of income dataset")

Top 10 row of income dataset
stop_id	income
97294	26305
87077	77578
57432	80185
121675	98784
37238	9350
199246	31981
191139	42301
153865	52278
90007	21719
111167	34801

Question 7: Find the difference between the probability of paying a ticket on time for people with incomes less than $30,000 and those with incomes greater than or equal to $30,000.

(Note: in a real project, we would set this up as a regression discontinuity design. However, as we don’t expect non-social scientists to be familiar with that methodology, we are simplifying it here into a simple difference around a threshold. If you feel comfortable working with regression discontinuity designs and feel it would be easier for you to answer that way, please feel free to do so.)

# restricting to tickets that were actually issued 
tickets_issued <- stops_dedup %>%
  filter(ticket == TRUE) %>%
  select(stop_id)

creating dataset for policy analysis

policy_data <- courts_clean %>%
  inner_join(income, by = "stop_id") %>%
  inner_join(tickets_issued, by = "stop_id") %>%
  filter(ticket_status %in% c("paid", "overdue")) %>%
  mutate(
    paid_on_time = ifelse(ticket_status == "paid", 1,0),
    income_group = ifelse(income <30000, "Below_30k", "Above_30k"))

# view data
kable(data.frame( No_of_row =nrow(policy_data)), caption="Number of rows in policy data")

Number of rows in policy data
No_of_row
37414

kable(table(policy_data$income_group), caption="Frequency for income above or below $30K")

Frequency for income above or below $30K
Var1	Freq
Above_30k	24318
Below_30k	13096

probability of paying on time by income group

pay_sum <- policy_data %>%
  group_by(income_group) %>%
  summarise(
    total_people           = n(),
    total_paid      = sum(paid_on_time),
    prob_paid      = mean(paid_on_time), # probability
    .groups     = "drop"
  )
kable((pay_sum), caption="Probability of paying on time by income group")

Probability of paying on time by income group
income_group	total_people	total_paid	prob_paid
Above_30k	24318	8492	0.3492063
Below_30k	13096	5453	0.4163867

Difference: low income minus high income

prob_low_income  <- pay_sum %>% filter(income_group == "Below_30k")  %>% pull(prob_paid)
prob_high_income <- pay_sum %>% filter(income_group == "Above_30k") %>% pull(prob_paid)
prob_difference <- prob_low_income - prob_high_income

print result of probability

kable(data.frame(Probability= round(prob_low_income, 4)), caption="Probability of paying on time (Income <$30,000)")

Probability of paying on time (Income <$30,000)
Probability
0.4164

kable(data.frame(probability=round(prob_high_income, 4)), caption="Probability of paying on time (Income >=$30,000)")

Probability of paying on time (Income >=$30,000)
probability
0.3492

kable(data.frame(Probability_difference=round(prob_difference, 4)), caption="Difference in Probability (low income - high income)")

Difference in Probability (low income - high income)
Probability_difference
0.0672

Two proportion z-test

count_low <- pay_sum %>% filter(income_group == "Below_30k") %>% pull(total_people)
count_high <- pay_sum  %>% filter(income_group == "Above_30k") %>%  pull(total_people)


paid_low   <- pay_sum %>% filter(income_group == "Below_30k") %>% pull(total_paid)
paid_high  <- pay_sum %>% filter(income_group == "Above_30k") %>% pull(total_paid)


test_result <- prop.test(
  x =c(paid_low, paid_high),
  n =c(count_low, count_high),
  correct =FALSE
)

# print test result
cat("\n--- Two-proportion test result ---\n")

— Two-proportion test result —

print(test_result)

2-sample test for equality of proportions without continuity correction

data: c(paid_low, paid_high) out of c(count_low, count_high) X-squared = 164.31, df = 1, p-value < 2.2e-16 alternative hypothesis: two.sided 95 percent confidence interval: 0.05682747 0.07753320 sample estimates: prop 1 prop 2 0.4163867 0.3492063

Q7_answer= The probability of paying a ticket on time for individuals with income less than $30,000 is 0.4164 (41.64%), while for individuals with income greater than or equal to $30,000 it is 0.3492 (34.92%).

The difference in probability is 0.0672 (6.72 percentage points), indicating that lower-income individuals have a higher probability of paying their tickets on time in this dataset.

A two-proportion test shows that this difference is statistically significant (p-value < 0.001), with a 95% confidence interval of [0.0568, 0.0775].

Question 8: What is the magnitude of your results? How are they relevant for informing policy? Discuss in 3-4 sentences.

The estimated effect is a 6.72 percentage point increase in the probability of paying a ticket on time for individuals with incomes below $30,000 compared to those above the threshold. This is a moderate but meaningful difference, especially in a policy context where even small increases in compliance can translate into substantial reductions in overdue tickets at the population level. The result suggests that lowering financial burden may improve timely payment behavior among lower-income individuals, supporting the rationale behind the discount policy. However, since this analysis is descriptive and not causal, therefore further evaluation would be needed.

Question 9: A colleague is wondering if we see differences in the probability of paying on time at different income thresholds. They want you to plot the distribution of the p-values for a test that looks at differences in the probability of paying on time at incomes below and above $5,000; below and above $10,000; and so on for every $5,000 increment up to $50,000.

You realize that, since that could involve copying and pasting the code many times, it would be more efficient to write a function that:

Takes as inputs the data and a vector of income thresholds
Runs the test from the previous step
Returns the p-value of the test

Please write a function to that effect, and then use the results to create a plot where the x axis is the income threshold (again, $5,000 to $50,000, in $5,000 increments) and the y axis is the p-value for the test on the differences in the probability of paying on time around that income threshold.

(Note: As in Question 7, if this was a real project, we would set this up as a sequence of regression discontinuity designs. However, as we don’t expect non-social scientists to be familiar with that methodology, we are simplifying it here into simple differences around a series of thresholds. If you feel comfortable working with regression discontinuity designs and feel it would be easier for you to answer that way, please feel free to do so.)

# function to test p value
get_p_value <- function(data, threshold) {
  
  temp_data <- data %>%
    mutate(income_group = ifelse(income < threshold, "Below", "Above"))
  
  pay_sum <- temp_data %>%
    group_by(income_group) %>%
    summarise(
      total_people = n(),
      total_paid   = sum(paid_on_time),
      .groups = "drop"
    )
  
  # skipping if one group missing
  if (nrow(pay_sum) < 2) return(NA)
  
  test <- prop.test(
    x = pay_sum$total_paid,
    n = pay_sum$total_people,
    correct = FALSE
  )
  
  return(test$p.value)
}

# running threshold
thresholds <- seq(5000, 50000, by = 5000)

p_values <- sapply(thresholds, function(x) {
  get_p_value(policy_data, x)
})

results <- data.frame(
  income_threshold = thresholds,
  p_value = p_values
)

kable(data.frame(p_values= p_values), caption= "List of P Values")

List of P Values
p_values
0.6049941
0.2497395
0.0000179
0.0000000
0.0000000
0.0000000
0.9102625
0.0000000
0.0000000
0.0000000

# plot p values

plot_pvalues <- ggplot(results, aes(x = income_threshold, y = p_value)) +
  geom_line(color = "#2C6E9B", linewidth = 1) +
  geom_point(color = "#2C6E9B", size = 3) +
  
  # significance line
  geom_hline(yintercept = 0.05, linetype = "dashed",
             color = "firebrick", linewidth = 0.8) +
  
  # label for alpha
  annotate("text", x = 5000, y = 0.07, label = "α = 0.05",
           color = "firebrick", hjust = 0, size = 4) +
  
  # axes
  scale_x_continuous(
    breaks = thresholds,
    labels = label_dollar(scale = 1/1000, suffix = "k")
  ) +
  scale_y_continuous(
    limits = c(0, 1),
    breaks = seq(0, 1, 0.1)
  ) +
  
  labs(
    title = "P-values Across Income Thresholds",
    subtitle = "Difference in probability of paying on time (below vs above threshold)",
    x = "Income Threshold",
    y = "P-value"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank()
  )

print(plot_pvalues)

# save plot
#ggsave("pvalue_by_threshold.png", plot = plot_pvalues,width = 8, height = 5, dpi = 150)

Question 10: In 3-5 sentences, explain to your colleague, Leon Mait, whether or not the new reduced payment policy had a detectable impact on residents’ likelihood of having an overdue ticket. What problems might this analysis have in capturing the causal effect of the policy? How would you suggest overcoming those problems? Assume that Leon is generally familiar with the policy and the data.

The results suggest that there are statistically significant differences in the probability of paying on time at several income thresholds (many p-values are < 0.05), which indicates that income level is associated with payment behavior. However, the pattern is not consistent across all thresholds incomes tested, and there are spikes around $35k, where the difference is not significant, suggesting instability in the effect. This means we cannot confidently conclude that the reduced payment policy itself caused the change in overdue rates.

A key limitation is that this analysis is purely descriptive and does not control for confounding factors such as differences in demographics, enforcement patterns, or underlying ability to pay. People just below and above the threshold may differ systematically in ways unrelated to the policy.

Section 3: Data Visualization

Question 11: We are interested in assessing your data visualization skills. Using any dataset available on opendata.dc.gov, make a visualization that you think is interesting or illuminating in some way. Your visualization should be clear, accessible, and professional. (Note: in subsequent hiring stages, you should be prepared to share why you chose to make this visualization and how you generated it.)

dataset source: Bicycle Lanes Metadata source: Metadata Bicycle Lane

# Reading bicycle lanes data
bicycle <- read.csv("data/Bicycle_Lanes.csv")

# print head of data
kable(head(bicycle, 10), caption = "Printing first 10 row of dataset")

Printing first 10 row of dataset
ROUTEID	ROUTENAME	ROADTYPE	SUBBLOCKKEY	BIKELANE_PARKINGLANE_ADJACENT	BIKELANE_THROUGHLANE_ADJACENT	BIKELANE_CONTRAFLOW	BIKELANE_CONVENTIONAL	BIKELANE_PROTECTED	BIKELANE_BUFFERED	SUBBLOCKID	BLOCKID	BLOCKKEY	QUADRANT	STREETNAME	STREETTYPE	TOTALBIKELANES	TOTALBIKELANEWIDTH	WARD_ID	SMD_ID	ANC_ID	OBJECTID
11000302	3RD ST NW	1	62ee7cc40df1189eb13995b3a1b44812	IB	IB		IB			11000302-11000302_11076242-11000302_47015042	11000302-11000302_11076242-11000302_11065452_11079882	d24a9caf2b4db11382b6172d41484fe5	1	3RD	ST	1	5	4	4B03	4B	10237121
11000302	3RD ST NW	1	2445dcb917e30b8935246eb42189a665	BD	BD		BD			11000302-11000302_11086842-11000302_58023592	11000302-11000302_11086842-11000302_11087422	4cb5ba2ff947bbbb82770ea071f63ae3	1	3RD	ST	2	10	4	4B03	4B	10237134
11000302	3RD ST NW	1	9ad16a536e3a8edb388ec0ce951c5e63	IB	IB		IB			11000302-11000302_47015042-11000302_11065452_11079882	11000302-11000302_11076242-11000302_11065452_11079882	d24a9caf2b4db11382b6172d41484fe5	1	3RD	ST	1	5	4	4B03	4B	10237175
11000302	3RD ST NW	1	62cd96d7bcd68d42f2d13529e19f542f	BD	BD		BD			11000302-11000302_11065452_11079882-11000302_11086842	11000302-11000302_11065452_11079882-11000302_11086842	62cd96d7bcd68d42f2d13529e19f542f	1	3RD	ST	2	10	4	4B03	4B	10237212
11000302	3RD ST NW	1	22c1ec2999731a290796fb9664491948	BD	BD		BD			11000302-11000302_11087422-11000302_11088792	11000302-11000302_11087422-11000302_11088792	22c1ec2999731a290796fb9664491948	1	3RD	ST	2	10	4	4B03	4B	10237213
11000402	4TH ST NW	1	8e801d825787a035b408230dbe869ff4						OB	11000402-11000402_11033082-11000402_11066102	11000402-11000402_11033082-11000402_11066102	8e801d825787a035b408230dbe869ff4	1	4TH	ST	1	5	1	1B01	1B	10237220
11000402	4TH ST NW	1	45092fe2113bf60f171431f8e9f0f3f9			OB		OB		11000402-11000402_11019632-11000402_11016382	11000402-11000402_11019632-11000402_11016382	45092fe2113bf60f171431f8e9f0f3f9	1	4TH	ST	1	5	4	4B04	4B	10237222
11000402	4TH ST NW	1	7eff870e38d4c76c1df1f08e827fa4ca						OB	11000402-11000402_11090152-11000402_11018722	11000402-11000402_11090152-11000402_11018722	7eff870e38d4c76c1df1f08e827fa4ca	1	4TH	ST	1	5	1	1E07	1E	10237223
11000402	4TH ST NW	1	1fb1e4c9ad5cbbb62ec65f8a245f9941						OB	11000402-11000402_11087232-11000402_11033082	11000402-11000402_11087232-11000402_11033082	1fb1e4c9ad5cbbb62ec65f8a245f9941	1	4TH	ST	1	5	1	1B01	1B	10237241
11000402	4TH ST NW	1	de1e8797ef55bfc8215bbf4100126979						OB	11000402-11000402_47015622_47039622-11000402_11090152	11000402-11000402_11088462-11000402_11090152	30298022971c113640b720129156b018	1	4TH	ST	1	5	1	1B01	1B	10237254

# get data overview
#glimpse(bicycle)

Goal of this visualization is to have a ward-by-ward look of the DC on-streets Bike lanes from the saftey lenses.

# filter data to streets only (ROADTYPE=1, street only)
bike_streets <-bicycle %>% 
  filter(ROADTYPE==1)

# pulling saftey variable
is_present <- function(x) nchar(trimws(x)) > 0

bike_streets <- bike_streets %>%
  mutate(
    # has any separation (protected or buffered)
    has_sep = is_present(BIKELANE_PROTECTED) |
              is_present(BIKELANE_DUAL_PROTECTED) |
              is_present(BIKELANE_BUFFERED) |
              is_present(BIKELANE_DUAL_BUFFERED),

    # adjacent to parking (dooring risk)
    parking_adj = is_present(BIKELANE_PARKINGLANE_ADJACENT), ward_label = paste0("Ward ", WARD_ID))

# summarise by ward

ward_summary <- bike_streets %>%
  group_by(ward_label) %>%
  summarise(
    total_segments = n(),
    percent_sep        = round(mean(has_sep) * 100, 1),
    percent_parking    = round(mean(parking_adj) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(percent_sep) %>% 
  mutate(ward_label = factor(ward_label, levels = ward_label))

kable((ward_summary), caption=" Summary of ward data")

Summary of ward data
ward_label	total_segments	percent_sep	percent_parking
Ward 6	581	18.2	75.9
Ward 4	249	18.9	80.3
Ward 8	60	23.3	68.3
Ward 3	109	25.7	56.9
Ward 7	184	35.3	58.7
Ward 1	325	35.7	60.0
Ward 2	576	37.0	58.0
Ward 5	238	48.3	51.7

# pivot longer data for plotting

plot_data <- ward_summary %>%
  select(ward_label,total_segments, percent_sep, percent_parking) %>% 
  pivot_longer(
    cols = c(percent_sep, percent_parking),
    names_to = "metric",
    values_to = "percent"
  ) %>%
  mutate(
    metric_label = case_when(
      metric == "percent_sep"~ "Has some separation\n(protected or buffered)",
      metric == "percent_parking" ~ "Adjacent to parked cars\n(dooring risk)"
    ),
    metric_label = factor(metric_label, levels = c(
      "Has some separation\n(protected or buffered)",
      "Adjacent to parked cars\n(dooring risk)"
    )),
    # Color: green for separation (good), red for dooring risk (bad)
    dot_color = ifelse(metric == "percent_sep", "#1a6b3c", "#c0392b")
  )

kable((plot_data), caption ="Plot data for Bicycle lane")

Plot data for Bicycle lane
ward_label	total_segments	metric	percent	metric_label
Ward 6	581	percent_sep	18.2	Has some separation
(protected or buffered)	#1a6b3c
Ward 6	581	percent_parking	75.9	Adjacent to parked cars
(dooring risk)	#c0392b
Ward 4	249	percent_sep	18.9	Has some separation
(protected or buffered)	#1a6b3c
Ward 4	249	percent_parking	80.3	Adjacent to parked cars
(dooring risk)	#c0392b
Ward 8	60	percent_sep	23.3	Has some separation
(protected or buffered)	#1a6b3c
Ward 8	60	percent_parking	68.3	Adjacent to parked cars
(dooring risk)	#c0392b
Ward 3	109	percent_sep	25.7	Has some separation
(protected or buffered)	#1a6b3c
Ward 3	109	percent_parking	56.9	Adjacent to parked cars
(dooring risk)	#c0392b
Ward 7	184	percent_sep	35.3	Has some separation
(protected or buffered)	#1a6b3c
Ward 7	184	percent_parking	58.7	Adjacent to parked cars
(dooring risk)	#c0392b
Ward 1	325	percent_sep	35.7	Has some separation
(protected or buffered)	#1a6b3c
Ward 1	325	percent_parking	60.0	Adjacent to parked cars
(dooring risk)	#c0392b
Ward 2	576	percent_sep	37.0	Has some separation
(protected or buffered)	#1a6b3c
Ward 2	576	percent_parking	58.0	Adjacent to parked cars
(dooring risk)	#c0392b
Ward 5	238	percent_sep	48.3	Has some separation
(protected or buffered)	#1a6b3c
Ward 5	238	percent_parking	51.7	Adjacent to parked cars
(dooring risk)	#c0392b

# segment count labels

n_labels <- ward_summary %>% 
  mutate(
    metric_label =factor(
      "Has some separation\n(protected or buffered)",
      levels = c(
        "Has some separation\n(protected or buffered)",
        "Adjacent to parked cars\n(dooring risk)"
      )
    )
  )

# plot graph

p <- ggplot(plot_data, aes(x = percent, y = ward_label)) +
  geom_segment(
    aes(x = 0, xend = percent, yend = ward_label),
    color     = "gray88",
    linewidth = 0.8
  ) +
  geom_point(aes(color = dot_color), size = 5) +
  geom_text(
    aes(label = paste0(percent, "%")),
    hjust      = -0.45,
    size       = 3.2,
    color      = "gray20",
    fontface   = "bold"
  ) +
  geom_text(
    data = n_labels,
    aes(x = -2, y = ward_label, label = paste0("n=", total_segments)),
    hjust        = 1,
    size         = 2.7,
    color        = "gray55",
    inherit.aes  = FALSE
  ) +
  scale_color_identity() +
  scale_x_continuous(
    limits = c(-14, 88),
    breaks = c(0, 25, 50, 75),
    labels = function(x) paste0(x, "%")
  ) +
  facet_wrap(~ metric_label, ncol = 2) +
  labs(
    title    = "How Safe Are DC Bike Lanes? A Ward-by-Ward Look",
    subtitle = paste0(
      "Street segments only (n = 2,322)  |  ",
      "Wards ordered by share with separation (low to high)\n",
      "Separation = lane has physical barrier or painted buffer between bike and traffic\n",
      "Dooring risk = bike lane sits next to a parking lane ",
      "(opened car doors hit cyclists)"
    ),
    x        = "Share of bike lane segments in that ward (%)",
    y        = NULL,
    caption  = paste0(
      "Source: DC Open Data — [Bicycle Lanes](https://opendata.dc.gov/datasets/DCGIS::bicycle-lanes/about)(DDOT, updated weekly)"
    )
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title         = element_text(face = "bold", size = 13),
    plot.subtitle      = element_text(size = 8.5, color = "gray40",
                                      lineheight = 1.4),
    plot.caption       = element_text(size = 7.5, color = "gray55"),
    strip.text         = element_text(face = "bold", size = 11,
                                      lineheight = 1.3),
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "gray93"),
    axis.text.y        = element_text(size = 10, face = "bold"),
    axis.text.x        = element_text(size = 9),
    panel.spacing      = unit(2.2, "lines")
  )

print(p)

# save plot
#ggsave("bike_lanes_safety.png", plot = p, width = 10, height = 5)

Question 12: Compress and upload your code/script(s) used to answer all questions (e.g., .R or .Rmd; .py or .ipynb; .qmd; .txt file) in a zip file (it is easier for our system to accept zip files). Please ensure the file and the file name do not include your name or other identifying information.

Submitting .Rmd file and html knit file with all the answers.