###### PROLOG ########
# PROLOG ###
# PROJECT: The Lab @ DC Performance Task
# PURPOSE: Performance Task
# DIR: C:\Users\kesha\Downloads\thelab_dc\thelab_task
# DATA: C:\Users\kesha\Downloads\thelab_dc\thelab_task\data
# AUTHOR: Masked
# CREATED: Apr 15, 2026
# LATEST: Apr 15, 2026
# NOTES: Data sources is provided with the task
# PROLOG ###
# install packages and open libraries
# dplyr for data wrangling
# tidyverse for core packages
# ggplot2 for plots
# writing results to excel
# ggthemes for tuft theme
#latexpdf for converting table to pdf or png
# kableextra for table output
# stringr for string manipulation
# lubridate for date/time parsing
# scales for axis label formatting
# tibble for tibble()
pacman::p_load(dplyr, tidyverse, ggplot2, ggthemes, writexl, kableExtra, readxl, stringr, lubridate, scales, tibble, latexpdf)
# plot theme
theme_set(theme_tufte()) # but might not carry over in chunks
# Okabe-Ito colorblind-friendly color palette:
# https://jfly.uni-koeln.de/color/
oi_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7", "#999999")
# # Clear existing variables from the environment
# rm(list = ls ())
# # Set the working directory to the project folder
# setwd("C:\Users\kesha\Downloads\thelab_dc\thelab_task")
Data Source 1. Recent traffic stops by Metropolitan Police Department (MPD) officers (traffic stops data): stops_2026
# read crimes data
crime <- read_csv("data/crime_2026.csv")
# view data from top 10 rows
kable(head(crime, 10), caption="Crime datasets")
| X | Y | CCN | REPORT_DAT | SHIFT | METHOD | OFFENSE | BLOCK | XBLOCK | YBLOCK | WARD | ANC | DISTRICT | PSA | NEIGHBORHOOD_CLUSTER | BLOCK_GROUP | CENSUS_TRACT | VOTING_PRECINCT | LATITUDE | LONGITUDE | BID | START_DATE | END_DATE | OBJECTID | OCTO_RECORD_ID |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| -77.05866 | 38.90685 | 7093973 | 2017-06-03 12:54:27 | DAY | OTHERS | THEFT FROM AUTO | 2900 - 2999 BLOCK OF N STREET NW | 394912 | 137666 | 2 | 2E | 2 | 206 | Cluster 4 | 000100 4 | 100 | Precinct 5 | 38.90684 | -77.05866 | NA | 2017-06-03 11:54:03 | 2017-06-03 11:55:05 | 341397622 | 07093973-01 |
| -77.01804 | 38.96075 | 9251904 | 2017-04-07 00:00:00 | MIDNIGHT | OTHERS | HOMICIDE | 5800 - 5899 BLOCK OF 4TH STREET NW | 398437 | 143648 | 4 | 4B | 4 | 402 | Cluster 17 | 001902 1 | 1902 | Precinct 58 | 38.96074 | -77.01803 | NA | 2009-02-13 12:03:45 | 2009-02-13 12:03:43 | 341453326 | 09251904-01 |
| -76.98495 | 38.91515 | 14060710 | 2017-03-13 00:00:00 | MIDNIGHT | OTHERS | HOMICIDE | 2000 - 2099 BLOCK OF FENWICK STREET NE | 401305 | 138586 | 5 | 5D | 5 | 506 | Cluster 23 | 008803 1 | 8803 | Precinct 76 | 38.91514 | -76.98495 | NA | 2014-05-01 19:32:00 | 2014-05-01 19:32:00 | 341600002 | 14060710-01 |
| -77.00972 | 38.82439 | 16217337 | 2017-04-03 00:00:00 | MIDNIGHT | OTHERS | HOMICIDE | 2 - 153 BLOCK OF GALVESTON PLACE SW | 399156 | 128511 | 8 | 8D | 7 | 708 | Cluster 39 | 009807 2 | 9807 | Precinct 126 | 38.82438 | -77.00972 | NA | 2016-12-25 11:59:35 | 2016-12-25 20:36:17 | 341698884 | 16217337-01 |
| -77.01720 | 38.90453 | 17037020 | 2017-03-05 20:46:01 | EVENING | OTHERS | THEFT FROM AUTO | 400 - 471 BLOCK OF NEW YORK AVENUE NW | 398508 | 137407 | 6 | 6E | 3 | 308 | Cluster 8 | 004701 2 | 4701 | Precinct 1 | 38.90452 | -77.01720 | MOUNT VERNON TRIANGLE CID | 2017-03-05 20:14:30 | NA | 341703531 | 17037020-01 |
| -76.97611 | 38.87293 | 17037041 | 2017-03-05 23:23:03 | MIDNIGHT | OTHERS | MOTOR VEHICLE THEFT | 2100 - 2149 BLOCK OF YOUNG STREET SE | 402073 | 133900 | 8 | 8A | 6 | 607 | Cluster 34 | 007601 1 | 7601 | Precinct 133 | 38.87292 | -76.97611 | NA | 2017-03-05 20:46:11 | 2017-03-05 21:45:13 | 341703532 | 17037041-01 |
| -77.02140 | 38.91557 | 17037050 | 2017-03-05 22:27:38 | EVENING | OTHERS | THEFT/OTHER | 620 - 699 BLOCK OF T STREET NW | 398144 | 138633 | 1 | 1B | 3 | 306 | Cluster 3 | 004801 1 | 4801 | Precinct 137 | 38.91556 | -77.02140 | NA | 2017-03-05 20:30:08 | 2017-03-05 20:45:13 | 341703533 | 17037050-01 |
| -77.00879 | 38.94933 | 17037051 | 2017-03-05 22:24:01 | EVENING | GUN | ROBBERY | 4700 - 4931 BLOCK OF NORTH CAPITOL STREET | 399238 | 142381 | 4 | 4D | 4 | 405 | Cluster 19 | 002202 1 | 2202 | Precinct 57 | 38.94933 | -77.00879 | NA | 2017-03-05 20:50:16 | 2017-03-05 21:30:52 | 341703534 | 17037051-01 |
| -76.95661 | 38.92216 | 17037085 | 2017-03-05 23:59:44 | MIDNIGHT | OTHERS | MOTOR VEHICLE THEFT | 3279 3499 BLOCK OF FORT LINCOLN DRIVE NE | 403763 | 139365 | 5 | 5C | 5 | 503 | Cluster 24 | 009000 1 | 9000 | Precinct 139 | 38.92215 | -76.95661 | NA | 2017-03-05 12:00:01 | 2017-03-05 22:30:40 | 341703535 | 17037085-01 |
| -76.96778 | 38.87011 | 17037086 | 2017-03-06 05:09:27 | MIDNIGHT | OTHERS | THEFT/OTHER | 1600 - 1699 BLOCK OF 28TH STREET SE | 402796 | 133587 | 7 | 7B | 6 | 607 | Cluster 34 | 007604 1 | 7604 | Precinct 111 | 38.87010 | -76.96778 | NA | 2017-03-05 23:22:21 | 2017-03-06 00:37:35 | 341703536 | 17037086-01 |
# read traffic stops data
stops <- read_csv("data/stops_2026.csv")
# view data from top 10 rows
kable(head(stops, 10), caption="Traffic Stops Dataset")
| stop_id | officer_id | stop_date | driver_age | driver_sex | ticket |
|---|---|---|---|---|---|
| 52595 | E | 3/1/2017 | 37 | f | TRUE |
| 46141 | D | 3/1/2017 | 16 | m | TRUE |
| 154271 | D | 3/1/2017 | 16 | m | TRUE |
| 184170 | D | 3/1/2017 | 23 | m | TRUE |
| 91155 | D | 3/1/2017 | 18 | f | TRUE |
| 9776 | E | 3/1/2017 | 36 | f | TRUE |
| 64687 | B | 3/1/2017 | 52 | m | TRUE |
| 35922 | D | 3/1/2017 | 17 | f | FALSE |
| 49723 | D | 3/1/2017 | 31 | m | TRUE |
| 176992 | B | 3/1/2017 | 45 | f | FALSE |
#(glimpse(crime))
#(glimpse(stops))
Please create a new column based on REPORT_DAT column of
the crime incidents data that rounds the exact timestamp for the crime
down to the date. Call this column report_daily.
# create new column in crime data
crime <- crime %>%
mutate(report_daily= as.Date(ymd_hms(REPORT_DAT)))
# check new column created
kable(head(crime %>% select(REPORT_DAT, report_daily), 10))
| REPORT_DAT | report_daily |
|---|---|
| 2017-06-03 12:54:27 | 2017-06-03 |
| 2017-04-07 00:00:00 | 2017-04-07 |
| 2017-03-13 00:00:00 | 2017-03-13 |
| 2017-04-03 00:00:00 | 2017-04-03 |
| 2017-03-05 20:46:01 | 2017-03-05 |
| 2017-03-05 23:23:03 | 2017-03-05 |
| 2017-03-05 22:27:38 | 2017-03-05 |
| 2017-03-05 22:24:01 | 2017-03-05 |
| 2017-03-05 23:59:44 | 2017-03-05 |
| 2017-03-06 05:09:27 | 2017-03-06 |
Similarly, create a new dataset, stops_by_day, that gives the total count of traffic stops per day. For the purposes of aggregating, you can consider each unique stop_id as reflecting a different stop.
# Remove duplicate OBJECTIDs from crime data
crime_dedup <- crime %>%
distinct(OBJECTID, .keep_all = TRUE)
# view number of row before and after deduplication
kable(data.frame(No_of_rows=nrow(crime)), caption= "Number of rows in Crime dataset")
| No_of_rows |
|---|
| 16519 |
kable(data.frame(No_of_rows=nrow(crime_dedup)), caption= "Number of unique rows in Crime dataset")
| No_of_rows |
|---|
| 16504 |
kable(data.frame(No_of_rows=nrow(crime)-nrow(crime_dedup)), caption= "Number of duplicates in crime data")
| No_of_rows |
|---|
| 15 |
# Remove duplicate stop_ids from stops data
stops_dedup <- stops %>%
distinct(stop_id, .keep_all = TRUE)
# view number of row before and after deduplication
kable(data.frame(No_of_rows=nrow(stops)), caption= "Number of rows in traffic stops dataset")
| No_of_rows |
|---|
| 81129 |
kable(data.frame(No_of_rows=nrow(stops_dedup)), caption= "Number of unique rows in traffic stops dataset")
| No_of_rows |
|---|
| 80946 |
kable(data.frame(No_of_rows=nrow(stops)-nrow(stops_dedup)), caption= "Number of duplicates in Traffic stops dataset")
| No_of_rows |
|---|
| 183 |
# creating unique crime count per day
crimes_by_day <- crime_dedup %>%
group_by(report_daily) %>%
summarise(n_crimes = n(), .groups = "drop") %>%
arrange(report_daily)
# check the new crime by day dataset create
kable(data.frame(crime_by_day=head(crimes_by_day, 10)), caption= "Crime by Day")
| crime_by_day.report_daily | crime_by_day.n_crimes |
|---|---|
| 2017-03-01 | 80 |
| 2017-03-02 | 59 |
| 2017-03-03 | 84 |
| 2017-03-04 | 77 |
| 2017-03-05 | 61 |
| 2017-03-06 | 88 |
| 2017-03-07 | 70 |
| 2017-03-08 | 89 |
| 2017-03-09 | 69 |
| 2017-03-10 | 95 |
# Parse stop_date similar to crime report daily format
stops_dedup <- stops_dedup %>%
mutate(stop_date_parsed = mdy(stop_date))
# creating unique traffic stops count per day
stops_by_day <- stops_dedup %>%
group_by(stop_date_parsed) %>%
summarise(n_stops = n(), .groups = "drop") %>%
arrange(stop_date_parsed)
# check new stop by day dataset created
kable(data.frame(stop_by_day =head(stops_by_day, 10)), caption="Traffic stops by day")
| stop_by_day.stop_date_parsed | stop_by_day.n_stops |
|---|---|
| 2017-03-01 | 509 |
| 2017-03-02 | 421 |
| 2017-03-03 | 413 |
| 2017-03-04 | 475 |
| 2017-03-05 | 490 |
| 2017-03-06 | 449 |
| 2017-03-07 | 447 |
| 2017-03-08 | 450 |
| 2017-03-09 | 447 |
| 2017-03-10 | 435 |
# create date sequence for crimes
crime_range <- seq(min(crimes_by_day$report_daily),
max(crimes_by_day$report_daily),
by="day")
crime_missing <- crime_range[!crime_range%in%crimes_by_day$report_daily]
# view results
# number of missing days
kable(data.frame(no_of_missing=length(crime_missing)), caption="Total number of missing in crime data")
| no_of_missing |
|---|
| 0 |
# first few missing dates
kable(data.frame(missing_crime =head(crime_missing, 10)), caption="List of missing in crime data")
| missing_crime |
|---|
# create date sequence for stops
stop_range <- seq(min(stops_by_day$stop_date_parsed),
max(stops_by_day$stop_date_parsed),
by = "day")
stop_missing <- stop_range[!stop_range %in%stops_by_day$stop_date_parsed]
# view results
# number of missing days
kable(data.frame(total_missing=length(stop_missing)), caption="Total number of missing in traffic stop data")
| total_missing |
|---|
| 8 |
# first few missing dates
kable(data.frame(missing_dates=head(stop_missing, 10)), caption="List of missing in traffic stop data")
| missing_dates |
|---|
| 2017-03-30 |
| 2017-03-31 |
| 2017-04-30 |
| 2017-05-30 |
| 2017-05-31 |
| 2017-06-30 |
| 2017-07-30 |
| 2017-07-31 |
# replace for missing in crime data
crimes_by_day_full <- data.frame(report_daily = crime_range) %>%
left_join(crimes_by_day, by = "report_daily") %>%
mutate(n_crimes = ifelse(is.na(n_crimes), 0, n_crimes))
# replace for missing in stop data
stops_by_day_full <- data.frame(stop_date_parsed = stop_range) %>%
left_join(stops_by_day, by = "stop_date_parsed") %>%
mutate(n_stops = ifelse(is.na(n_stops), 0, n_stops))
combined_data <- crimes_by_day_full %>%
rename(date = report_daily) %>%
left_join(
stops_by_day_full %>% rename(date = stop_date_parsed),
by = "date"
)
max_stops <- combined_data %>%
filter(n_stops == max(n_stops, na.rm = TRUE)) %>%
mutate(day_of_week = weekdays(date))
kable((max_stops), caption="Most traffic stops by date and count")
| date | n_crimes | n_stops | day_of_week |
|---|---|---|---|
| 2017-05-01 | 89 | 535 | Monday |
max_crimes <- combined_data %>%
filter(n_crimes == max(n_crimes, na.rm = TRUE)) %>%
mutate(day_of_week = weekdays(date))
kable((max_crimes), caption="Most crime incident by date and count")
| date | n_crimes | n_stops | day_of_week |
|---|---|---|---|
| 2017-05-08 | 122 | 410 | Monday |
| 2017-05-29 | 122 | 481 | Monday |
kable(data.frame(Day_of_week=(max_stops$day_of_week)), caption ="Max traffic stops day of week")
| Day_of_week |
|---|
| Monday |
kable(data.frame(Day_of_week=(max_crimes$day_of_week)), caption="Max crime incident day of week")
| Day_of_week |
|---|
| Monday |
| Monday |
july4_offense<- crime_dedup %>%
mutate(offense_list = case_when(
OFFENSE%in% c("THEFT OTHER", "THEFT_OTHER", "THEFT/OTHER") ~ "THEFT/OTHER",
TRUE ~ OFFENSE
))
july4_max_offense <- july4_offense %>%
filter(format(report_daily, "%m-%d") == "07-04") %>%
group_by(offense_list) %>%
summarise(count=n(), .groups ="drop") %>%
arrange(desc(count))
# view result
kable((july4_max_offense), caption="Count of Most common offense on July 4th")
| offense_list | count |
|---|---|
| THEFT/OTHER | 33 |
| THEFT FROM AUTO | 22 |
| MOTOR VEHICLE THEFT | 13 |
| ROBBERY | 7 |
| ASSAULT W/DANGEROUS WEAPON | 6 |
| BURGLARY | 2 |
Most common crime offense type committed on july 4th is Theft/other = 33 followed by Theft from auto =22.
Suppose the DC government enacted a policy that allows people with low incomes (less than $30,000 per year) to have their ticket amount reduced by 20 to 40 percent. The goal of the new policy is to reduce the number of overdue tickets among low-income residents.
In this section, you’ll use the same traffic stop data as in Section 1. You’ll also need to download the following files: 1. Data on the court outcomes of tickets: Courts Data 2. Data on residents’ income: Income Data
Note: The ticket amounts come in $0.50 denominations.
considering “50” present in some status column as discount flag
# parse with readlines()
raw_lines <- readLines("C:/Users/kesha/Downloads/CourtsData1.csv")
parse_courts_row <- function(line) {
fields <- strsplit(line, ",")[[1]]
if (length(fields) == 3) {
data.frame(
stop_id = as.integer(fields[1]),
ticket_amount = as.numeric(fields[2]),
discount_flag = NA_character_,
ticket_status = trimws(fields[3]),
stringsAsFactors = FALSE
)
} else if (length(fields) == 4) {
data.frame(
stop_id = as.integer(fields[1]),
ticket_amount = as.numeric(fields[2]),
discount_flag = trimws(fields[3]),
ticket_status = trimws(fields[4]),
stringsAsFactors = FALSE
)
} else {
NULL
}
}
courts_clean <- do.call(rbind, lapply(raw_lines[-1], parse_courts_row))
kable(data.frame(No_of_row =nrow(courts_clean)), caption="number of rows in courts clean dataset")
| No_of_row |
|---|
| 60257 |
# print ticket frequency table
kable(table(courts_clean$ticket_status, useNA = "always"), caption="Ticket status Frequency")
| Var1 | Freq |
|---|---|
| challenged | 5471 |
| overdue | 32689 |
| paid | 19306 |
| pending | 2791 |
| NA | 0 |
#read data
income <- read.csv("C:/Users/kesha/Downloads/IncomeData.csv", stringsAsFactors = FALSE)
# view head of data
kable(head(income, 10), caption="Top 10 row of income dataset")
| stop_id | income |
|---|---|
| 97294 | 26305 |
| 87077 | 77578 |
| 57432 | 80185 |
| 121675 | 98784 |
| 37238 | 9350 |
| 199246 | 31981 |
| 191139 | 42301 |
| 153865 | 52278 |
| 90007 | 21719 |
| 111167 | 34801 |
(Note: in a real project, we would set this up as a regression discontinuity design. However, as we don’t expect non-social scientists to be familiar with that methodology, we are simplifying it here into a simple difference around a threshold. If you feel comfortable working with regression discontinuity designs and feel it would be easier for you to answer that way, please feel free to do so.)
# restricting to tickets that were actually issued
tickets_issued <- stops_dedup %>%
filter(ticket == TRUE) %>%
select(stop_id)
policy_data <- courts_clean %>%
inner_join(income, by = "stop_id") %>%
inner_join(tickets_issued, by = "stop_id") %>%
filter(ticket_status %in% c("paid", "overdue")) %>%
mutate(
paid_on_time = ifelse(ticket_status == "paid", 1,0),
income_group = ifelse(income <30000, "Below_30k", "Above_30k"))
# view data
kable(data.frame( No_of_row =nrow(policy_data)), caption="Number of rows in policy data")
| No_of_row |
|---|
| 37414 |
kable(table(policy_data$income_group), caption="Frequency for income above or below $30K")
| Var1 | Freq |
|---|---|
| Above_30k | 24318 |
| Below_30k | 13096 |
pay_sum <- policy_data %>%
group_by(income_group) %>%
summarise(
total_people = n(),
total_paid = sum(paid_on_time),
prob_paid = mean(paid_on_time), # probability
.groups = "drop"
)
kable((pay_sum), caption="Probability of paying on time by income group")
| income_group | total_people | total_paid | prob_paid |
|---|---|---|---|
| Above_30k | 24318 | 8492 | 0.3492063 |
| Below_30k | 13096 | 5453 | 0.4163867 |
prob_low_income <- pay_sum %>% filter(income_group == "Below_30k") %>% pull(prob_paid)
prob_high_income <- pay_sum %>% filter(income_group == "Above_30k") %>% pull(prob_paid)
prob_difference <- prob_low_income - prob_high_income
kable(data.frame(Probability= round(prob_low_income, 4)), caption="Probability of paying on time (Income <$30,000)")
| Probability |
|---|
| 0.4164 |
kable(data.frame(probability=round(prob_high_income, 4)), caption="Probability of paying on time (Income >=$30,000)")
| probability |
|---|
| 0.3492 |
kable(data.frame(Probability_difference=round(prob_difference, 4)), caption="Difference in Probability (low income - high income)")
| Probability_difference |
|---|
| 0.0672 |
count_low <- pay_sum %>% filter(income_group == "Below_30k") %>% pull(total_people)
count_high <- pay_sum %>% filter(income_group == "Above_30k") %>% pull(total_people)
paid_low <- pay_sum %>% filter(income_group == "Below_30k") %>% pull(total_paid)
paid_high <- pay_sum %>% filter(income_group == "Above_30k") %>% pull(total_paid)
test_result <- prop.test(
x =c(paid_low, paid_high),
n =c(count_low, count_high),
correct =FALSE
)
# print test result
cat("\n--- Two-proportion test result ---\n")
— Two-proportion test result —
print(test_result)
2-sample test for equality of proportions without continuity correction
data: c(paid_low, paid_high) out of c(count_low, count_high) X-squared = 164.31, df = 1, p-value < 2.2e-16 alternative hypothesis: two.sided 95 percent confidence interval: 0.05682747 0.07753320 sample estimates: prop 1 prop 2 0.4163867 0.3492063
The estimated effect is a 6.72 percentage point increase in the probability of paying a ticket on time for individuals with incomes below $30,000 compared to those above the threshold. This is a moderate but meaningful difference, especially in a policy context where even small increases in compliance can translate into substantial reductions in overdue tickets at the population level. The result suggests that lowering financial burden may improve timely payment behavior among lower-income individuals, supporting the rationale behind the discount policy. However, since this analysis is descriptive and not causal, therefore further evaluation would be needed.
You realize that, since that could involve copying and pasting the code many times, it would be more efficient to write a function that:
Please write a function to that effect, and then use the results to create a plot where the x axis is the income threshold (again, $5,000 to $50,000, in $5,000 increments) and the y axis is the p-value for the test on the differences in the probability of paying on time around that income threshold.
(Note: As in Question 7, if this was a real project, we would set this up as a sequence of regression discontinuity designs. However, as we don’t expect non-social scientists to be familiar with that methodology, we are simplifying it here into simple differences around a series of thresholds. If you feel comfortable working with regression discontinuity designs and feel it would be easier for you to answer that way, please feel free to do so.)
# function to test p value
get_p_value <- function(data, threshold) {
temp_data <- data %>%
mutate(income_group = ifelse(income < threshold, "Below", "Above"))
pay_sum <- temp_data %>%
group_by(income_group) %>%
summarise(
total_people = n(),
total_paid = sum(paid_on_time),
.groups = "drop"
)
# skipping if one group missing
if (nrow(pay_sum) < 2) return(NA)
test <- prop.test(
x = pay_sum$total_paid,
n = pay_sum$total_people,
correct = FALSE
)
return(test$p.value)
}
# running threshold
thresholds <- seq(5000, 50000, by = 5000)
p_values <- sapply(thresholds, function(x) {
get_p_value(policy_data, x)
})
results <- data.frame(
income_threshold = thresholds,
p_value = p_values
)
kable(data.frame(p_values= p_values), caption= "List of P Values")
| p_values |
|---|
| 0.6049941 |
| 0.2497395 |
| 0.0000179 |
| 0.0000000 |
| 0.0000000 |
| 0.0000000 |
| 0.9102625 |
| 0.0000000 |
| 0.0000000 |
| 0.0000000 |
# plot p values
plot_pvalues <- ggplot(results, aes(x = income_threshold, y = p_value)) +
geom_line(color = "#2C6E9B", linewidth = 1) +
geom_point(color = "#2C6E9B", size = 3) +
# significance line
geom_hline(yintercept = 0.05, linetype = "dashed",
color = "firebrick", linewidth = 0.8) +
# label for alpha
annotate("text", x = 5000, y = 0.07, label = "α = 0.05",
color = "firebrick", hjust = 0, size = 4) +
# axes
scale_x_continuous(
breaks = thresholds,
labels = label_dollar(scale = 1/1000, suffix = "k")
) +
scale_y_continuous(
limits = c(0, 1),
breaks = seq(0, 1, 0.1)
) +
labs(
title = "P-values Across Income Thresholds",
subtitle = "Difference in probability of paying on time (below vs above threshold)",
x = "Income Threshold",
y = "P-value"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.minor = element_blank()
)
print(plot_pvalues)
# save plot
#ggsave("pvalue_by_threshold.png", plot = plot_pvalues,width = 8, height = 5, dpi = 150)
The results suggest that there are statistically significant differences in the probability of paying on time at several income thresholds (many p-values are < 0.05), which indicates that income level is associated with payment behavior. However, the pattern is not consistent across all thresholds incomes tested, and there are spikes around $35k, where the difference is not significant, suggesting instability in the effect. This means we cannot confidently conclude that the reduced payment policy itself caused the change in overdue rates.
A key limitation is that this analysis is purely descriptive and does not control for confounding factors such as differences in demographics, enforcement patterns, or underlying ability to pay. People just below and above the threshold may differ systematically in ways unrelated to the policy.
Submitting .Rmd file and html knit file with all the answers.