#read in data
fars_accid_all <- readRDS("fars_accid_all.RDS")
fars_person_all <- readRDS("fars_person_all.RDS")
fars_race_all <- readRDS("fars_race_all.RDS")
switrs_all <- read_excel("switrs_all_fatal.xlsx")
agediff <- readRDS("fs_age.RDS")
# FARS cleaning & merging step
#filter to keep: CA, fatal, non-passengers
personCA <- fars_person_all%>%
filter(STATE==6 & INJ_SEV==4 & (PER_TYP!=2 & PER_TYP!=3 & PER_TYP!=9))
raceCA <- fars_race_all%>%
filter(STATE==6 & RACE!=0)
accidCA <- fars_accid_all%>%
filter(STATE==6)
#make a KEY variable for meging datasets
raceCA<-raceCA %>%
mutate(KEY=paste0(YEAR, ST_CASE,PER_NO,VEH_NO))
personCA<-personCA %>%
mutate(KEY=paste0(YEAR, ST_CASE,PER_NO,VEH_NO))
accidCA<-accidCA %>%
mutate(KEY2=paste0(YEAR, ST_CASE))
#remove duplicates
n_occur <- data.frame(table(raceCA$KEY))
dupes<-raceCA %>%
filter(KEY %in% unique(.[["KEY"]][duplicated(.[["KEY"]])]))
dupes<-dupes$KEY
raceKEY<-raceCA$KEY
raceCA_ndkey<-setdiff(raceKEY,dupes)
raceCA_nd<-raceCA[which(raceCA$KEY %in% raceCA_ndkey),]
#merge datasets
merge1<-left_join(personCA, raceCA_nd, by = "KEY")
merge1<-merge1 %>%
mutate(KEY2=paste0(YEAR.x, ST_CASE.x))
person_clean<-left_join(merge1, accidCA, by = "KEY2")
#create a date variable that matches SWITRS
person_clean$MONTH_F<-str_pad(person_clean$MONTH.x, 2, pad ="0")
person_clean$DAY_F<-str_pad(person_clean$DAY.x, 2, pad ="0")
person_clean<-person_clean%>%
mutate(COLLISION_TIME=paste0(HOUR.x,MINUTE.x))
person_clean<-person_clean%>%
mutate(COLLISION_DATE=paste0(YEAR.x,MONTH_F,DAY_F))
#make 1 race variable that captures all of the FARS years (there are slight differences in format over time)
person_clean <- person_clean %>%
mutate(across(RACE.y, coalesce, RACE.x))
person_clean<-person_clean %>%
mutate(RACE = RACE.y,
RACENAME = RACENAME.y)
#recode race data to line up with SWITRS
p_clean_cat<-mutate(person_clean,
race2 = case_when(
RACE == 1 & (HISPANIC ==7 | HISPANIC == 99)~4,
RACE == 2 & (HISPANIC ==7 | HISPANIC == 99)~2,
RACE == 3 & (HISPANIC ==7 | HISPANIC == 99)~5,
RACE == 4 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 5 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 6 & (HISPANIC ==7 | HISPANIC == 99)~5,
RACE == 7 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 18 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 19 & (HISPANIC ==7 | HISPANIC == 99)~5,
RACE == 28 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 38 & (HISPANIC ==7 | HISPANIC == 99)~5,
RACE == 48 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 58 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 68 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 78 & (HISPANIC ==7 | HISPANIC == 99)~1,
RACE == 97 & (HISPANIC ==7 | HISPANIC == 99)~5,
RACE == 98 & (HISPANIC ==7 | HISPANIC == 99)~5,
RACE == 99 & (HISPANIC ==7 )~5,
RACE == 99 & HISPANIC == 99 ~6,
RACE == 99 & HISPANIC == 1 ~3,
RACE ==99 ~ 3,
HISPANIC == 99 ~ 8,
HISPANIC == 1 ~ 3,
HISPANIC == 2 ~ 3,
HISPANIC == 3 ~ 3,
HISPANIC == 4 ~ 3,
HISPANIC == 6 ~ 3,
HISPANIC == 5 ~ 3
))
#remove missing race cases
p_clean_cat <- p_clean_cat%>%
filter(race2!=6 & race2!=8)
#match SWITRS format for race coding
p_clean_cat<-mutate(p_clean_cat,
race_name = case_when(race2==1 ~ 'A',
race2==2 ~ 'B',
race2==3 ~ 'H',
race2==4 ~ 'W',
race2==5 ~ 'O'))
#recode sex to match SWITRS
p_clean_cat<-mutate(p_clean_cat,
SEX2 = case_when(SEX==1 ~'M',
SEX==2 ~'F',
SEX==3 ~ 'X',
SEX==8 ~'-',
SEX==9 ~'-'))
#recode victim role to match SWITRS
p_clean_cat<-mutate(p_clean_cat,
VICTIM_ROLE = case_when(PER_TYP==1 ~ 1,
PER_TYP==2 ~ 2,
PER_TYP==4 ~ 5,
PER_TYP==5 ~ 3,
PER_TYP==6 ~ 4,
PER_TYP==7 ~ 4,
PER_TYP==8 ~ 5,
PER_TYP==10 ~ 5,
PER_TYP==11 ~ 5,
PER_TYP==12 ~ 5,
PER_TYP==13 ~ 5))
#fix up the string variables for the merge with SWITRS
p_clean_cat$COLLISION_DATE<-as.double(p_clean_cat$COLLISION_DATE)
p_clean_cat$COLLISION_TIME<-as.double(p_clean_cat$COLLISION_TIME)
p_clean_cat$COUNTYNAME<-gsub("\\s*\\([^\\)]+\\)", "", p_clean_cat$COUNTYNAME)
# READ IN SWITRS
# filter to 2018-2022 only
switrs_all<-switrs_all %>%
filter(COLLISION_YEAR_1 >=2018 & COLLISION_YEAR_1 <= 2022)
# only keep fatals, remove passengers
switrs_clean <- switrs_all%>%
filter(PARTY_RACE!='NA'& VICTIM_DEGREE_OF_INJURY_Vic==1 & VICTIM_ROLE!=2)
#recode SWITRS to make merge easier
switrs_clean<-mutate(switrs_clean, COUNTYNAME=CountyName)
switrs_clean$COUNTYNAME<-str_to_upper(switrs_clean$COUNTYNAME)
switrs_clean<-mutate(switrs_clean,
swit_race = case_when(PARTY_RACE=='A' ~ 1,
PARTY_RACE=='B' ~ 2,
PARTY_RACE=='H' ~ 3,
PARTY_RACE=='W' ~ 4,
PARTY_RACE=='O' ~ 5))
#MERGE STEP
# do initial strict merge on all variables
switrs_clean<-mutate(switrs_clean, FLAG=paste0(COUNTYNAME, COLLISION_DATE, COLLISION_TIME, VICTIM_ROLE, VICTIM_SEX, VICTIM_AGE))
p_clean_cat<-mutate(p_clean_cat, FLAG=paste0(COUNTYNAME, COLLISION_DATE, COLLISION_TIME, VICTIM_ROLE, SEX2, AGE))
fars_switrs<-merge(p_clean_cat, switrs_clean, by = "FLAG")
# do a merge without the age variable with the addition of the count of fatals (age has been found to create the most dropoff of cases)
cswitrs_clean<-mutate(switrs_clean, FLAG=paste0(COUNTYNAME, COLLISION_DATE, COLLISION_TIME, VICTIM_ROLE, VICTIM_SEX, KILLED_VICTIMS))
cp_clean_cat<-mutate(p_clean_cat, FLAG=paste0(COUNTYNAME, COLLISION_DATE, COLLISION_TIME, VICTIM_ROLE, SEX2, FATALS))
count_merge<-merge(cp_clean_cat, cswitrs_clean, by = "FLAG")
# get a difference score between FARS and SWITRS age variable
agecalc<-count_merge%>%
mutate(agediff = AGE - VICTIM_AGE)
# retain cases that are only 1 year apart
agekeep<-agecalc %>%
filter(agediff==1 | agediff==-1)
# also make sure its collisions with only 1 person
subset<-agekeep %>%
filter(FATALS==1)
#stack the new cases into the strict merge file
fs_age<-rbind(subset, fars_switrs, fill=TRUE)
##Descriptive Tables
fars_CAfatal <- fars_person_all%>%
filter(STATE==6 & INJ_SEV==4)
FARS<-table(fars_CAfatal$YEAR)
unclean_switr <- switrs_all%>%
filter(VICTIM_DEGREE_OF_INJURY_Vic==1)
SWITRS<-table(unclean_switr$COLLISION_YEAR_1)
unclean_years<-cbind(FARS, SWITRS)
knitr::kable(unclean_years,
col.names = c('FARS', 'SWITRS'),
align = 'lll', caption = 'Fatal collisions by year')
Fatal collisions by year
| 2018 |
3798 |
3804 |
| 2019 |
3719 |
3737 |
| 2020 |
3980 |
3982 |
| 2021 |
4513 |
4477 |
| 2022 |
4539 |
4386 |
FARS<-table(p_clean_cat$YEAR)
SWITRS<-table(switrs_clean$COLLISION_YEAR_1)
clean_years<-cbind(FARS, SWITRS)
knitr::kable(clean_years,
col.names = c('FARS', 'SWITRS'),
align = 'll', caption = 'Cleaned fatal collisions')
Cleaned fatal collisions
| 2018 |
3071 |
3055 |
| 2019 |
2980 |
3040 |
| 2020 |
3269 |
3308 |
| 2021 |
3711 |
3734 |
| 2022 |
3702 |
3673 |
switrs_clean<-mutate(p_clean_cat,
victim_type = case_when(VICTIM_ROLE=='1' ~ 'Driver',
VICTIM_ROLE=='3' ~ 'Pedestrian',
VICTIM_ROLE=='4' ~ 'Cyclist',
VICTIM_ROLE=='5' ~ 'Other',
))
switrs_clean<-mutate(switrs_clean,
victim_type = case_when(VICTIM_ROLE=='1' ~ 'Driver',
VICTIM_ROLE=='3' ~ 'Pedestrian',
VICTIM_ROLE=='4' ~ 'Cyclist',
VICTIM_ROLE=='5' ~ 'Other',
))
fvrole<-table(p_clean_cat$VICTIM_ROLE)
svrole<-table(switrs_clean$VICTIM_ROLE)
vrole<-cbind(fvrole, svrole)
knitr::kable(vrole,
col.names = c('Victim Role', 'Frequency'),
align = 'll', caption = 'Victim Role Frequencies')
Victim Role Frequencies
| 1 |
10627 |
10627 |
| 3 |
5176 |
5176 |
| 4 |
727 |
727 |
| 5 |
203 |
203 |
FARS<-table(p_clean_cat$race_name)
SWITRS<-table(switrs_clean$PARTY_RACE)
clean_race<-cbind(FARS, SWITRS)
knitr::kable(clean_race,
col.names = c('FARS', 'SWITRS'),
align = 'll', caption = 'Pre-merge cleaned race frequencies')
Pre-merge cleaned race frequencies
| A |
1054 |
| B |
1838 |
| H |
6819 |
| O |
307 |
| W |
6715 |
SWITRS<-table(fs_age$PARTY_RACE)
FARS<-table(fs_age$race_name)
mergerace<-cbind(FARS, SWITRS)
knitr::kable(mergerace,
col.names = c('FARS', 'SWITRS'),
align = 'll', caption = 'Merged race frequencies')
Merged race frequencies
| A |
718 |
475 |
| B |
1334 |
1342 |
| H |
5059 |
4930 |
| O |
219 |
569 |
| W |
4971 |
4985 |
Data Analysis
data<-agediff
# TRUE POSITIVES when SWITRS and FARS both have the category
#Asian
data<-mutate(data,
tp_asian = case_when(swit_race==1 & race2 ==1 ~ 1,
TRUE ~ 0))
#Black
data<-mutate(data,
tp_black = case_when(swit_race==2 & race2 ==2 ~ 1,
TRUE ~ 0))
#hisp
data<-mutate(data,
tp_hisp = case_when(swit_race==3 & race2 ==3 ~ 1,
TRUE ~ 0))
#white
data<-mutate(data,
tp_white = case_when(swit_race==4 & race2 ==4 ~ 1,
TRUE ~ 0))
#other
data<-mutate(data,
tp_other = case_when(swit_race==5 & race2 ==5 ~ 1,
TRUE ~ 0))
# FALSE POSITIVES when SWITRS has the category, but FARS does not
#Asian
data<-mutate(data,
fp_asian= case_when(swit_race==1 & race2 != 1 ~ 1,
TRUE ~ 0))
#Black
data<-mutate(data,
fp_black= case_when(swit_race==2 & race2 != 2 ~ 1,
TRUE ~ 0))
#hisp
data<-mutate(data,
fp_hisp= case_when(swit_race==3 & race2 != 3 ~ 1,
TRUE ~ 0))
#white
data<-mutate(data,
fp_white= case_when(swit_race==4 & race2 != 4 ~ 1,
TRUE ~ 0))
#other
data<-mutate(data,
fp_other= case_when(swit_race==5 & race2 != 5 ~ 1,
TRUE ~ 0))
# FALSE NEGATIVES when SWITRS does not have the category, but FARS does
#Asian
data<-mutate(data,
fn_asian = case_when(swit_race != 1 & race2 ==1 ~ 1,
TRUE ~ 0))
#Black
data<-mutate(data,
fn_black = case_when(swit_race != 2 & race2 ==2 ~ 1,
TRUE ~ 0))
#Hispanic
data<-mutate(data,
fn_hisp = case_when(swit_race != 3 & race2 ==3 ~ 1,
TRUE ~ 0))
#white
data<-mutate(data,
fn_white = case_when(swit_race != 4 & race2 ==4 ~ 1,
TRUE ~ 0))
#other
data<-mutate(data,
fn_other = case_when(swit_race != 5 & race2 ==5 ~ 1,
TRUE ~ 0))
# TRUE NEGATIVES neither SWITRS nor FARS have the category
#Asian
data<-mutate(data,
tn_asian = case_when(swit_race != 1 & race2 !=1 ~ 1,
TRUE ~ 0))
#Black
data<-mutate(data,
tn_black = case_when(swit_race != 2 & race2 !=2 ~ 1,
TRUE ~ 0))
#Hispanic
data<-mutate(data,
tn_hisp = case_when(swit_race != 3 & race2 !=3 ~ 1,
TRUE ~ 0))
#white
data<-mutate(data,
tn_white = case_when(swit_race != 4 & race2 !=4 ~ 1,
TRUE ~ 0))
#other
data<-mutate(data,
tn_other = case_when(swit_race != 5 & race2 !=5 ~ 1,
TRUE ~ 0))
#START MAKING THE MEASURES. Sensitivity TP/(TP+FN)
#asian
asian_tp_sum<-sum(data$tp_asian)
asian_fn_sum<-sum(data$fn_asian)
asian_sensitivity<-asian_tp_sum/(asian_tp_sum+asian_fn_sum)
#black
black_tp_sum<-sum(data$tp_black)
black_fn_sum<-sum(data$fn_black)
black_sensitivity<-black_tp_sum/(black_tp_sum+black_fn_sum)
#hisp
hisp_tp_sum<-sum(data$tp_hisp)
hisp_fn_sum<-sum(data$fn_hisp)
hisp_sensitivity<-hisp_tp_sum/(hisp_tp_sum+hisp_fn_sum)
#White
white_tp_sum<-sum(data$tp_white)
white_fn_sum<-sum(data$fn_white)
white_sensitivity<-white_tp_sum/(white_tp_sum+white_fn_sum)
#other
other_tp_sum<-sum(data$tp_other)
other_fn_sum<-sum(data$fn_other)
other_sensitivity<-other_tp_sum/(other_tp_sum+other_fn_sum)
# SPECIFICITY TN/(TN+FP)
#asian
asian_tn_sum<-sum(data$tn_asian)
asian_fp_sum<-sum(data$fp_asian)
asian_specificity<-asian_tn_sum/(asian_tn_sum+asian_fp_sum)
#black
black_tn_sum<-sum(data$tn_black)
black_fp_sum<-sum(data$fp_black)
black_specificity<-black_tn_sum/(black_tn_sum+black_fp_sum)
#hisp
hisp_tn_sum<-sum(data$tn_hisp)
hisp_fp_sum<-sum(data$fp_hisp)
hisp_specificity<-hisp_tn_sum/(hisp_tn_sum+hisp_fp_sum)
#white
white_tn_sum<-sum(data$tn_white)
white_fp_sum<-sum(data$fp_white)
white_specificity<-white_tn_sum/(white_tn_sum+white_fp_sum)
#other
other_tn_sum<-sum(data$tn_other)
other_fp_sum<-sum(data$fp_other)
other_specificity<-other_tn_sum/(other_tn_sum+other_fp_sum)
# check reliability
ratings<-data.frame(race2=data$race2, swit_race=data$swit_race)
kappa_result <- kappa2(ratings, weight = "unweighted")
print(kappa_result)
## Cohen's Kappa for 2 Raters (Weights: unweighted)
##
## Subjects = 12301
## Raters = 2
## Kappa = 0.798
##
## z = 133
## p-value = 0
# put all of the different stats into a data frame for tabling
sesp_table<-data.frame(Race=c('Asian', 'Black', 'Hispanic', 'White', 'Other'),
Sensitivity=percent(c(asian_sensitivity, black_sensitivity, hisp_sensitivity,white_sensitivity, other_sensitivity), digits=2),
Specificity=percent(c(asian_specificity, black_specificity, hisp_specificity,white_specificity, other_specificity), digits=2))
knitr::kable(sesp_table,
col.names = c('Race', 'Sensitivity', 'Specificity'),
align = 'lrr', caption = 'Sensitivity & specificity by race')
Sensitivity & specificity by race
| Asian |
61.00% |
99.68% |
| Black |
92.88% |
99.06% |
| Hispanic |
88.99% |
94.09% |
| White |
89.04% |
92.37% |
| Other |
30.59% |
95.85% |
# Investigating Mismatch
# filter to only cases that don't match between FARS & SWITRS
nonmatch<- data %>%
filter(race_name!=PARTY_RACE)
# keep a match subset just in case
match<- data %>%
filter(race_name==PARTY_RACE)
# Asian mismatch
nmatch_a<-nonmatch%>%
filter(race_name=='A')
nm_a<-table(nmatch_a$PARTY_RACE)
knitr::kable(nm_a,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS Asian cases')
Misclassified FARS Asian cases
| B |
6 |
| H |
73 |
| O |
169 |
| W |
32 |
# Black mismatch
nmatch_b<-nonmatch%>%
filter(race_name=='B')
nm_b<-table(nmatch_b$PARTY_RACE)
knitr::kable(nm_b,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS Black cases')
Misclassified FARS Black cases
| A |
3 |
| H |
34 |
| O |
29 |
| W |
29 |
# Hispanic mismatch
nmatch_h<-nonmatch%>%
filter(race_name=='H')
nm_h<-table(nmatch_h$PARTY_RACE)
knitr::kable(nm_h,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS Hispanic cases')
Misclassified FARS Hispanic cases
| A |
12 |
| B |
50 |
| O |
88 |
| W |
407 |
# White mismatch
nmatch_w<-nonmatch%>%
filter(race_name=='W')
nm_w<-table(nmatch_w$PARTY_RACE)
knitr::kable(nm_w,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS White cases')
Misclassified FARS White cases
| A |
17 |
| B |
29 |
| H |
283 |
| O |
216 |
# Other mismatch
nmatch_oS<-nonmatch%>%
filter(race_name=='O')
nm_o<-table(nmatch_oS$PARTY_RACE)
knitr::kable(nm_o,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS Other cases')
Misclassified FARS Other cases
| A |
5 |
| B |
18 |
| H |
38 |
| W |
91 |
# Fine grained mismatch investigation
# the Asian sensitivity was pretty low, so I'm assessing how the FARS category seemed to fit SWITRS
# I selected the cases that had higher numbers in the mismatch
# FARS coding: Filipino (7), Asian Indian (18), Other Asian (68), North American Indian (3)
# Filipino
nmatch_fil<-nonmatch%>%
filter(RACE==7)
fil<-table(nmatch_fil$PARTY_RACE)
knitr::kable(fil,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS Filipino cases')
Misclassified FARS Filipino cases
| A |
1 |
| B |
2 |
| H |
43 |
| O |
31 |
| W |
14 |
# Asian Indian
nmatch_aind<-nonmatch%>%
filter(RACE==18)
aind<-table(nmatch_aind$PARTY_RACE)
knitr::kable(aind,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS Asian Indian cases')
Misclassified FARS Asian Indian cases
| H |
1 |
| O |
81 |
| W |
4 |
# Other Asian
nmatch_othera<-nonmatch%>%
filter(RACE==68)
othera<-table(nmatch_othera$PARTY_RACE)
knitr::kable(othera,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS Other Asian cases')
Misclassified FARS Other Asian cases
| A |
2 |
| B |
3 |
| H |
14 |
| O |
36 |
| W |
7 |
nmatch_na<-nonmatch%>%
filter(RACE==3)
na<-table(nmatch_na$PARTY_RACE)
knitr::kable(na,
col.names = c('Race', 'Frequency'),
align = 'll', caption = 'Misclassified FARS NA Indian cases')
Misclassified FARS NA Indian cases
| B |
9 |
| H |
23 |
| W |
74 |