OTS grant FARS-SWITRS race validity analysis

#read in data
fars_accid_all <- readRDS("fars_accid_all.RDS")
fars_person_all <- readRDS("fars_person_all.RDS")
fars_race_all <- readRDS("fars_race_all.RDS")
switrs_all <- read_excel("switrs_all_fatal.xlsx")
agediff <- readRDS("fs_age.RDS")

# FARS cleaning & merging step

#filter to keep: CA, fatal, non-passengers
personCA <- fars_person_all%>%
  filter(STATE==6 & INJ_SEV==4 & (PER_TYP!=2 & PER_TYP!=3 & PER_TYP!=9))
raceCA <- fars_race_all%>%
  filter(STATE==6 & RACE!=0)
accidCA <- fars_accid_all%>%
  filter(STATE==6)

#make a KEY variable for meging datasets 
raceCA<-raceCA %>% 
  mutate(KEY=paste0(YEAR, ST_CASE,PER_NO,VEH_NO))
personCA<-personCA %>% 
  mutate(KEY=paste0(YEAR, ST_CASE,PER_NO,VEH_NO))
accidCA<-accidCA %>% 
  mutate(KEY2=paste0(YEAR, ST_CASE))

#remove duplicates
n_occur <- data.frame(table(raceCA$KEY))
dupes<-raceCA %>%
  filter(KEY %in% unique(.[["KEY"]][duplicated(.[["KEY"]])]))
dupes<-dupes$KEY
raceKEY<-raceCA$KEY
raceCA_ndkey<-setdiff(raceKEY,dupes)
raceCA_nd<-raceCA[which(raceCA$KEY %in% raceCA_ndkey),]

#merge datasets
merge1<-left_join(personCA, raceCA_nd, by = "KEY")
merge1<-merge1 %>% 
  mutate(KEY2=paste0(YEAR.x, ST_CASE.x))
person_clean<-left_join(merge1, accidCA, by = "KEY2")

#create a date variable that matches SWITRS
person_clean$MONTH_F<-str_pad(person_clean$MONTH.x, 2, pad ="0")
person_clean$DAY_F<-str_pad(person_clean$DAY.x, 2, pad ="0")
person_clean<-person_clean%>%
  mutate(COLLISION_TIME=paste0(HOUR.x,MINUTE.x))
person_clean<-person_clean%>%
  mutate(COLLISION_DATE=paste0(YEAR.x,MONTH_F,DAY_F))

#make 1 race variable that captures all of the FARS years (there are slight differences in format over time)
person_clean <- person_clean %>%
  mutate(across(RACE.y, coalesce, RACE.x))
person_clean<-person_clean %>%
  mutate(RACE = RACE.y,
         RACENAME = RACENAME.y)

#recode race data to line up with SWITRS 
p_clean_cat<-mutate(person_clean,
                    race2 = case_when(
                      RACE == 1 & (HISPANIC ==7 | HISPANIC == 99)~4,
                      RACE == 2 & (HISPANIC ==7 | HISPANIC == 99)~2,
                      RACE == 3 & (HISPANIC ==7 | HISPANIC == 99)~5,
                      RACE == 4 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 5 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 6 & (HISPANIC ==7 | HISPANIC == 99)~5,
                      RACE == 7 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 18 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 19 & (HISPANIC ==7 | HISPANIC == 99)~5,
                      RACE == 28 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 38 & (HISPANIC ==7 | HISPANIC == 99)~5,
                      RACE == 48 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 58 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 68 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 78 & (HISPANIC ==7 | HISPANIC == 99)~1,
                      RACE == 97 & (HISPANIC ==7 | HISPANIC == 99)~5,
                      RACE == 98 & (HISPANIC ==7 | HISPANIC == 99)~5,
                      RACE == 99 & (HISPANIC ==7 )~5,
                      RACE == 99 &  HISPANIC == 99 ~6,
                      RACE == 99 &  HISPANIC == 1 ~3,
                      RACE ==99 ~ 3,
                      HISPANIC == 99 ~ 8,
                      HISPANIC == 1  ~ 3,
                      HISPANIC == 2  ~ 3,
                      HISPANIC == 3  ~ 3,
                      HISPANIC == 4  ~ 3,
                      HISPANIC == 6 ~ 3,
                      HISPANIC == 5 ~ 3
                    ))
#remove missing race cases
p_clean_cat <- p_clean_cat%>%
  filter(race2!=6 & race2!=8)

#match SWITRS format for race coding
p_clean_cat<-mutate(p_clean_cat,
                    race_name = case_when(race2==1 ~ 'A',
                                          race2==2 ~ 'B',
                                          race2==3 ~ 'H',
                                          race2==4 ~ 'W',
                                          race2==5 ~ 'O'))

#recode sex to match SWITRS 
p_clean_cat<-mutate(p_clean_cat,
                    SEX2 = case_when(SEX==1 ~'M',
                                     SEX==2 ~'F',
                                     SEX==3 ~ 'X',
                                     SEX==8 ~'-',
                                     SEX==9 ~'-'))
#recode victim role to match SWITRS 
p_clean_cat<-mutate(p_clean_cat,
                    VICTIM_ROLE = case_when(PER_TYP==1 ~ 1,
                                            PER_TYP==2 ~ 2,
                                            PER_TYP==4 ~ 5,
                                            PER_TYP==5 ~ 3,
                                            PER_TYP==6 ~ 4,
                                            PER_TYP==7 ~ 4,
                                            PER_TYP==8 ~ 5,
                                            PER_TYP==10 ~ 5,
                                            PER_TYP==11 ~ 5,
                                            PER_TYP==12 ~ 5,
                                            PER_TYP==13 ~ 5))

#fix up the string variables for the merge with SWITRS 
p_clean_cat$COLLISION_DATE<-as.double(p_clean_cat$COLLISION_DATE)
p_clean_cat$COLLISION_TIME<-as.double(p_clean_cat$COLLISION_TIME)
p_clean_cat$COUNTYNAME<-gsub("\\s*\\([^\\)]+\\)", "", p_clean_cat$COUNTYNAME)

# READ IN SWITRS

# filter to 2018-2022 only
switrs_all<-switrs_all %>%
  filter(COLLISION_YEAR_1 >=2018 & COLLISION_YEAR_1 <= 2022)
# only keep fatals, remove passengers
switrs_clean <- switrs_all%>%
  filter(PARTY_RACE!='NA'& VICTIM_DEGREE_OF_INJURY_Vic==1 & VICTIM_ROLE!=2)

#recode SWITRS to make merge easier 
switrs_clean<-mutate(switrs_clean, COUNTYNAME=CountyName)
switrs_clean$COUNTYNAME<-str_to_upper(switrs_clean$COUNTYNAME)
switrs_clean<-mutate(switrs_clean,
               swit_race = case_when(PARTY_RACE=='A' ~ 1,
                                     PARTY_RACE=='B' ~ 2,
                                     PARTY_RACE=='H' ~ 3,
                                     PARTY_RACE=='W' ~ 4,
                                     PARTY_RACE=='O' ~ 5))

#MERGE STEP

# do initial strict merge on all variables
switrs_clean<-mutate(switrs_clean, FLAG=paste0(COUNTYNAME, COLLISION_DATE,  COLLISION_TIME, VICTIM_ROLE, VICTIM_SEX, VICTIM_AGE))
p_clean_cat<-mutate(p_clean_cat, FLAG=paste0(COUNTYNAME, COLLISION_DATE, COLLISION_TIME, VICTIM_ROLE, SEX2, AGE))
fars_switrs<-merge(p_clean_cat, switrs_clean, by = "FLAG")

# do a merge without the age variable with the addition of the count of fatals (age has been found to create the most dropoff of cases)
cswitrs_clean<-mutate(switrs_clean, FLAG=paste0(COUNTYNAME, COLLISION_DATE, COLLISION_TIME, VICTIM_ROLE, VICTIM_SEX, KILLED_VICTIMS))
cp_clean_cat<-mutate(p_clean_cat, FLAG=paste0(COUNTYNAME, COLLISION_DATE, COLLISION_TIME, VICTIM_ROLE, SEX2, FATALS))
count_merge<-merge(cp_clean_cat, cswitrs_clean, by = "FLAG")

# get a difference score between FARS and SWITRS age variable
agecalc<-count_merge%>%
  mutate(agediff = AGE - VICTIM_AGE)
# retain cases that are only 1 year apart 
agekeep<-agecalc %>%
  filter(agediff==1 | agediff==-1)
# also make sure its collisions with only 1 person 
subset<-agekeep %>%
  filter(FATALS==1)
#stack the new cases into the strict merge file
fs_age<-rbind(subset, fars_switrs, fill=TRUE)

##Descriptive Tables

fars_CAfatal <- fars_person_all%>%
filter(STATE==6 & INJ_SEV==4)
FARS<-table(fars_CAfatal$YEAR)
unclean_switr <- switrs_all%>%
  filter(VICTIM_DEGREE_OF_INJURY_Vic==1)
SWITRS<-table(unclean_switr$COLLISION_YEAR_1)

unclean_years<-cbind(FARS, SWITRS)

  knitr::kable(unclean_years,
             col.names = c('FARS', 'SWITRS'),
             align = 'lll', caption = 'Fatal collisions by year')

Fatal collisions by year
	FARS	SWITRS
2018	3798	3804
2019	3719	3737
2020	3980	3982
2021	4513	4477
2022	4539	4386

FARS<-table(p_clean_cat$YEAR)
SWITRS<-table(switrs_clean$COLLISION_YEAR_1)
clean_years<-cbind(FARS, SWITRS)

knitr::kable(clean_years,
             col.names = c('FARS', 'SWITRS'),
             align = 'll', caption = 'Cleaned fatal collisions')

Cleaned fatal collisions
	FARS	SWITRS
2018	3071	3055
2019	2980	3040
2020	3269	3308
2021	3711	3734
2022	3702	3673

switrs_clean<-mutate(p_clean_cat,
               victim_type = case_when(VICTIM_ROLE=='1' ~ 'Driver',
                                     VICTIM_ROLE=='3' ~ 'Pedestrian',
                                     VICTIM_ROLE=='4' ~ 'Cyclist',
                                     VICTIM_ROLE=='5' ~ 'Other',
                                     ))

switrs_clean<-mutate(switrs_clean,
               victim_type = case_when(VICTIM_ROLE=='1' ~ 'Driver',
                                     VICTIM_ROLE=='3' ~ 'Pedestrian',
                                     VICTIM_ROLE=='4' ~ 'Cyclist',
                                     VICTIM_ROLE=='5' ~ 'Other',
                                     ))

fvrole<-table(p_clean_cat$VICTIM_ROLE)
svrole<-table(switrs_clean$VICTIM_ROLE)
vrole<-cbind(fvrole, svrole)


knitr::kable(vrole,
             col.names = c('Victim Role', 'Frequency'),
             align = 'll', caption = 'Victim Role Frequencies')

Victim Role Frequencies
	Victim Role	Frequency
1	10627	10627
3	5176	5176
4	727	727
5	203	203

FARS<-table(p_clean_cat$race_name)
SWITRS<-table(switrs_clean$PARTY_RACE)
clean_race<-cbind(FARS, SWITRS)

knitr::kable(clean_race,
             col.names = c('FARS', 'SWITRS'),
             align = 'll', caption = 'Pre-merge cleaned race frequencies')

Pre-merge cleaned race frequencies
FARS	SWITRS
A	1054
B	1838
H	6819
O	307
W	6715

SWITRS<-table(fs_age$PARTY_RACE)
FARS<-table(fs_age$race_name)
mergerace<-cbind(FARS, SWITRS)

knitr::kable(mergerace,
             col.names = c('FARS', 'SWITRS'),
             align = 'll', caption = 'Merged race frequencies')

Merged race frequencies
	FARS	SWITRS
A	718	475
B	1334	1342
H	5059	4930
O	219	569
W	4971	4985

Data Analysis

data<-agediff

# TRUE POSITIVES when SWITRS and FARS both have the category
#Asian
data<-mutate(data, 
             tp_asian = case_when(swit_race==1 & race2 ==1 ~ 1,
                                  TRUE ~ 0))
#Black
data<-mutate(data, 
             tp_black = case_when(swit_race==2 & race2 ==2 ~ 1,
                                  TRUE ~ 0))
#hisp
data<-mutate(data, 
             tp_hisp = case_when(swit_race==3 & race2 ==3 ~ 1,
                                  TRUE ~ 0))
#white
data<-mutate(data, 
             tp_white = case_when(swit_race==4 & race2 ==4 ~ 1,
                                  TRUE ~ 0))

#other
data<-mutate(data, 
             tp_other = case_when(swit_race==5 & race2 ==5 ~ 1,
                                  TRUE ~ 0))

# FALSE POSITIVES   when SWITRS has the category, but FARS does not

#Asian
data<-mutate(data, 
             fp_asian= case_when(swit_race==1 & race2 != 1 ~ 1, 
                                 TRUE ~ 0))
#Black
data<-mutate(data, 
             fp_black= case_when(swit_race==2 & race2 != 2 ~ 1, 
                                 TRUE ~ 0))
#hisp
data<-mutate(data, 
             fp_hisp= case_when(swit_race==3 & race2 != 3 ~ 1, 
                                 TRUE ~ 0))
#white
data<-mutate(data, 
             fp_white= case_when(swit_race==4 & race2 != 4 ~ 1, 
                                 TRUE ~ 0))
#other
data<-mutate(data, 
             fp_other= case_when(swit_race==5 & race2 != 5 ~ 1, 
                                 TRUE ~ 0))

# FALSE NEGATIVES   when SWITRS does not have the category, but FARS does

#Asian
data<-mutate(data, 
             fn_asian = case_when(swit_race != 1 & race2 ==1 ~ 1,
                                  TRUE ~ 0))

#Black
data<-mutate(data, 
             fn_black = case_when(swit_race != 2 & race2 ==2 ~ 1,
                                  TRUE ~ 0))
#Hispanic
data<-mutate(data, 
             fn_hisp = case_when(swit_race != 3 & race2 ==3 ~ 1,
                                  TRUE ~ 0))

#white
data<-mutate(data, 
             fn_white = case_when(swit_race != 4 & race2 ==4 ~ 1,
                                  TRUE ~ 0))

#other
data<-mutate(data, 
             fn_other = case_when(swit_race != 5 & race2 ==5 ~ 1,
                                  TRUE ~ 0))

# TRUE NEGATIVES  neither SWITRS nor FARS have the category

#Asian
data<-mutate(data, 
             tn_asian = case_when(swit_race != 1 & race2 !=1 ~ 1,
                                  TRUE ~ 0))
#Black
data<-mutate(data, 
             tn_black = case_when(swit_race != 2 & race2 !=2 ~ 1,
                                  TRUE ~ 0))
#Hispanic
data<-mutate(data, 
             tn_hisp = case_when(swit_race != 3 & race2 !=3 ~ 1,
                                  TRUE ~ 0))
#white
data<-mutate(data, 
             tn_white = case_when(swit_race != 4 & race2 !=4 ~ 1,
                                  TRUE ~ 0))
#other
data<-mutate(data, 
             tn_other = case_when(swit_race != 5 & race2 !=5 ~ 1,
                                  TRUE ~ 0))

#START MAKING THE MEASURES.  Sensitivity TP/(TP+FN)

#asian
asian_tp_sum<-sum(data$tp_asian)
asian_fn_sum<-sum(data$fn_asian)
asian_sensitivity<-asian_tp_sum/(asian_tp_sum+asian_fn_sum)

#black
black_tp_sum<-sum(data$tp_black)
black_fn_sum<-sum(data$fn_black)
black_sensitivity<-black_tp_sum/(black_tp_sum+black_fn_sum)

#hisp
hisp_tp_sum<-sum(data$tp_hisp)
hisp_fn_sum<-sum(data$fn_hisp)
hisp_sensitivity<-hisp_tp_sum/(hisp_tp_sum+hisp_fn_sum)

#White
white_tp_sum<-sum(data$tp_white)
white_fn_sum<-sum(data$fn_white)
white_sensitivity<-white_tp_sum/(white_tp_sum+white_fn_sum)

#other
other_tp_sum<-sum(data$tp_other)
other_fn_sum<-sum(data$fn_other)
other_sensitivity<-other_tp_sum/(other_tp_sum+other_fn_sum)

# SPECIFICITY TN/(TN+FP)

#asian
asian_tn_sum<-sum(data$tn_asian)
asian_fp_sum<-sum(data$fp_asian)
asian_specificity<-asian_tn_sum/(asian_tn_sum+asian_fp_sum)

#black
black_tn_sum<-sum(data$tn_black)
black_fp_sum<-sum(data$fp_black)
black_specificity<-black_tn_sum/(black_tn_sum+black_fp_sum)

#hisp
hisp_tn_sum<-sum(data$tn_hisp)
hisp_fp_sum<-sum(data$fp_hisp)
hisp_specificity<-hisp_tn_sum/(hisp_tn_sum+hisp_fp_sum)

#white
white_tn_sum<-sum(data$tn_white)
white_fp_sum<-sum(data$fp_white)
white_specificity<-white_tn_sum/(white_tn_sum+white_fp_sum)

#other
other_tn_sum<-sum(data$tn_other)
other_fp_sum<-sum(data$fp_other)
other_specificity<-other_tn_sum/(other_tn_sum+other_fp_sum)

# check reliability
ratings<-data.frame(race2=data$race2, swit_race=data$swit_race)
kappa_result <- kappa2(ratings, weight = "unweighted")
print(kappa_result)

##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 12301 
##    Raters = 2 
##     Kappa = 0.798 
## 
##         z = 133 
##   p-value = 0

# put all of the different stats into a data frame for tabling
sesp_table<-data.frame(Race=c('Asian', 'Black', 'Hispanic', 'White', 'Other'),
                              Sensitivity=percent(c(asian_sensitivity, black_sensitivity, hisp_sensitivity,white_sensitivity, other_sensitivity), digits=2),
                              Specificity=percent(c(asian_specificity, black_specificity, hisp_specificity,white_specificity, other_specificity), digits=2))

knitr::kable(sesp_table,
             col.names = c('Race', 'Sensitivity', 'Specificity'),
             align = 'lrr', caption = 'Sensitivity & specificity by race')

Sensitivity & specificity by race
Race	Sensitivity	Specificity
Asian	61.00%	99.68%
Black	92.88%	99.06%
Hispanic	88.99%	94.09%
White	89.04%	92.37%
Other	30.59%	95.85%

# Investigating Mismatch
# filter to only cases that don't match between FARS & SWITRS
nonmatch<- data %>%
  filter(race_name!=PARTY_RACE)
# keep a match subset just in case 
match<- data %>%
  filter(race_name==PARTY_RACE)

# Asian mismatch
nmatch_a<-nonmatch%>%
  filter(race_name=='A')
nm_a<-table(nmatch_a$PARTY_RACE)

knitr::kable(nm_a,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS Asian cases')

Misclassified FARS Asian cases
Race	Frequency
B	6
H	73
O	169
W	32

# Black mismatch
nmatch_b<-nonmatch%>%
  filter(race_name=='B')
nm_b<-table(nmatch_b$PARTY_RACE)

knitr::kable(nm_b,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS Black cases')

Misclassified FARS Black cases
Race	Frequency
A	3
H	34
O	29
W	29

# Hispanic mismatch
nmatch_h<-nonmatch%>%
  filter(race_name=='H')
nm_h<-table(nmatch_h$PARTY_RACE)

knitr::kable(nm_h,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS Hispanic cases')

Misclassified FARS Hispanic cases
Race	Frequency
A	12
B	50
O	88
W	407

# White mismatch
nmatch_w<-nonmatch%>%
  filter(race_name=='W')
nm_w<-table(nmatch_w$PARTY_RACE)

knitr::kable(nm_w,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS White cases')

Misclassified FARS White cases
Race	Frequency
A	17
B	29
H	283
O	216

# Other mismatch
nmatch_oS<-nonmatch%>%
  filter(race_name=='O')
nm_o<-table(nmatch_oS$PARTY_RACE)

knitr::kable(nm_o,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS Other cases')

Misclassified FARS Other cases
Race	Frequency
A	5
B	18
H	38
W	91

# Fine grained mismatch investigation
# the Asian sensitivity was pretty low, so I'm assessing how the FARS category seemed to fit SWITRS 
# I selected the cases that had higher numbers in the mismatch 
# FARS coding: Filipino (7), Asian Indian (18), Other Asian (68), North American Indian (3)
# Filipino 
nmatch_fil<-nonmatch%>%
  filter(RACE==7)
fil<-table(nmatch_fil$PARTY_RACE)
knitr::kable(fil,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS Filipino cases')

Misclassified FARS Filipino cases
Race	Frequency
A	1
B	2
H	43
O	31
W	14

# Asian Indian
nmatch_aind<-nonmatch%>%
  filter(RACE==18)
aind<-table(nmatch_aind$PARTY_RACE)
knitr::kable(aind,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS Asian Indian cases')

Misclassified FARS Asian Indian cases
Race	Frequency
H	1
O	81
W	4

# Other Asian
nmatch_othera<-nonmatch%>%
  filter(RACE==68)
othera<-table(nmatch_othera$PARTY_RACE)

knitr::kable(othera,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS Other Asian cases')

Misclassified FARS Other Asian cases
Race	Frequency
A	2
B	3
H	14
O	36
W	7

nmatch_na<-nonmatch%>%
  filter(RACE==3)
na<-table(nmatch_na$PARTY_RACE)
knitr::kable(na,
             col.names = c('Race', 'Frequency'),
             align = 'll', caption = 'Misclassified FARS NA Indian cases')

Misclassified FARS NA Indian cases
Race	Frequency
B	9
H	23
W	74

OTS grant FARS-SWITRS race validity analysis

2025-27-06

Data Analysis