Predicting Ethnicity from Names in R
rethnicitypredict_ethnicity() firstname lastname prob_asian prob_black prob_hispanic prob_white race
1 Martin Tran 0.9897194 0.0007328312 0.004585689 0.004962085 asian
playerID nameFirst nameLast
1 aardsda01 David Aardsma
2 abadan01 Andy Abad
3 abadfe01 Fernando Abad
4 abbotan01 Andrew Abbott
5 abbotco01 Cory Abbott
6 abelmi01 Mick Abel
playerID firstname lastname prob_asian prob_black prob_hispanic prob_white
1 aardsda01 David Aardsma 0.07009414 0.347950069 0.08085186 0.501103934
2 abadan01 Andy Abad 0.68531211 0.007956145 0.25888873 0.047843014
3 abadfe01 Fernando Abad 0.08253610 0.006101720 0.90657341 0.004788768
4 abbotan01 Andrew Abbott 0.10490424 0.274247768 0.04656111 0.574286882
5 abbotco01 Cory Abbott 0.08479635 0.298825897 0.02083483 0.595542921
6 abelmi01 Mick Abel 0.22109019 0.104949849 0.04501239 0.628947570
race
1 white
2 asian
3 hispanic
4 white
5 white
6 white
ggplot(aggregated_data, aes(x = reorder(predicted_race, -percentage), y = percentage, fill = predicted_race)) +
geom_col(show.legend = FALSE, color = "black", width = 0.6) +
geom_text(aes(label = paste0(round(percentage, 1), "%")), vjust = -0.5, fontface = "bold", size = 5) +
labs(
title = "Predicted Ethnic Distribution of MLB Players After 2000",
x = "Predicted Ethnicity",
y = "Percentage of Roster",
) +
theme(
plot.title = element_text(face = "bold", size = 18)
)