We exported names_all.csv from the nltk library. This is the 2 column file of the names and genders for all samples.
We add the first letter of each name as a column first_let.
Proportion of Female and Male Names
##
## female male
## 0.6295317 0.3704683
Name Frequency by First Letter in Alphabetical Order
| letter | a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p | q | r | s | t | u | v | w | x | y | z |
| proportion | 8.26 | 5.27 | 7.99 | 5.72 | 4.66 | 2.91 | 4.65 | 3.61 | 1.61 | 5.50 | 4.36 | 5.60 | 8.61 | 2.96 | 1.49 | 2.79 | 0.30 | 5.63 | 6.89 | 4.86 | 0.45 | 1.95 | 2.58 | 0.15 | 0.43 | 0.78 |
Sorted by First Letter Frequency
| letter | m | a | c | s | d | r | l | j | b | t | e | g | k | h | n | f | p | w | v | i | o | z | u | y | q | x |
| proportion | 8.61 | 8.26 | 7.99 | 6.89 | 5.72 | 5.63 | 5.60 | 5.50 | 5.27 | 4.86 | 4.66 | 4.65 | 4.36 | 3.61 | 2.96 | 2.91 | 2.79 | 2.58 | 1.95 | 1.61 | 1.49 | 0.78 | 0.45 | 0.43 | 0.30 | 0.15 |
| Name | names_all |
| Number of rows | 7944 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 19 |
| logical | 3 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| B_A_name | 0 | 1.00 | 2 | 15 | 0 | 7578 | 0 |
| B_A_firstletter | 0 | 1.00 | 1 | 1 | 0 | 26 | 0 |
| B_A_secondletter | 0 | 1.00 | 1 | 1 | 0 | 26 | 0 |
| B_A_lastletter | 1 | 1.00 | 1 | 1 | 0 | 25 | 0 |
| P_A_dmetacode | 0 | 1.00 | 1 | 8 | 0 | 1756 | 0 |
| P_A_phx_00 | 2693 | 0.66 | 1 | 2 | 0 | 35 | 0 |
| P_A_phx_01 | 2694 | 0.66 | 1 | 2 | 0 | 38 | 0 |
| P_A_phx_02 | 2804 | 0.65 | 1 | 2 | 0 | 39 | 0 |
| P_A_phx_03 | 3475 | 0.56 | 1 | 2 | 0 | 39 | 0 |
| P_A_phx_04 | 4919 | 0.38 | 1 | 2 | 0 | 37 | 0 |
| P_A_phx_05 | 6195 | 0.22 | 1 | 2 | 0 | 35 | 0 |
| P_A_phx_06 | 7179 | 0.10 | 1 | 2 | 0 | 31 | 0 |
| P_A_phx_07 | 7693 | 0.03 | 1 | 2 | 0 | 23 | 0 |
| P_A_phx_08 | 7873 | 0.01 | 1 | 2 | 0 | 14 | 0 |
| P_A_phx_09 | 7930 | 0.00 | 1 | 2 | 0 | 8 | 0 |
| P_A_phcode | 2693 | 0.66 | 3 | 31 | 0 | 3959 | 0 |
| P_A_phfirst | 2693 | 0.66 | 1 | 3 | 0 | 54 | 0 |
| P_A_phx_last | 2693 | 0.66 | 1 | 2 | 0 | 32 | 0 |
| R_A_gender | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| P_A_phx_10 | 7944 | 0 | NaN | : |
| P_A_phx_11 | 7944 | 0 | NaN | : |
| P_B_phfound | 0 | 1 | 0.66 | TRU: 5251, FAL: 2693 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| B_N_length | 0 | 1 | 6.03 | 1.57 | 2 | 5 | 6 | 7 | 15 | ▂▇▂▁▁ |
| B_N_numconsonants | 0 | 1 | 3.55 | 1.16 | 1 | 3 | 3 | 4 | 10 | ▂▇▂▁▁ |
| B_N_numvowels | 0 | 1 | 2.48 | 0.95 | 0 | 2 | 2 | 3 | 6 | ▃▇▇▂▁ |
| P_N_dmetalen | 0 | 1 | 3.12 | 1.03 | 1 | 2 | 3 | 4 | 8 | ▆▇▆▁▁ |
| P_N_phlen | 0 | 1 | 3.27 | 2.62 | 0 | 0 | 4 | 5 | 12 | ▇▆▇▁▁ |
| P_N_phsyllables | 0 | 1 | 1.40 | 1.17 | 0 | 0 | 2 | 2 | 5 | ▇▇▂▁▁ |
| P_N_phx_stress | 0 | 1 | 0.83 | 0.72 | 0 | 0 | 1 | 1 | 4 | ▅▇▂▁▁ |
We summarize all the R code used in this project in this appendix for ease of reading.
knitr::opts_chunk$set(echo = FALSE, message =FALSE, warning=FALSE)
library(tidyverse)
library(skimr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(cowplot)
names_all = read_csv("names_all.csv", col_types = cols(B_N_length = col_integer(),
P_N_dmetalen = col_integer(), P_N_phlen = col_integer(),
P_N_phsyllables = col_integer()))
names_all %>% group_by(B_A_firstletter, R_A_gender) %>% summarize(Count =n()) -> first_letter_all
names_test = read_csv("names_test.csv")
names_test %>% group_by(B_A_firstletter, R_A_gender) %>% summarize(Count =n()) -> first_letter_test
names_train = read_csv("names_train.csv")
names_train %>% group_by(B_A_firstletter , R_A_gender) %>% summarize(Count =n()) -> first_letter_train
p1 = ggplot(data= first_letter_all, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
geom_bar(position = "stack", stat = "identity") +
ggtitle("All")
p2 = ggplot(data= first_letter_train, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
geom_bar(position = "stack", stat = "identity") +
ggtitle("Train")
p3 = ggplot(data= first_letter_test, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
geom_bar(position = "stack", stat = "identity") +
ggtitle("Test")
title <- ggdraw() + draw_label("First Letter Frequency by Gender per Dataset", x = 0, hjust = 0) +
theme( plot.margin = margin(0,0,0,7))
pg = plot_grid(p1, p2, p3, ncol=2)
plot_grid(title, pg, ncol = 1 , rel_heights = c(0.1, 1))
prop.table(table( names_all$R_A_gender))
prop.table(table( names_all$B_A_firstletter ) )-> by_first_let
df = data.frame( letter = names(by_first_let), proportion = round(as.numeric(by_first_let) * 100, 2 ) )
t_df = t(df)
t_df %>% kable(digits =1) %>% kable_styling(bootstrap_options = c("hover", "striped"), full_width = F)
df %>% arrange(desc(proportion)) -> df_sorted
t(df_sorted) %>% kable(digits =1, caption = "Sorted by First Letter Frequency") %>% kable_styling(bootstrap_options = c("hover", "striped"), full_width = F)
skim(names_all)