Data 620 PROJECT 3 EDA

Alexander Ng

04/01/2022

EDA FOR PROJECT 3

We exported names_all.csv from the nltk library. This is the 2 column file of the names and genders for all samples.

We add the first letter of each name as a column first_let.

Proportion of Female and Male Names

## 
##    female      male 
## 0.6295317 0.3704683

Name Frequency by First Letter in Alphabetical Order

letter a b c d e f g h i j k l m n o p q r s t u v w x y z
proportion 8.26 5.27 7.99 5.72 4.66 2.91 4.65 3.61 1.61 5.50 4.36 5.60 8.61 2.96 1.49 2.79 0.30 5.63 6.89 4.86 0.45 1.95 2.58 0.15 0.43 0.78

Sorted by First Letter Frequency

letter m a c s d r l j b t e g k h n f p w v i o z u y q x
proportion 8.61 8.26 7.99 6.89 5.72 5.63 5.60 5.50 5.27 4.86 4.66 4.65 4.36 3.61 2.96 2.91 2.79 2.58 1.95 1.61 1.49 0.78 0.45 0.43 0.30 0.15
Data summary
Name names_all
Number of rows 7944
Number of columns 29
_______________________
Column type frequency:
character 19
logical 3
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
B_A_name 0 1.00 2 15 0 7578 0
B_A_firstletter 0 1.00 1 1 0 26 0
B_A_secondletter 0 1.00 1 1 0 26 0
B_A_lastletter 1 1.00 1 1 0 25 0
P_A_dmetacode 0 1.00 1 8 0 1756 0
P_A_phx_00 2693 0.66 1 2 0 35 0
P_A_phx_01 2694 0.66 1 2 0 38 0
P_A_phx_02 2804 0.65 1 2 0 39 0
P_A_phx_03 3475 0.56 1 2 0 39 0
P_A_phx_04 4919 0.38 1 2 0 37 0
P_A_phx_05 6195 0.22 1 2 0 35 0
P_A_phx_06 7179 0.10 1 2 0 31 0
P_A_phx_07 7693 0.03 1 2 0 23 0
P_A_phx_08 7873 0.01 1 2 0 14 0
P_A_phx_09 7930 0.00 1 2 0 8 0
P_A_phcode 2693 0.66 3 31 0 3959 0
P_A_phfirst 2693 0.66 1 3 0 54 0
P_A_phx_last 2693 0.66 1 2 0 32 0
R_A_gender 0 1.00 4 6 0 2 0

Variable type: logical

skim_variable n_missing complete_rate mean count
P_A_phx_10 7944 0 NaN :
P_A_phx_11 7944 0 NaN :
P_B_phfound 0 1 0.66 TRU: 5251, FAL: 2693

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
B_N_length 0 1 6.03 1.57 2 5 6 7 15 ▂▇▂▁▁
B_N_numconsonants 0 1 3.55 1.16 1 3 3 4 10 ▂▇▂▁▁
B_N_numvowels 0 1 2.48 0.95 0 2 2 3 6 ▃▇▇▂▁
P_N_dmetalen 0 1 3.12 1.03 1 2 3 4 8 ▆▇▆▁▁
P_N_phlen 0 1 3.27 2.62 0 0 4 5 12 ▇▆▇▁▁
P_N_phsyllables 0 1 1.40 1.17 0 0 2 2 5 ▇▇▂▁▁
P_N_phx_stress 0 1 0.83 0.72 0 0 1 1 4 ▅▇▂▁▁

Code

We summarize all the R code used in this project in this appendix for ease of reading.

knitr::opts_chunk$set(echo = FALSE, message =FALSE, warning=FALSE)

library(tidyverse)
library(skimr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(cowplot)

names_all = read_csv("names_all.csv", col_types = cols(B_N_length = col_integer(), 
         P_N_dmetalen = col_integer(), P_N_phlen = col_integer(), 
         P_N_phsyllables = col_integer()))

names_all %>% group_by(B_A_firstletter, R_A_gender) %>% summarize(Count =n()) -> first_letter_all

names_test = read_csv("names_test.csv")
names_test %>% group_by(B_A_firstletter, R_A_gender) %>% summarize(Count =n()) -> first_letter_test

names_train = read_csv("names_train.csv")
names_train %>% group_by(B_A_firstletter , R_A_gender) %>% summarize(Count =n()) -> first_letter_train




p1 = ggplot(data= first_letter_all, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
  geom_bar(position = "stack", stat = "identity") +
  ggtitle("All")

p2 = ggplot(data= first_letter_train, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
  geom_bar(position = "stack", stat = "identity") +
  ggtitle("Train")

p3 = ggplot(data= first_letter_test, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
  geom_bar(position = "stack", stat = "identity") +
  ggtitle("Test")

title <- ggdraw() + draw_label("First Letter Frequency by Gender per Dataset", x = 0, hjust = 0) +
  theme( plot.margin = margin(0,0,0,7))

pg = plot_grid(p1, p2, p3, ncol=2)

plot_grid(title, pg, ncol = 1 , rel_heights = c(0.1, 1))
prop.table(table( names_all$R_A_gender))
prop.table(table( names_all$B_A_firstletter  ) )-> by_first_let

df = data.frame( letter = names(by_first_let), proportion = round(as.numeric(by_first_let) * 100, 2 ) )

t_df = t(df) 

t_df %>% kable(digits =1) %>% kable_styling(bootstrap_options = c("hover", "striped"), full_width = F)

df %>% arrange(desc(proportion)) -> df_sorted

t(df_sorted) %>% kable(digits =1, caption = "Sorted by First Letter Frequency") %>% kable_styling(bootstrap_options = c("hover", "striped"), full_width = F)

skim(names_all)