EDA FOR PROJECT 3

We exported names_all.csv from the nltk library. This is the 2 column file of the names and genders for all samples.

We add the first letter of each name as a column first_let.

Proportion of Female and Male Names

## 
##    female      male 
## 0.6295317 0.3704683

Name Frequency by First Letter in Alphabetical Order

letter	a	b	c	d	e	f	g	h	i	j	k	l	m	n	o	p	q	r	s	t	u	v	w	x	y	z
proportion	8.26	5.27	7.99	5.72	4.66	2.91	4.65	3.61	1.61	5.50	4.36	5.60	8.61	2.96	1.49	2.79	0.30	5.63	6.89	4.86	0.45	1.95	2.58	0.15	0.43	0.78

Sorted by First Letter Frequency

letter	m	a	c	s	d	r	l	j	b	t	e	g	k	h	n	f	p	w	v	i	o	z	u	y	q	x
proportion	8.61	8.26	7.99	6.89	5.72	5.63	5.60	5.50	5.27	4.86	4.66	4.65	4.36	3.61	2.96	2.91	2.79	2.58	1.95	1.61	1.49	0.78	0.45	0.43	0.30	0.15

Data summary

Name	names_all
Number of rows	7944
Number of columns	29
_______________________
Column type frequency:
character	19
logical	3
numeric	7
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
B_A_name	0	1.00	2	15	7578
B_A_firstletter	0	1.00	1	1	26
B_A_secondletter	0	1.00	1	1	26
B_A_lastletter	1	1.00	1	1	25
P_A_dmetacode	0	1.00	1	8	1756
P_A_phx_00	2693	0.66	1	2	35
P_A_phx_01	2694	0.66	1	2	38
P_A_phx_02	2804	0.65	1	2	39
P_A_phx_03	3475	0.56	1	2	39
P_A_phx_04	4919	0.38	1	2	37
P_A_phx_05	6195	0.22	1	2	35
P_A_phx_06	7179	0.10	1	2	31
P_A_phx_07	7693	0.03	1	2	23
P_A_phx_08	7873	0.01	1	2	14
P_A_phx_09	7930	0.00	1	2	8
P_A_phcode	2693	0.66	3	31	3959
P_A_phfirst	2693	0.66	1	3	54
P_A_phx_last	2693	0.66	1	2	32
R_A_gender	0	1.00	4	6	2

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
P_A_phx_10	7944	0	NaN	:
P_A_phx_11	7944	0	NaN	:
P_B_phfound	0	1	0.66	TRU: 5251, FAL: 2693

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
B_N_length	1	6.03	1.57	2	5	6	7	15	▂▇▂▁▁
B_N_numconsonants	1	3.55	1.16	1	3	3	4	10	▂▇▂▁▁
B_N_numvowels	1	2.48	0.95	0	2	2	3	6	▃▇▇▂▁
P_N_dmetalen	1	3.12	1.03	1	2	3	4	8	▆▇▆▁▁
P_N_phlen	1	3.27	2.62	0	0	4	5	12	▇▆▇▁▁
P_N_phsyllables	1	1.40	1.17	0	0	2	2	5	▇▇▂▁▁
P_N_phx_stress	1	0.83	0.72	0	0	1	1	4	▅▇▂▁▁

Code

We summarize all the R code used in this project in this appendix for ease of reading.

knitr::opts_chunk$set(echo = FALSE, message =FALSE, warning=FALSE)

library(tidyverse)
library(skimr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(cowplot)

names_all = read_csv("names_all.csv", col_types = cols(B_N_length = col_integer(), 
         P_N_dmetalen = col_integer(), P_N_phlen = col_integer(), 
         P_N_phsyllables = col_integer()))

names_all %>% group_by(B_A_firstletter, R_A_gender) %>% summarize(Count =n()) -> first_letter_all

names_test = read_csv("names_test.csv")
names_test %>% group_by(B_A_firstletter, R_A_gender) %>% summarize(Count =n()) -> first_letter_test

names_train = read_csv("names_train.csv")
names_train %>% group_by(B_A_firstletter , R_A_gender) %>% summarize(Count =n()) -> first_letter_train




p1 = ggplot(data= first_letter_all, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
  geom_bar(position = "stack", stat = "identity") +
  ggtitle("All")

p2 = ggplot(data= first_letter_train, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
  geom_bar(position = "stack", stat = "identity") +
  ggtitle("Train")

p3 = ggplot(data= first_letter_test, aes(fill = R_A_gender, y = Count, x = B_A_firstletter )) +
  geom_bar(position = "stack", stat = "identity") +
  ggtitle("Test")

title <- ggdraw() + draw_label("First Letter Frequency by Gender per Dataset", x = 0, hjust = 0) +
  theme( plot.margin = margin(0,0,0,7))

pg = plot_grid(p1, p2, p3, ncol=2)

plot_grid(title, pg, ncol = 1 , rel_heights = c(0.1, 1))
prop.table(table( names_all$R_A_gender))
prop.table(table( names_all$B_A_firstletter  ) )-> by_first_let

df = data.frame( letter = names(by_first_let), proportion = round(as.numeric(by_first_let) * 100, 2 ) )

t_df = t(df) 

t_df %>% kable(digits =1) %>% kable_styling(bootstrap_options = c("hover", "striped"), full_width = F)

df %>% arrange(desc(proportion)) -> df_sorted

t(df_sorted) %>% kable(digits =1, caption = "Sorted by First Letter Frequency") %>% kable_styling(bootstrap_options = c("hover", "striped"), full_width = F)

skim(names_all)

Data 620 PROJECT 3 EDA

Alexander Ng

04/01/2022

EDA FOR PROJECT 3

Code