#Packages needed
library(forcats)
library(tidyverse)
library(ggplot2)
library(scales)
library(dplyr)
library(RColorBrewer)
#Dataframes to Start
female_names <- df %>% #creating a new data frame
filter(Gender == "F") %>% # filtering only Female gender
select(Name, Year, Gender, Births) %>% # grabbing these variables from original df
filter(Year >= 2000, Year <= 2020) %>% # filtering for years between 2000 and 2020
group_by(Name) %>%
summarize(Total_Births = sum(Births)) %>% # creating new variable for total births to be added
arrange(desc(Total_Births)) #sort this variable in descending
male_names <- df %>% #new data frame containing only male names
filter(Gender == "M") %>% #Filtering only Male gender
select(Name, Births, Year, Gender) %>% # grabbing these variables from original df
filter(Year >= 2000, Year <= 2020) %>% # filtering for years between 2000 and 2020
group_by(Name) %>%
summarize(Total_Births = sum(Births)) %>% # creating new variable for total births to be added
arrange(desc(Total_Births)) #sort this variable in descending
This dataset was found on Kaggle. It summarizes the top 1000 baby names, per year, in the United States from the Social Security Administration from 1880 until 2020. The dataset includes different variables such as year, name, gender, number of births for a given name and gender, and the rank at which that name placed within its respective year.
With few variables to work with, it felt as though not many questions could be explored. With the help of some data manipulation and analysis I decided to hone in on five different insightful questions to explore trends within this dataset.
A quick look of the data is shown below:
## Year Name Gender Births Rank
## 1: 2020 Liam M 19659 1
## 2: 2020 Noah M 18252 2
## 3: 2020 Oliver M 14147 3
## 4: 2020 Elijah M 13034 4
## 5: 2020 William M 12541 5
## 6: 2020 James M 12250 6
The first step in analyzing this data was to see what trends lied within the number of children being born each year. I wanted to see if phenomenons such as the “Baby Boom” from 1946 to 1964 were reflected in this dataset. Furthermore, I also wanted to explore how the births of males and females differed over this time period, and if one gender consistently had more births than the other.
years_df <- df %>% #new data frame
select(Year, Births) %>% #grabbing year and birth variables from original df
group_by(Year) %>% #grouping births by year
summarize(Total_Births = sum(Births)) #creating new variable for total births
female_years <- df %>% #new data frame for female births
select(Year, Births, Gender) %>% #grabbing year, birth, and gender variables from original df
filter(Gender == "F") %>% #Filtering only for Female gender
group_by(Year) %>% #Grouping by year
summarize(Total_Births = sum(Births)) #creating new variable for total births
male_years <- df %>% #new data frame for male births
select(Year, Births, Gender) %>% #grabbing year, birth, and gender variables from original df
filter(Gender == "M") %>% #Filtering only for Male gender
group_by(Year) %>% #Grouping by year
summarize(Total_Births = sum(Births)) #creating new variable for total births
#creating line graph for 3 dataframes just created
ggplot() +
geom_line(data = years_df,aes(x = Year, y = Total_Births/1e6, color = 'Total Births')) + #line for total births all genders
geom_line(data = female_years, aes(x = Year, y = Total_Births/1e6, color = 'Females')) + #line for total births for females
geom_line(data = male_years, aes(x = Year, y = Total_Births/1e6, color = 'Males')) + #line for total births for Males
scale_color_manual(values=c('Pink','Blue','Black')) + #setting color of lines
scale_y_continuous(labels = unit_format(unit = "M")) + #Formatting y axis to present M for millions
scale_x_continuous(limits = c(1880, 2020)) + #setting x axis limits (not really necessary)
labs(title = "Births by Gender",x = 'Year', y = 'Number of Births', color ='Genders') + #creating labels for axis, and legend creating title
theme(plot.title = element_text(hjust = 0.5)) #Centering plot title
From the data, we can see that the “Baby Boom” phenomenon is shown with a skyrocketing amount of births between the years 1940 and 1960. This is then followed by the Vietnam War that occurred between 1954 and 1975 where we see the number of children drop by over one million births. This behavior is to be expected as many men were deployed while their wives stayed home.
When we dive into the births by gender we can see that more females were born than males from 1880 until almost 1935. After 1935 we almost consistently see more males born than females all the way through the year 2020.
Following along with the theme of United States history I decided to look at trends that may lie within boy names that correspond to the sitting presidents. For Presidents, Franklin Roosevelt sat as Commander in Chief from 1933 to 1945. Given Franklin is a unique name for this current day in age I wanted to see if his presidency affected the frequency of Franklins born within his presidency. I then wanted to look at a president who was considered “popular” to see if his presidency held weight for babies born during his time in office. I settled on looking at John F. Kennedy who served as Commander in Chief from 1961 to 1963.
#-------------------------------Scatter-------------------------------------#
# John F Kennedy 1961-1963
# Franklin Roosevelt 1933-1945
john_df <- df %>% #creating new dataset for the names John from original df
filter(Gender == "M") %>% #filtering for Male gender only
select(Year, Births, Name) %>% #selecting the variables Year Births and Name from original df
filter(Name == "John") %>% #filtering for just the name John to be accounted for
group_by(Year) %>% #grouping data by year
data.frame()
franklin_df <- df %>% #creating new dataset for the names Franklin from original df
filter(Gender == "M") %>% #filtering for Male gender only
select(Year, Births, Name) %>% #selecting the variables Year Births and Name from original df
filter(Name == "Franklin") %>% #filtering for just the name Franklin to be accounted for
group_by(Year) %>% #grouping data by year
data.frame()
ggplot() +
geom_point(data=john_df, aes(x=Year, y=Births, color='John')) + #creating scatter plot of the names John
geom_point(data=franklin_df, aes(x=Year, y=Births, color='Franklin')) + #creating scatter plot of the names Franklin
labs(title = "Effect of President's on Boy Names",color = 'Names of Presidents') + #Creating titles for graph and legend
theme(plot.title = element_text(hjust = 0.5)) + #Centering plot title
xlim(1930, 1965) + #setting x axis limits to see only a portion of years that fall within the two presidencys
geom_vline(xintercept = 1961, linetype="dashed",color = "blue") + #creating dashed verticle line to show years of presidency (JFK)
geom_vline(xintercept = 1963, linetype="dashed",color = "blue") + #creating dashed verticle line to show years of presidency (JFK)
geom_vline(xintercept = 1933, linetype="dashed",color = "red") + #creating dashed verticle line to show years of presidency (Roos)
geom_vline(xintercept = 1945, linetype="dashed",color = "red") #creating dashed verticle line to show years of presidency (Roos)
From the scatter plot we can see Franklin Roosevelt’s time as Presdient encapsulated by the red dashed lines and similarly John F. Kennedy’s term encapsulated by the blue dashed lines. We can see during Rooselvelt’s first year in office there was an uptick in male babies born carrying his name. This trend however died down shortly after his first year in office and never significantly trended upwards after. For Kennedy we see a similar pattern. His first year in office there is an uptick of males born carrying the name John, but come his second year in office those numbers drop back down. In 1963 he was assassinated and as a result his presidency terminated. It is interesting to see after his assassination there looks to be an uptick in the number of males born with his name. Now we can not infer people were naming their children after the president, but this would be an interesting route of analysis to persue further.
Shifting gears away from United States History, selfishly, I was born in the year 2000 and now that I had a general idea of what the overall births looked like I decided to investigate certain trends in this dataset that corresponded with the year in which I was born through 2020.
#--------------------------------Bar Chart-----------------------------------#
ggplot(female_names[1:5,], aes(x = reorder(Name, -Total_Births), y = Total_Births/1e3)) + #aes is specifying axis - reorder births in descending using name, dividing by 1e3 for formatting reasons specified below on scale_y_continuous line
geom_bar(colour="black", fill = "pink", stat="identity") + #specifying aesthetics for the bars such as outline and fill
geom_text(aes(label= scales::comma(Total_Births), vjust=1.6))+ #creating the labels within the bar
labs(title = "Top 5 Female Names Since 2000", x="Names", y="Number of Births") + #creating titles for graph and axis
theme(plot.title = element_text(hjust = 0.5)) + #Centered plot title
scale_y_continuous(labels = unit_format(unit = "K")) #formatting numbers on y axis to appear as "10k" for 10,000
ggplot(male_names[1:5,], aes(x = reorder(Name, -Total_Births), y = Total_Births/1e3)) + #aes is specifying axis - reorder births in descending using name
geom_bar(colour="black", fill = "steelblue", stat="identity") + #specifying aesthetics for the bars such as outline and fill
geom_text(aes(label= scales::comma(Total_Births), vjust=1.6))+ #creating the labels within the bar
labs(title = "Top 5 Male Names Since 2000", x="Names", y="Number of Births") + #creating titles for graph and axis
theme(plot.title = element_text(hjust = 0.5)) + #Centered plot title
scale_y_continuous(labels = unit_format(unit = "K")) #formatting numbers on y axis to appear as "10k" for 10,000
From these bar charts we can see that between 2000 and 2020 the most popular female names were Emma, Olivia, Isabella, Emily, and Sophia accounting for 1.7 million births. Similarly we saw the most popular male names were Jacob, Michael, William, Ethan, and Matthew accounting for just over 1.9 million births.
Looking at the top female names from 2000 and on got me curious about how my own name would rank amongst these. The approach here was to find the total number of births that occurred for my name, Chloe, and append that data to the preexisting dataset for the top female names. The goal here was to see amongst the top 5 names how much “weight” did my name hold, or more precisiley what percent of births came from my name compared to the top 5?
chloe_df <- df %>% #creating a new data fram for my name
filter(Gender == "F") %>% #Filtering only Female gender
filter(Name == 'Chloe') %>% #Filtering for only my name
select(Name, Year, Gender, Births) %>% # grabbing these variables from original df
filter(Year >= 2000, Year <= 2020) %>% # filtering for years between 2000 and 2020
group_by(Name) %>%
summarize(Total_Births = sum(Births)) %>% # creating new variable for total births to be added
arrange(desc(Total_Births)) %>% #sort this variable in descending
data.frame()
top_chloe <- rbind(female_names[1:5,], chloe_df) %>% #using rbind to append the top 5 female names and my name together to a new df
mutate(perc = Total_Births / sum(Total_Births)) %>% #creating a new variable percent
arrange(perc) %>% #sorting data
mutate(labels = scales::percent(perc)) %>% #new variable labels, formatting percent variable to be a percentage
data.frame()
#-------------------------------Pie Graph-------------------------------------#
ggplot(data = top_chloe, aes(x = "", y = Total_Births, fill = Name)) + #using top5 female names plus my own dataframe and looking at totalbirths variable
geom_col(color = "black") +
geom_label(aes(label = labels), #specifying lables (created in dataframe)
position = position_stack(vjust = 0.5), #positioning of labels
show.legend = FALSE) +
ggtitle("Top 5 Names Compared to My Name from 2000 to 2020")+ #Adding title
theme(plot.title = element_text(hjust = 0.5)) + #centering title
coord_polar(theta = "y")+ #specifying coordinates (polar) to be total births
scale_fill_brewer(palette="PuRd") + #changing color of pie chart
theme(legend.title = "Names") + #Changing legend title name
theme_void() #getting rid of outter circle
The pie chart shows that when including my name into the top 5 female names from 2000 and on, “Chloe” doesn’t hold much significance among the others. In this pool of 6 names Chloe only makes up about 9.7% of the total births for this 20 year time frame. Even further it very likely holds even less weight when compared to all the female names that occurred in this period of time.
Now that we’ve looked at which names were “trendy” the last 20 years and how my name compared to those female names I decided to look at one last interesting trend. Names for men and women are simply letters strung together and phonetically sounded out. I wanted to see if there were trends in how names were created…
The goal for this was to see, for the year 2000, of all the names recorded in the dataset, if there were combinations of first and second letters of a name that were most popular. Similarly I sought out the same trend for the first and last letter of all the names. The finding are presented below in two heat maps.
df <- df %>%
mutate(first_letter = str_sub(df$Name, 1, 1)) %>% #creating variable to extraxt first letter from all names in df
mutate(second_letter = str_sub(df$Name, 2, 2)) %>% #creating variable to extraxt second letter from all names in df
mutate(last_letter = str_sub(df$Name, -1,-1)) %>% #creating variable to extraxt last letter from all names in df
mutate(second_letter = str_to_upper(second_letter)) %>% #Creating second letter as an uppercase
mutate(last_letter = str_to_upper(last_letter)) #Creating last letter as an uppercase
Year_2000 <- df %>% #creating a new dataframe
select(Year, Name, Births, first_letter, second_letter, last_letter) %>% #grabbing year, birth, births, and new variables from updated df
filter(Year == 2000) %>% #Filtering only for the year 2000
group_by(first_letter,second_letter, last_letter, Year) %>% #grouping by new variables and year
summarise(tot_births = sum(Births)) %>% #creating new variable for total births
group_by(Year) %>% #grouping by year
mutate(percent_birth = 100*tot_births/sum(tot_births)) %>% #creating new variable computing the percentage of births from variables within df
ungroup() #ungrouping data frame
#data.frame()
#creating heat maps
ggplot(data = Year_2000, aes(x = second_letter, y = first_letter)) + #specifying axis for first and second letter
geom_tile(aes(fill = percent_birth)) +#setting the tiles to be filled based on variable percent_birth
scale_fill_gradient(low = "steelblue", high = "red") +
labs(title = 'Distribution of First Letter and Second Letter of Names in 2000', #creating title, axis titles, and legend title
x = 'Second Letter', y = 'First Letter ', fill = 'Combination %') +
theme(plot.title = element_text(hjust = 0.5), axis.ticks = element_blank()) #centering plot title, adjusting to not see graph background lines
When comparing the first and second letter found in all of the names born in 2000, we can see that the most frequently used combination are the letters “N” and “I” followed by “E” and “M”. This finding is particularly interesting because from our previous bar graph we knew the name Emma ranked in the top 5 most popular female names from 2000 to 2020.
The letter combination “N” and “I” comes as more of a shock as none of the top 5 names for female or male names contain these two consecutive letters in the first two spots of the names. Further analysis would need to be done for the top 5 male and female names for strictly the year 2000 to see if the names would corroborate or make sense of this finding.
This next heatmap shows the percentage of births that occurred for combinations of first and last letters of names recorded in 2000.
ggplot(data = Year_2000,aes(x = last_letter, y = first_letter)) + #specifying axis for first and last letter
geom_tile(aes(fill = percent_birth)) + #setting the tiles to be filled based on variable percent_birth
scale_fill_gradient(low = "steelblue", high = "red") +
labs(title = 'Distribution of First Letter and Last Letter of Names in 2000', #creating title, axis titles, and legend title
x = 'Last Letter', y = 'First Letter ', fill = 'Combination %') +
theme(plot.title = element_text(hjust = 0.5), axis.ticks = element_blank()) #centering plot title, adjusting to not see graph background lines
From this heatmap we can see the two most popular combinations of first and last letters used in names from 2000 were “M” and “L” as well as “J” and “B”. This data follows what we saw from our previous bar chart for the top 5 male names from 2000 to 2020. Two of the names that remained the most popular for those years were Michael and Jacob. Coincidentally, or perhaps not, the first and last letter for each of those names follows the data we see from the heat map above.
An interesting aside is that “M” and “L” combinations rank higher than “J” and “B”, which is opposite of the data we presented in the bar graphs. For the 20 year time frame we saw that about 41,000 more “Jacobs” were born than “Michaels”. A simple explanation of this is that our heatmap is only focusing on the year 2000 and its possible more Michaels, along with other names that start and end with “M” and “L” were born than Jacobs, or names beginning and ending with “J” and “B”.
Now if I were to do this assignment over again, I would have not chosen this dataset. With few variables to work with my options felt very limited on what I could present graphically. Moreover, I didn’t see a plausible use of a scatter plot but essentially forced one for the diversification of the visualizations shown.
I hope I entertained you, and provided an interesting story line for this baby name data!