#Packages needed
library(forcats)
library(tidyverse)
library(ggplot2)
library(scales)
library(dplyr)
library(RColorBrewer)

#Dataframes to Start
female_names <- df %>% #creating a new data frame 
  filter(Gender == "F") %>% # filtering only Female gender
  select(Name, Year, Gender, Births) %>% # grabbing these variables from original df 
  filter(Year >= 2000, Year <= 2020) %>% # filtering for years between 2000 and 2020
  group_by(Name) %>% 
  summarize(Total_Births = sum(Births)) %>% # creating new variable for total births to be added 
  arrange(desc(Total_Births))  #sort this variable in descending
  
male_names <- df %>% #new data frame containing only male names
  filter(Gender == "M") %>% #Filtering only Male gender 
  select(Name, Births, Year, Gender) %>% # grabbing these variables from original df
  filter(Year >= 2000, Year <= 2020) %>%  # filtering for years between 2000 and 2020
  group_by(Name) %>%
  summarize(Total_Births = sum(Births)) %>% # creating new variable for total births to be added
  arrange(desc(Total_Births)) #sort this variable in descending

Top Baby Names in the United States between 1880 and 2020

This dataset was found on Kaggle. It summarizes the top 1000 baby names, per year, in the United States from the Social Security Administration from 1880 until 2020. The dataset includes different variables such as year, name, gender, number of births for a given name and gender, and the rank at which that name placed within its respective year.

With few variables to work with, it felt as though not many questions could be explored. With the help of some data manipulation and analysis I decided to hone in on five different insightful questions to explore trends within this dataset.

A quick look of the data is shown below:

##    Year    Name Gender Births Rank
## 1: 2020    Liam      M  19659    1
## 2: 2020    Noah      M  18252    2
## 3: 2020  Oliver      M  14147    3
## 4: 2020  Elijah      M  13034    4
## 5: 2020 William      M  12541    5
## 6: 2020   James      M  12250    6

How did the number of births per year change over this time period?

The first step in analyzing this data was to see what trends lied within the number of children being born each year. I wanted to see if phenomenons such as the “Baby Boom” from 1946 to 1964 were reflected in this dataset. Furthermore, I also wanted to explore how the births of males and females differed over this time period, and if one gender consistently had more births than the other.

years_df <- df %>% #new data frame
  select(Year, Births) %>% #grabbing year and birth variables from original df
  group_by(Year) %>% #grouping births by year
  summarize(Total_Births = sum(Births)) #creating new variable for total births


female_years <- df %>% #new data frame for female births
  select(Year, Births, Gender) %>% #grabbing year, birth, and gender variables from original df
  filter(Gender == "F") %>%  #Filtering only for Female gender 
  group_by(Year) %>% #Grouping by year
  summarize(Total_Births = sum(Births)) #creating new variable for total births

male_years <- df %>% #new data frame for male births
  select(Year, Births, Gender) %>% #grabbing year, birth, and gender variables from original df
  filter(Gender == "M") %>% #Filtering only for Male gender 
  group_by(Year) %>% #Grouping by year
  summarize(Total_Births = sum(Births)) #creating new variable for total births

#creating line graph for 3 dataframes just created
ggplot() +
  geom_line(data = years_df,aes(x = Year, y = Total_Births/1e6, color = 'Total Births')) + #line for total births all genders
  geom_line(data = female_years, aes(x = Year, y = Total_Births/1e6, color = 'Females')) + #line for total births for females
  geom_line(data = male_years, aes(x = Year, y = Total_Births/1e6, color = 'Males')) + #line for total births for Males
  scale_color_manual(values=c('Pink','Blue','Black')) + #setting color of lines
  scale_y_continuous(labels = unit_format(unit = "M")) + #Formatting y axis to present M for millions
  scale_x_continuous(limits = c(1880, 2020)) + #setting x axis limits (not really necessary)
  labs(title = "Births by Gender",x = 'Year', y = 'Number of Births', color ='Genders') + #creating labels for axis, and legend creating title
  theme(plot.title = element_text(hjust = 0.5)) #Centering plot title

From the data, we can see that the “Baby Boom” phenomenon is shown with a skyrocketing amount of births between the years 1940 and 1960. This is then followed by the Vietnam War that occurred between 1954 and 1975 where we see the number of children drop by over one million births. This behavior is to be expected as many men were deployed while their wives stayed home.

When we dive into the births by gender we can see that more females were born than males from 1880 until almost 1935. After 1935 we almost consistently see more males born than females all the way through the year 2020.

Following along with the theme of United States history I decided to look at trends that may lie within boy names that correspond to the sitting presidents. For Presidents, Franklin Roosevelt sat as Commander in Chief from 1933 to 1945. Given Franklin is a unique name for this current day in age I wanted to see if his presidency affected the frequency of Franklins born within his presidency. I then wanted to look at a president who was considered “popular” to see if his presidency held weight for babies born during his time in office. I settled on looking at John F. Kennedy who served as Commander in Chief from 1961 to 1963.

How did Presidents affect Male births during their term?

#-------------------------------Scatter-------------------------------------#
# John F Kennedy 1961-1963
# Franklin Roosevelt 1933-1945

john_df <- df %>% #creating new dataset for the names John from original df
  filter(Gender == "M") %>% #filtering for Male gender only
  select(Year, Births, Name) %>% #selecting the variables Year Births and Name from original df
  filter(Name == "John") %>% #filtering for just the name John to be accounted for
  group_by(Year) %>%  #grouping data by year
  data.frame()
franklin_df <- df %>% #creating new dataset for the names Franklin from original df
  filter(Gender == "M") %>% #filtering for Male gender only
  select(Year, Births, Name) %>% #selecting the variables Year Births and Name from original df
  filter(Name == "Franklin") %>% #filtering for just the name Franklin to be accounted for
  group_by(Year) %>% #grouping data by year
  data.frame()
  
ggplot() + 
  geom_point(data=john_df, aes(x=Year, y=Births, color='John')) + #creating scatter plot of the names John
  geom_point(data=franklin_df, aes(x=Year, y=Births, color='Franklin')) + #creating scatter plot of the names Franklin
  labs(title = "Effect of President's on Boy Names",color = 'Names of Presidents') + #Creating titles for graph and legend
  theme(plot.title = element_text(hjust = 0.5)) + #Centering plot title
  xlim(1930, 1965) + #setting x axis limits to see only a portion of years that fall within the two presidencys
  geom_vline(xintercept = 1961, linetype="dashed",color = "blue") + #creating dashed verticle line to show years of presidency (JFK)
  geom_vline(xintercept = 1963, linetype="dashed",color = "blue") + #creating dashed verticle line to show years of presidency (JFK)
  geom_vline(xintercept = 1933, linetype="dashed",color = "red") + #creating dashed verticle line to show years of presidency (Roos)
  geom_vline(xintercept = 1945, linetype="dashed",color = "red")  #creating dashed verticle line to show years of presidency (Roos)

From the scatter plot we can see Franklin Roosevelt’s time as Presdient encapsulated by the red dashed lines and similarly John F. Kennedy’s term encapsulated by the blue dashed lines. We can see during Rooselvelt’s first year in office there was an uptick in male babies born carrying his name. This trend however died down shortly after his first year in office and never significantly trended upwards after. For Kennedy we see a similar pattern. His first year in office there is an uptick of males born carrying the name John, but come his second year in office those numbers drop back down. In 1963 he was assassinated and as a result his presidency terminated. It is interesting to see after his assassination there looks to be an uptick in the number of males born with his name. Now we can not infer people were naming their children after the president, but this would be an interesting route of analysis to persue further.

Shifting gears away from United States History, selfishly, I was born in the year 2000 and now that I had a general idea of what the overall births looked like I decided to investigate certain trends in this dataset that corresponded with the year in which I was born through 2020.

What were the top baby names for males and females from 2000 to 2020?

#--------------------------------Bar Chart-----------------------------------#
ggplot(female_names[1:5,], aes(x = reorder(Name, -Total_Births), y = Total_Births/1e3)) + #aes is specifying axis - reorder births in descending using name, dividing by 1e3 for formatting reasons specified below on scale_y_continuous line
  geom_bar(colour="black", fill = "pink", stat="identity") + #specifying aesthetics  for the bars such as outline and fill
  geom_text(aes(label= scales::comma(Total_Births), vjust=1.6))+ #creating the labels within the bar
  labs(title = "Top 5 Female Names Since 2000", x="Names", y="Number of Births") + #creating titles for graph and axis
  theme(plot.title = element_text(hjust = 0.5)) + #Centered plot title
  scale_y_continuous(labels = unit_format(unit = "K")) #formatting numbers on y axis to appear as "10k" for 10,000

ggplot(male_names[1:5,], aes(x = reorder(Name, -Total_Births), y = Total_Births/1e3)) + #aes is specifying axis - reorder births in descending using name
  geom_bar(colour="black", fill = "steelblue", stat="identity") + #specifying aesthetics for the bars such as outline and fill
  geom_text(aes(label= scales::comma(Total_Births), vjust=1.6))+ #creating the labels within the bar
  labs(title = "Top 5 Male Names Since 2000", x="Names", y="Number of Births") + #creating titles for graph and axis
  theme(plot.title = element_text(hjust = 0.5)) + #Centered plot title
  scale_y_continuous(labels = unit_format(unit = "K")) #formatting numbers on y axis to appear as "10k" for 10,000

From these bar charts we can see that between 2000 and 2020 the most popular female names were Emma, Olivia, Isabella, Emily, and Sophia accounting for 1.7 million births. Similarly we saw the most popular male names were Jacob, Michael, William, Ethan, and Matthew accounting for just over 1.9 million births.

Looking at the top female names from 2000 and on got me curious about how my own name would rank amongst these. The approach here was to find the total number of births that occurred for my name, Chloe, and append that data to the preexisting dataset for the top female names. The goal here was to see amongst the top 5 names how much “weight” did my name hold, or more precisiley what percent of births came from my name compared to the top 5?

What percentage of births came from my name compared to the top 5 female names from 2000 and on?

chloe_df <- df %>% #creating a new data fram for my name
  filter(Gender == "F") %>% #Filtering only Female gender
  filter(Name == 'Chloe') %>% #Filtering for only my name
  select(Name, Year, Gender, Births) %>% # grabbing these variables from original df
  filter(Year >= 2000, Year <= 2020) %>% # filtering for years between 2000 and 2020
  group_by(Name) %>%
  summarize(Total_Births = sum(Births)) %>% # creating new variable for total births to be added
  arrange(desc(Total_Births)) %>% #sort this variable in descending
  data.frame()

top_chloe <- rbind(female_names[1:5,], chloe_df) %>% #using rbind to append the top 5 female names and my name together to a new df
  mutate(perc = Total_Births / sum(Total_Births)) %>%  #creating a new variable percent 
  arrange(perc) %>% #sorting data
  mutate(labels = scales::percent(perc)) %>% #new variable labels, formatting percent variable to be a percentage
  data.frame()
 

#-------------------------------Pie Graph-------------------------------------#

ggplot(data = top_chloe, aes(x = "", y = Total_Births, fill = Name)) + #using top5 female names plus my own dataframe and looking at totalbirths variable
  geom_col(color = "black") +
  geom_label(aes(label = labels), #specifying lables (created in dataframe)
             position = position_stack(vjust = 0.5), #positioning of labels
             show.legend = FALSE) +
  ggtitle("Top 5 Names Compared to My Name from 2000 to 2020")+ #Adding title
  theme(plot.title = element_text(hjust = 0.5)) + #centering title
  coord_polar(theta = "y")+ #specifying coordinates (polar) to be total births
  scale_fill_brewer(palette="PuRd") + #changing color of pie chart
  theme(legend.title = "Names") + #Changing legend title name
  theme_void() #getting rid of outter circle

The pie chart shows that when including my name into the top 5 female names from 2000 and on, “Chloe” doesn’t hold much significance among the others. In this pool of 6 names Chloe only makes up about 9.7% of the total births for this 20 year time frame. Even further it very likely holds even less weight when compared to all the female names that occurred in this period of time.

Now that we’ve looked at which names were “trendy” the last 20 years and how my name compared to those female names I decided to look at one last interesting trend. Names for men and women are simply letters strung together and phonetically sounded out. I wanted to see if there were trends in how names were created…