Descriptive Analysis of Baby Names in the US

Reading all the files in one big dataset.

#Download the file
# temp <- tempfile()
# download.file("http://www.ssa.gov/oact/babynames/state/namesbystate.zip",temp)
# fname = unzip(temp, list=TRUE)
# unzip(temp, files=fname$Name, exdir="namesbystate", overwrite=TRUE)

#Reading all files in one dataframe
#file_list <- list.files(path="namesbystate",pattern ="*.TXT")
file_list <- list.files(pattern ="*.TXT")

for (file in file_list){
  
# if the merged dataset doesn't exist, create it
  if (!exists("dataset")){
    dataset <- read.table(file, header= FALSE, sep=",")
    dataset <- dataset[!duplicated(dataset), ]
  }
  
  # if the merged dataset does exist, append to it
  if (exists("dataset")){
    temp_dataset <-read.table(file, header=FALSE, sep=",")
    temp_dataset <- temp_dataset[!duplicated(temp_dataset), ]
    dataset<-rbind(dataset, temp_dataset)
    rm(temp_dataset)
  }
  
}

#Adding column names
colnames(dataset) <- c("State","Gender","Year","Name","occurrence")

Q1. Format of data files? Limitations or distortion in data?

The data for each state are stored in a comma separated list. The columns/variables in the dataset are:

State: the 2-letter code of the state Gender: Gender of the child - M or F Year: Year of birth in 4-digit format Name: Name of the child Count: Number of times the name occurred in the state in a particular year

To safeguard privacy, the list of names have been restricted to at least 5 occurrences. If a name has less than 5 occurrences for a year of birth in any state, the sum of the state counts for that year will be less than the national count. The dataset is hence biased since it does not include all names used in a state in a particular year.

Dataset also has duplicates, which were removed while importing.

cat("There are",length(unique(dataset$Name)),"unique names in the dataset. Of which, there are",length(unique(subset(dataset$Name,dataset$Gender=="F"))),"unique names for girls,",length(unique(subset(dataset$Name,dataset$Gender=="M"))),"unique names for boys and", length(unique(subset(dataset$Name,dataset$Gender=="F"))) + length(unique(subset(dataset$Name,dataset$Gender=="M"))) - length(unique(dataset$Name)),"names have been used for both genders")

## There are 30274 unique names in the dataset. Of which, there are 20031 unique names for girls, 13139 unique names for boys and 2896 names have been used for both genders

Q2. Most popular name of all times:

Approach: Aggregated the occurrences by Name for all years and gender. Reported the name with maximum occurrence through time.

#Irrespective of gender
allnames <- aggregate(dataset$occurrence,by = list(dataset$Name), FUN = sum, na.rm = T)

#Printing the most popular name
cat(as.character(allnames[which.max(allnames$x),][[1]]),"is the most popular names of all times")

## James is the most popular names of all times

#Removing the files
rm(allnames,file,file_list)

Q3. Most gender ambiguous names in 2013 and 1945.

Approach: First, aggregated the occurrence partitioned by Name and Gender. Transformed the dataset to view occurrences of each name across both the genders. Retained only those names which had been used for both genders. Calculated 2 metrics from the dataset: (a) Absolute difference in occurrences between male and female and (b) Minimum of Ratio of M:F or F:M occurrences. I calculated ratios since its possible that most popular names would not have an exact match in occurrences for both male and female, but would relatively be more popular than the ones with exact match for M and F.Further, subsetted the dataset to the ones where ratio was greater than 0.8. To normalize and rank the results, I multiplied the min(M/F or F/M) by the numerator of minimum value.

After sorting by the normalized result, I picked the top 5 Names as the most ambiguous names.

The most gender ambiguous names in 2013:

library(tidyr)

#2013

#Subsetting and getting the data in required format
data2013 <- subset(dataset, dataset$Year == 2013)
data2013 <- aggregate(data2013$occurrence, by = list(data2013$Name,data2013$Gender), FUN = sum)
colnames(data2013) <- c("Name","Gender","occurrence")
data2013 <- tidyr::spread(data2013,Gender,occurrence, fill = NA)
data2013 <- subset(data2013,!is.na(data2013$F) & !is.na(data2013$M))

#Creating variables with calculations to apply the logic described
data2013$diff <- abs(data2013$F - data2013$M)
data2013$FMRatio <- ifelse((data2013$F/data2013$M)<=(data2013$M/data2013$F),data2013$F/data2013$M,data2013$M/data2013$F)
data2013$Norm <- ifelse((data2013$F/data2013$M)<=(data2013$M/data2013$F),(data2013$F^2/data2013$M),data2013$M^2/data2013$F)
data2013.sub <- subset(data2013,data2013$FMRatio > 0.8)
data2013.sub <- data2013.sub[order(-data2013.sub$Norm),]

cat("Most gender ambiguous names for 2013 were",head(as.character(data2013.sub$Name),5))

## Most gender ambiguous names for 2013 were Charlie Dakota Justice Milan Lennon

The most gender ambiguous names in 1945:

#1945

#Subsetting and getting the data in required format
data1945 <- subset(dataset, dataset$Year == 1945)
data1945 <- aggregate(data1945$occurrence, by = list(data1945$Name,data1945$Gender), FUN = sum)
colnames(data1945) <- c("Name","Gender","occurrence")
data1945 <- tidyr::spread(data1945,Gender,occurrence, fill = NA)
data1945 <- subset(data1945,!is.na(data1945$F) & !is.na(data1945$M))

#Creating variables with calculations to apply the logic described
data1945$diff <- abs(data1945$F - data1945$M)
data1945$FMRatio <- ifelse((data1945$F/data1945$M)<=(data1945$M/data1945$F),data1945$F/data1945$M,data1945$M/data1945$F)
data1945$Norm <- ifelse((data1945$F/data1945$M)<=(data1945$M/data1945$F),(data1945$F^2/data1945$M),data1945$M^2/data1945$F)
data1945.sub <- subset(data1945,data1945$FMRatio > 0.8)
data1945.sub <- data1945.sub[order(-data1945.sub$Norm),]

cat("Most gender ambiguous names for 1945 were",head(as.character(data1945.sub$Name),5))

## Most gender ambiguous names for 1945 were Leslie Jackie Jessie Frankie Lavern

#Removing datasets
rm(data1945,data2013,data2013.sub,data1945.sub)

Q4. Name with largest %age increase/decrease in popularity after 1980 (Irrespective of gender)

Approach: Took a simple approach and identified the year till which we have data. Checked percentage difference of names in 1980 and max year and reported the percentage increase or decrease for names which were present in both years

#Latest Year information available
cat("We have information till",max(dataset$Year))

## We have information till 2014

#Subsetting data for 1980 and 2014
After1980 <- subset(dataset,dataset$Year == 1980 | dataset$Year == 2014)

After1980 <- aggregate(After1980$occurrence, by = list(After1980$Name,After1980$Year), FUN = sum)
colnames(After1980) <- c("Name","Year","occurrence")
After1980 <- tidyr::spread(After1980,Year,occurrence, fill = NA)

#Only retain Names which were present in both 1980 and 2014
After1980 <- subset(After1980, !is.na(After1980$`1980`) & !is.na(After1980$`2014`))

After1980$PerChange <- round((After1980$`2014` - After1980$`1980`)/After1980$`1980`,2)*100
After1980 <- After1980[order(-After1980$PerChange,-After1980$`2014`),]

cat("Name with largest percentage increase in popularity between 1980 and 2014 is",head(as.character(After1980$Name),1),"with change of",head(After1980$PerChange,1),"%")

## Name with largest percentage increase in popularity between 1980 and 2014 is Colton with change of 127000 %

cat("Name with largest percentage decrease in popularity between 1980 and 2014 is",tail(as.character(After1980$Name),1),"with change of",tail(After1980$PerChange,1),"%")

## Name with largest percentage decrease in popularity between 1980 and 2014 is Terri with change of -100 %

Q5. Names which could have had even larger increase of decrease in popularity?

These could be names which were not present in 1980 but started becoming popular thereafter. Or names which did not feature in 2014 because the count was < 5.

Approach: Used the same data manipulation logic as above. Just instead, retained only those Names which had either no mention in 1980 or 2014. Replaced all the no mentiones by 1 to indicate the presence of name and to calculate percentage change.

#Subsetting data for 1980 and 2014
After1980 <- subset(dataset,dataset$Year == 1980 | dataset$Year == 2014)

After1980 <- aggregate(After1980$occurrence, by = list(After1980$Name,After1980$Year), FUN = sum)
colnames(After1980) <- c("Name","Year","occurrence")
After1980 <- tidyr::spread(After1980,Year,occurrence, fill = NA)

#Only retain Names which were present in either 1980 and 2014
After1980 <- subset(After1980, is.na(After1980$`1980`) | is.na(After1980$`2014`))

#Replacing all NAs with 1 to indicate that the name was either not present at all or had a count < 5
After1980[is.na(After1980)] <- 1

After1980$PerChange <- round((After1980$`2014` - After1980$`1980`)/After1980$`1980`,2)*100
After1980 <- After1980[order(-After1980$PerChange,-After1980$`2014`),]

cat("Name with largest percentage increase in popularity between 1980 and 2014 is",head(as.character(After1980$Name),1),"with change of",head(After1980$PerChange,1),"%")

## Name with largest percentage increase in popularity between 1980 and 2014 is Jayden with change of 1344000 %

cat("Name with largest percentage decrease in popularity between 1980 and 2014 is",tail(as.character(After1980$Name),1),"with change of",tail(After1980$PerChange,1),"%")

## Name with largest percentage decrease in popularity between 1980 and 2014 is Quiana with change of -100 %

rm(After1980)

Q6. Exploratory Analysis

(All analysis has been done at different levels of aggregation)

I start by looking at a trend between Male and Female birth rates year on year.

#Babies born YoY - split by Boys and Girls
dataYoY <- aggregate(dataset$occurrence, by = list(dataset$Gender,dataset$Year), FUN = sum)
colnames(dataYoY) <- c("Gender","Year","Count")

#Plotting
library(ggplot2)
ggplot(dataYoY, aes(x= dataYoY$Year, y= (dataYoY$Count/1000), group = dataYoY$Gender))+
  geom_line(aes(color = dataYoY$Gender))+
  ggtitle("YoY Male & Female Births ") +
  labs(x="Year",y="Birth Count (in thousands)")+
  theme(plot.title = element_text(face="bold", size=15)) +
  theme(axis.title = element_text(face="bold"))+
  theme(legend.title=element_blank(),legend.position="bottom")

Till 1930s, there were more female births than male. Trend changes thereafter, and we see the difference between male and female births widening

rm(dataYoY)

Looking at a distribution of distinct names by gender:

#Count of distinct names by genderYoY
dataGen <- aggregate(dataset$Name, by = list(dataset$Gender,dataset$Year), FUN = length)
colnames(dataGen) <- c("Gender","Year","Count")

#Plotting
ggplot(dataGen, aes(x= dataGen$Year, y= (dataGen$Count)/100, group = dataGen$Gender))+
  geom_line(aes(color = dataGen$Gender))+
  ggtitle("YoY Unique Male & Female Names ") +
  labs(x="Year",y="Name Count (in hundreds)")+
  theme(plot.title = element_text(face="bold", size=15)) +
  theme(axis.title = element_text(face="bold"))+
  theme(legend.title=element_blank(),legend.position="bottom")

There have always been more unique female names then male names and the difference increased between 1960 and 1980.

rm(dataGen)

I then tried exploring names which are popular across all states.

#Most common names across states
CommonGen <-  aggregate(dataset$occurrence, by = list(dataset$Gender,dataset$Name, dataset$State), FUN = sum)
colnames(CommonGen) <- c("Gender","Name","State","occurrence")
CommonGen <- tidyr::spread(CommonGen,State,occurrence, fill = NA)
CommonGen$Total <- rowSums(CommonGen[,-c(1:2)],na.rm = T )
CommonGen$NACounts <- rowSums(is.na(CommonGen[,-c(1:2)]))
CommonGen$NonNaCounts <- 51 - CommonGen$NACounts
CommonGen <- CommonGen[order(-CommonGen$NonNaCounts,-CommonGen$Total),]

#Shortlisting names which occurred in all states
UniversalNames <- subset(CommonGen,CommonGen$NonNaCounts == 51)

library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")

#The world cloud shows the most popular names common to all states
wordcloud(words = UniversalNames$Name, freq =UniversalNames$Total, min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

cat("The most popular common names across all states are",head(as.character(UniversalNames$Name),10))

## The most popular common names across all states are James John Robert Michael William Mary David Richard Joseph Charles

I also looked at names which were unique to only one states across all years.

#Shortlisting names with one state only and getting the data in required format
SingleStateNames <- subset(CommonGen, CommonGen$NACounts == 50 & CommonGen$Name != "Notnamed")
SingleStateNames <- SingleStateNames[, !apply(is.na(SingleStateNames), 2, all)]

UniqueName <- as.data.frame(colSums(!is.na(SingleStateNames[,-c(1:2,51:53)])))
UniqueName$State <- rownames(UniqueName)
colnames(UniqueName) <- c("UniqueNames","State")

#Subsetting the data to only states with unique name count over 50 - to plot cleaner graph
UniqueName <- subset(UniqueName,UniqueName$UniqueNames >= 50)

#Plotting
ggplot(UniqueName, aes(x= reorder(State,-UniqueNames), y=UniqueNames)) +
  geom_bar(stat = "identity", fill= "dodgerblue2")+
  ggtitle("Name counts unique to only one state") +
  labs(x="State",y="Unique Names (#)")+
  theme(plot.title = element_text(face="bold", size=15)) +
  theme(axis.title = element_text(face="bold"))

California, New York and Texas have over 1000 names that are unique to only that state. A simple deep dive would be required to look into the unique names by state.

rm(list=setdiff(ls(), "dataset"))

I then tried verifying NEWS claims which associate baby name’s popularity with TV shows. I picked up the baby names form the following website: http://www.babycenter.com/0_baby-names-inspired-by-tv-show-characters_10309850.bc

I fed the data into an excel sheet, by referencing the show start year, and read it into R. The shows which are still on-going have been given an end date of 2016. I am not using the end year in my analysis.

The data looks as follows:

#Reading the TV show file
TV <- read.csv("https://raw.githubusercontent.com/Dhwani-Parekh/Assignments/master/ShowInspire.csv")
head(TV)

##   Gender    Name         TV.show YearStart YearEnd
## 1      M  Adrian            Monk      2002    2009
## 2      M   Angus        MacGyver      1985    1992
## 3      M     Axl      The Middle      2009    2016
## 4      M Bertram         Mad Men      2007    2015
## 5      M   Boone            Lost      2004    2010
## 6      M    Bran Game of Thrones      2011    2016

I then subset my main dataset with just the most popular names I’m trying to verify from NEWS articles. Since most of the TV shows started after 1990s, I subset the data to years after 1984.

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

#Getting the data in required format
ShowNames1 <-  aggregate(dataset$occurrence, by = list(dataset$Gender,dataset$Name, dataset$Year), FUN = sum)
colnames(ShowNames1) <- c("Gender","Name","Year","occurrence")
ShowNames <- subset(ShowNames1, trim(ShowNames1$Name) %in% trim(TV$Name) & ShowNames1$Year >= 1984)
ShowNames <- ShowNames[order(ShowNames$Name,ShowNames$Year),]

ShowNames1 <- merge(ShowNames,TV, by="Name")
ShowNames1$line <- ifelse(ShowNames1$Year == ShowNames1$YearStart,paste(as.character(ShowNames1$Name),"-",as.character(ShowNames1$TV.show)),"")

Trend plot for popular girl names:

#Subsetting only Female names
GShowNames <- subset(ShowNames1, ShowNames1$Gender.x == "F" & ShowNames1$Gender.y == "F")

#Plotting
ggplot(GShowNames,aes(x= GShowNames$Year, y=GShowNames$occurrence, group = GShowNames$Name, label = GShowNames$line))+
  geom_line(aes(colour=(GShowNames$YearStart >= GShowNames$Year)))+
  geom_text()+
  ggtitle("Popular Girl Names because of TV shows?") +
  labs(x="Year",y="occurrences(#)")+
  theme(plot.title = element_text(face="bold", size=15)) +
  theme(axis.title = element_text(face="bold"))+
  theme(legend.title=element_blank(),legend.position="bottom")+
  scale_color_manual("Legend Title\n",labels = c("After Show Start","Before Show Start"), values = c("red", "dodgerblue"))

Names such as “Olivia” had been on an increasing trend and “Scandal” may or may not have been the reason for its rise after the show started. “Grace” was on a decline and its possible that “The Good Wife” stopped its decline further, while “Hannah” was on a declining trend and continued so even after “Girls” started.

All three names mentioned above were widely popular and the show could have adopted a popular name instead of pushing a name to popularity. Thus, we cannot come to a concrete conclusion from this plot.

I then tried looking at not-so-popular names, which could have possibly gained popularity.

#Subsetting female names whose occurrence is less than 5000
GShowNamesL <- subset(ShowNames1, ShowNames1$Gender.x == "F" & ShowNames1$Gender.y == "F" & ShowNames1$occurrence < 5000)

#Plotting
ggplot(GShowNamesL,aes(x= GShowNamesL$Year, y=GShowNamesL$occurrence, group = GShowNamesL$Name, label = GShowNamesL$line))+
  geom_line(aes(colour=(GShowNamesL$YearStart >= GShowNamesL$Year)))+
  geom_text(vjust = "center", hjust = "center")+
  ggtitle("Popular Girl Names because of TV shows?") +
  labs(x="Year",y="occurrences(#)")+
  theme(plot.title = element_text(face="bold", size=15)) +
  theme(axis.title = element_text(face="bold"))+
  theme(legend.title=element_blank(),legend.position="bottom")+
  scale_color_manual("Legend Title\n",labels = c("After Show Start","Before Show Start"), values = c("red", "dodgerblue"))

This plot gives us more insights than the previous one. TV shows have definitely boosted names like Peyton, Arya, Nadia, Phoebe, Maggie, but on contrary names like Gabrielle and Carrie started showing a decline on being associated with a show.

This tells us that audience do not prefer picking up names from adult rated shows like “Sex and the City” and “Desperate Housewives”.

Similarly I saw a trend plot for popular boy names:

#Shortlisting Boy names only
BShowNames <- subset(ShowNames1, ShowNames1$Gender.x == "M" & ShowNames1$Gender.y == "M")

#Plotting
ggplot(BShowNames,aes(x= BShowNames$Year, y=BShowNames$occurrence, group = BShowNames$Name, label = BShowNames$line))+
  geom_line(aes(colour=(BShowNames$YearStart >= BShowNames$Year)))+
  geom_text()+
  geom_text(vjust = "center", hjust = "center")+
  ggtitle("Popular Boy Names because of TV shows?") +
  labs(x="Year",y="occurrences(#)")+
  theme(plot.title = element_text(face="bold", size=15)) +
  theme(axis.title = element_text(face="bold"))+
  theme(legend.title=element_blank(),legend.position="bottom")+
  scale_color_manual("Legend Title\n",labels = c("After Show Start","Before Show Start"), values = c("red", "dodgerblue"))

Sawyer, Ezra, Oliver are a few names which have gained popularity because of a TV show association. Calen and Shane on the other hand have shown a decline while we cannot attribute rise in Adrian to “Monk” since the name was on an increasing trend for a while.

#Subsetting male names whose occurrence is less than 3000
BShowNamesL <- subset(ShowNames1, ShowNames1$Gender.x == "M" & ShowNames1$Gender.y == "M" & ShowNames1$occurrence < 3000)

ggplot(BShowNamesL,aes(x= BShowNamesL$Year, y=BShowNamesL$occurrence, group = BShowNamesL$Name, label = BShowNamesL$line))+
  geom_line(aes(colour=(BShowNamesL$YearStart >= BShowNamesL$Year)))+
  geom_text(vjust = "center", hjust = "center")+
  ggtitle("Popular Boy Names because of TV shows?") +
  labs(x="Year",y="occurrences(#)")+
  theme(plot.title = element_text(face="bold", size=15)) +
  theme(axis.title = element_text(face="bold"))+
  theme(legend.title=element_blank(),legend.position="bottom")+
  scale_color_manual("Legend Title\n",labels = c("After Show Start","Before Show Start"), values = c("red", "dodgerblue"))

Saul and Chandler seem to have gained most popularity because of the show association.

Finally, I looked at the most popular baby names in 2014 by each state split by gender.

rm(list=setdiff(ls(), "dataset"))

#Subsetting data for 2014
data2014 <- subset(dataset,dataset$Year == 2014)

data2014 <- aggregate(data2014$occurrence, by = list(data2014$Name,data2014$State,data2014$Gender), FUN = sum)
colnames(data2014) <- c("Name","State","Gender","occurrence")

library(sqldf)

StatePop <- sqldf("select a.State,a.Gender,b.Name,a.maxpop
                  from

                    (select State,Gender,max(occurrence) as maxpop
                    from data2014
                    group by State,Gender)a
                    inner join
                    (select State,Name,Gender,occurrence
                    from data2014)b
                    on a.maxpop = b.occurrence
                      and a.State = b.State
                      and a.Gender = b.Gender")

colnames(StatePop) <- c("State","Gender","Name","occurrence")

StatePopF <- subset(StatePop, StatePop$Gender == "F")
StatePopM <- subset(StatePop, StatePop$Gender == "M")

library(ggplot2)
library(maps)

#load us map data
all_states <- map_data("state")
Mapper <- read.csv("https://raw.githubusercontent.com/Dhwani-Parekh/Assignments/master/StateMapper.csv")
all_states <- merge(all_states,Mapper, by = "region")
StatePopF <- merge(all_states,StatePopF,by = "State")
StatePopM <- merge(all_states,StatePopM,by = "State")

The most popular girl name for each state is:

gnames <- aggregate(cbind(long, lat) ~ region + Name, data=StatePopF, 
                    FUN=function(x)mean(range(x)))

#plot all states with ggplot
ggplot()+
  geom_polygon( data=all_states, aes(x=long, y=lat, group = group),colour="deeppink2", fill="plum2" )+
  geom_text(data = gnames, aes(x=long, y=lat, label = Name), size = 4)+
  coord_map()+
  ggtitle("Most Popular Girl Names by State in 2014")+
  theme(plot.title = element_text(face="bold", size=15))

The most popular boy name for each state is:

bnames <- aggregate(cbind(long, lat) ~ region + Name, data=StatePopM, 
                    FUN=function(x)mean(range(x)))
ggplot()+
  geom_polygon( data=all_states, aes(x=long, y=lat, group = group),colour="dodgerblue", fill="lightblue" )+
  geom_text(data = bnames, aes(x=long, y=lat, label = Name), size = 4)+
  coord_map()+
  ggtitle("Most Popular Boy Names by State in 2014")+
  theme(plot.title = element_text(face="bold", size=15))

rm(list=setdiff(ls(), "dataset"))