You will acquire and analyze a real dataset on baby name popularity provided by the Social Security Administration. To warm up, we will ask you a few simple questions that can be answered by inspecting the data.
The data can be downloaded in zip format from: http://www.ssa.gov/oact/babynames/state/namesbystate.zip (~22MB)
The data are seperated in multiple files by states. I loop through the folder by looking for files that end with .txt and append then into a single dataframe.
setwd("~/Documents/scu/Winter 2017/Machine Learning/Homework/namesbystate")
df<-NULL
tem<-list.files()
for (i in tem){
if(grepl('.TXT',i)){
#print (i)
temp1 <- read.table(i, sep = ",")
df<-rbind(df,temp1)
}}
col<-c("state","gender","year","name","occurences")
colnames(df)<-col
write.csv(df,"allstate.csv")
df<-read.csv("allstate.csv")
James and Mary are the most popular name for male and female respectively.
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df %>%
group_by(gender,name) %>%
summarize(sum_occurences = sum(occurences))%>%
arrange(desc(sum_occurences),gender)
## Source: local data frame [33,613 x 3]
## Groups: gender [2]
##
## gender name sum_occurences
## <fctr> <fctr> <int>
## 1 M James 4954037
## 2 M John 4840467
## 3 M Robert 4716978
## 4 M Michael 4310511
## 5 M William 3845199
## 6 F Mary 3733620
## 7 M David 3566154
## 8 M Richard 2532760
## 9 M Joseph 2491293
## 10 M Charles 2251908
## # ... with 33,603 more rows
I came up with a ambiguity metric. The most gender ambiguous name is defined as a name with the highest occurences (and most popular) between male and female. This ambiguity metric is made up with two factors. The first factor is the name occurence ratio between male and female. The higher the ratio is, the more ambiguous the name is. The second factor is to capture the popularity of the name. The multiplication of two factor will be the ambiguity metric (AM), the higher the AM, the more ambiguous the name is.
Based on this definition- The most gender ambiguous name is Charlie.
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
ambiguous_func<-function(x)
{
DT<-data.table(df)
DT_2013=DT[year == x]
DT_2013=DT_2013[,sum(occurences),by=.(name,gender)]
temp1 <- DT_2013 %>% group_by(name) %>% filter(n()>1) %>% arrange(desc(V1))
temp1<-data.table(temp1)
ratio <- function(x) (x/lag(x))
temp2<-temp1%>% group_by(name)%>%mutate_each(funs(min_ratio=ratio),V1)
temp2=na.omit(temp2)
temp2$V1<-NULL
temp2$gender<-NULL
temp1=temp1[,sum(V1),by=.(name)]
colnames(temp1)<-c("name","total_occurences")
temp<-merge(temp1,temp2,by="name")
temp$ambiguity<-temp$total_occurences*temp$min_ratio
temp<-data.table(temp)
temp<-temp[min_ratio>=0.75]
result<-( temp[which.max(temp$ambiguity),])
print (result)
}
ambiguous_func(2013)
## name total_occurences min_ratio ambiguity
## 1: Charlie 2844 0.8479532 2411.579
ambiguous_func(1945)
## name total_occurences min_ratio ambiguity
## 1: Leslie 3654 0.8491903 3102.941
To find out the popular name between 1980 and 2015. I calculate the occurence delta between two time period. pete Aria has the largest percentage increase by 127440% and Jill has the largest percentage decrease by -99.8%.
DT<-data.table(df)
df_q4=DT[ year == 1980 | year ==2015]
df_q4=df_q4[,sum(occurences),by=.(name,year)]
pct <- function(x) ((x-lag(x))/lag(x))
df_q4a<-df_q4 %>%group_by(name) %>%arrange(name,year)%>%mutate_each(funs(pct_change=pct),V1)
df_q4a[which.min(df_q4a$pct_change),]
## Source: local data frame [1 x 4]
## Groups: name [1]
##
## name year V1 pct_change
## <fctr> <int> <int> <dbl>
## 1 Jill 2015 7 -0.9984626
df_q4a[which.max(df_q4a$pct_change),]
## Source: local data frame [1 x 4]
## Groups: name [1]
##
## name year V1 pct_change
## <fctr> <int> <int> <dbl>
## 1 Aria 2015 6377 1274.4
To identify this, I consider every year as the start year and find the greatest increase/descrease across all years. Print out the top name for growth from each year.The names that have the largest increase and decrease in popularity are Liam by 365520% in 1954 and Debbie -99.97% in 1959.
DT_grouped<-df %>%
group_by(name,year) %>%
summarize(sum_occurences = sum(occurences))
DT_grouped<-data.table(DT_grouped)
df_q5=DT_grouped[year !=2015]
df_q5a=DT_grouped[year ==2015]
df_q5a$year<-NULL
colnames(df_q5a)<-c("name","2015occurences")
dftemp<-right_join(df_q5a,df_q5,by="name")
dftemp$pct_change<-(dftemp$`2015occurences`-dftemp$sum_occurences)/dftemp$sum_occurences
dftemp[which.min(dftemp$pct_change),]
## name 2015occurences year sum_occurences pct_change
## 128332 Debbie 6 1959 19541 -0.999693
dftemp[which.max(dftemp$pct_change),]
## name 2015occurences year sum_occurences pct_change
## 323223 Liam 18281 1954 5 3655.2
This gives interesting results, and may be used in a different way with a rolling window than using all the data.
I get the top 5 names that have the largest percentage increase in popularity since 1980. It is surprising to see that two of the names have existed since 1920, the other three first appear in 1970.
df_q4a<-df_q4a %>%arrange(desc(pct_change))
#df_q4a[1:10,1]
library(ggplot2)
Aria<-DT_grouped[name=="Aria"]
Colton<-DT_grouped[name=="Colton"]
Skylar<-DT_grouped[name=="Skylar"]
Mateo<-DT_grouped[name=="Mateo"]
Mila<-DT_grouped[name=="Mila"]
ggplot() +
geom_line(data=Aria, aes(x=year,y=sum_occurences,color="Aria")) +
geom_line(data=Colton, aes(x=year,y=sum_occurences,color="Colton"))+
geom_line(data=Skylar, aes(x=year,y=sum_occurences, color='Skylar'))+
geom_line(data=Mateo, aes(x=year,y=sum_occurences, color='Mateo')) +
geom_line(data=Mila, aes(x=year,y=sum_occurences, color='Mila'))+
scale_colour_manual(name="Name",values=c(Aria="red", Colton="blue", Skylar="purple", Mateo="orange",Mila="yellow"))