Problem 1:

Our team interpreted this question to mean the following … we have aggregated the total frequency of male and female baby names across all years (no repeats), and ouputed both with their respective frequency.

The most popular male baby name is [James] and the count is [5164280]. The most popular female baby name is [Mary] and the count is [4125675].

mydf = data.frame(matrix(ncol = 4, nrow = 0))
mydf = setNames(mydf, c("name", "sex", "freq", "year"))
setwd("~/assgn1/temp/test")

for (fn in file_names) {
  df <- read.csv(fn, header = FALSE)
  s <- substr(fn, 4, 7)
  iyear <- strtoi(s)
  year <- rep(iyear,nrow(df))
  colnames(df) <- c("Name", "Sex", "Count")
  df$year <- year
  mydf <- rbind(mydf, df)
  write.csv(mydf, file = '~/assgn1/temp/mybabydata.csv')
}

male = subset(mydf, mydf$Sex == 'M')
female = subset(mydf, mydf$Sex == 'F')

lookupTop = function(dataframe, iterations){ # this function takes in a datafile and the number of iterations you want returned
  aggregates = ddply(dataframe, 'Name', numcolwise(sum))
  sorted_frame = arrange(aggregates, Count, decreasing = TRUE)
  for(i in seq(1, iterations)){ # loops through sequence range
    print(paste(as.character(sorted_frame[i,]$Name), sorted_frame[i,]$Count)) # prints name and frequency of top i entries
  }
}

print(sprintf('The most popular baby name and its frequency are: %s', lookupTop(male, 1)))
## [1] "James 5164280"
## character(0)

The most popular male baby name is [James] and the count is [5164280].

print(sprintf('The most popular baby name and its frequency are: %s', lookupTop(female, 1)))
## [1] "Mary 4125675"
## character(0)

The most popular female baby name is [Mary] and the count is [4125675].

Problem 2:

The top five baby names for males for the year 1950 are shown in the plot below.

setwd("~/assgn1/temp/test")
file <- read.csv('yob1950.txt', header = FALSE)
m_1950 = subset(file, file$V2 == 'M')
m_1950_Top5 <- head(m_1950[order(m_1950$V3,decreasing = TRUE),],5)

barplot(m_1950_Top5$V3, names=m_1950_Top5$V1,
        main = "Top 5 male names in 1950", 
        ylab = "Count", 
        ylim=range(pretty(c(0,m_1950_Top5$V3))), # makes the plot scale to the y - axis
        col = "royalblue3")

Problem 3

The top five baby names for females for the year 1980 are shown in the plot below.

setwd("~/assgn1/temp/test")
file <- read.csv('yob1980.txt', header = FALSE)
f_1950 = subset(file, file$V2 == 'F')
f_1950_Top5 <- head(f_1950[order(f_1950$V3,decreasing = TRUE),],5)

barplot(f_1950_Top5$V3, names=f_1950_Top5$V1,
        main = "Top 5 female names in 1980", 
        ylab = "Count", 
        ylim = range(pretty(c(0, f_1950_Top5$V3 ))),
        col = "indianred1")

Problem 4

##         name       x
## 914    James 5187679
## 956     John 5146508
## 1575  Robert 4840228
## 1308 Michael 4384463
## 1259    Mary 4140840
## 1843 William 4133327
## 418    David 3634229
## 967   Joseph 2623958
## 1568 Richard 2574832
## 315  Charles 2405197
## "","x"
## "1","only_10.csv"
## [1] "The file with the top 10 baby names ever has been saved as a csv file named mostpop.csv in the current folder."