Our team interpreted this question to mean the following … we have aggregated the total frequency of male and female baby names across all years (no repeats), and ouputed both with their respective frequency.
The most popular male baby name is [James] and the count is [5164280]. The most popular female baby name is [Mary] and the count is [4125675].
mydf = data.frame(matrix(ncol = 4, nrow = 0))
mydf = setNames(mydf, c("name", "sex", "freq", "year"))
setwd("~/assgn1/temp/test")
for (fn in file_names) {
df <- read.csv(fn, header = FALSE)
s <- substr(fn, 4, 7)
iyear <- strtoi(s)
year <- rep(iyear,nrow(df))
colnames(df) <- c("Name", "Sex", "Count")
df$year <- year
mydf <- rbind(mydf, df)
write.csv(mydf, file = '~/assgn1/temp/mybabydata.csv')
}
male = subset(mydf, mydf$Sex == 'M')
female = subset(mydf, mydf$Sex == 'F')
lookupTop = function(dataframe, iterations){ # this function takes in a datafile and the number of iterations you want returned
aggregates = ddply(dataframe, 'Name', numcolwise(sum))
sorted_frame = arrange(aggregates, Count, decreasing = TRUE)
for(i in seq(1, iterations)){ # loops through sequence range
print(paste(as.character(sorted_frame[i,]$Name), sorted_frame[i,]$Count)) # prints name and frequency of top i entries
}
}
print(sprintf('The most popular baby name and its frequency are: %s', lookupTop(male, 1)))
## [1] "James 5164280"
## character(0)
The most popular male baby name is [James] and the count is [5164280].
print(sprintf('The most popular baby name and its frequency are: %s', lookupTop(female, 1)))
## [1] "Mary 4125675"
## character(0)
The most popular female baby name is [Mary] and the count is [4125675].
The top five baby names for males for the year 1950 are shown in the plot below.
setwd("~/assgn1/temp/test")
file <- read.csv('yob1950.txt', header = FALSE)
m_1950 = subset(file, file$V2 == 'M')
m_1950_Top5 <- head(m_1950[order(m_1950$V3,decreasing = TRUE),],5)
barplot(m_1950_Top5$V3, names=m_1950_Top5$V1,
main = "Top 5 male names in 1950",
ylab = "Count",
ylim=range(pretty(c(0,m_1950_Top5$V3))), # makes the plot scale to the y - axis
col = "royalblue3")
The top five baby names for females for the year 1980 are shown in the plot below.
setwd("~/assgn1/temp/test")
file <- read.csv('yob1980.txt', header = FALSE)
f_1950 = subset(file, file$V2 == 'F')
f_1950_Top5 <- head(f_1950[order(f_1950$V3,decreasing = TRUE),],5)
barplot(f_1950_Top5$V3, names=f_1950_Top5$V1,
main = "Top 5 female names in 1980",
ylab = "Count",
ylim = range(pretty(c(0, f_1950_Top5$V3 ))),
col = "indianred1")
## name x
## 914 James 5187679
## 956 John 5146508
## 1575 Robert 4840228
## 1308 Michael 4384463
## 1259 Mary 4140840
## 1843 William 4133327
## 418 David 3634229
## 967 Joseph 2623958
## 1568 Richard 2574832
## 315 Charles 2405197
## "","x"
## "1","only_10.csv"
## [1] "The file with the top 10 baby names ever has been saved as a csv file named mostpop.csv in the current folder."