HW5

1

bnames <- data.frame(read.csv("/Users/iris/Documents/597A/HW5/bnames.csv"))
name <- unique(bnames$name)
boy <- data.frame(subset(bnames, bnames$sex == "boy"))
girl <- data.frame(subset(bnames, bnames$sex == "girl"))

girl.table <- na.omit(data.frame(tapply(girl$percent,girl$name,sum)))
girl.table$Name <- rownames(girl.table)
row.names(girl.table) <- (1:nrow(girl.table))
names(girl.table) <- c("Percent","Name")
girl.table <- girl.table[order(girl.table$Percent, decreasing = TRUE),]
nrow(girl.table) #Size of girl.table is 4018 rows.

## [1] 4018

girl.table[1:10,]

##       Percent      Name
## 2730 4.511860      Mary
## 1196 1.392100 Elizabeth
## 2642 1.360965  Margaret
## 1582 1.234222     Helen
## 246  1.195867      Anna
## 1098 1.065111   Dorothy
## 398  1.001579   Barbara
## 3128 0.999798  Patricia
## 3338 0.942272      Ruth
## 2392 0.837364     Linda

boy.table <- na.omit(data.frame(tapply(boy$percent,boy$name,sum)))
boy.table$Name <- rownames(boy.table)
row.names(boy.table) <- (1:nrow(boy.table))
names(boy.table) <- c("Percent", "Name")
boy.table <- boy.table[order(boy.table$Percent, decreasing = TRUE),]
nrow(boy.table) #Size of boy.table is 3437 rows.

## [1] 3437

boy.table[1:10,]

##       Percent    Name
## 1843 5.299585    John
## 1707 4.574991   James
## 3358 4.409453 William
## 2825 3.821662  Robert
## 590  2.518147 Charles
## 2415 2.366102 Michael
## 1872 2.292487  Joseph
## 821  2.159018   David
## 1365 2.096747  George
## 3133 1.901267  Thomas

2

library(plyr)
x <- subset(boy, boy$year==1880)
top.5 <- function(a) {
  a <- a[order(a$percent, decreasing = TRUE),]
  result <- as.data.frame(a[1:5,2])
  return(t(result))
}
table <- ddply(bnames,.(year, sex),top.5)
nrow(table)

## [1] 258

table[1:5,]

##   year  sex    1       2     3         4        5
## 1 1880  boy John William James   Charles   George
## 2 1880 girl Mary    Anna  Emma Elizabeth   Minnie
## 3 1881  boy John William James    George  Charles
## 4 1881 girl Mary    Anna  Emma Elizabeth Margaret
## 5 1882  boy John William James    George  Charles

table[254:258,]

##     year  sex     1        2       3        4       5
## 254 2006 girl Emily     Emma Madison Isabella     Ava
## 255 2007  boy Jacob  Michael   Ethan   Joshua  Daniel
## 256 2007 girl Emily Isabella    Emma      Ava Madison
## 257 2008  boy Jacob  Michael   Ethan   Joshua  Daniel
## 258 2008 girl  Emma Isabella   Emily  Madison     Ava

3

data <- read.csv("/Users/iris/Documents/597A/HW5/bnames.csv")   #Read data file to R and name as data.
lm.fit <- function(temp){                     #Create a function. Input is data.
fit <- lm(percent~year,data=temp)          #Apply a linear model to fit. percent~year is the prediction interval.
return(data.frame(int=fit$coef[1],slope=fit$coef[2], n=dim(temp)[1])) } 
#Return the result as a dataframe. int column comes from first index of coefficients of fit, slope is from second index and n is from first index of dimension of the input data.
inc.dec <- ddply(data,.(name,sex),lm.fit) #Apply the created funtion to the whole group of data, splited by name and sex.
inc.dec <- subset(inc.dec,n>100) #Filter out and create a subset for data with n>100, from inc.dec.
inc.dec <- subset(inc.dec,(slope>quantile(slope,p=0.99,na.rm=T))|
(slope<quantile(slope,p=0.01,na.rm=T))) #Create a new subset based on last step and here only keep data with selected slopes, as well as n>100.

4

library(foreach)
new <- merge(inc.dec,bnames)
graph <- foreach(i=unique(new$name) ) %do% {
  fit <- lm(percent~year,new)
  plot(subset(new, new$name==i)$year,subset(new, new$name==i)$percent, main = i, xlab= "Year", ylab = "Percent")
  abline(a=fit$coefficients[1],b=fit$coefficients[2])
  #par(mfrow = c(4, 4))
}

5

library(ggplot2)
#boy <- boy[order(boy$percent, decreasing = TRUE),][1:100,]
#girl <- girl[order(girl$percent, decreasing = TRUE),][1:100,]
boy.percent <- foreach(i=unique(boy$year)) %do% {
  sum(subset(boy, boy$year==i)$percent)
}
girl.percent <- foreach(i=unique(girl$year)) %do% {
  sum(subset(girl, girl$year==i)$percent)
}

library(ggplot2)
library(plyr)
times <- data.frame(table(bnames$name))
top_name <- times[order(times$Freq,decreasing = TRUE),][1:100,]
#top_name_data <- merge(top_name, bnames)

Here I tried to merge two data sets but met problems and R studio crashed, but the idea is I want to compare the top 100 frequency with the overall’s and use ggplot to plot it.

HW5

Yuhan Duan

3/26/2017

1

2

3

4

5