bnames <- data.frame(read.csv("/Users/iris/Documents/597A/HW5/bnames.csv"))
name <- unique(bnames$name)
boy <- data.frame(subset(bnames, bnames$sex == "boy"))
girl <- data.frame(subset(bnames, bnames$sex == "girl"))
girl.table <- na.omit(data.frame(tapply(girl$percent,girl$name,sum)))
girl.table$Name <- rownames(girl.table)
row.names(girl.table) <- (1:nrow(girl.table))
names(girl.table) <- c("Percent","Name")
girl.table <- girl.table[order(girl.table$Percent, decreasing = TRUE),]
nrow(girl.table) #Size of girl.table is 4018 rows.
## [1] 4018
girl.table[1:10,]
## Percent Name
## 2730 4.511860 Mary
## 1196 1.392100 Elizabeth
## 2642 1.360965 Margaret
## 1582 1.234222 Helen
## 246 1.195867 Anna
## 1098 1.065111 Dorothy
## 398 1.001579 Barbara
## 3128 0.999798 Patricia
## 3338 0.942272 Ruth
## 2392 0.837364 Linda
boy.table <- na.omit(data.frame(tapply(boy$percent,boy$name,sum)))
boy.table$Name <- rownames(boy.table)
row.names(boy.table) <- (1:nrow(boy.table))
names(boy.table) <- c("Percent", "Name")
boy.table <- boy.table[order(boy.table$Percent, decreasing = TRUE),]
nrow(boy.table) #Size of boy.table is 3437 rows.
## [1] 3437
boy.table[1:10,]
## Percent Name
## 1843 5.299585 John
## 1707 4.574991 James
## 3358 4.409453 William
## 2825 3.821662 Robert
## 590 2.518147 Charles
## 2415 2.366102 Michael
## 1872 2.292487 Joseph
## 821 2.159018 David
## 1365 2.096747 George
## 3133 1.901267 Thomas
library(plyr)
x <- subset(boy, boy$year==1880)
top.5 <- function(a) {
a <- a[order(a$percent, decreasing = TRUE),]
result <- as.data.frame(a[1:5,2])
return(t(result))
}
table <- ddply(bnames,.(year, sex),top.5)
nrow(table)
## [1] 258
table[1:5,]
## year sex 1 2 3 4 5
## 1 1880 boy John William James Charles George
## 2 1880 girl Mary Anna Emma Elizabeth Minnie
## 3 1881 boy John William James George Charles
## 4 1881 girl Mary Anna Emma Elizabeth Margaret
## 5 1882 boy John William James George Charles
table[254:258,]
## year sex 1 2 3 4 5
## 254 2006 girl Emily Emma Madison Isabella Ava
## 255 2007 boy Jacob Michael Ethan Joshua Daniel
## 256 2007 girl Emily Isabella Emma Ava Madison
## 257 2008 boy Jacob Michael Ethan Joshua Daniel
## 258 2008 girl Emma Isabella Emily Madison Ava
data <- read.csv("/Users/iris/Documents/597A/HW5/bnames.csv") #Read data file to R and name as data.
lm.fit <- function(temp){ #Create a function. Input is data.
fit <- lm(percent~year,data=temp) #Apply a linear model to fit. percent~year is the prediction interval.
return(data.frame(int=fit$coef[1],slope=fit$coef[2], n=dim(temp)[1])) }
#Return the result as a dataframe. int column comes from first index of coefficients of fit, slope is from second index and n is from first index of dimension of the input data.
inc.dec <- ddply(data,.(name,sex),lm.fit) #Apply the created funtion to the whole group of data, splited by name and sex.
inc.dec <- subset(inc.dec,n>100) #Filter out and create a subset for data with n>100, from inc.dec.
inc.dec <- subset(inc.dec,(slope>quantile(slope,p=0.99,na.rm=T))|
(slope<quantile(slope,p=0.01,na.rm=T))) #Create a new subset based on last step and here only keep data with selected slopes, as well as n>100.
library(foreach)
new <- merge(inc.dec,bnames)
graph <- foreach(i=unique(new$name) ) %do% {
fit <- lm(percent~year,new)
plot(subset(new, new$name==i)$year,subset(new, new$name==i)$percent, main = i, xlab= "Year", ylab = "Percent")
abline(a=fit$coefficients[1],b=fit$coefficients[2])
#par(mfrow = c(4, 4))
}
library(ggplot2)
#boy <- boy[order(boy$percent, decreasing = TRUE),][1:100,]
#girl <- girl[order(girl$percent, decreasing = TRUE),][1:100,]
boy.percent <- foreach(i=unique(boy$year)) %do% {
sum(subset(boy, boy$year==i)$percent)
}
girl.percent <- foreach(i=unique(girl$year)) %do% {
sum(subset(girl, girl$year==i)$percent)
}
library(ggplot2)
library(plyr)
times <- data.frame(table(bnames$name))
top_name <- times[order(times$Freq,decreasing = TRUE),][1:100,]
#top_name_data <- merge(top_name, bnames)
Here I tried to merge two data sets but met problems and R studio crashed, but the idea is I want to compare the top 100 frequency with the overall’s and use ggplot to plot it.