Dplyr
Sys.setlocale("LC_ALL", "en_us.UTF-8")
## [1] "en_us.UTF-8/en_us.UTF-8/en_us.UTF-8/C/en_us.UTF-8/C"
data(Titanic)
titanic <- as.data.frame(Titanic)
class(titanic)
## [1] "data.frame"
head(titanic[titanic$Age == 'Child' , c('Age', 'Freq') ])
## Age Freq
## 1 Child 0
## 2 Child 0
## 3 Child 35
## 4 Child 0
## 5 Child 0
## 6 Child 0
## SELECT Age, Freq FROM titanic WHERE Age = Child LIMIT 6
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
titanic %>%
filter(Age == 'Child') %>%
select(Age, Freq) %>%
head()
## Warning: package 'bindrcpp' was built under R version 3.2.5
## Age Freq
## 1 Child 0
## 2 Child 0
## 3 Child 35
## 4 Child 0
## 5 Child 0
## 6 Child 0
## SELECT Age, Freq FROM titanic WHERE Age = Child ORDER BY Freq LIMIT 6
titanic %>%
filter(Age == 'Child') %>%
select(Age, Freq) %>%
arrange(Freq) %>%
head()
## Age Freq
## 1 Child 0
## 2 Child 0
## 3 Child 0
## 4 Child 0
## 5 Child 0
## 6 Child 0
## SELECT Age, Freq FROM titanic WHERE Age = Child ORDER BY Freq DESC LIMIT 6
titanic %>%
filter(Age == 'Child') %>%
select(Age, Freq) %>%
arrange(desc(Freq)) %>%
head()
## Age Freq
## 1 Child 35
## 2 Child 17
## 3 Child 14
## 4 Child 13
## 5 Child 13
## 6 Child 11
freqsum <- titanic %>%
select(Freq) %>%
sum()
titanic <- titanic %>%
select(Sex, Age, Freq) %>%
mutate(portion = Freq/freqsum)
# SELECT Sex, SUM(Freq) FROM titanic GROUP BY Sex
titanic %>%
group_by(Sex) %>%
summarise(sexsum = sum(Freq, na.rm = TRUE))
## # A tibble: 2 x 2
## Sex sexsum
## <fctr> <dbl>
## 1 Male 1731
## 2 Female 470
# SELECT Sex, SUM(Freq), SUM(Portion) FROM titanic GROUP BY Sex
titanic %>%
group_by(Sex) %>%
summarise_each(funs(sum), Freq, portion)
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
## # A tibble: 2 x 3
## Sex Freq portion
## <fctr> <dbl> <dbl>
## 1 Male 1731 0.786
## 2 Female 470 0.214
# SELECT Class, min(Freq), max(Freq) FROM titanic GROUP BY Class
titanic <- as.data.frame(Titanic)
titanic %>%
group_by(Class) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Freq"))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
## # A tibble: 4 x 3
## Class Freq_min Freq_max
## <fctr> <dbl> <dbl>
## 1 1st 0 140
## 2 2nd 0 154
## 3 3rd 13.0 387
## 4 Crew 0 670
# SELECT COUNT(Sex) FROM titanic
titanic %>%
select(Sex) %>%
summarise_each(funs(n()))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
## Sex
## 1 32
# SELECT COUNT(DISTINCT(Sex)) FROM titanic
titanic %>%
select(Sex) %>%
summarise_each(funs(n_distinct(Sex)))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
## Sex
## 1 2
# SELECT Sex, Age, SUM(Freq) AS frequency_sum GROUP Sex, Age ORDER BY frequency_sum DESC
titanic %>%
select(Sex, Age, Freq) %>%
group_by(Sex, Age) %>%
summarise(frequency_sum = sum(Freq, na.rm=TRUE)) %>%
arrange(desc(frequency_sum))
## # A tibble: 4 x 3
## # Groups: Sex [2]
## Sex Age frequency_sum
## <fctr> <fctr> <dbl>
## 1 Male Adult 1667
## 2 Female Adult 425
## 3 Male Child 64.0
## 4 Female Child 45.0
# SELECT Sex, SUM(Freq) FROM titanic GROUP BY Sex
sex_stat <- titanic %>%
select(Sex, Freq) %>%
group_by(Sex) %>%
summarise(sexsum = sum(Freq))
sex_stat
## # A tibble: 2 x 2
## Sex sexsum
## <fctr> <dbl>
## 1 Male 1731
## 2 Female 470
barplot(sex_stat$sexsum, names.arg = sex_stat$Sex, col=c('blue'))

sex_stat2 <- titanic %>%
select(Sex, Survived, Freq) %>%
group_by(Sex, Survived) %>%
summarise(sexsum = sum(Freq))
#install.packages('tidyr')
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
titanic_pivot <- spread(sex_stat2, Survived, sexsum, fill = 0)
mosaicplot(titanic_pivot)

survived_stat <- titanic %>% group_by(Survived,Sex) %>% summarise_each(funs(sum), Freq)
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
survived_tb <- dcast(survived_stat, Survived ~ Sex, value.var="Freq")
survived_tb
## Survived Male Female
## 1 No 1364 126
## 2 Yes 367 344
m <- as.matrix(survived_tb[2:3])
barplot(m, legend=c("Perished" ,"Survived"))

分析姓名
babyname <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/WA.TXT', header = FALSE)
colnames(babyname) = c("state", "sex", "year", "name", "freq")
# SELECT name, Freq FROM babyname WHERE year = 2012 AND sex = 'M' ORDER BY Freq DESC LIMIT 10
top10_male <- babyname %>%
filter(year == 2012, sex == 'M') %>%
select(name, freq) %>%
arrange(desc(freq)) %>%
head(10)
top10_female <- babyname %>%
filter(year == 2012, sex == 'F') %>%
select(name, freq) %>%
arrange(desc(freq)) %>%
head(10)
male_stat <- babyname %>%
filter(name %in% top10_male$name, sex == 'M') %>%
select(year, name, freq)
plot(male_stat$year, male_stat$freq, type = 'n')
c <- 1
for (n in levels(factor(male_stat$name))){
stat <- male_stat[male_stat$name == n, ]
lines(freq ~ year, data = stat, col=c)
c <- c + 1
}
legend("topleft", levels(factor(male_stat$name)), col =1:10, lwd= 1)

female_stat <- babyname %>%
filter(name %in% top10_female$name, sex == 'F') %>%
select(year, name, freq)
plot(female_stat$year, female_stat$freq, type = 'n')
c <- 1
for (n in levels(factor(female_stat$name))){
stat <- female_stat[female_stat$name == n, ]
lines(freq ~ year, data = stat, col=c)
c <- c + 1
}
legend("topleft", levels(factor(female_stat$name)), col =1:10, lwd= 1)
