library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf <- read.csv('pseudo_facebook.tsv',sep = '\t')
ggplot(aes(gender,age), data = subset(pf, !is.na(gender))) +
geom_boxplot() +
stat_summary(fun.y = mean,geom = 'point',shape = 4)
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender),stat = 'summary',fun.y = median)
Notes:
library(dplyr)
friendcount_by_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age,gender) %>%
summarise(median_friend_count = median(friend_count),
mean_friend_count = mean(friend_count),
n = n()) %>%
arrange(age)
Notes:
ggplot(aes(age,median_friend_count),data = friendcount_by_gender) +
geom_line(aes(color = gender))
Notes:
Notes:
library(reshape2)
#长数据转宽数据
friendcount_by_gender.wide <- dcast(friendcount_by_gender,
age ~ gender,
value.var = 'median_friend_count')
#宽数据转长数据
friendcount_by_gender.long <- melt(friendcount_by_gender.wide,id.vars = c('age'),variable.name = 'gender',value.name = 'meadian_friend_count')
ggplot(aes(age,female / male),data = friendcount_by_gender.wide) +
geom_line() +
#设置y轴截距,并且设置噪声值
geom_hline(yintercept = 1,alpha = 0.3,linetype = 2)
Notes:
#floor函数将返回不大于该数字的最大整数
pf$year_joined <- floor(2014 -pf$tenure / 365)
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
Notes:
#按加入年份分组
pf$year_joined.bucket <- cut(pf$year_joined,
c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket)
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]
## 6669 15308 33366 43658
Notes:
ggplot(aes(age,friend_count),data = subset(pf,!is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),stat = 'summary',fun.y = median)
Notes:
ggplot(aes(age,friend_count),data = subset(pf,!is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),stat = 'summary',fun.y = mean) +
geom_line(stat = 'summary',fun.y = mean,linetype = 2)
Notes:
#计算条件比率
with(subset(pf,tenure >= 1),summary(friend_count / tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
Notes:
What is the median friend rate?
What is the maximum friend rate?
ggplot(aes(tenure,friendships_initiated / tenure),
data = subset(pf,tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = median)
Notes: 利用round函数来降低噪声,使得函数曲线更加平滑
q1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
q2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
q3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
q4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(q1,q2,q3,q4,ncol = 1)
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_smooth(aes(color = year_joined.bucket))
## `geom_smooth()` using method = 'gam'
Notes ***
Notes:
Notes:
yo <- read.csv("yogurt.csv")
yo$id <- factor(yo$id)
qplot(x = price,data = yo,fill = I('#F79420'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
#创建新的数据集(内部数据整理)
yo <- transform(yo,all.purchases = strawberry + blueberry + pina.colada +
plain + mixed.berry)
summary(yo$all.purchases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.971 2.000 21.000
Notes:
ggplot(aes(time,price),data = yo) +
geom_jitter(alpha = 1/4,shape = 21,fill = I('#F79420'))
Notes:
#选取样本
set.seed(4230)
sample.ids <- sample(levels(yo$id),16)
#按照样本画图
ggplot(aes(time,price),data = subset(yo,id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size = all.purchases),pch = 1)
set.seed(5720)
sample.ids2 <- sample(levels(yo$id),16)
ggplot(aes(time,price),data = subset(yo,id %in% sample.ids2)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size = all.purchases),pth = 1)
## Warning: Ignoring unknown parameters: pth
Notes:
Notes:
library('GGally')
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
theme_set(theme_minimal(10))
set.seed(1836)
pf_subset <- pf[,c(2:7)]
names(pf_subset)
## [1] "age" "dob_day" "dob_year" "dob_month" "gender" "tenure"
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes: