library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

pf <- read.csv('pseudo_facebook.tsv',sep = '\t')
ggplot(aes(gender,age), data = subset(pf, !is.na(gender))) + 
       geom_boxplot() +
  stat_summary(fun.y = mean,geom = 'point',shape = 4)

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(gender))) +
  geom_line(aes(color = gender),stat = 'summary',fun.y = median)

Plotting Conditional Summaries

Notes:

library(dplyr)
friendcount_by_gender <- pf %>%
  filter(!is.na(gender)) %>%
  group_by(age,gender) %>%
  summarise(median_friend_count = median(friend_count),
            mean_friend_count = mean(friend_count),
            n = n()) %>%
  arrange(age)

Thinking in Ratios

Notes:

ggplot(aes(age,median_friend_count),data = friendcount_by_gender) + 
  geom_line(aes(color = gender))

Wide and Long Format

Notes:

Reshaping Data

Notes:

library(reshape2)
#长数据转宽数据
friendcount_by_gender.wide <- dcast(friendcount_by_gender,
                                    age ~ gender,
                                    value.var = 'median_friend_count')
#宽数据转长数据
friendcount_by_gender.long <- melt(friendcount_by_gender.wide,id.vars = c('age'),variable.name = 'gender',value.name = 'meadian_friend_count')

Ratio Plot

Notes:比率图

ggplot(aes(age,female / male),data = friendcount_by_gender.wide) +
  geom_line() +
  #设置y轴截距,并且设置噪声值
  geom_hline(yintercept = 1,alpha = 0.3,linetype = 2)

Third Quantitative Variable

Notes:

#floor函数将返回不大于该数字的最大整数
pf$year_joined <- floor(2014 -pf$tenure / 365)
table(pf$year_joined)

## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70

Cut a Variable

Notes:

#按加入年份分组
pf$year_joined.bucket <- cut(pf$year_joined,
                          c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket)

## 
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] 
##        6669       15308       33366       43658

Plotting it All Together

Notes:

ggplot(aes(age,friend_count),data = subset(pf,!is.na(year_joined.bucket))) + 
  geom_line(aes(color = year_joined.bucket),stat = 'summary',fun.y = median)

Plot the Grand Mean

Notes:

ggplot(aes(age,friend_count),data = subset(pf,!is.na(year_joined.bucket))) + 
  geom_line(aes(color = year_joined.bucket),stat = 'summary',fun.y = mean) +
  geom_line(stat = 'summary',fun.y = mean,linetype = 2)

Friending Rate

Notes:

#计算条件比率
with(subset(pf,tenure >= 1),summary(friend_count / tenure))

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0775   0.2205   0.6096   0.5658 417.0000

Friendships Initiated

Notes:

What is the median friend rate?

What is the maximum friend rate?

ggplot(aes(tenure,friendships_initiated / tenure),
       data = subset(pf,tenure >= 1)) +
  geom_line(aes(color = year_joined.bucket),
            stat = 'summary',
            fun.y = median)

Bias-Variance Tradeoff Revisited

Notes: 利用round函数来降低噪声，使得函数曲线更加平滑

q1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined.bucket),
            stat = 'summary',
            fun.y = mean)

q2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

q3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

q4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(q1,q2,q3,q4,ncol = 1)

ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_smooth(aes(color = year_joined.bucket))

## `geom_smooth()` using method = 'gam'

Sean’s NFL Fan Sentiment Study

Notes ***

Introducing the Yogurt Data Set

Notes:

Histograms Revisited

Notes:

yo <- read.csv("yogurt.csv")
yo$id <- factor(yo$id)
qplot(x = price,data = yo,fill = I('#F79420'))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Number of Purchases

Notes:

#创建新的数据集(内部数据整理)
yo <- transform(yo,all.purchases = strawberry + blueberry + pina.colada +
                  plain + mixed.berry)
summary(yo$all.purchases)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.971   2.000  21.000

Prices over Time

Notes:

ggplot(aes(time,price),data = yo) +
  geom_jitter(alpha = 1/4,shape = 21,fill = I('#F79420'))

Sampling Observations

Notes:

Looking at Samples of Households

#选取样本
set.seed(4230)
sample.ids <- sample(levels(yo$id),16)
#按照样本画图
ggplot(aes(time,price),data = subset(yo,id %in% sample.ids)) +
  facet_wrap(~id) +
  geom_line() +
  geom_point(aes(size = all.purchases),pch = 1)

set.seed(5720)
sample.ids2 <- sample(levels(yo$id),16)
ggplot(aes(time,price),data = subset(yo,id %in% sample.ids2)) + 
  facet_wrap(~id) +
  geom_line() + 
  geom_point(aes(size = all.purchases),pth = 1)

## Warning: Ignoring unknown parameters: pth

The Limits of Cross Sectional Data

Notes:

Many Variables

Notes:

Scatterplot Matrix

Notes:创建更多的数据图

library('GGally')

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

theme_set(theme_minimal(10))
set.seed(1836)
pf_subset <- pf[,c(2:7)]
names(pf_subset)

## [1] "age"       "dob_day"   "dob_year"  "dob_month" "gender"    "tenure"

ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Even More Variables

Notes: