Lesson 5

Multivariate Data

Notes:

Moira Perceived Audience Size Colored by Age

Notes:

Third Qualitative Variable

Notes:

library(ggplot2)
setwd('/Users/wangkai/Documents/Rwork/edaTest')
pf = read.csv('pseudo_facebook.tsv',sep='\t')
ggplot(aes(x = gender, y = age),
       data = subset(pf, !is.na(gender))) + geom_boxplot()+
  stat_summary(fun.y=mean, geom="point", shape=4)

ggplot(aes(x=age, y=friend_count), data=subset(pf, !is.na(gender))) + 
  geom_line(aes(color=gender), stat="summary", fun.y=median)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

pf.fc_by_age_gender <- pf %>%
  filter(!is.na(gender)) %>%
  group_by(age, gender) %>%
  summarise(mean_friend_count=mean(friend_count),
            median_friend_count=median(friend_count),
            n=n())

# my own
gourpby_age_gender = group_by(subset(pf,!is.na(gender)), age,gender)
fc_by_age_gender = summarise(gourpby_age_gender, 
                             mean_friend_count=mean(friend_count),
                             median_friend_count=median(friend_count),
                             n=n()
                             )
head(fc_by_age_gender)

## Source: local data frame [6 x 5]
## Groups: age [3]
## 
##     age gender mean_friend_count median_friend_count     n
##   <int> <fctr>             <dbl>               <dbl> <int>
## 1    13 female          259.1606               148.0   193
## 2    13   male          102.1340                55.0   291
## 3    14 female          362.4286               224.0   847
## 4    14   male          164.1456                92.5  1078
## 5    15 female          538.6813               276.0  1139
## 6    15   male          200.6658               106.5  1478

head(pf.fc_by_age_gender)

## Source: local data frame [6 x 5]
## Groups: age [3]
## 
##     age gender mean_friend_count median_friend_count     n
##   <int> <fctr>             <dbl>               <dbl> <int>
## 1    13 female          259.1606               148.0   193
## 2    13   male          102.1340                55.0   291
## 3    14 female          362.4286               224.0   847
## 4    14   male          164.1456                92.5  1078
## 5    15 female          538.6813               276.0  1139
## 6    15   male          200.6658               106.5  1478

Plotting Conditional Summaries

Notes:

ggplot(aes(x=age, y=median_friend_count), data=fc_by_age_gender) + 
  geom_line(aes(color=gender))

Thinking in Ratios

Notes:

Wide and Long Format

Notes:

Reshaping Data

Notes:

#install.packages('reshape2')
library(reshape2)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age~gender, value.var='median_friend_count')

Ratio Plot

Notes:

ggplot(pf.fc_by_age_gender.wide, aes(age, y=female/male)) + geom_line() +
  geom_hline(yintercept=1, alpha=0.3, linetype=2) +
  xlab('Age') +
  ylab('Median Female Friend Count / Median Male Friend Count')

Third Quantitative Variable

Notes:

# pf$year_joined <- 2014-floor(pf$tenure/365)
# table(pf$year_joined)

pf$year_joined <- 2014-ceiling(pf$tenure/365)
table(pf$year_joined)

## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70

Cut a Variable

Notes:

pf$year_joined.bucket <- cut(pf$year_joined, breaks=c(2004,2009,2011,2012,2014), right=TRUE)
summary(pf$year_joined.bucket)

## (2004,2009] (2009,2011] (2011,2012] (2012,2014]        NA's 
##        6669       15308       33366       43658           2

Plotting it All Together

Notes:

ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
  geom_line(aes(color=year_joined.bucket), stat='summary', fun.y=median)

Plot the Grand Mean

Notes:

ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
  geom_line(aes(color=year_joined.bucket), stat='summary', fun.y=mean) +
  geom_line(stat='summary', fun.y=mean, linetype=2)

Friending Rate

Notes:

Friendships Initiated

Notes:

What is the median friend rate?

What is the maximum friend rate?

ggplot(subset(pf, tenure>0), aes(x=tenure, y=friendships_initiated/tenure, color=year_joined.bucket)) + 
  geom_line(stat='summary', fun.y=mean)

Bias-Variance Tradeoff Revisited

Notes:

ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined.bucket),
            stat = 'summary',
            fun.y = mean)

ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_smooth(aes(color = year_joined.bucket))

Sean’s NFL Fan Sentiment Study

Notes:

Introducing the Yogurt Data Set

Notes:

Histograms Revisited

Notes:

yo = read.csv('yogurt.csv')
yo$id = factor(yo$id)
str(yo)

## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...

ggplot(aes(x=price),data=yo)+geom_histogram(binwidth = 10)

Number of Purchases

Notes:

yo$all.purchases <- with(yo, strawberry+blueberry+pina.colada+plain+mixed.berry)
head(yo$all.purchases)

## [1] 1 1 1 1 3 4

ggplot(aes(x=all.purchases),data=yo)+geom_histogram(binwidth = 2)

Prices over Time

Notes:

ggplot(aes(x=time, y=price), data=yo)+geom_point(position="jitter", alpha=0.1)

Sampling Observations

Notes:

Looking at Samples of Households

set.seed(42)
sample.ids <- sample( levels(yo$id), 16)

ggplot(subset(yo, id %in% sample.ids), aes(x=time, y=price)) +
  facet_wrap(~id) + 
  geom_line() + 
  geom_point(aes(size=all.purchases), pch=1)

The Limits of Cross Sectional Data

Notes:

Many Variables

Notes:

#install.packages('GGally')
library(GGally)

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

Scatterplot Matrix

Notes:

#install.packages('GGally')
library(GGally)
theme_set(theme_minimal(20))

# set the seed for reproducible results
set.seed(1836)
pf_subset <- pf[, c(2:15)]
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Even More Variables

Notes:

Heat Maps

Notes:

nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)

nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)

##   gene case  value
## 1    1    1  0.300
## 2    2    1  1.180
## 3    3    1  0.550
## 4    4    1  1.140
## 5    5    1 -0.265
## 6    6    1 -0.070

ggplot(aes(y = gene, x = case, fill = value),
  data = nci.long.samp) +
  geom_tile() +
  scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))

nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)

##   gene case  value
## 1    1    1  0.300
## 2    2    1  1.180
## 3    3    1  0.550
## 4    4    1  1.140
## 5    5    1 -0.265
## 6    6    1 -0.070

ggplot(aes(y = gene, x = case, fill = value),
  data = nci.long.samp) +
  geom_tile() +
  scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))

Analyzing Three of More Variables

Reflection:

Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!