Third Qualitative Variable

setwd("D:/R/Udacity/EDA_Course_Materials/lesson5")

pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

ggplot(aes(x = gender, y = age),
       data = subset(pf, !is.na(gender))) + geom_boxplot() +
    stat_summary(fun.y = mean, geom = 'point', shape = 4)

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(gender))) + 
    geom_line(aes(color = gender), stat = 'summary', fun.y = median)

Multivariate Data

Notes:

Moira Perceived Audience Size Colored by Age

Notes:

Third Qualitative Variable

Notes:

pf.fc_by_age_gender <- subset(pf, !is.na(gender)) %>%
    group_by(age, gender) %>%
    summarise(mean_friend_count = mean(friend_count), median_friend_count = median(friend_count), n = n()) %>%
    ungroup() %>%
    arrange(age)

Plotting Conditional Summaries

Notes:

ggplot(aes(x = age, y = median_friend_count),
       data = pf.fc_by_age_gender) + 
    geom_line(aes(color = gender))

Thinking in Ratios

Notes:

Wide and Long Format

Notes:

Reshaping Data

Notes:

library(reshape2)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age ~ gender, value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide)

##   age female male
## 1  13    148   55
## 2  14    224   92
## 3  15    276  106
## 4  16    258  136
## 5  17    245  125
## 6  18    243  122

Ratio Plot

Notes:

ggplot(aes(x = age, y = female / male),
data = pf.fc_by_age_gender.wide) +
    geom_line() +
    geom_hline(yintercept = 1, alpha = 0.3, linetype = 2)

Third Quantitative Variable

Notes:

pf$year_joined <- floor(2014 - pf$tenure/365)

Cut a Variable

Notes:

#        (2004, 2009]
#        (2009, 2011]
#        (2011, 2012]
#        (2012, 2014]

summary(pf$year_joined)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2005    2012    2012    2012    2013    2014       2

table(pf$year_joined)

## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70

pf$year_joined.bucket <- cut(pf$year_joined, c(2004, 2009, 2011, 2012, 2014))

Plotting it All Together

Notes:

table(pf$year_joined.bucket, useNA = 'ifany')

## 
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]        <NA> 
##        6669       15308       33366       43658           2

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(year_joined.bucket))) + 
    geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)

Plot the Grand Mean

Notes:

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(year_joined.bucket))) + 
    geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
    geom_line(stat = 'summary', fun.y = mean, linetype = 2)

Friending Rate

Notes:

with(subset(pf, tenure >= 1), summary(friend_count / tenure))

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0775   0.2205   0.6096   0.5658 417.0000

Friendships Initiated

Notes:

What is the median friend rate? 0.2205

What is the maximum friend rate? 417.0000

ggplot(aes(x = tenure, y = friendships_initiated / tenure), data = subset(pf, tenure >=1)) +
    geom_line(aes(color = year_joined.bucket),)

Bias-Variance Tradeoff Revisited

Notes:

ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined.bucket),
            stat = 'summary',
            fun.y = mean)

ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_smooth(aes(color = year_joined.bucket))

## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

Introducing the Yogurt Data Set

Notes: Yogurt data has many rows per household, one for each purchase occasion. ***

Histograms Revisited

Notes:

There are some observation which almost don’t have purchases.

yo <- read.csv('yogurt.csv')
str(yo)

## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : int  2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...

# Change the id from an int to a factor
yo$id <- factor(yo$id)
str(yo)

## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...

qplot(data = yo, x = price, fill = I('#F79420'))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Number of Purchases

Notes:

summary(yo)

##       obs               id            time         strawberry     
##  Min.   :   1.0   2132290:  74   Min.   : 9662   Min.   : 0.0000  
##  1st Qu.: 696.5   2130583:  59   1st Qu.: 9843   1st Qu.: 0.0000  
##  Median :1369.5   2124073:  50   Median :10045   Median : 0.0000  
##  Mean   :1367.8   2149500:  50   Mean   :10050   Mean   : 0.6492  
##  3rd Qu.:2044.2   2101790:  47   3rd Qu.:10255   3rd Qu.: 1.0000  
##  Max.   :2743.0   2129528:  39   Max.   :10459   Max.   :11.0000  
##                   (Other):2061                                    
##    blueberry        pina.colada          plain         mixed.berry    
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 0.0000   Median : 0.0000   Median :0.0000   Median :0.0000  
##  Mean   : 0.3571   Mean   : 0.3584   Mean   :0.2176   Mean   :0.3887  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :12.0000   Max.   :10.0000   Max.   :6.0000   Max.   :8.0000  
##                                                                       
##      price      
##  Min.   :20.00  
##  1st Qu.:50.00  
##  Median :65.04  
##  Mean   :59.25  
##  3rd Qu.:68.96  
##  Max.   :68.96  
##

length(unique(yo$price))

## [1] 20

table(yo$price)

## 
##    20 24.96 33.04  33.2 33.28 33.36 33.52 39.04    44 45.04 48.96 49.52 
##     2    11    54     1     1    22     1   234    21    11    81     1 
##  49.6    50 55.04 58.96    62 63.04 65.04 68.96 
##     1   205     6   303    15     2   799   609

yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)

summary(yo$all.purchases)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.971   2.000  21.000

Prices over Time

Notes:

Model of most common price encreases over time.

qplot(x = all.purchases, data = yo, binwidth = 1, fill = I('#099DD9'))

ggplot(aes(x = time, y = price), data = yo) +
    geom_jitter(alpha = 1/4, shape = 21, fill = I('#F79420'))

Sampling Observations

Notes:

Looking at Samples of Households

set.seed(666)
sample.ids <- sample(levels(yo$id), 16)

ggplot(aes(x = time, y = price),
       data = subset(yo, id %in% sample.ids)) +
    facet_wrap(~ id) +
    geom_line() +
    geom_point(aes(size = all.purchases), pch = 18, color = I('#7D26CD'))

ggsave('HH_sample.jpeg', width = 10, height = 10)

Scatterplot Matrices

library(GGally)

## 
## Attaching package: 'GGally'
## 
## The following object is masked from 'package:dplyr':
## 
##     nasa

theme_set(theme_minimal(20)) 

set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)

##  [1] "age"                   "dob_day"              
##  [3] "dob_year"              "dob_month"            
##  [5] "gender"                "tenure"               
##  [7] "friend_count"          "friendships_initiated"
##  [9] "likes"                 "likes_received"       
## [11] "mobile_likes"          "mobile_likes_received"
## [13] "www_likes"             "www_likes_received"

ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## Warning: position_stack requires constant width: output may be incorrect

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Scatterplot Matrix

Notes: Scatterplots are below the diagonal, and categorical variables, like gender, create faceted histograms. ***

Even More Variables

Notes:

Heat Maps

Notes:

nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)

nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)

##   gene case  value
## 1    1    1  0.300
## 2    2    1  1.180
## 3    3    1  0.550
## 4    4    1  1.140
## 5    5    1 -0.265
## 6    6    1 -0.070

ggplot(aes(y = gene, x = case, fill = value),
  data = nci.long.samp) +
  geom_tile() +
  scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))

Analyzing Three of More Variables

Reflection:

Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!

Lesson 5

Third Qualitative Variable

Multivariate Data

Moira Perceived Audience Size Colored by Age

Third Qualitative Variable

Plotting Conditional Summaries

Thinking in Ratios

Wide and Long Format

Reshaping Data

Ratio Plot

Third Quantitative Variable

Cut a Variable

Plotting it All Together

Plot the Grand Mean

Friending Rate

Friendships Initiated

Bias-Variance Tradeoff Revisited

Introducing the Yogurt Data Set

Histograms Revisited

Number of Purchases

Prices over Time

Sampling Observations

Looking at Samples of Households

Scatterplot Matrices

Scatterplot Matrix

Even More Variables

Heat Maps

Analyzing Three of More Variables