Notes: we handle more than two variables
Notes:
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv',sep='\t')
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot() + stat_summary(fun.y = mean, geom = 'point', shape = 4)
ggplot(aes(x = gender, y = friend_count),
data = subset(pf, !is.na(gender))) + geom_line(aes(color=gender), stat = 'summary', fun.y = median)
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
***
Notes:
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n=n()) %>%
ungroup() %>%
arrange(age)
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
##
## age gender mean_friend_count median_friend_count n
## (int) (fctr) (dbl) (dbl) (int)
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
Notes:
ggplot(aes(x = age, y = mean_friend_count), data = pf.fc_by_age_gender) + geom_line(aes(color = gender))
Notes: how many times more friends does the average female user have than the male user?
Notes:
Notes:
#install.packages('reshape2')
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.2.4
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
##
## age gender mean_friend_count median_friend_count n
## (int) (fctr) (dbl) (dbl) (int)
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age ~ gender, value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide)
## age female male
## 1 13 148.0 55.0
## 2 14 224.0 92.5
## 3 15 276.0 106.5
## 4 16 258.5 136.0
## 5 17 245.5 125.0
## 6 18 243.0 122.0
Notes:
ggplot(aes(x = age, y = female/male), data = pf.fc_by_age_gender.wide) + geom_line()+ geom_hline(yintercept = 1, alpha = 0.3, linetype = 2)
Notes:
pf$year_joined <- floor(2014 - pf$tenure/365)
Notes:
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
pf$year_joined.bucket <- with(pf, cut(year_joined, breaks = c(2004, 2009, 2011, 2012, 2014)))
table(pf$year_joined.bucket)
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]
## 6669 15308 33366 43658
Notes:
table(pf$year_joined.bucket, useNA = 'ifany')
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] <NA>
## 6669 15308 33366 43658 2
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)
Notes: suspicion is confirmed. Users with a longer tenure tend to have higher friend counts, with the exception of our older users, say about 80 or up
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = mean, linetype = 2)
Notes: how many friends does a user have for each day since they’ve started using the service?
with(subset(pf, tenure >=1), summary(friend_count/tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
Notes:
What is the median friend rate? 0.2205
What is the maximum friend rate? 417
Create a line graph of mean of friendships_initiated per day (of tenure) vs. tenure colored by year_joined.bucket.
You need to make use of the variables tenure, friendships_initiated, and year_joined.bucket.
ggplot(aes(x = tenure, y = friendships_initiated / tenure), data = subset(pf, tenure >= 1)) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) + geom_line(aes(x= age, y = friendships_initiated), linetype = 2, stat = 'summary', fun.y = mean)
Notes:
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x =tenure, y = friendships_initiated / tenure), data = subset(pf, tenure > 0)) +
geom_smooth(aes(color = year_joined.bucket))
Notes:
Notes:
Notes:
yo <- read.csv('yogurt.csv')
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
# Change the id from an int to a factor
yo$id <- factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
ggplot(aes(price), data = yo) + geom_histogram(binwidth = 2)
Notes:
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price
## Min. :20.00
## 1st Qu.:50.00
## Median :65.04
## Mean :59.25
## 3rd Qu.:68.96
## Max. :68.96
##
length(unique(yo$price))
## [1] 20
table(yo$price)
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
summary(yo$all.purchases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.971 2.000 21.000
Notes:
ggplot(aes(x=time, y=price), data = yo) + geom_jitter(alpha = 1/4, shape = 21, fill = I('#F79420'))
Notes:
set.seed(4230)
sample.ids <- sample(levels(yo$id),16)
ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) + facet_wrap( ~ id) + geom_line() + geom_point(aes(size = all.purchases), pch = 1)
Notes:
Notes:
Notes:
#install.packages('GGally')
library(GGally)
## Warning: package 'GGally' was built under R version 3.2.4
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
theme_set(theme_minimal(20))
#set the seed for reproducible results
set.seed(1836)
pf_subset <- pf[, c(2:7)]
names(pf_subset)
## [1] "age" "dob_day" "dob_year" "dob_month" "gender" "tenure"
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), -1], axisLabels = 'internal')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
with(pf, cor.test(friendships_initiated, friend_count))
##
## Pearson's product-moment correlation
##
## data: friendships_initiated and friend_count
## t = 460.81, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8238590 0.8278205
## sample estimates:
## cor
## 0.82585
with(pf, cor.test(age, mobile_likes))
##
## Pearson's product-moment correlation
##
## data: age and mobile_likes
## t = -8.4088, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03293894 -0.02048964
## sample estimates:
## cor
## -0.02671533
Notes:
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
Notes:
library(reshape2)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
Reflection: I learned about third qualitative variable, thinking in ratios, reshaping data, third quantitative variable, cut a variable, plot the grand mean, scatterplot matrix, heat maps
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!