Notes: Adding a color by age doesn’t help because of overplotting.
Notes:
setwd('~/Downloads')
getwd()
## [1] "/Users/jacob/Downloads"
install.packages("ggplot2", repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/ggplot2_2.1.0.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/ggplot2_2.1.0.tgz/src/contrib:
## cannot download all files
## Warning: package 'ggplot2' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/ggplot2_2.1.0.tgz/bin/macosx/mavericks/contrib/3.3:
## cannot download all files
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
load(file = "pf_sub_gender.RData")
ggplot(data = pf.sub_gender, aes(x = gender, y = age)) +
geom_boxplot() +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf.sub_gender <- subset(pf, !is.na(gender))
age_groups_with_gender <- group_by(pf.sub_gender, age, gender)
pf.fc_by_age_gender <- summarise(age_groups_with_gender,
mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n= n())
head(pf.fc_by_age_gender, 10)
## Source: local data frame [10 x 5]
## Groups: age [5]
##
## age gender mean_friend_count median_friend_count n
## <int> <fctr> <dbl> <dbl> <int>
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
## 7 16 female 519.5145 258.5 1238
## 8 16 male 239.6748 136.0 1848
## 9 17 female 538.9943 245.5 1236
## 10 17 male 236.4924 125.0 2045
# library(dplyr)
#
# pf.fc_by_age_gender <- filter(!is.na(gender)) %>%
# group_by(age, gender) %>%
# summarise(mean_friend_count = mean(friend_count),
# median_friend_count = median(friend_count),
# n= n()) %>%
# ungroup() %>%
# arrange(age)
#
# head(pf.fc_by_age_gender, 10)
Notes: Create a line graph showing the median friend count over the ages for each gender. (Same graph as earlier, just different syntax)
ggplot(aes(x = age, y = median_friend_count),
data = pf.fc_by_age_gender) +
geom_line(aes(color = gender))
Notes: Gender difference is largest at youngest ages, but numbers don’t help. Need to think in ratios of each other.
Notes: Old data is in long format. Need to move to long format. Use tidyr or reshape2 package.
install.packages('tidyr', repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/tidyr_0.6.0.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/tidyr_0.6.0.tgz/src/contrib:
## cannot download all files
## Warning: package 'tidyr' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/tidyr_0.6.0.tgz/bin/macosx/mavericks/contrib/3.3:
## cannot download all files
library(tidyr)
install.packages('reshape2', repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/reshape2_1.4.2.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/reshape2_1.4.2.tgz/src/contrib:
## cannot download all files
## Warning: package 'reshape2' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/reshape2_1.4.2.tgz/bin/macosx/mavericks/contrib/3.3:
## cannot download all files
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
Notes:
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age ~ gender,
value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide, 10)
## age female male
## 1 13 148.0 55.0
## 2 14 224.0 92.5
## 3 15 276.0 106.5
## 4 16 258.5 136.0
## 5 17 245.5 125.0
## 6 18 243.0 122.0
## 7 19 229.0 130.0
## 8 20 190.0 112.0
## 9 21 158.0 108.0
## 10 22 124.0 97.0
Notes:
ggplot(aes(x = age, y = female/male), data = pf.fc_by_age_gender.wide) +
geom_line() +
geom_hline(yintercept = 1, linetype = 'dashed', alpha = 0.5)
Notes: Use floor instead of round or ceiling because it is only full and complete years that matter when determining the year that the user joined.
pf$year_joined <- with(pf, floor(2014 - (tenure/365)))
Notes:
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
pf$year_joined.bucket <- cut(pf$year_joined, breaks = c(2004, 2009, 2011, 2012, 2014))
table(pf$year_joined.bucket)
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]
## 6669 15308 33366 43658
Notes:
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)
Notes:
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
geom_line(linetype = 'dashed', stat = 'summary', fun.y = mean)
with(subset(pf, tenure > 1), summary(friend_count/tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2204 0.6069 0.5652 417.0000
What is the median friend rate? 0.22
What is the maximum friend rate? 417.00
pf.sub_tenure <- subset(pf, tenure >= 1)
with(pf.sub_tenure, summary(friendships_initiated/tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0400 0.1244 0.3658 0.3409 149.5000
ggplot(aes(x = tenure, y = friendships_initiated/tenure),
data = pf.sub_tenure) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
Notes:
install.packages("gridExtra", repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/gridExtra_2.2.1.tgz')
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/gridExtra_2.2.1.tgz/src/contrib:
## cannot download all files
## Warning: package 'gridExtra' is not available (for R version 3.3.1)
## Warning: unable to access index for repository https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/gridExtra_2.2.1.tgz/bin/macosx/mavericks/contrib/3.3:
## cannot download all files
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
p2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
grid.arrange(p1, p2, p3, p4, ncol = 1)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = pf.sub_tenure) +
geom_smooth(aes(color = year_joined.bucket))
Notes: Sean is an Eagles fan.
Notes:
yo <- read.csv('yogurt.csv')
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
yo$id <- factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
Notes: Price doesn’t make senes. Most have the same price, there are a lot of gaps where nothing has that price.
qplot(x = price, data = yo)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price
## Min. :20.00
## 1st Qu.:50.00
## Median :65.04
## Mean :59.25
## 3rd Qu.:68.96
## Max. :68.96
##
length(unique(yo))
## [1] 9
table(yo$price)
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
summary(yo$all.purchases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.971 2.000 21.000
Notes:
ggplot(aes(x = time, y = price), data = yo) +
geom_point(alpha = 1/20)
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price),
data = subset(yo, id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1)
Quiz (different seed number)
set.seed(230)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price),
data = subset(yo, id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1)
Notes: It seems as though the jump in different prices could be caused by coupons. There are not many prices from 40 < x < 60, and there are large jumps rather than curves.
#install.packages("GGally", repos = 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/GGally_1.2.0.tgz')
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
## [1] "age" "dob_day"
## [3] "dob_year" "dob_month"
## [5] "gender" "tenure"
## [7] "friend_count" "friendships_initiated"
## [9] "likes" "likes_received"
## [11] "mobile_likes" "mobile_likes_received"
## [13] "www_likes" "www_likes_received"
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes: What is going on here, I dont know.
Notes:
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))