Notes:
Notes:
Notes:
library(ggplot2)
setwd('/Users/wangkai/Documents/Rwork/edaTest')
pf = read.csv('pseudo_facebook.tsv',sep='\t')
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot()+
stat_summary(fun.y=mean, geom="point", shape=4)
ggplot(aes(x=age, y=friend_count), data=subset(pf, !is.na(gender))) +
geom_line(aes(color=gender), stat="summary", fun.y=median)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count=mean(friend_count),
median_friend_count=median(friend_count),
n=n())
# my own
gourpby_age_gender = group_by(subset(pf,!is.na(gender)), age,gender)
fc_by_age_gender = summarise(gourpby_age_gender,
mean_friend_count=mean(friend_count),
median_friend_count=median(friend_count),
n=n()
)
head(fc_by_age_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
##
## age gender mean_friend_count median_friend_count n
## <int> <fctr> <dbl> <dbl> <int>
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
##
## age gender mean_friend_count median_friend_count n
## <int> <fctr> <dbl> <dbl> <int>
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
Notes:
ggplot(aes(x=age, y=median_friend_count), data=fc_by_age_gender) +
geom_line(aes(color=gender))
Notes:
Notes:
Notes:
#install.packages('reshape2')
library(reshape2)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age~gender, value.var='median_friend_count')
Notes:
ggplot(pf.fc_by_age_gender.wide, aes(age, y=female/male)) + geom_line() +
geom_hline(yintercept=1, alpha=0.3, linetype=2) +
xlab('Age') +
ylab('Median Female Friend Count / Median Male Friend Count')
Notes:
# pf$year_joined <- 2014-floor(pf$tenure/365)
# table(pf$year_joined)
pf$year_joined <- 2014-ceiling(pf$tenure/365)
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
Notes:
pf$year_joined.bucket <- cut(pf$year_joined, breaks=c(2004,2009,2011,2012,2014), right=TRUE)
summary(pf$year_joined.bucket)
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] NA's
## 6669 15308 33366 43658 2
Notes:
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color=year_joined.bucket), stat='summary', fun.y=median)
Notes:
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color=year_joined.bucket), stat='summary', fun.y=mean) +
geom_line(stat='summary', fun.y=mean, linetype=2)
Notes:
Notes:
What is the median friend rate?
What is the maximum friend rate?
ggplot(subset(pf, tenure>0), aes(x=tenure, y=friendships_initiated/tenure, color=year_joined.bucket)) +
geom_line(stat='summary', fun.y=mean)
Notes:
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_smooth(aes(color = year_joined.bucket))
Notes:
Notes:
Notes:
yo = read.csv('yogurt.csv')
yo$id = factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
ggplot(aes(x=price),data=yo)+geom_histogram(binwidth = 10)
Notes:
yo$all.purchases <- with(yo, strawberry+blueberry+pina.colada+plain+mixed.berry)
head(yo$all.purchases)
## [1] 1 1 1 1 3 4
ggplot(aes(x=all.purchases),data=yo)+geom_histogram(binwidth = 2)
Notes:
ggplot(aes(x=time, y=price), data=yo)+geom_point(position="jitter", alpha=0.1)
Notes:
set.seed(42)
sample.ids <- sample( levels(yo$id), 16)
ggplot(subset(yo, id %in% sample.ids), aes(x=time, y=price)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size=all.purchases), pch=1)
Notes:
Notes:
#install.packages('GGally')
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
Notes:
#install.packages('GGally')
library(GGally)
theme_set(theme_minimal(20))
# set the seed for reproducible results
set.seed(1836)
pf_subset <- pf[, c(2:15)]
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
Notes:
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!