library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
glimpse(pf)
## Observations: 99,003
## Variables: 15
## $ userid <int> 2094382, 1192601, 2083884, 1203168, 1733...
## $ age <int> 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, ...
## $ dob_day <int> 19, 2, 16, 25, 4, 1, 14, 4, 1, 2, 22, 1,...
## $ dob_year <int> 1999, 1999, 1999, 1999, 1999, 1999, 2000...
## $ dob_month <int> 11, 11, 11, 12, 12, 12, 1, 1, 1, 2, 2, 2...
## $ gender <fctr> male, female, male, female, male, male,...
## $ tenure <int> 266, 6, 13, 93, 82, 15, 12, 0, 81, 171, ...
## $ friend_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ friendships_initiated <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
ggplot(data = pf)+
geom_point(mapping = aes(age, friend_count))
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
ggplot(data = pf)+
geom_point(mapping = aes(age, friend_count))+
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_point(mapping = aes(age, friend_count), alpha = 1/20)+
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_jitter(mapping = aes(age, friend_count), alpha =(1/20))+
xlim(13,90)
## Warning: Removed 5187 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_point(mapping = aes(age, friend_count), alpha =(1/20))+
xlim(13,90)+
coord_trans(y='sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_jitter(mapping = aes(age, friend_count), alpha =(1/20),position = position_jitter(height = 0))+
xlim(13,90)+
coord_trans(y = 'sqrt')
## Warning: Removed 5176 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_jitter(mapping = aes(age, friendships_initiated), alpha = 1/10, position = position_jitter(height = 0))+
xlim(13,90)+
coord_trans(y = 'sqrt')
## Warning: Removed 5176 rows containing missing values (geom_point).
##Conditional Means
library(dplyr)
pf.fc_by_age <- pf %>% group_by(age) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count), N = n()) %>% arrange(age)
head(pf.fc_by_age)
## # A tibble: 6 x 4
## age friend_count_mean friend_count_median N
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
library(ggplot2)
ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean))+
geom_line()+
xlim(30,90)
## Warning: Removed 40 rows containing missing values (geom_path).
ggplot(data=pf,aes(age, friend_count)) +
coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) +
geom_point(alpha = 0.05, position=position_jitter(h = 0), color = 'orange') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .1), linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .5), linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .9), linetype = 2, color = 'blue')
cor.test(pf$age,pf$friend_count)
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
ggplot(data = pf, aes(www_likes_received,likes_received))+
geom_jitter(alpha = 1/50, position = position_jitter(height = 0))+
coord_cartesian(xlim = c(0, quantile(pf$www_likes_received, .95)), ylim = c(0, quantile(pf$likes_received, 0.95)))+
geom_smooth(method = 'lm', color = 'red')
cor.test(pf$www_likes_received, pf$likes_received)
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
pf$age_with_months <-pf$age + (1 - pf$dob_month / 12)
pf.fc_by_age_months <- pf %>% group_by(age_with_months) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count),n = n()) %>% arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
geom_line()
ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
geom_line()
p2 <- ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
geom_line()
p1 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
geom_line()
p3 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = round(age/5)*5, y = friend_count_mean))+
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p2,p1,p3, ncol = 1)
p2 <- ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
geom_line()+
geom_smooth()
p1 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
geom_line()+
geom_smooth()
p3 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = round(age/5)*5, y = friend_count_mean))+
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p2,p1,p3, ncol = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Multivarite Analysis of Facebook Dataset
ggplot(data = filter(pf, !is.na(gender)), mapping = aes(x = gender, y = age)) +
geom_boxplot() +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
ggplot(data = filter(pf, !is.na(gender)), mapping = aes(x = age, y = friend_count)) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)
pf.fc_by_age_gender <- pf %>% filter(!is.na(gender)) %>% group_by(age,gender) %>% summarise(mean_friend_count = as.integer(mean(friend_count)), median_friend_count = as.integer(median(as.numeric(friend_count))), n = n())
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
##
## # A tibble: 6 x 5
## age gender mean_friend_count median_friend_count n
## <int> <fctr> <int> <int> <int>
## 1 13 female 259 148 193
## 2 13 male 102 55 291
## 3 14 female 362 224 847
## 4 14 male 164 92 1078
## 5 15 female 538 276 1139
## 6 15 male 200 106 1478
tail(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
##
## # A tibble: 6 x 5
## age gender mean_friend_count median_friend_count n
## <int> <fctr> <int> <int> <int>
## 1 111 female 244 109 7
## 2 111 male 246 191 10
## 3 112 female 201 203 5
## 4 112 male 594 95 13
## 5 113 female 278 198 105
## 6 113 male 410 223 91
ggplot(data = filter(pf.fc_by_age_gender, !is.na(gender)), mapping = aes(x = age, y = median_friend_count)) +
geom_line(aes(color = gender)) +
stat_summary(fun.y = median)
## Warning: Removed 101 rows containing missing values (geom_pointrange).
library(tidyr)
pf.fc_by_age_gender.wide <- pf.fc_by_age_gender %>% select(age, gender, median_friend_count) %>% filter(!is.na(gender)) %>% spread(gender, median_friend_count) %>% mutate(ratio = female/male)
head(pf.fc_by_age_gender.wide)
## Source: local data frame [6 x 4]
## Groups: age [6]
##
## # A tibble: 6 x 4
## age female male ratio
## <int> <int> <int> <dbl>
## 1 13 148 55 2.690909
## 2 14 224 92 2.434783
## 3 15 276 106 2.603774
## 4 16 258 136 1.897059
## 5 17 245 125 1.960000
## 6 18 243 122 1.991803
ggplot(data = pf.fc_by_age_gender.wide, aes(x = age, y = ratio)) +
geom_line() +
geom_hline(yintercept = 1,alpha = 0.3, linetype = 2)
pf$year_joined <- floor(2014 - (pf$tenure/365))
head(pf)
## userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382 14 19 1999 11 male 266 0
## 2 1192601 14 2 1999 11 female 6 0
## 3 2083884 14 16 1999 11 male 13 0
## 4 1203168 14 25 1999 12 female 93 0
## 5 1733186 14 4 1999 12 male 82 0
## 6 1524765 14 1 1999 12 male 15 0
## friendships_initiated likes likes_received mobile_likes
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## mobile_likes_received www_likes www_likes_received age_with_months
## 1 0 0 0 14.08333
## 2 0 0 0 14.08333
## 3 0 0 0 14.08333
## 4 0 0 0 14.00000
## 5 0 0 0 14.00000
## 6 0 0 0 14.00000
## year_joined
## 1 2013
## 2 2013
## 3 2013
## 4 2013
## 5 2013
## 6 2013
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
pf %>% group_by(year_joined) %>% summarise(count = n())
## # A tibble: 11 x 2
## year_joined count
## <dbl> <int>
## 1 2005 9
## 2 2006 15
## 3 2007 581
## 4 2008 1507
## 5 2009 4557
## 6 2010 5448
## 7 2011 9860
## 8 2012 33366
## 9 2013 43588
## 10 2014 70
## 11 NA 2
pf$year_joined.bucket <- cut(pf$year_joined, breaks = c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket)
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]
## 6669 15308 33366 43658
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = mean, alpha = 0.5, linetype = 2)