library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
glimpse(pf)
## Observations: 99,003
## Variables: 15
## $ userid                <int> 2094382, 1192601, 2083884, 1203168, 1733...
## $ age                   <int> 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, ...
## $ dob_day               <int> 19, 2, 16, 25, 4, 1, 14, 4, 1, 2, 22, 1,...
## $ dob_year              <int> 1999, 1999, 1999, 1999, 1999, 1999, 2000...
## $ dob_month             <int> 11, 11, 11, 12, 12, 12, 1, 1, 1, 2, 2, 2...
## $ gender                <fctr> male, female, male, female, male, male,...
## $ tenure                <int> 266, 6, 13, 93, 82, 15, 12, 0, 81, 171, ...
## $ friend_count          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ friendships_initiated <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes_received        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes_received    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
ggplot(data = pf)+
  geom_point(mapping = aes(age, friend_count))

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00
ggplot(data = pf)+
  geom_point(mapping = aes(age, friend_count))+
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_point(mapping = aes(age, friend_count), alpha = 1/20)+
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_jitter(mapping = aes(age, friend_count), alpha =(1/20))+
  xlim(13,90)
## Warning: Removed 5187 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_point(mapping = aes(age, friend_count), alpha =(1/20))+
  xlim(13,90)+
  coord_trans(y='sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_jitter(mapping = aes(age, friend_count), alpha =(1/20),position = position_jitter(height = 0))+
  xlim(13,90)+
  coord_trans(y = 'sqrt')
## Warning: Removed 5176 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_jitter(mapping = aes(age, friendships_initiated), alpha = 1/10, position = position_jitter(height = 0))+
  xlim(13,90)+
  coord_trans(y = 'sqrt')
## Warning: Removed 5176 rows containing missing values (geom_point).

##Conditional Means

library(dplyr)
pf.fc_by_age <- pf %>% group_by(age) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count), N = n()) %>% arrange(age)
head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     N
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196
library(ggplot2)
ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean))+
 geom_line()+
 xlim(30,90)
## Warning: Removed 40 rows containing missing values (geom_path).

ggplot(data=pf,aes(age, friend_count)) +
  coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) +
  geom_point(alpha = 0.05, position=position_jitter(h = 0), color = 'orange') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .1), linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .5), linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .9), linetype = 2, color = 'blue')

cor.test(pf$age,pf$friend_count)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
ggplot(data = pf, aes(www_likes_received,likes_received))+
  geom_jitter(alpha = 1/50, position = position_jitter(height = 0))+
  coord_cartesian(xlim = c(0, quantile(pf$www_likes_received, .95)), ylim = c(0, quantile(pf$likes_received, 0.95)))+
geom_smooth(method = 'lm', color = 'red')

cor.test(pf$www_likes_received, pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902
pf$age_with_months <-pf$age + (1 - pf$dob_month / 12)
pf.fc_by_age_months <- pf %>% group_by(age_with_months) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count),n = n()) %>% arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1        13.16667          46.33333                30.5     6
## 2        13.25000         115.07143                23.5    14
## 3        13.33333         136.20000                44.0    25
## 4        13.41667         164.24242                72.0    33
## 5        13.50000         131.17778                66.0    45
## 6        13.58333         156.81481                64.0    54
ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
  geom_line()

ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
 geom_line()

p2 <- ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
  geom_line()
p1 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
 geom_line()
p3 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = round(age/5)*5, y = friend_count_mean))+
 geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p2,p1,p3, ncol = 1)

p2 <- ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
  geom_line()+
  geom_smooth()
p1 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
 geom_line()+
  geom_smooth()
p3 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = round(age/5)*5, y = friend_count_mean))+
 geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p2,p1,p3, ncol = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Multivarite Analysis of Facebook Dataset

ggplot(data = filter(pf, !is.na(gender)), mapping = aes(x = gender, y = age)) +
  geom_boxplot() +
  stat_summary(fun.y = mean, geom = 'point', shape = 4)

ggplot(data = filter(pf, !is.na(gender)), mapping = aes(x = age, y = friend_count)) +
  geom_line(aes(color = gender), stat = 'summary', fun.y = median)

pf.fc_by_age_gender <-  pf %>% filter(!is.na(gender)) %>% group_by(age,gender) %>% summarise(mean_friend_count = as.integer(mean(friend_count)), median_friend_count = as.integer(median(as.numeric(friend_count))), n = n())
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
## 
## # A tibble: 6 x 5
##     age gender mean_friend_count median_friend_count     n
##   <int> <fctr>             <int>               <int> <int>
## 1    13 female               259                 148   193
## 2    13   male               102                  55   291
## 3    14 female               362                 224   847
## 4    14   male               164                  92  1078
## 5    15 female               538                 276  1139
## 6    15   male               200                 106  1478
tail(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
## 
## # A tibble: 6 x 5
##     age gender mean_friend_count median_friend_count     n
##   <int> <fctr>             <int>               <int> <int>
## 1   111 female               244                 109     7
## 2   111   male               246                 191    10
## 3   112 female               201                 203     5
## 4   112   male               594                  95    13
## 5   113 female               278                 198   105
## 6   113   male               410                 223    91
ggplot(data = filter(pf.fc_by_age_gender, !is.na(gender)), mapping = aes(x = age, y = median_friend_count)) +
geom_line(aes(color = gender)) +
stat_summary(fun.y = median)
## Warning: Removed 101 rows containing missing values (geom_pointrange).

library(tidyr)
pf.fc_by_age_gender.wide <- pf.fc_by_age_gender %>% select(age, gender, median_friend_count) %>% filter(!is.na(gender)) %>% spread(gender, median_friend_count) %>% mutate(ratio = female/male)
head(pf.fc_by_age_gender.wide)
## Source: local data frame [6 x 4]
## Groups: age [6]
## 
## # A tibble: 6 x 4
##     age female  male    ratio
##   <int>  <int> <int>    <dbl>
## 1    13    148    55 2.690909
## 2    14    224    92 2.434783
## 3    15    276   106 2.603774
## 4    16    258   136 1.897059
## 5    17    245   125 1.960000
## 6    18    243   122 1.991803
ggplot(data = pf.fc_by_age_gender.wide, aes(x = age, y = ratio)) +
  geom_line() +
  geom_hline(yintercept = 1,alpha = 0.3, linetype = 2)

pf$year_joined <- floor(2014 - (pf$tenure/365))
head(pf)
##    userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382  14      19     1999        11   male    266            0
## 2 1192601  14       2     1999        11 female      6            0
## 3 2083884  14      16     1999        11   male     13            0
## 4 1203168  14      25     1999        12 female     93            0
## 5 1733186  14       4     1999        12   male     82            0
## 6 1524765  14       1     1999        12   male     15            0
##   friendships_initiated likes likes_received mobile_likes
## 1                     0     0              0            0
## 2                     0     0              0            0
## 3                     0     0              0            0
## 4                     0     0              0            0
## 5                     0     0              0            0
## 6                     0     0              0            0
##   mobile_likes_received www_likes www_likes_received age_with_months
## 1                     0         0                  0        14.08333
## 2                     0         0                  0        14.08333
## 3                     0         0                  0        14.08333
## 4                     0         0                  0        14.00000
## 5                     0         0                  0        14.00000
## 6                     0         0                  0        14.00000
##   year_joined
## 1        2013
## 2        2013
## 3        2013
## 4        2013
## 5        2013
## 6        2013
summary(pf$year_joined)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2005    2012    2012    2012    2013    2014       2
table(pf$year_joined)
## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70
pf %>% group_by(year_joined) %>% summarise(count = n())
## # A tibble: 11 x 2
##    year_joined count
##          <dbl> <int>
##  1        2005     9
##  2        2006    15
##  3        2007   581
##  4        2008  1507
##  5        2009  4557
##  6        2010  5448
##  7        2011  9860
##  8        2012 33366
##  9        2013 43588
## 10        2014    70
## 11          NA     2
pf$year_joined.bucket <- cut(pf$year_joined, breaks = c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket)
## 
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] 
##        6669       15308       33366       43658
ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(year_joined.bucket))) +
  geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(year_joined.bucket))) +
  geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = mean, alpha = 0.5, linetype = 2)