library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
glimpse(pf)
## Observations: 99,003
## Variables: 15
## $ userid                <int> 2094382, 1192601, 2083884, 1203168, 1733...
## $ age                   <int> 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, ...
## $ dob_day               <int> 19, 2, 16, 25, 4, 1, 14, 4, 1, 2, 22, 1,...
## $ dob_year              <int> 1999, 1999, 1999, 1999, 1999, 1999, 2000...
## $ dob_month             <int> 11, 11, 11, 12, 12, 12, 1, 1, 1, 2, 2, 2...
## $ gender                <fctr> male, female, male, female, male, male,...
## $ tenure                <int> 266, 6, 13, 93, 82, 15, 12, 0, 81, 171, ...
## $ friend_count          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ friendships_initiated <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes_received        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes_received    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
ggplot(data = pf)+
  geom_point(mapping = aes(age, friend_count))

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00
ggplot(data = pf)+
  geom_point(mapping = aes(age, friend_count))+
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_point(mapping = aes(age, friend_count), alpha = 1/20)+
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_jitter(mapping = aes(age, friend_count), alpha =(1/20))+
  xlim(13,90)
## Warning: Removed 5188 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_point(mapping = aes(age, friend_count), alpha =(1/20))+
  xlim(13,90)+
  coord_trans(y='sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_jitter(mapping = aes(age, friend_count), alpha =(1/20),position = position_jitter(height = 0))+
  xlim(13,90)+
  coord_trans(y = 'sqrt')
## Warning: Removed 5189 rows containing missing values (geom_point).

ggplot(data = pf)+
  geom_jitter(mapping = aes(age, friendships_initiated), alpha = 1/10, position = position_jitter(height = 0))+
  xlim(13,90)+
  coord_trans(y = 'sqrt')
## Warning: Removed 5183 rows containing missing values (geom_point).

##Conditional Means

library(dplyr)
pf.fc_by_age <- pf %>% group_by(age) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count), N = n()) %>% arrange(age)
head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     N
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196
library(ggplot2)
ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean))+
 geom_line()+
 xlim(30,90)
## Warning: Removed 40 rows containing missing values (geom_path).

ggplot(data=pf,aes(age, friend_count)) +
  coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) +
  geom_point(alpha = 0.05, position=position_jitter(h = 0), color = 'orange') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .1), linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .5), linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .9), linetype = 2, color = 'blue')

cor.test(pf$age,pf$friend_count)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
ggplot(data = pf, aes(www_likes_received,likes_received))+
  geom_jitter(alpha = 1/50, position = position_jitter(height = 0))+
  coord_cartesian(xlim = c(0, quantile(pf$www_likes_received, .95)), ylim = c(0, quantile(pf$likes_received, 0.95)))+
geom_smooth(method = 'lm', color = 'red')

cor.test(pf$www_likes_received, pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902
pf$age_with_months <-pf$age + (1 - pf$dob_month / 12)
pf.fc_by_age_months <- pf %>% group_by(age_with_months) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count),n = n()) %>% arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1        13.16667          46.33333                30.5     6
## 2        13.25000         115.07143                23.5    14
## 3        13.33333         136.20000                44.0    25
## 4        13.41667         164.24242                72.0    33
## 5        13.50000         131.17778                66.0    45
## 6        13.58333         156.81481                64.0    54
ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
  geom_line()

ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
 geom_line()

p2 <- ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
  geom_line()
p1 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
 geom_line()
p3 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = round(age/5)*5, y = friend_count_mean))+
 geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p2,p1,p3, ncol = 1)

p2 <- ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
  geom_line()+
  geom_smooth()
p1 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
 geom_line()+
  geom_smooth()
p3 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = round(age/5)*5, y = friend_count_mean))+
 geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p2,p1,p3, ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'