library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
glimpse(pf)
## Observations: 99,003
## Variables: 15
## $ userid <int> 2094382, 1192601, 2083884, 1203168, 1733...
## $ age <int> 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, ...
## $ dob_day <int> 19, 2, 16, 25, 4, 1, 14, 4, 1, 2, 22, 1,...
## $ dob_year <int> 1999, 1999, 1999, 1999, 1999, 1999, 2000...
## $ dob_month <int> 11, 11, 11, 12, 12, 12, 1, 1, 1, 2, 2, 2...
## $ gender <fctr> male, female, male, female, male, male,...
## $ tenure <int> 266, 6, 13, 93, 82, 15, 12, 0, 81, 171, ...
## $ friend_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ friendships_initiated <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
ggplot(data = pf)+
geom_point(mapping = aes(age, friend_count))
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
ggplot(data = pf)+
geom_point(mapping = aes(age, friend_count))+
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_point(mapping = aes(age, friend_count), alpha = 1/20)+
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_jitter(mapping = aes(age, friend_count), alpha =(1/20))+
xlim(13,90)
## Warning: Removed 5188 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_point(mapping = aes(age, friend_count), alpha =(1/20))+
xlim(13,90)+
coord_trans(y='sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_jitter(mapping = aes(age, friend_count), alpha =(1/20),position = position_jitter(height = 0))+
xlim(13,90)+
coord_trans(y = 'sqrt')
## Warning: Removed 5189 rows containing missing values (geom_point).
ggplot(data = pf)+
geom_jitter(mapping = aes(age, friendships_initiated), alpha = 1/10, position = position_jitter(height = 0))+
xlim(13,90)+
coord_trans(y = 'sqrt')
## Warning: Removed 5183 rows containing missing values (geom_point).
##Conditional Means
library(dplyr)
pf.fc_by_age <- pf %>% group_by(age) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count), N = n()) %>% arrange(age)
head(pf.fc_by_age)
## # A tibble: 6 x 4
## age friend_count_mean friend_count_median N
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
library(ggplot2)
ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean))+
geom_line()+
xlim(30,90)
## Warning: Removed 40 rows containing missing values (geom_path).
ggplot(data=pf,aes(age, friend_count)) +
coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) +
geom_point(alpha = 0.05, position=position_jitter(h = 0), color = 'orange') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .1), linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .5), linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, fun.args=list(probs = .9), linetype = 2, color = 'blue')
cor.test(pf$age,pf$friend_count)
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
ggplot(data = pf, aes(www_likes_received,likes_received))+
geom_jitter(alpha = 1/50, position = position_jitter(height = 0))+
coord_cartesian(xlim = c(0, quantile(pf$www_likes_received, .95)), ylim = c(0, quantile(pf$likes_received, 0.95)))+
geom_smooth(method = 'lm', color = 'red')
cor.test(pf$www_likes_received, pf$likes_received)
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
pf$age_with_months <-pf$age + (1 - pf$dob_month / 12)
pf.fc_by_age_months <- pf %>% group_by(age_with_months) %>% summarise(friend_count_mean = mean(friend_count), friend_count_median = median(friend_count),n = n()) %>% arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
geom_line()
ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
geom_line()
p2 <- ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
geom_line()
p1 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
geom_line()
p3 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = round(age/5)*5, y = friend_count_mean))+
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p2,p1,p3, ncol = 1)
p2 <- ggplot(data = subset(pf.fc_by_age_months, pf.fc_by_age_months$age_with_months < 71), aes(y = friend_count_mean, x = age_with_months))+
geom_line()+
geom_smooth()
p1 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = age, y = friend_count_mean))+
geom_line()+
geom_smooth()
p3 <- ggplot(data = subset(pf.fc_by_age, age < 71), aes(x = round(age/5)*5, y = friend_count_mean))+
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p2,p1,p3, ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'