1 Context


We want to check year by year how is the time distribution for LinkedIn connections. Overnight connections might indicate overseas links.


2 Libraries


library(data.table)
library(ggplot2)
library(plyr)
library(lubridate)
library(dplyr)

3 Data import


# Reading the table of my LinkedIn connections
LinkedIn <- as.data.frame(fread("my_connections.csv", header = T, stringsAsFactors = FALSE))

# Renaming the variables
names(LinkedIn) <- c("Position", "Connection")

# Summary of the table
summary(LinkedIn)
##    Position          Connection       
##  Length:1745        Length:1745       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

4 Pre-processing

4.1 Keeping only the year and hour of connection


# We check the format of the date
head(LinkedIn$Connection)
## [1] "4/13/18, 2:52 AM"   "1/17/18, 12:52 PM"  "4/7/18, 2:44 PM"   
## [4] "2/23/18, 1:21 AM"   "11/3/17, 12:25 AM"  "11/21/17, 12:04 PM"
# Now we want to format it as PosixCT
LinkedIn$Connection <- mdy_hm(LinkedIn$Connection)

# Let's check that everything went fine
str(LinkedIn$Connection)
##  POSIXct[1:1745], format: "2018-04-13 02:52:00" "2018-01-17 12:52:00" ...
# We keep only the hour and year of connection
LinkedIn$Hour <- as.character(formatC(hour(LinkedIn$Connection), width = 2, flag = "0"))
LinkedIn$Year <- as.character(year(LinkedIn$Connection))
LinkedIn$Connection <- NULL

# Now we check the structure of the data
str(LinkedIn)
## 'data.frame':    1745 obs. of  3 variables:
##  $ Position: chr  "VP of AI and Machine Learning" "Founder and President" "Founder" "Espacio vinculado a Essential Institute" ...
##  $ Hour    : chr  "02" "12" "14" "01" ...
##  $ Year    : chr  "2018" "2018" "2018" "2018" ...

4.2 Grouping data by year and hour


# We want to know how many connections we had per year and hour
# We are going to group them
my_group <- group_by(LinkedIn, Year, Hour)
LinkedIn_grouped <- as.data.frame(summarize(my_group, Total = n()))

table(LinkedIn$Year)
## 
## 2013 2014 2015 2016 2017 2018 
##  139   58   36   32  607  873
# Let's check how the table looks like
head(LinkedIn_grouped)
##   Year Hour Total
## 1 2013   00     5
## 2 2013   01     7
## 3 2013   02     7
## 4 2013   03    16
## 5 2013   04    11
## 6 2013   05    10
# We want to normalize the distributions so we can compare the shapes
for(i in unique(LinkedIn_grouped$Year)){
  my_index <- which(LinkedIn_grouped$Year == i)
  LinkedIn_grouped$Total[my_index] <- LinkedIn_grouped$Total[my_index]/sum(LinkedIn_grouped$Total[my_index])

  # Check that normalization is correct
  print(paste(i, sum(LinkedIn_grouped$Total[my_index]), sep = " : "))
}
## [1] "2013 : 1"
## [1] "2014 : 1"
## [1] "2015 : 1"
## [1] "2016 : 1"
## [1] "2017 : 1"
## [1] "2018 : 1"

5 Results: Plotting the hourly distribution per year


# First we plot the year 2018 to check the distribution
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year == "2018"),])+
  geom_col(aes(x = Hour, y = Total)) + 
  ggtitle("Hourly distribution of LinkedIn connections in 2018") +
  xlab("Hour of the day (h)") + ylab("Density of connections") +
  theme(plot.title = element_text(hjust = 0.5))

# We can use an area plot to visualize it better
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year == "2018"),])+
  geom_area(aes(x = as.numeric(Hour), y = Total), fill = "#3333FF") + 
  ggtitle("Hourly distribution of LinkedIn connections in 2018") +
  xlab("Hour of the day (h)") + ylab("Density of connections") +
  theme(plot.title = element_text(hjust = 0.5))

# We can compare the 2018 distribution with the one from 2017
# We see that the distribution is pretty similar 
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year %in% c("2017","2018")),])+
  geom_col(aes(x = Hour, y = Total, fill = Year), position = "dodge") + 
  ggtitle("Hourly distribution of LinkedIn connections in 2018 vs 2017") +
  xlab("Hour of the day (h)") + ylab("Density of connections") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(values = c("#009E73", "#D55E00"))

# However we can find other ways to visualize it
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year %in% c("2017","2018")),])+
  geom_line(aes(x = as.numeric(Hour), y = Total, colour = Year), size = 1)+
  ggtitle("Hourly distribution of LinkedIn connections in 2018 vs 2017") +
  xlab("Hour of the day (h)") + ylab("Density of connections") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(values = c("#009E73", "#D55E00"))


6 Conclusion


We see that there is a clear hour dependence on the number of connections made that looks a lot like a daily activity distribution.

We also see that the time zone of LinkedIn is not GMT as the bottom of activity doesn’t correspond with the night hours.

We have seen different ways to plot hourly distributions.