1 Context


We want to check year by year how is the time distribution for LinkedIn connections. Overnight connections might indicate overseas links.


2 Libraries


library(data.table)
library(ggplot2)
library(plyr)
library(lubridate)
library(dplyr)

3 Data import


# Reading the table of my LinkedIn connections
LinkedIn <- as.data.frame(fread("my_connections.csv", header = T, stringsAsFactors = FALSE))

# Renaming the variables
names(LinkedIn) <- c("Position", "Connection")

# Summary of the table
summary(LinkedIn)
##    Position          Connection       
##  Length:1745        Length:1745       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

4 Pre-processing

4.1 Keeping only the year and hour of connection


# We check the format of the date
head(LinkedIn$Connection)
## [1] "4/13/18, 2:52 AM"   "1/17/18, 12:52 PM"  "4/7/18, 2:44 PM"   
## [4] "2/23/18, 1:21 AM"   "11/3/17, 12:25 AM"  "11/21/17, 12:04 PM"
# Now we want to format it as PosixCT
LinkedIn$Connection <- mdy_hm(LinkedIn$Connection)

# Let's check that everything went fine
str(LinkedIn$Connection)
##  POSIXct[1:1745], format: "2018-04-13 02:52:00" "2018-01-17 12:52:00" ...
# We keep only the hour and year of connection
LinkedIn$Hour <- as.character(formatC(hour(LinkedIn$Connection), width = 2, flag = "0"))
LinkedIn$Year <- as.character(year(LinkedIn$Connection))
LinkedIn$Connection <- NULL

# Now we check the structure of the data
str(LinkedIn)
## 'data.frame':    1745 obs. of  3 variables:
##  $ Position: chr  "VP of AI and Machine Learning" "Founder and President" "Founder" "Espacio vinculado a Essential Institute" ...
##  $ Hour    : chr  "02" "12" "14" "01" ...
##  $ Year    : chr  "2018" "2018" "2018" "2018" ...

4.2 Grouping data by year and hour


# We want to know how many connections we had per year and hour
# We are going to group them
my_group <- group_by(LinkedIn, Year, Hour)
LinkedIn_grouped <- as.data.frame(summarize(my_group, Total = n()))

table(LinkedIn$Year)
## 
## 2013 2014 2015 2016 2017 2018 
##  139   58   36   32  607  873
# Let's check how the table looks like
head(LinkedIn_grouped)
##   Year Hour Total
## 1 2013   00     5
## 2 2013   01     7
## 3 2013   02     7
## 4 2013   03    16
## 5 2013   04    11
## 6 2013   05    10
# We want to normalize the distributions so we can compare the shapes
for(i in unique(LinkedIn_grouped$Year)){
  my_index <- which(LinkedIn_grouped$Year == i)
  LinkedIn_grouped$Total[my_index] <- LinkedIn_grouped$Total[my_index]/sum(LinkedIn_grouped$Total[my_index])

  # Check that normalization is correct
  print(paste(i, sum(LinkedIn_grouped$Total[my_index]), sep = " : "))
}
## [1] "2013 : 1"
## [1] "2014 : 1"
## [1] "2015 : 1"
## [1] "2016 : 1"
## [1] "2017 : 1"
## [1] "2018 : 1"

5 Results: Plotting the hourly distribution per year


# First we plot the year 2018 to check the distribution
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year == "2018"),])+
  geom_col(aes(x = Hour, y = Total)) + 
  ggtitle("Hourly distribution of LinkedIn connections in 2018") +
  xlab("Hour of the day (h)") + ylab("Density of connections") +
  theme(plot.title = element_text(hjust = 0.5))

# We can use an area plot to visualize it better
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year == "2018"),])+
  geom_area(aes(x = as.numeric(Hour), y = Total), fill = "#3333FF") + 
  ggtitle("Hourly distribution of LinkedIn connections in 2018") +
  xlab("Hour of the day (h)") + ylab("Density of connections") +
  theme(plot.title = element_text(hjust = 0.5))