We want to check year by year how is the time distribution for LinkedIn connections. Overnight connections might indicate overseas links.
library(data.table)
library(ggplot2)
library(plyr)
library(lubridate)
library(dplyr)
# Reading the table of my LinkedIn connections
LinkedIn <- as.data.frame(fread("my_connections.csv", header = T, stringsAsFactors = FALSE))
# Renaming the variables
names(LinkedIn) <- c("Position", "Connection")
# Summary of the table
summary(LinkedIn)
## Position Connection
## Length:1745 Length:1745
## Class :character Class :character
## Mode :character Mode :character
# We check the format of the date
head(LinkedIn$Connection)
## [1] "4/13/18, 2:52 AM" "1/17/18, 12:52 PM" "4/7/18, 2:44 PM"
## [4] "2/23/18, 1:21 AM" "11/3/17, 12:25 AM" "11/21/17, 12:04 PM"
# Now we want to format it as PosixCT
LinkedIn$Connection <- mdy_hm(LinkedIn$Connection)
# Let's check that everything went fine
str(LinkedIn$Connection)
## POSIXct[1:1745], format: "2018-04-13 02:52:00" "2018-01-17 12:52:00" ...
# We keep only the hour and year of connection
LinkedIn$Hour <- as.character(formatC(hour(LinkedIn$Connection), width = 2, flag = "0"))
LinkedIn$Year <- as.character(year(LinkedIn$Connection))
LinkedIn$Connection <- NULL
# Now we check the structure of the data
str(LinkedIn)
## 'data.frame': 1745 obs. of 3 variables:
## $ Position: chr "VP of AI and Machine Learning" "Founder and President" "Founder" "Espacio vinculado a Essential Institute" ...
## $ Hour : chr "02" "12" "14" "01" ...
## $ Year : chr "2018" "2018" "2018" "2018" ...
# We want to know how many connections we had per year and hour
# We are going to group them
my_group <- group_by(LinkedIn, Year, Hour)
LinkedIn_grouped <- as.data.frame(summarize(my_group, Total = n()))
table(LinkedIn$Year)
##
## 2013 2014 2015 2016 2017 2018
## 139 58 36 32 607 873
# Let's check how the table looks like
head(LinkedIn_grouped)
## Year Hour Total
## 1 2013 00 5
## 2 2013 01 7
## 3 2013 02 7
## 4 2013 03 16
## 5 2013 04 11
## 6 2013 05 10
# We want to normalize the distributions so we can compare the shapes
for(i in unique(LinkedIn_grouped$Year)){
my_index <- which(LinkedIn_grouped$Year == i)
LinkedIn_grouped$Total[my_index] <- LinkedIn_grouped$Total[my_index]/sum(LinkedIn_grouped$Total[my_index])
# Check that normalization is correct
print(paste(i, sum(LinkedIn_grouped$Total[my_index]), sep = " : "))
}
## [1] "2013 : 1"
## [1] "2014 : 1"
## [1] "2015 : 1"
## [1] "2016 : 1"
## [1] "2017 : 1"
## [1] "2018 : 1"
# First we plot the year 2018 to check the distribution
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year == "2018"),])+
geom_col(aes(x = Hour, y = Total)) +
ggtitle("Hourly distribution of LinkedIn connections in 2018") +
xlab("Hour of the day (h)") + ylab("Density of connections") +
theme(plot.title = element_text(hjust = 0.5))
# We can use an area plot to visualize it better
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year == "2018"),])+
geom_area(aes(x = as.numeric(Hour), y = Total), fill = "#3333FF") +
ggtitle("Hourly distribution of LinkedIn connections in 2018") +
xlab("Hour of the day (h)") + ylab("Density of connections") +
theme(plot.title = element_text(hjust = 0.5))
# We can compare the 2018 distribution with the one from 2017
# We see that the distribution is pretty similar
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year %in% c("2017","2018")),])+
geom_col(aes(x = Hour, y = Total, fill = Year), position = "dodge") +
ggtitle("Hourly distribution of LinkedIn connections in 2018 vs 2017") +
xlab("Hour of the day (h)") + ylab("Density of connections") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = c("#009E73", "#D55E00"))
# However we can find other ways to visualize it
ggplot(data = LinkedIn_grouped[which(LinkedIn_grouped$Year %in% c("2017","2018")),])+
geom_line(aes(x = as.numeric(Hour), y = Total, colour = Year), size = 1)+
ggtitle("Hourly distribution of LinkedIn connections in 2018 vs 2017") +
xlab("Hour of the day (h)") + ylab("Density of connections") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = c("#009E73", "#D55E00"))
We see that there is a clear hour dependence on the number of connections made that looks a lot like a daily activity distribution.
We also see that the time zone of LinkedIn is not GMT as the bottom of activity doesn’t correspond with the night hours.
We have seen different ways to plot hourly distributions.