1 Context


LinkedIn has a limit of 30.000 connections so I want to know using a time series analysis when I will reach that limit.


2 Libraries


library(data.table)
library(ggplot2)
library(plyr)
library(lubridate)
library(dplyr)
library(forecast)

3 Data import


# Reading the table of my LinkedIn connections
LinkedIn <- fread("my_connections.csv", header = T, stringsAsFactors = FALSE)

# Renaming the variables
names(LinkedIn) <- c("Position", "Connection")

# Summary of the table
summary(LinkedIn)
##    Position          Connection       
##  Length:1745        Length:1745       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

4 Pre-processing

4.1 Changing the format of date


# We check the format of the date
head(LinkedIn$Connection)
## [1] "4/13/18, 2:52 AM"   "1/17/18, 12:52 PM"  "4/7/18, 2:44 PM"   
## [4] "2/23/18, 1:21 AM"   "11/3/17, 12:25 AM"  "11/21/17, 12:04 PM"
# Now we want to format it as PosixCT
LinkedIn$Connection <- mdy_hm(LinkedIn$Connection)

# Let's check that everything went fine
str(LinkedIn$Connection)
##  POSIXct[1:1745], format: "2018-04-13 02:52:00" "2018-01-17 12:52:00" ...

4.2 Grouping the data by month to do the forecasting


# Let's group the data so we now how many people was added to the network at any moment
LinkedIn$Date <- format(LinkedIn$Connection, format = "%m/%Y")
LinkedIn$Date <- as.factor(LinkedIn$Date)
levels(LinkedIn$Date)
##  [1] "01/2014" "01/2015" "01/2017" "01/2018" "02/2014" "02/2015" "02/2016"
##  [8] "02/2017" "02/2018" "03/2013" "03/2014" "03/2015" "03/2017" "03/2018"
## [15] "04/2013" "04/2014" "04/2015" "04/2018" "05/2013" "05/2014" "05/2015"
## [22] "05/2017" "06/2013" "06/2014" "06/2015" "06/2016" "06/2017" "07/2013"
## [29] "07/2014" "07/2015" "07/2016" "07/2017" "08/2013" "08/2014" "08/2015"
## [36] "08/2016" "08/2017" "09/2013" "09/2014" "09/2015" "09/2016" "09/2017"
## [43] "10/2013" "10/2014" "10/2016" "10/2017" "11/2013" "11/2014" "11/2015"
## [50] "11/2016" "11/2017" "12/2013" "12/2014" "12/2016" "12/2017"
# We need to fill the gaps before doing the time series
# We are going to create a list of levels for the factor
my_levels <- c()

for(i in 2013:2018){
   for(j in 1:12){
         
       my_levels <- c(my_levels, paste(formatC(j, width = 2, flag = "0"), i, sep = "/"))
       if(i == 2018 && j == 4){
          break
       }
   }
}

my_levels
##  [1] "01/2013" "02/2013" "03/2013" "04/2013" "05/2013" "06/2013" "07/2013"
##  [8] "08/2013" "09/2013" "10/2013" "11/2013" "12/2013" "01/2014" "02/2014"
## [15] "03/2014" "04/2014" "05/2014" "06/2014" "07/2014" "08/2014" "09/2014"
## [22] "10/2014" "11/2014" "12/2014" "01/2015" "02/2015" "03/2015" "04/2015"
## [29] "05/2015" "06/2015" "07/2015" "08/2015" "09/2015" "10/2015" "11/2015"
## [36] "12/2015" "01/2016" "02/2016" "03/2016" "04/2016" "05/2016" "06/2016"
## [43] "07/2016" "08/2016" "09/2016" "10/2016" "11/2016" "12/2016" "01/2017"
## [50] "02/2017" "03/2017" "04/2017" "05/2017" "06/2017" "07/2017" "08/2017"
## [57] "09/2017" "10/2017" "11/2017" "12/2017" "01/2018" "02/2018" "03/2018"
## [64] "04/2018"
# We can add the missing levels now
LinkedIn$Date <- factor(LinkedIn$Date, levels = my_levels)

# Now we create the dataset of connections per month
my_connections <- melt(table(LinkedIn$Date))
names(my_connections) <- c("Date", "Connections")
head(my_connections)
##      Date Connections
## 1 01/2013           0
## 2 02/2013           0
## 3 03/2013          74
## 4 04/2013          10
## 5 05/2013          20
## 6 06/2013           8
# Now it's time to give time format again to the date
my_connections$Date <- dmy(paste("01",my_connections$Date, sep = "/"))
str(my_connections)
## 'data.frame':    64 obs. of  2 variables:
##  $ Date       : Date, format: "2013-01-01" "2013-02-01" ...
##  $ Connections: int  0 0 74 10 20 8 5 3 10 7 ...
# We need to order by date the table
my_connections_sorted <- my_connections[order(my_connections$Date, decreasing = F), ]


# Finally we need to do a cumulative sum
my_connections_sorted$Connections <- cumsum(my_connections_sorted$Connections)
head(my_connections_sorted)
##         Date Connections
## 1 2013-01-01           0
## 2 2013-02-01           0
## 3 2013-03-01          74
## 4 2013-04-01          84
## 5 2013-05-01         104
## 6 2013-06-01         112

5 Creating the time series and forecasting


# First we create the time series 
myts <- ts(my_connections_sorted$Connections, frequency = 12, start = c(2013,03))

# We need to load the forecast library in order to autoplot ts
autoplot(myts) + xlab("Year") + ylab("Number of LinkedIn connections") + 
  ggtitle("LinkedIn connections per month") +
  theme(plot.title = element_text(hjust = 0.5))

# Let's forecast 
my_fc <- forecast(HoltWinters(myts), h = 180)
autoplot(my_fc) + xlab("Year") + ylab("Number of LinkedIn connections") +
   theme(plot.title = element_text(hjust = 0.5)) +
   geom_hline(yintercept = 30000, size = 1, colour = "red")


6 Conclusion


In this tutorial we have used time series to predict when I’m going to reach the limit of 30k connections on LinkedIn. The date should be something between 2025 and 2035.

This project will be updated every 3 months and updated to check how the forecast changes over the time.