1 Goal


The goal of this tutorial is to learn how to build properly a time series in order to prepare data to do forecasting and all sort of time related predictions.


2 Preparing the data


#First we load the libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# In this tutorial we will use the dataset of dead celebrity by month and year
# https://www.kaggle.com/hugodarwood/celebrity-deaths
deaths <- read.csv("celebrity_deaths.csv", stringsAsFactors = FALSE)
head(deaths)
##   age birth_year       cause_of_death death_month death_year
## 1  85       1921       natural causes     January       2006
## 2  49       1957             murdered     January       2006
## 3  64       1942  Alzheimer's disease     January       2006
## 4  86       1920  Alzheimer's disease     January       2006
## 5  82       1924               cancer     January       2006
## 6  52       1954               stroke     January       2006
##                                                      famous_for
## 1            businessman chairman of IBM (1973\x89\xdb\xd21981)
## 2                         musician (House of Freaks Gutterball)
## 3                           baseball player (Oakland Athletics)
## 4  politician Representative from Oregon (1957\x89\xdb\xd21961)
## 5                              nightclub owner (Tropicana Club)
## 6  New Guinean politician Prime Minister (1997\x89\xdb\xd21999)
##                name nationality fame_score
## 1        Frank Cary    American          1
## 2      Bryan Harvey    American          2
## 3     Paul Lindblad    American          1
## 4 Charles O. Porter    American          2
## 5        Ofelia Fox       Cuban         NA
## 6 Sir William Skate       Papua         NA
# We change the months to numbers
# First we have to order the levels so they don't appear in alphabetical order
deaths$death_month <- factor(deaths$death_month, levels = c("January","February", "March","April","May","June","July","August","September", "October","November","December"))
levels(deaths$death_month)
##  [1] "January"   "February"  "March"     "April"     "May"      
##  [6] "June"      "July"      "August"    "September" "October"  
## [11] "November"  "December"
deaths$death_month <- as.numeric(deaths$death_month)


# Now we create a time tag
deaths <- mutate(deaths, month_year = paste(death_year, formatC(death_month, width = 2, flag = "0")))
head(unique(deaths$month_year))
## [1] "2006 01" "2006 02" "2006 03" "2006 04" "2006 05" "2006 06"
# Let's build a dataframe with the number of deaths by month and year
# Check the group by tutorial if necessary
my_group <- group_by(deaths, month_year)
deaths_by_month <- summarize(my_group, count = n())
head(deaths_by_month)
## # A tibble: 6 × 2
##   month_year count
##        <chr> <int>
## 1    2006 01    24
## 2    2006 02    13
## 3    2006 03    26
## 4    2006 04    18
## 5    2006 05    27
## 6    2006 06    16
# We plot the deaths by year
ggplot(data = deaths_by_month) + geom_line(aes(x = 1:nrow(deaths_by_month), y = deaths_by_month$count))


3 Creating a time series


# Now let's build the time series. 
# We have 12 points per year so let's define our time series
# The frequency is how many data points it takes to fullfil a period
# We have to define as well the starting point so the time series understands how to build the dates
myts <- ts(deaths_by_month$count,frequency=12,start=c(2006,1))
plot(myts)