1 Goal


The goal of this tutorial is to learn how to build properly time series not based on years but on different time periods.


2 Preparing the data


#First we load the libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)

# In this tutorial we will use the dataset of minimum temperature in melbourne
# https://datamarket.com/data/set/2324/daily-minimum-temperatures-in-melbourne-australia-1981-1990
Temperatures <- read.csv("daily-minimum-temperatures-in-me.csv", stringsAsFactors = FALSE)
head(Temperatures)
##         Date Daily.minimum.temperatures.in.Melbourne..Australia..1981.1990
## 1 1981-01-01                                                          20.7
## 2 1981-01-02                                                          17.9
## 3 1981-01-03                                                          18.8
## 4 1981-01-04                                                          14.6
## 5 1981-01-05                                                          15.8
## 6 1981-01-06                                                          15.8
colnames(Temperatures) <- c("Date", "Temperature")

# First we have to change the date to POSIXct
Temperatures$Date <- strptime(Temperatures$Date, "%Y-%m-%d" )
Temperatures$Date <- as.POSIXct(Temperatures$Date)
Temperatures$Temperature <- as.numeric(Temperatures$Temperature)
## Warning: NAs introduced by coercion
# Let's check the structure of the table
str(Temperatures)
## 'data.frame':    3652 obs. of  2 variables:
##  $ Date       : POSIXct, format: "1981-01-01" "1981-01-02" ...
##  $ Temperature: num  20.7 17.9 18.8 14.6 15.8 15.8 15.8 17.4 21.8 20 ...
# Now we create different columns for different time configurations
# Month
Temperatures <- mutate(Temperatures, MonthYear = paste(year(Date),formatC(month(Date), width = 2, flag = "0")))

# Day of the week
Temperatures <- mutate(Temperatures, Yearday = paste(year(Date), formatC(month(Date), width = 2, flag = "0"),
                                                     formatC(day(Date), width = 2, flag = "0")))

# Week of the year
Temperatures <- mutate(Temperatures, Week = week(Date))

# Year
Temperatures <- mutate(Temperatures, Year = year(Date))
Temperatures$Year <- as.factor(Temperatures$Year)
# Let's check the structure of the table
str(Temperatures)
## 'data.frame':    3652 obs. of  6 variables:
##  $ Date       : POSIXct, format: "1981-01-01" "1981-01-02" ...
##  $ Temperature: num  20.7 17.9 18.8 14.6 15.8 15.8 15.8 17.4 21.8 20 ...
##  $ MonthYear  : chr  "1981 01" "1981 01" "1981 01" "1981 01" ...
##  $ Yearday    : chr  "1981 01 01" "1981 01 02" "1981 01 03" "1981 01 04" ...
##  $ Week       : num  1 1 1 1 1 1 1 2 2 2 ...
##  $ Year       : Factor w/ 10 levels "1981","1982",..: 1 1 1 1 1 1 1 1 1 1 ...

3 Creating time series of daily temperatures

3.1 For each year


# First we need to use the year column to aggregate
Temps_year <- aggregate(Temperatures$Temperature, by = list(Temperatures$Year), FUN = function(x) mean(x, na.rm=T))

# Now we create the time series adding the right period
myts <- ts(Temps_year$x, frequency=1, start = c(1981))
plot(myts)


3.2 For each month


# First we need to use the year column to aggregate
Temps_month <- aggregate(Temperatures$Temperature, by = list(Temperatures$MonthYear), FUN = function(x) mean(x, na.rm=T))

# Now we create the time series adding the right period
myts <- ts(Temps_month$x, frequency=12, start = c(1981))
plot(myts)

# Now that we have several periods we can decompose 
myds_month <- decompose(myts)
plot(myds_month)


3.3 For day of the year


# First we need to use the weekday column to aggregate
Temps_yearday <- aggregate(Temperatures$Temperature, by = list(Temperatures$Yearday), FUN = function(x) mean(x, na.omit=T))

# Temps_weekday
# Now we create the time series adding the right period
myts <- ts(Temps_yearday$x, frequency=365, start = 0, end = 10)
plot(myts)

# Let's fix the missing values
tsna <- which(is.na(myts))

while(length(tsna) > 0){
  tsna <- which(is.na(myts))
  myts[tsna] <- myts[tsna -1]
}

# Now that we have several periods we can decompose 
# We clearly see that this granularity was not the appropriate for this problem
# We find the same seasonality of the monthly time series but with the added noise of daily variations
# The information we get in the monthly distribution is much clearer and more useful
myds <- decompose(myts)
plot(myds)


4 Time series of minute temperatures

4.1 Preparing the data


# In this tutorial we will use the dataset of room occupancy per minute
# http://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+

Occupancy <- read.csv("temperatures-per-minute.txt", stringsAsFactors = FALSE)
head(Occupancy)
##                  date Temperature Humidity    Light      CO2 HumidityRatio
## 1 2015-02-11 14:48:00     21.7600 31.13333 437.3333 1029.667   0.005021011
## 2 2015-02-11 14:49:00     21.7900 31.00000 437.3333 1000.000   0.005008581
## 3 2015-02-11 14:50:00     21.7675 31.12250 434.0000 1003.750   0.005021569
## 4 2015-02-11 14:51:00     21.7675 31.12250 439.0000 1009.500   0.005021569
## 5 2015-02-11 14:51:59     21.7900 31.13333 437.3333 1005.667   0.005030298
## 6 2015-02-11 14:53:00     21.7600 31.26000 437.3333 1014.333   0.005041605
##   Occupancy
## 1         1
## 2         1
## 3         1
## 4         1
## 5         1
## 6         1
# We declare the date properly
Occupancy$date <- strptime(Occupancy$date, "%Y-%m-%d %H:%M:%S")
Occupancy$date <- as.POSIXct(Occupancy$date)
str(Occupancy)
## 'data.frame':    9752 obs. of  7 variables:
##  $ date         : POSIXct, format: "2015-02-11 14:48:00" "2015-02-11 14:49:00" ...
##  $ Temperature  : num  21.8 21.8 21.8 21.8 21.8 ...
##  $ Humidity     : num  31.1 31 31.1 31.1 31.1 ...
##  $ Light        : num  437 437 434 439 437 ...
##  $ CO2          : num  1030 1000 1004 1010 1006 ...
##  $ HumidityRatio: num  0.00502 0.00501 0.00502 0.00502 0.00503 ...
##  $ Occupancy    : int  1 1 1 1 1 1 1 1 1 1 ...

4.2 By minute


# We can create a time series using different time bins
# The frequency defines how many points per period you have in your dataset

# In this case one day is 24 hours * 60 minutes per hour = 1440 entries
myts <- ts(Occupancy$Temperature, frequency = 1440, start = 0)
plot(myts)


4.3 By hour


# We can create a time series using different time bins
# The frequency defines how many points per period you have in your dataset

# Now we aggregate the points of the same hour
Occupancy_hour <- Occupancy
Occupancy_hour <- mutate(Occupancy, hour_day = paste(formatC(day(date), width = 2, flag ="0"),
                                                     formatC(hour(date), width = 2, flag = "0")))

Occupancy_hour <- aggregate(Occupancy_hour, by = list(Occupancy_hour$hour_day), FUN = mean)
head(Occupancy_hour)
##   Group.1                date Temperature Humidity    Light       CO2
## 1   11 14 2015-02-11 14:53:29    21.78125 31.27028 437.0417 1020.5139
## 2   11 15 2015-02-11 15:29:29    21.87847 31.06213 435.3903  949.1431
## 3   11 16 2015-02-11 16:29:59    21.90492 30.26449 429.9754  780.9932
## 4   11 17 2015-02-11 17:29:59    21.88514 29.96881 424.2994  735.5523
## 5   11 18 2015-02-11 18:29:29    21.63704 28.98953 172.0139  656.5917
## 6   11 19 2015-02-11 19:29:59    21.10109 29.80351   0.0000  550.8033
##   HumidityRatio Occupancy hour_day
## 1   0.005049894 1.0000000       NA
## 2   0.005046090 0.9500000       NA
## 3   0.004923533 1.0000000       NA
## 4   0.004869331 1.0000000       NA
## 5   0.004637511 0.4166667       NA
## 6   0.004613468 0.0000000       NA
# In this case one day is 24 entries
# We can compare this time series with the previous one to check that it is correctly built
myts <- ts(Occupancy_hour$Temperature, frequency = 24, start = 0)
plot(myts)


5 Conclusion


In this tutorial we have learnt how to build time series using different time binnings. Different time binnings require different different frequencies.

The first example shows how to find the right granularity to explain a period of one year while the second example shows how to define correctly the frequency for the same time period and different binning.