1 Goal


The goal of this tutorial is to learn how to properly define a table in order to draw a different line per year when plotting a variable per month.


2 Data preparation


#First we load the libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)

# In this tutorial we will use the dataset of minimum temperature in melbourne
# https://datamarket.com/data/set/2324/daily-minimum-temperatures-in-melbourne-australia-1981-1990
Temperatures <- read.csv("daily-minimum-temperatures-in-me.csv", stringsAsFactors = FALSE)
head(Temperatures)
##         Date Daily.minimum.temperatures.in.Melbourne..Australia..1981.1990
## 1 1981-01-01                                                          20.7
## 2 1981-01-02                                                          17.9
## 3 1981-01-03                                                          18.8
## 4 1981-01-04                                                          14.6
## 5 1981-01-05                                                          15.8
## 6 1981-01-06                                                          15.8
colnames(Temperatures) <- c("Date", "Temperature")

# First we have to change the date to POSIXct
Temperatures$Date <- strptime(Temperatures$Date, "%Y-%m-%d" )
Temperatures$Date <- as.POSIXct(Temperatures$Date)
Temperatures$Temperature <- as.numeric(Temperatures$Temperature)

# Let's check the structure of the table
str(Temperatures)
## 'data.frame':    3652 obs. of  2 variables:
##  $ Date       : POSIXct, format: "1981-01-01" "1981-01-02" ...
##  $ Temperature: num  20.7 17.9 18.8 14.6 15.8 15.8 15.8 17.4 21.8 20 ...
# Now we create different columns for different time configurations
# Month
Temperatures <- mutate(Temperatures, Month = month(Date))

# Year
Temperatures <- mutate(Temperatures, Year = year(Date))
Temperatures$Year <- as.numeric(as.character(Temperatures$Year))

# Let's check the structure of the table
str(Temperatures)
## 'data.frame':    3652 obs. of  4 variables:
##  $ Date       : POSIXct, format: "1981-01-01" "1981-01-02" ...
##  $ Temperature: num  20.7 17.9 18.8 14.6 15.8 15.8 15.8 17.4 21.8 20 ...
##  $ Month      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Year       : num  1981 1981 1981 1981 1981 ...
# For the purpose of this exercise we are only keeping 4 years 
Temperatures <- Temperatures[which(Temperatures$Year %in% c(1981:1984)),]
unique(Temperatures$Year)
## [1] 1981 1982 1983 1984
# Now we are going to aggregate the points to get only one point per month
Temperatures <- mutate(Temperatures, MonthYear = paste(year(Date),formatC(month(Date), width = 2, flag = "0")))
Temps_month <- aggregate(Temperatures, by = list(Temperatures$MonthYear), FUN = function(x) mean(x, na.rm=T))

head(Temps_month)
##   Group.1                Date Temperature Month Year MonthYear
## 1 1981 01 1981-01-16 00:00:00   17.712903     1 1981        NA
## 2 1981 02 1981-02-14 12:00:00   17.678571     2 1981        NA
## 3 1981 03 1981-03-15 23:56:07   13.500000     3 1981        NA
## 4 1981 04 1981-04-15 12:00:00   12.356667     4 1981        NA
## 5 1981 05 1981-05-16 00:00:00    9.490323     5 1981        NA
## 6 1981 06 1981-06-15 12:00:00    7.306667     6 1981        NA

3 Drawing one line per year


# Let's first define a wrong structure to identify the plot and learn how to solve the problem
Temps_month$Month <- factor(Temps_month$Month)
Temps_month$Year <- as.numeric(Temps_month$Year)

# First let's take a look at the data
ggplot() + geom_point(data = Temps_month, aes(x = Month, y = Temperature, color = Year))

# Let's draw
ggplot() + geom_line(data = Temps_month, aes(x = Month, y = Temperature, color = Year))

# This is not what we wanted
# The first problem is that the month is a factor so ggplot doesn't understand that it has to connect consecutive months
# The second problem is that the year is numeric, so ggplot knows how to connect those data points

# Now we can make the months numerical and see the effect on the plot
Temps_month$Month <- as.numeric(Temps_month$Month)

# Let's draw
ggplot() + geom_line(data = Temps_month, aes(x = Month, y = Temperature, color = Year))

# Now ggplot understands how to connect the months as they are numerical too
# The only thing that it's left to make the plot correctly is to define the variable used to color as a factor

Temps_month$Year <- factor(Temps_month$Year)

# Let's draw
ggplot() + geom_line(data = Temps_month, aes(x = Month, y = Temperature, color = Year))


4 Quick solution


# In our dataset we usually have three variables
# 1. The x variable should be numerical in order to properly draw the line
Temps_month$Month <- as.numeric(Temps_month$Month)
# 2. The y variable should also be numerical for the same reason
Temps_month$Temperature <- as.numeric(Temps_month$Temperature)
# 3. The variable used to color and split the data should be a factor so lines are properly drawn
Temps_month$Year <- factor(Temps_month$Year)

# If we follow these simple steps we will always obtain the type of plot that we want
# Let's draw
ggplot() + geom_line(data = Temps_month, aes(x = Month, y = Temperature, color = Year))+ scale_x_continuous(breaks = 1:12)


5 Conclusion


In this tutorial we have learnt how to define properly the variables in order to get the type of line plot that we want. We have to define numerical for x and y variables and factor for the variable used to color and split the data.