Time & Place

The Code

For those interested, a brief look at how I put this together, starting with a simple csv.

Setup and Load

# Issue to Console:
# setwd('../GitHub/Timelines')
# rmarkdown::render('Time-And-Place.Rmd', output_file = 'Time-And-Place.html')

# cleanup environment
rm(list=ls())

# load libraries
install_packages <- function(package){
  
  newpackage <- package[!(package %in% installed.packages()[, "Package"])]
      
    if (length(newpackage)) {
        suppressMessages(install.packages(newpackage, dependencies = TRUE))
    }
    sapply(package, require, character.only = TRUE)
}


packages <- c("RJSONIO", "geosphere", "lubridate", "ggplot2", "tibble")
suppressPackageStartupMessages(install_packages(packages))

##   RJSONIO geosphere lubridate   ggplot2    tibble 
##      TRUE      TRUE      TRUE      TRUE      TRUE

# load simplified timeline
df <- read.csv("./data/timelines_simple.csv"
                , fileEncoding="UTF-8-BOM"
                , na.strings=c("NA","NULL","na","null","N/A","n/a","")
                , stringsAsFactors = FALSE)

# format date type
df$jdate <- as.Date(df$jdate, format="%m/%d/%Y")
df$mdate <- as.Date(df$mdate, format="%m/%d/%Y")

# peek
as_tibble(df)

## # A tibble: 38 x 8
##    jcity        jstate          jcountrycode jdate      mdate      mcity           mstate         mcountrycode
##    <chr>        <chr>           <chr>        <date>     <date>     <chr>           <chr>          <chr>       
##  1 Melbourne    Florida         US           1977-01-01 1977-01-01 São Paulo       São Paulo      BR          
##  2 Boulder      Colorado        US           1981-01-01 1990-06-01 Piracicaba      São Paulo      BR          
##  3 Fort Collins Colorado        US           1994-08-01 1990-07-01 São Paulo       São Paulo      BR          
##  4 Boulder      Colorado        US           1995-05-01 1991-12-01 Londrina        Paraná         BR          
##  5 Fort Collins Colorado        US           1995-08-01 1992-01-01 São Paulo       São Paulo      BR          
##  6 Sydney       New South Wales AU           1995-11-01 1992-12-01 Curitiba        Paraná         BR          
##  7 Fort Collins Colorado        US           1995-12-01 1993-01-01 São Paulo       São Paulo      BR          
##  8 Boulder      Colorado        US           1999-05-01 1994-01-01 Rio de Janeiro  Rio de Janeiro BR          
##  9 Kingston     Surrey County   JM           2003-06-01 1994-02-01 São Paulo       São Paulo      BR          
## 10 Boulder      Colorado        US           2003-07-01 1995-02-01 Puerto Quijarro Germán Busch   BO          
## # ... with 28 more rows

We simplified to a max of 1 location per month (of course there are many years without a single change) and all dates are the first of the month, that way I could make sure the y axis labels and graphics would be legible testing a single y value and checking the month spacing.

Wrangling

Here I create a new dataframe merging all dates in order, filling columns and rows as needed.

# get all dates, remove NAs and sort
all_dates <- c(df$jdate, df$mdate)
all_dates <- all_dates[!is.na(all_dates)]
all_dates <- sort(unique(all_dates))

# create a new dataframe
N <- length(all_dates)
dfm <- data.frame(
    'date' = all_dates
    , 'jcity' = rep(NA, N)
    , 'jstate' = rep(NA, N)
    , 'jcountrycode' = rep(NA, N)
    , 'mcity' = rep(NA, N)
    , 'mstate' = rep(NA, N)
    , 'mcountrycode' = rep(NA, N)
)

# fill cols
fill_col <- function(dfm, df, whosedate, col) {
    whosedate <- whosedate[!is.na(whosedate)]
    for (i in 1:nrow(dfm)) {
        dfm[i, colnames(dfm) == col] <- ifelse(dfm$date[i] %in% whosedate, 
            df[which(whosedate == dfm$date[i]), colnames(df) == col]
            , NA)
    }
    dfm
}

dfm <- fill_col(dfm, df, df$jdate, "jcity")
dfm <- fill_col(dfm, df, df$jdate, "jstate")
dfm <- fill_col(dfm, df, df$jdate, "jcountrycode")
dfm <- fill_col(dfm, df, df$mdate, "mcity")
dfm <- fill_col(dfm, df, df$mdate, "mstate")
dfm <- fill_col(dfm, df, df$mdate, "mcountrycode")

# fill all rows
fill_nas <- function(df, col) {
    for (i in 2:nrow(df)) {
        if (is.na(df[i, colnames(df) %in% col])) {
            df[i, colnames(df) %in% col] <- df[i-1, colnames(df) %in% col]
        }
    }
    df
}

dfm <- fill_nas(dfm, "jcity")
dfm <- fill_nas(dfm, "jstate")
dfm <- fill_nas(dfm, "jcountrycode")
dfm <- fill_nas(dfm, "mcity")
dfm <- fill_nas(dfm, "mstate")
dfm <- fill_nas(dfm, "mcountrycode")

# peek 
as_tibble(dfm)

## # A tibble: 71 x 7
##    date       jcity     jstate   jcountrycode mcity          mstate         mcountrycode
##    <date>     <chr>     <chr>    <chr>        <chr>          <chr>          <chr>       
##  1 1977-01-01 Melbourne Florida  US           São Paulo      São Paulo      BR          
##  2 1981-01-01 Boulder   Colorado US           São Paulo      São Paulo      BR          
##  3 1990-06-01 Boulder   Colorado US           Piracicaba     São Paulo      BR          
##  4 1990-07-01 Boulder   Colorado US           São Paulo      São Paulo      BR          
##  5 1991-12-01 Boulder   Colorado US           Londrina       Paraná         BR          
##  6 1992-01-01 Boulder   Colorado US           São Paulo      São Paulo      BR          
##  7 1992-12-01 Boulder   Colorado US           Curitiba       Paraná         BR          
##  8 1993-01-01 Boulder   Colorado US           São Paulo      São Paulo      BR          
##  9 1994-01-01 Boulder   Colorado US           Rio de Janeiro Rio de Janeiro BR          
## 10 1994-02-01 Boulder   Colorado US           São Paulo      São Paulo      BR          
## # ... with 61 more rows

Enrichment

Next I create a normalized table of locations avoid unnecessary API calls to openstreetmap where I get the latitude and longitudes of the cities in order to calculate the distances between them. I do this only once, given any location changes to the original CSV, so that in later code runs I can just pull up the normalized table from the data folder.

# only hit api if not exists (manually delete if changing input file)
if (!file.exists('./data/locs_df.csv')) {

    unique_locs <- unique(
        c(paste0(dfm$jcity, "|", dfm$jstate, "|", dfm$jcountrycode)
        , paste0(dfm$mcity, "|", dfm$mstate, "|", dfm$mcountrycode))
    )
    
    normalize_locs <- function(unique_locs) {
        N <- length(unique_locs)
        city <- state <- countrycode <- rep(NA, N)
        for (i in 1:N) {
            city[i] <- strsplit(unique_locs[i], "\\|")[[1]][1]
            state[i] <-  strsplit(unique_locs[i], "\\|")[[1]][2]
            countrycode[i] <-  strsplit(unique_locs[i], "\\|")[[1]][3]
        }
        df <- data.frame(
            'city' = city
            , 'state' = state
            , 'countrycode' = countrycode
        )
        df
    }
    
    locs_df <- normalize_locs(unique_locs)
    
    # instantiate lat long cols
    locs_df$lon <- locs_df$lat <- 0
    
    get_latlongs <- function(dfm) {
    
        for (i in 1:nrow(dfm)) {
    
            city_i <- gsub(' ', '%20', dfm$city[i])
            state_i <-  gsub(' ', '%20', dfm$state[i])
            countrycode_i <- dfm$countrycode[i]
    
            url <- paste0(
                "http://nominatim.openstreetmap.org/search?city=", city_i
                , "&state=", state_i
                , "&countrycodes=", countrycode_i
                , "&limit=4&format=json", sep="")
    
            data <- fromJSON(url)
    
            if (length(data) == 0) {
                print(paste0("Error requesting: ", url))
            }
            else if (is.vector(data)) {
                dfm$lon[i] <- as.numeric(data[[1]]$lon)
                dfm$lat[i] <- as.numeric(data[[1]]$lat)
            } 
            else {
                dfm$lon[i] <- as.numeric(data$lon)
                dfm$lon[i] <- as.numeric(data$lat)
            }
        }
        dfm
    }
    
    locs_df <- get_latlongs(locs_df)
    
    # save
    write.csv(locs_df, './data/locs_df.csv', row.names=FALSE)
}

# load locations
locs_df <- read.csv('./data/locs_df.csv')

# peek
as_tibble(locs_df)

## # A tibble: 38 x 5
##    city                    state           countrycode   lat     lon
##    <chr>                   <chr>           <chr>       <dbl>   <dbl>
##  1 Melbourne               Florida         US           28.1  -80.6 
##  2 Boulder                 Colorado        US           40.0 -105.  
##  3 Fort Collins            Colorado        US           40.6 -105.  
##  4 Sydney                  New South Wales AU          -33.9  151.  
##  5 Kingston                Surrey County   JM           18.0  -76.8 
##  6 Skye                    Highland        GB           57.4   -6.30
##  7 Wheat Ridge             Colorado        US           39.8 -105.  
##  8 Jemez                   New Mexico      US           35.6 -107.  
##  9 Kirribilli              New South Wales AU          -33.8  151.  
## 10 Saint Mary-of-the-Woods Indiana         US           39.5  -87.5 
## # ... with 28 more rows

Now I add the coordinates to the original dataframe, compute Haversine distances, and divide them in half for the plot. Distances are in kilometers, because science.

# enrich dfm with location data

# add j's lat, lon
dfm <- merge(
    dfm, locs_df
    , by.x=c('jcity', 'jstate', 'jcountrycode')
    , by.y = c('city', 'state', 'countrycode')
    , all = FALSE
)

dfm$jlat <- dfm$lat 
dfm$jlon <- dfm$lon 
dfm <- dfm[, !colnames(dfm) %in% c('lat','lon')]

# add m's lat, lon
dfm <- merge(dfm, locs_df, by.x=c('mcity', 'mstate', 'mcountrycode')
            , by.y = c('city', 'state', 'countrycode'), all = FALSE)
dfm$mlat <- dfm$lat 
dfm$mlon <- dfm$lon 
dfm <- dfm[, !colnames(dfm) %in% c('lat','lon')]

# sort by date
dfm <- dfm[order(dfm$date), ]

# Convert coords into distances

# calculate Haversine distance (in meters)
dfm$meters <- 0
for (i in 1:nrow(dfm)) {
    dfm$meters[i] <- distm(c(dfm$mlon[i], dfm$mlat[i])
        , c(dfm$jlon[i], dfm$jlat[i]), fun = distHaversine)
}

# calculate distance to midpoint (in km)
dfm$distance <- as.integer(round(dfm$meters / 2 / 1000))

# subset & reindex
dfm <- dfm[, c('date', 'jcity', 'mcity', 'jcountrycode', 'mcountrycode','distance')]
rownames(dfm) <- 1:nrow(dfm)

# peek
as_tibble(dfm)

## # A tibble: 71 x 6
##    date       jcity     mcity          jcountrycode mcountrycode distance
##    <date>     <chr>     <chr>          <chr>        <chr>           <int>
##  1 1977-01-01 Melbourne São Paulo      US           BR               3405
##  2 1981-01-01 Boulder   São Paulo      US           BR               4663
##  3 1990-06-01 Boulder   Piracicaba     US           BR               4594
##  4 1990-07-01 Boulder   São Paulo      US           BR               4663
##  5 1991-12-01 Boulder   Londrina       US           BR               4504
##  6 1992-01-01 Boulder   São Paulo      US           BR               4663
##  7 1992-12-01 Boulder   Curitiba       US           BR               4656
##  8 1993-01-01 Boulder   São Paulo      US           BR               4663
##  9 1994-01-01 Boulder   Rio de Janeiro US           BR               4753
## 10 1994-02-01 Boulder   São Paulo      US           BR               4663
## # ... with 61 more rows

Munging

Here I prepare some graphical parameters and merge into a tall-skinny dataframe at the month level (technically, first of the month).

# monthly and yearly frames
month_date_range <- seq(min(dfm$date), max(dfm$date), by='month')
month_format <- format(month_date_range, '%b')
month_df <- data.frame(month_date_range, month_format)

year_date_range <- c(seq(min(dfm$date) - years(1), max(dfm$date) + years(1), by='year'))
year_date_range <- as.Date(
    intersect(
        floor_date(year_date_range, unit="year")
        , ceiling_date(year_date_range, unit="year")
    ),  origin = "1970-01-01"
)
year_format <- format(year_date_range, '%Y')
year_df <- data.frame(year_date_range, year_format)

# create tall skinny date frame with all months
N <- length(month_date_range)
dfs <- data.frame(
    'date' = month_date_range
)

dfm$month <- as.integer(month(dfm$date))
dfm$year <- as.integer(year(dfm$date))

# Merge dfm into dfs frame for horizontal bars
mm <-  merge(dfs, dfm, by = c('date'), all = TRUE)
# reorder columns
mm <- mm[, c('date', 'month', 'year', 'jcity', 'jcountrycode', 'mcity', 'mcountrycode', 'distance')]

# fill out NAs
mm <- fill_nas(mm, "month")
mm <- fill_nas(mm, "year")
mm <- fill_nas(mm, "jcity")
mm <- fill_nas(mm, "jcountrycode")
mm <- fill_nas(mm, "mcity")
mm <- fill_nas(mm, "mcountrycode")
mm <- fill_nas(mm, "distance")

# peek 
as_tibble(mm)

## # A tibble: 439 x 8
##    date       month  year jcity     jcountrycode mcity     mcountrycode distance
##    <date>     <int> <int> <chr>     <chr>        <chr>     <chr>           <int>
##  1 1977-01-01     1  1977 Melbourne US           São Paulo BR               3405
##  2 1977-02-01     1  1977 Melbourne US           São Paulo BR               3405
##  3 1977-03-01     1  1977 Melbourne US           São Paulo BR               3405
##  4 1977-04-01     1  1977 Melbourne US           São Paulo BR               3405
##  5 1977-05-01     1  1977 Melbourne US           São Paulo BR               3405
##  6 1977-06-01     1  1977 Melbourne US           São Paulo BR               3405
##  7 1977-07-01     1  1977 Melbourne US           São Paulo BR               3405
##  8 1977-08-01     1  1977 Melbourne US           São Paulo BR               3405
##  9 1977-09-01     1  1977 Melbourne US           São Paulo BR               3405
## 10 1977-10-01     1  1977 Melbourne US           São Paulo BR               3405
## # ... with 429 more rows

Then I reshape into a single city and countrycode, doubling their values, distances, and adding directions for labeling and separating “who” those values refer to. I do this twice, once at the who-month level and once at the who-newplace level. The first is for the horizontal bars (distances) so I call it bars_df and the secon is for labeling changes in location so I call it labs_df.

# Reshape into single city, countrycode + distance + direction for labels for whom
# mm: tall with all dates for horiz bars
# dfm: only changing dates for city labels
reshape_df <- function(df) {

    out <- data.frame(
        'date' = rep(df$date, each = 2)
        , 'month' = rep(df$month, each = 2)
        , 'year' = rep(df$year, each = 2)
        , 'city' = c(rbind(df$jcity, df$mcity))
        , 'countrycode' = c(rbind(df$jcountrycode, df$mcountrycode))
        , 'distance' = c(rbind(df$distance, -df$distance))
        , 'direction' = as.integer(rep(c(1, -1), nrow(df)))
    )

    # factor country codes
    country_codes <- sort(unique(out$countrycode))
    out$countrycode <- factor(out$countrycode, levels = country_codes, ordered = TRUE)
    # add  'who' column
    who_levels <- c('Jenny', 'Marcelo')
    out$who <- rep(NA, nrow(out))
    out$who[out$direction == 1] <- who_levels[1]
    out$who[out$direction == -1] <- who_levels[2]
    out$who <- factor(out$who, levels = who_levels, ordered = TRUE)
    out <- out[, c('date', 'month', 'year', 'city', 'countrycode', 'distance', 'direction', 'who')]
    # return
    out
}

labs_df <- reshape_df(dfm)
bars_df <- reshape_df(mm)


# save
write.csv(labs_df, "./data/labs_df.csv", row.names = FALSE)
write.csv(bars_df, "./data/bars_df.csv", row.names = FALSE)

# peek
as_tibble(labs_df)

## # A tibble: 142 x 8
##    date       month  year city       countrycode distance direction who    
##    <date>     <int> <int> <chr>      <ord>          <int>     <int> <ord>  
##  1 1977-01-01     1  1977 Melbourne  US              3405         1 Jenny  
##  2 1977-01-01     1  1977 São Paulo  BR             -3405        -1 Marcelo
##  3 1981-01-01     1  1981 Boulder    US              4663         1 Jenny  
##  4 1981-01-01     1  1981 São Paulo  BR             -4663        -1 Marcelo
##  5 1990-06-01     6  1990 Boulder    US              4594         1 Jenny  
##  6 1990-06-01     6  1990 Piracicaba BR             -4594        -1 Marcelo
##  7 1990-07-01     7  1990 Boulder    US              4663         1 Jenny  
##  8 1990-07-01     7  1990 São Paulo  BR             -4663        -1 Marcelo
##  9 1991-12-01    12  1991 Boulder    US              4504         1 Jenny  
## 10 1991-12-01    12  1991 Londrina   BR             -4504        -1 Marcelo
## # ... with 132 more rows

Plot

First I reload the saved data and set up more graphical parameters, including the position of the text.

# reload data
load_data <- function(path) {
    read.csv(path
        , fileEncoding = 'latin1'
        , colClasses = c("Date", "integer", "integer", "character"
                        , "factor", "integer", "integer", "factor")
    )
}

bars_df <- load_data('./data/bars_df.csv')
labs_df <- load_data('./data/labs_df.csv')


# 10% max dist offset (used variously)
offset <- max(bars_df$distance)/10

# text position
bars_df$text_position <- (offset * 1.2 * bars_df$direction) + bars_df$distance
labs_df$text_position <- (offset * 1.2 * labs_df$direction) + labs_df$distance

# fix for close distances
fix_text_pos <- function(df) {
    df$text_position <- ifelse(df$direction == 1 & df$text_position < 2000, 2000 + offset, 
                        ifelse(df$direction == -1 & df$text_position > -2000, -2000 - offset
                        , df$text_position))
    df$text_position
}

bars_df$text_position <- fix_text_pos(bars_df)
labs_df$text_position <- fix_text_pos(labs_df)


# setup colors and levels
who_colors = c('blue3', 'darkred')
who_levels = c('Jenny', 'Marcelo')

# create arbitrary dataframe for meetings
meet_df <- data.frame(
    'date' = as.Date(c('2003-03-01', '2013-04-01'), format = '%Y-%m-%d')
    , 'distance' = as.integer(c(2500, 2500))
    , 'text' = c('Unintentional Meeting', 'Intentional Meeting')
)


# final peek
as_tibble(labs_df)

## # A tibble: 142 x 9
##    date       month  year city       countrycode distance direction who     text_position
##    <date>     <int> <int> <chr>      <fct>          <int>     <int> <fct>           <dbl>
##  1 1977-01-01     1  1977 Melbourne  US              3405         1 Jenny           4325.
##  2 1977-01-01     1  1977 São Paulo  BR             -3405        -1 Marcelo        -4325.
##  3 1981-01-01     1  1981 Boulder    US              4663         1 Jenny           5583.
##  4 1981-01-01     1  1981 São Paulo  BR             -4663        -1 Marcelo        -5583.
##  5 1990-06-01     6  1990 Boulder    US              4594         1 Jenny           5514.
##  6 1990-06-01     6  1990 Piracicaba BR             -4594        -1 Marcelo        -5514.
##  7 1990-07-01     7  1990 Boulder    US              4663         1 Jenny           5583.
##  8 1990-07-01     7  1990 São Paulo  BR             -4663        -1 Marcelo        -5583.
##  9 1991-12-01    12  1991 Boulder    US              4504         1 Jenny           5424.
## 10 1991-12-01    12  1991 Londrina   BR             -4504        -1 Marcelo        -5424.
## # ... with 132 more rows

Finally, I plot a very customized ggplot and save it as png.

# Plot
g <- ggplot(
        data = bars_df, aes(x = 0, y = date)
    ) +
    labs(color = "Who") +
    scale_color_manual(values = who_colors, labels = who_levels) +
    theme_classic()

# Plot: horizontal segment thick lines for all distances
g <- g + geom_segment(
    data = bars_df
        , aes(x = distance, xend = 0, yend = date)
        , color = 'azure2'
        , size = 8
)

# Plot: horizontal segment dotted lines for new cities
g <- g + geom_segment(
    data = labs_df
        , aes(x = distance, xend = 0, yend = date)
        , color = 'black'
        , size = 0.5
        , lty = 2
)

# Plot: scatter diamonds at timeline ends
g <- g +  geom_point(
        data = labs_df
        , aes(x = distance, color = who)
        , size = 5
        , pch = 18
)

# Plot: arbitrary scatter arrows at timeline ends
g <- g +  geom_point(
        data = meet_df
        , aes(x = 600, y = date)
        , color = 'maroon4'
        , size = 16
        , pch = -8592 # left arrow
)

# Plot: theme - less ink
g <- g + theme(
    legend.position = "top"
        , axis.title.x = element_blank()
        , axis.title.y = element_blank()
        , axis.line.x = element_blank()
        , axis.line.y = element_blank()
        , axis.text.x = element_blank()
        , axis.text.y = element_blank()
        , axis.ticks.x = element_blank()
        , axis.ticks.y = element_blank()
        , panel.background = element_rect(fill = "transparent")
        , plot.background = element_rect(fill = "transparent", color = NA)
        , legend.background = element_rect(fill = "transparent")
        , legend.title = element_text(size = 16)
        , legend.text = element_text(size = 14)
)

# Plot: vertical timeline
g <- g + geom_vline(
    xintercept = 0
        , color = "black"
        , size = 0.5
)

# Plot: year text
g <- g + geom_text(
    data = year_df
        , aes(x = -offset * 6/7, y = year_date_range, label = year_format, fontface = "bold")
        , check_overlap = TRUE
        , size = 4
        , vjust = 0
        , nudge_y = 5
        , color = "black"
)

# Plot: month text
g <- g + geom_text(
    data = month_df[seq(from = 2, to = nrow(month_df), by = 3), ]
        , aes(x = -offset * 1/4, y = month_date_range, label = month_format, fontface = "italic")
        , check_overlap = TRUE
        , size = 3
        , vjust = 0
        , nudge_y = 5
        , color = "black"
)

# Plot: city text
g <- g + geom_text(
   data = labs_df
       , aes(x = text_position, label = paste0(city, ', ', countrycode))
       , check_overlap = TRUE
       , size = 4
       # to override manual color, length must be 1 or df nrows
       , color = rep(who_colors, nrow(labs_df)/2)
)

# Plot: distance text
g <- g + geom_text(
   data = labs_df[labs_df$direction == 1, ]
       , aes(x = offset * 1/2, label = paste0(distance * 2, ' km'), fontface = "italic")
       , check_overlap = TRUE
       , size = 3.5
       , vjust = 0
       , nudge_y = 5
       # to override manual color, length must be 1 or df nrows
       , color = "black"
)

# Plot: meeting text
g <- g + geom_text(
   data = meet_df
       , aes(x = distance, label = text, fontface = "bold")
       , check_overlap = TRUE
       , size = 6
       , color = "maroon4"
)


# Save as png
ggsave(
    filename = "./img/timeline.png"
    , plot = last_plot()
    , device = "png"
    , width = 400
    , height = 2400
    , units = "mm"
    , limitsize = FALSE
    , dpi = 300
    , bg = "transparent"
)

# creates Rcode file with R code, set documentation = 1 to avoid text commentary
# library(knitr)
# options(knitr.purl.inline = TRUE)
# purl("Time-And-Place.Rmd", output = "Rcode.R", documentation = 1)

Back to Contents

Time & Place

Marcelo Sanches

4/10/2022

Contents

The Story