library(jsonlite)
library(dplyr)
library(ggplot2)
library(lubridate)
library(leaflet)
library(leaflet.extras)

Google Location History

The Google location history can be downloaded from your Google account under https://takeout.google.com/settings/takeout. The data provided by Google for download is a .json file and can be loaded using the jsonlite package. Loading this file into R might take a few minutes. It depends on how many location points Google had saved about you.

datos <- fromJSON("Location History.json")

Exploratory Data Analysis

The data is stored as a dataframe in the JSON under locations (datos$locations).

class(datos)
## [1] "list"
attributes(datos)
## $names
## [1] "locations"
class(datos$locations)
## [1] "data.frame"
# extract location dataframe
df <- datos$locations

rm(datos)
glimpse(df)
## Observations: 1,691,969
## Variables: 9
## $ timestampMs      <chr> "1307998861249", "1307998872287", "1307998876320",...
## $ latitudeE7       <int> 403240840, 403242250, 403245230, 403240310, 403240...
## $ longitudeE7      <int> -37778120, -37776070, -37778520, -37775600, -37775...
## $ accuracy         <int> 232, 93, 46, 34, 22, 22, 22, 25, 2, 2, 2, 2, 2, 2,...
## $ activity         <list> [NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ...
## $ altitude         <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ velocity         <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ heading          <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ verticalAccuracy <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
sapply(df,function(x)(sum(is.na(x))))  
##      timestampMs       latitudeE7      longitudeE7         accuracy 
##                0                0                0                3 
##         activity         altitude         velocity          heading 
##                0          1439462          1618278          1658690 
## verticalAccuracy 
##          1497313

Data Cleaning and Transformation

df <- df %>% filter(activity!="NULL")
##Convert the position and time stamps into a more readable form
df <- df %>% mutate(time  = as_datetime(as.numeric(df$timestampMs)/1000),
                    date = date(time),
                    hour.min  = paste(hour(time),minute(time),sep=":"),
                    week = isoweek(time),
                    year = isoyear(time),
                    latitude = latitudeE7/1e7,
                    longitude= longitudeE7/1e7) %>%
                    select(-timestampMs,-latitudeE7,-longitudeE7,-time)



##Convert the position and time stamps into a more readable form
##df <- df %>% mutate(time  = as_datetime(as.numeric(df$timestampMs)/1000),
##                    date = date(time),
##                    hour.min  = paste(hour(time),minute(time),sep=":")) %>%
##             mutate(latitud = latitudeE7/1e7,
##                    longitud= longitudeE7/1e7) %>%
##             select(-timestampMs,-latitudeE7,-longitudeE7)

# Extract timestamp for the activities
act.timestamp <- as.character(sapply(df$activity, function(x) (x[[1]][[1]])))

#Extract the first activity with the highest confidence 
act.actividad <-(sapply(df$activity, function(x) (x[[2]][[1]][1])))
act.actividad <- sapply(act.actividad,function(x) (x[[1]][1]))

act.actividad <- unlist(act.actividad)
df <- df %>% mutate(act.time = as_datetime(as.numeric(act.timestamp)/1000),
                    act.date = date(act.time),
                    act.hour  = hour(act.time),
                    act.hour.min = paste(act.hour,minute(act.time),sep=":"),
                    act.weekday = wday(act.time, label=T,week_start=1, abbr = F),
                    act.activity = act.actividad )

# Missing values 
sapply(df, class)
## $accuracy
## [1] "integer"
## 
## $activity
## [1] "list"
## 
## $altitude
## [1] "integer"
## 
## $velocity
## [1] "integer"
## 
## $heading
## [1] "integer"
## 
## $verticalAccuracy
## [1] "integer"
## 
## $date
## [1] "Date"
## 
## $hour.min
## [1] "character"
## 
## $week
## [1] "numeric"
## 
## $year
## [1] "numeric"
## 
## $latitude
## [1] "numeric"
## 
## $longitude
## [1] "numeric"
## 
## $act.time
## [1] "POSIXct" "POSIXt" 
## 
## $act.date
## [1] "Date"
## 
## $act.hour
## [1] "integer"
## 
## $act.hour.min
## [1] "character"
## 
## $act.weekday
## [1] "ordered" "factor" 
## 
## $act.activity
## [1] "character"
#rm(act.actividad,act.timestamp)

How long did have Google collected data?

summary(df$date)
##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "2013-07-12" "2015-01-02" "2016-09-26" "2016-07-01" "2017-10-26" "2019-06-12"
n_distinct(df$date)
## [1] 2144
head(df %>% group_by(year,week) %>% summarise(n=n()),10) 
## # A tibble: 10 x 3
## # Groups:   year [1]
##     year  week     n
##    <dbl> <dbl> <int>
##  1  2013    28   791
##  2  2013    29  1545
##  3  2013    30  1873
##  4  2013    31  3089
##  5  2013    32  3519
##  6  2013    33  1703
##  7  2013    34  2899
##  8  2013    35  3134
##  9  2013    36  3089
## 10  2013    37  3534
df %>% group_by(year,week) %>% summarise(n=n())%>% summarise(media=mean(n))
## # A tibble: 7 x 2
##    year media
##   <dbl> <dbl>
## 1  2013 3032.
## 2  2014 2655.
## 3  2015 2180.
## 4  2016 2972.
## 5  2017 3671.
## 6  2018 2550.
## 7  2019 2242.
df %>% group_by(year,week) %>% summarise(n=n())%>% summarise(media=round(mean(n)/24))
## # A tibble: 7 x 2
##    year media
##   <dbl> <dbl>
## 1  2013   126
## 2  2014   111
## 3  2015    91
## 4  2016   124
## 5  2017   153
## 6  2018   106
## 7  2019    93
df %>%  group_by(week,year) %>% summarise(n = n()) %>%
   ggplot( aes(x=week, y=n)) +
      geom_bar(stat="identity") +
      facet_grid(facets = year ~ .) +
      scale_x_continuous(breaks = c(1:54)) +
      labs(x = "Week of year", y = "Entries",
      title="Google Location: Tracks per week") +
      theme_bw()

How accurate are these measurements?

summary(df$accuracy)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2      22      24     218      29 4984961
x <- df[df$accuracy<4000,]
ggplot(x,aes(accuracy))+
  geom_density(size=1, col='grey')+ 
  coord_cartesian(xlim=c(0,2000)) +
  theme_bw() 

Altitude Variation

temp <- df %>% filter(!is.na(altitude)) %>% arrange(date)
temp[1000:15000,]  %>%
   ggplot(aes(x=as.Date(date),y=altitude)) +
   geom_point() +
   theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_x_date(breaks = function(x) seq.Date(from = min(x), to = max(x), by = "1 week"),
                minor_breaks = function(x) seq.Date(from = min(x), to = max(x), by = "1 week")) +
   ggtitle("Altitude variation") + labs(x="Date")

df %>% filter(!is.na(altitude)) %>% arrange(date) %>%
  ggplot(aes(x=as.Date(date),y=altitude)) +
  geom_point() +
  theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  scale_x_date(breaks = function(x) seq.Date(from = min(x), to = max(x), by = "1 month"),
                minor_breaks = function(x) seq.Date(from = min(x), to = max(x), by = "1 month")) +
   ggtitle("Altitude Variation") + labs(x="Date")

Activities

df %>%  ggplot(aes(x=(act.activity),group=(act.activity))) +
   geom_bar() + 
   labs(x = "Activity", y = "Entries", title = "Main activities") +
   theme_bw()

Frequency of the main activity during a day by hour.

unique(df$act.activity)
## [1] "STILL"           "IN_VEHICLE"      "ON_FOOT"         "ON_BICYCLE"     
## [5] "TILTING"         "UNKNOWN"         "EXITING_VEHICLE"
df %>% filter((!is.na(act.activity)) & (!is.na(act.hour))) %>% 
  ggplot(aes(x=act.hour)) +
  geom_bar()+
  coord_cartesian(xlim=c(0,24))+
  facet_wrap(~act.activity,scales='free') +
  theme_bw() + labs(x="Hours (0..24)")

Frequency of different activities by weekday.

df %>% 
  select(act.activity,act.weekday) %>%
  filter((!is.na(act.activity)) & (!is.na(act.weekday))) %>%
  ggplot(aes(x=act.activity)) + 
  geom_bar() +
  facet_wrap(~act.weekday, scales = 'free', ncol=4) +
  theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))

leaflet(df) %>% addTiles() %>%
addWebGLHeatmap(size=10,units='px')
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
myMap = leaflet(df) %>% 
  addProviderTiles(providers$CartoDB.Positron) %>%
  fitBounds(~min(longitude), ~min(latitude), ~max(longitude), ~max(latitude)) %>%  
  addHeatmap(lng = ~longitude, lat = ~latitude, group = "HeatMap", blur = 20, max = 0.01, radius = 15) %>%
  addMarkers(data = df, ~longitude, ~latitude, clusterOptions = markerClusterOptions(), group = "Points")
  
myMap