library(jsonlite)
library(dplyr)
library(ggplot2)
library(lubridate)
library(leaflet)
library(leaflet.extras)
The Google location history can be downloaded from your Google account under https://takeout.google.com/settings/takeout. The data provided by Google for download is a .json file and can be loaded using the jsonlite package. Loading this file into R might take a few minutes. It depends on how many location points Google had saved about you.
datos <- fromJSON("Location History.json")
The data is stored as a dataframe in the JSON under locations (datos$locations).
class(datos)
## [1] "list"
attributes(datos)
## $names
## [1] "locations"
class(datos$locations)
## [1] "data.frame"
# extract location dataframe
df <- datos$locations
rm(datos)
glimpse(df)
## Observations: 1,691,969
## Variables: 9
## $ timestampMs <chr> "1307998861249", "1307998872287", "1307998876320",...
## $ latitudeE7 <int> 403240840, 403242250, 403245230, 403240310, 403240...
## $ longitudeE7 <int> -37778120, -37776070, -37778520, -37775600, -37775...
## $ accuracy <int> 232, 93, 46, 34, 22, 22, 22, 25, 2, 2, 2, 2, 2, 2,...
## $ activity <list> [NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ...
## $ altitude <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ velocity <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ heading <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ verticalAccuracy <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
sapply(df,function(x)(sum(is.na(x))))
## timestampMs latitudeE7 longitudeE7 accuracy
## 0 0 0 3
## activity altitude velocity heading
## 0 1439462 1618278 1658690
## verticalAccuracy
## 1497313
df <- df %>% filter(activity!="NULL")
##Convert the position and time stamps into a more readable form
df <- df %>% mutate(time = as_datetime(as.numeric(df$timestampMs)/1000),
date = date(time),
hour.min = paste(hour(time),minute(time),sep=":"),
week = isoweek(time),
year = isoyear(time),
latitude = latitudeE7/1e7,
longitude= longitudeE7/1e7) %>%
select(-timestampMs,-latitudeE7,-longitudeE7,-time)
##Convert the position and time stamps into a more readable form
##df <- df %>% mutate(time = as_datetime(as.numeric(df$timestampMs)/1000),
## date = date(time),
## hour.min = paste(hour(time),minute(time),sep=":")) %>%
## mutate(latitud = latitudeE7/1e7,
## longitud= longitudeE7/1e7) %>%
## select(-timestampMs,-latitudeE7,-longitudeE7)
# Extract timestamp for the activities
act.timestamp <- as.character(sapply(df$activity, function(x) (x[[1]][[1]])))
#Extract the first activity with the highest confidence
act.actividad <-(sapply(df$activity, function(x) (x[[2]][[1]][1])))
act.actividad <- sapply(act.actividad,function(x) (x[[1]][1]))
act.actividad <- unlist(act.actividad)
df <- df %>% mutate(act.time = as_datetime(as.numeric(act.timestamp)/1000),
act.date = date(act.time),
act.hour = hour(act.time),
act.hour.min = paste(act.hour,minute(act.time),sep=":"),
act.weekday = wday(act.time, label=T,week_start=1, abbr = F),
act.activity = act.actividad )
# Missing values
sapply(df, class)
## $accuracy
## [1] "integer"
##
## $activity
## [1] "list"
##
## $altitude
## [1] "integer"
##
## $velocity
## [1] "integer"
##
## $heading
## [1] "integer"
##
## $verticalAccuracy
## [1] "integer"
##
## $date
## [1] "Date"
##
## $hour.min
## [1] "character"
##
## $week
## [1] "numeric"
##
## $year
## [1] "numeric"
##
## $latitude
## [1] "numeric"
##
## $longitude
## [1] "numeric"
##
## $act.time
## [1] "POSIXct" "POSIXt"
##
## $act.date
## [1] "Date"
##
## $act.hour
## [1] "integer"
##
## $act.hour.min
## [1] "character"
##
## $act.weekday
## [1] "ordered" "factor"
##
## $act.activity
## [1] "character"
#rm(act.actividad,act.timestamp)
How long did have Google collected data?
summary(df$date)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "2013-07-12" "2015-01-02" "2016-09-26" "2016-07-01" "2017-10-26" "2019-06-12"
n_distinct(df$date)
## [1] 2144
head(df %>% group_by(year,week) %>% summarise(n=n()),10)
## # A tibble: 10 x 3
## # Groups: year [1]
## year week n
## <dbl> <dbl> <int>
## 1 2013 28 791
## 2 2013 29 1545
## 3 2013 30 1873
## 4 2013 31 3089
## 5 2013 32 3519
## 6 2013 33 1703
## 7 2013 34 2899
## 8 2013 35 3134
## 9 2013 36 3089
## 10 2013 37 3534
df %>% group_by(year,week) %>% summarise(n=n())%>% summarise(media=mean(n))
## # A tibble: 7 x 2
## year media
## <dbl> <dbl>
## 1 2013 3032.
## 2 2014 2655.
## 3 2015 2180.
## 4 2016 2972.
## 5 2017 3671.
## 6 2018 2550.
## 7 2019 2242.
df %>% group_by(year,week) %>% summarise(n=n())%>% summarise(media=round(mean(n)/24))
## # A tibble: 7 x 2
## year media
## <dbl> <dbl>
## 1 2013 126
## 2 2014 111
## 3 2015 91
## 4 2016 124
## 5 2017 153
## 6 2018 106
## 7 2019 93
df %>% group_by(week,year) %>% summarise(n = n()) %>%
ggplot( aes(x=week, y=n)) +
geom_bar(stat="identity") +
facet_grid(facets = year ~ .) +
scale_x_continuous(breaks = c(1:54)) +
labs(x = "Week of year", y = "Entries",
title="Google Location: Tracks per week") +
theme_bw()
How accurate are these measurements?
summary(df$accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 22 24 218 29 4984961
x <- df[df$accuracy<4000,]
ggplot(x,aes(accuracy))+
geom_density(size=1, col='grey')+
coord_cartesian(xlim=c(0,2000)) +
theme_bw()
temp <- df %>% filter(!is.na(altitude)) %>% arrange(date)
temp[1000:15000,] %>%
ggplot(aes(x=as.Date(date),y=altitude)) +
geom_point() +
theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_date(breaks = function(x) seq.Date(from = min(x), to = max(x), by = "1 week"),
minor_breaks = function(x) seq.Date(from = min(x), to = max(x), by = "1 week")) +
ggtitle("Altitude variation") + labs(x="Date")
df %>% filter(!is.na(altitude)) %>% arrange(date) %>%
ggplot(aes(x=as.Date(date),y=altitude)) +
geom_point() +
theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))+
scale_x_date(breaks = function(x) seq.Date(from = min(x), to = max(x), by = "1 month"),
minor_breaks = function(x) seq.Date(from = min(x), to = max(x), by = "1 month")) +
ggtitle("Altitude Variation") + labs(x="Date")
df %>% ggplot(aes(x=(act.activity),group=(act.activity))) +
geom_bar() +
labs(x = "Activity", y = "Entries", title = "Main activities") +
theme_bw()
Frequency of the main activity during a day by hour.
unique(df$act.activity)
## [1] "STILL" "IN_VEHICLE" "ON_FOOT" "ON_BICYCLE"
## [5] "TILTING" "UNKNOWN" "EXITING_VEHICLE"
df %>% filter((!is.na(act.activity)) & (!is.na(act.hour))) %>%
ggplot(aes(x=act.hour)) +
geom_bar()+
coord_cartesian(xlim=c(0,24))+
facet_wrap(~act.activity,scales='free') +
theme_bw() + labs(x="Hours (0..24)")
Frequency of different activities by weekday.
df %>%
select(act.activity,act.weekday) %>%
filter((!is.na(act.activity)) & (!is.na(act.weekday))) %>%
ggplot(aes(x=act.activity)) +
geom_bar() +
facet_wrap(~act.weekday, scales = 'free', ncol=4) +
theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
leaflet(df) %>% addTiles() %>%
addWebGLHeatmap(size=10,units='px')
## Assuming "longitude" and "latitude" are longitude and latitude, respectively