[Video]
Which of the following is the correct way to specify the 4th of October 2004 according to ISO 8601?
4-10-2004
2004-10-4
10-04-2004
2004-10-04
# The date R 3.0.0 was released
x <- "2013-04-03"
# Examine structure of x
str(x)
## chr "2013-04-03"
# Use as.Date() to interpret x as a date
x_date <- as.Date(x)
# Examine structure of x_date
str(x_date)
## Date[1:1], format: "2013-04-03"
# Store April 10 2014 as a Date
april_10_2014 <- as.Date("2014-04-10")
# Load the readr package
library(readr)
# Use read_csv() to import rversions.csv
releases <- read_csv("rversions.csv")
## Parsed with column specification:
## cols(
## major = col_double(),
## minor = col_double(),
## patch = col_double(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine the structure of the date column
str(releases$date)
## Date[1:105], format: "1997-12-04" "1997-12-21" "1998-01-10" "1998-03-14" "1998-05-02" ...
# Load the anytime package
library(anytime)
# Various ways of writing Sep 10 2009
sep_10_2009 <- c("September 10 2009", "2009-09-10", "10 Sep 2009", "09-10-2009")
# Use anytime() to parse sep_10_2009
anytime(sep_10_2009)
## [1] "2009-09-10 MDT" "2009-09-10 MDT" "2009-09-10 MDT" "2009-09-10 MDT"
[Video]
library(ggplot2)
# Set the x axis to the date column
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major)))
# Limit the axis to between 2010-01-01 and 2014-01-01
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
xlim(as.Date("2010-01-01"), as.Date("2014-01-01"))
## Warning: Removed 87 row(s) containing missing values (geom_path).
# Specify breaks every ten years and labels with "%Y"
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
scale_x_date(date_breaks = "10 years", date_labels = "%Y")
# Find the largest date
last_release_date <- max(releases$date)
# Filter row for last release
# last_release <- filter(releases, date == last_release_date)
# Print last_release
# last_release
# How long since last release?
Sys.Date() - last_release_date
## Time difference of 1067 days
[Video]
# Use as.POSIXct to enter the datetime
as.POSIXct("2010-10-01 12:12:00")
## [1] "2010-10-01 12:12:00 MDT"
# Use as.POSIXct again but set the timezone to `"America/Los_Angeles"`
as.POSIXct("2010-10-01 12:12:00", tz = "America/Los_Angeles")
## [1] "2010-10-01 12:12:00 PDT"
# Use readr to import rversions.csv
releases <- read_csv("rversions.csv")
## Parsed with column specification:
## cols(
## major = col_double(),
## minor = col_double(),
## patch = col_double(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine structure of datetime column
str(releases$datetime)
## POSIXct[1:105], format: "1997-12-04 08:47:58" "1997-12-21 13:09:22" "1998-01-10 00:31:55" ...
# Import "cran-logs_2015-04-17.csv" with read_csv()
logs <- read_csv("cran-logs_2015-04-17.csv")
## Parsed with column specification:
## cols(
## datetime = col_datetime(format = ""),
## r_version = col_character(),
## country = col_character()
## )
# Print logs
logs
## # A tibble: 100,000 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-16 22:40:19 3.1.3 CO
## 2 2015-04-16 09:11:04 3.1.3 GB
## 3 2015-04-16 17:12:37 3.1.3 DE
## 4 2015-04-18 12:34:43 3.2.0 GB
## 5 2015-04-16 04:49:18 3.1.3 PE
## 6 2015-04-16 06:40:44 3.1.3 TW
## 7 2015-04-16 00:21:36 3.1.3 US
## 8 2015-04-16 10:27:23 3.1.3 US
## 9 2015-04-16 01:59:43 3.1.3 SG
## 10 2015-04-18 15:41:32 3.2.0 CA
## # … with 99,990 more rows
# Store the release time as a POSIXct object
release_time <- as.POSIXct("2015-04-16 07:13:33", tz = "UTC")
# When is the first download of 3.2.0?
logs %>%
filter(datetime > release_time,
r_version == "3.2.0")
## # A tibble: 35,826 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-18 12:34:43 3.2.0 GB
## 2 2015-04-18 15:41:32 3.2.0 CA
## 3 2015-04-18 14:58:41 3.2.0 IE
## 4 2015-04-18 16:44:45 3.2.0 US
## 5 2015-04-18 04:34:35 3.2.0 US
## 6 2015-04-18 22:29:45 3.2.0 CH
## 7 2015-04-17 16:21:06 3.2.0 US
## 8 2015-04-18 20:34:57 3.2.0 AT
## 9 2015-04-17 18:23:19 3.2.0 US
## 10 2015-04-18 03:00:31 3.2.0 US
## # … with 35,816 more rows
# Examine histograms of downloads by version
ggplot(logs, aes(x = datetime)) +
geom_histogram() +
geom_vline(aes(xintercept = as.numeric(release_time)))+
facet_wrap(~ r_version, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
[Video]
[Video]
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Parse x
x <- "2010 September 20th" # 2010-09-20
ymd(x)
## [1] "2010-09-20"
# Parse y
y <- "02.01.2010" # 2010-01-02
dmy(y)
## [1] "2010-01-02"
# Parse z
z <- "Sep, 12th 2010 14:00" # 2010-09-12T14:00
mdy_hm(z)
## [1] "2010-09-12 14:00:00 UTC"
parse_date_time()
# Specify an order string to parse x
x <- "Monday June 1st 2010 at 4pm"
parse_date_time(x, orders = "ABdyIp")
## [1] "2010-06-01 16:00:00 UTC"
# Specify order to include both "mdy" and "dmy"
two_orders <- c("October 7, 2001", "October 13, 2002", "April 13, 2003",
"17 April 2005", "23 April 2017")
parse_date_time(two_orders, orders = c("mdy", "dmy"))
## [1] "2001-10-07 UTC" "2002-10-13 UTC" "2003-04-13 UTC" "2005-04-17 UTC"
## [5] "2017-04-23 UTC"
# Specify order to include "dOmY", "OmY" and "Y"
short_dates <- c("11 December 1282", "May 1372", "1253")
parse_date_time(short_dates, orders = c("dOmY", "OmY", "Y"))
## [1] "1282-12-11 UTC" "1372-05-01 UTC" "1253-01-01 UTC"
[Video]
library(lubridate)
library(readr)
library(dplyr)
library(ggplot2)
# Import CSV with read_csv()
akl_daily_raw <- read_csv("akl_weather_daily.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## max_temp = col_double(),
## min_temp = col_double(),
## mean_temp = col_double(),
## mean_rh = col_double(),
## events = col_character(),
## cloud_cover = col_double()
## )
# Print akl_daily_raw
akl_daily_raw
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 2007-9-1 60 51 56 75 <NA> 4
## 2 2007-9-2 60 53 56 82 Rain 4
## 3 2007-9-3 57 51 54 78 <NA> 6
## 4 2007-9-4 64 50 57 80 Rain 6
## 5 2007-9-5 53 48 50 90 Rain 7
## 6 2007-9-6 57 42 50 69 <NA> 1
## 7 2007-9-7 59 41 50 77 <NA> 4
## 8 2007-9-8 59 46 52 80 <NA> 5
## 9 2007-9-9 55 50 52 88 Rain 7
## 10 2007-9-10 59 50 54 82 Rain 4
## # … with 3,651 more rows
# Parse date
akl_daily <- akl_daily_raw %>%
mutate(date = ymd(date))
# Print akl_daily
akl_daily
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <date> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 2007-09-01 60 51 56 75 <NA> 4
## 2 2007-09-02 60 53 56 82 Rain 4
## 3 2007-09-03 57 51 54 78 <NA> 6
## 4 2007-09-04 64 50 57 80 Rain 6
## 5 2007-09-05 53 48 50 90 Rain 7
## 6 2007-09-06 57 42 50 69 <NA> 1
## 7 2007-09-07 59 41 50 77 <NA> 4
## 8 2007-09-08 59 46 52 80 <NA> 5
## 9 2007-09-09 55 50 52 88 Rain 7
## 10 2007-09-10 59 50 54 82 Rain 4
## # … with 3,651 more rows
# Plot to check work
ggplot(akl_daily, aes(x = date, y = max_temp)) +
geom_line()
## Warning: Removed 1 row(s) containing missing values (geom_path).
library(lubridate)
library(readr)
library(dplyr)
library(ggplot2)
# Import "akl_weather_hourly_2016.csv"
akl_hourly_raw <- read_csv("akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## month = col_double(),
## mday = col_double(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_double(),
## date_utc = col_datetime(format = "")
## )
# Print akl_hourly_raw
akl_hourly_raw
## # A tibble: 17,454 x 10
## year month mday time temperature weather conditions events humidity
## <dbl> <dbl> <dbl> <tim> <dbl> <chr> <chr> <chr> <dbl>
## 1 2016 1 1 00:00 68 Clear Clear <NA> 68
## 2 2016 1 1 00:30 68 Clear Clear <NA> 68
## 3 2016 1 1 01:00 68 Clear Clear <NA> 73
## 4 2016 1 1 01:30 68 Clear Clear <NA> 68
## 5 2016 1 1 02:00 68 Clear Clear <NA> 68
## 6 2016 1 1 02:30 68 Clear Clear <NA> 68
## 7 2016 1 1 03:00 68 Clear Clear <NA> 68
## 8 2016 1 1 03:30 68 Cloudy Partly Cl… <NA> 68
## 9 2016 1 1 04:00 68 Cloudy Scattered… <NA> 68
## 10 2016 1 1 04:30 66.2 Cloudy Partly Cl… <NA> 73
## # … with 17,444 more rows, and 1 more variable: date_utc <dttm>
# Use make_date() to combine year, month and mday
akl_hourly <- akl_hourly_raw %>%
mutate(date = make_date(year = year, month = month, day = mday))
# Parse datetime_string
akl_hourly <- akl_hourly %>%
mutate(
datetime_string = paste(date, time, sep = "T"),
datetime = ymd_hms(datetime_string)
)
# Print date, time and datetime columns of akl_hourly
akl_hourly %>% select(date, time, datetime)
## # A tibble: 17,454 x 3
## date time datetime
## <date> <time> <dttm>
## 1 2016-01-01 00:00 2016-01-01 00:00:00
## 2 2016-01-01 00:30 2016-01-01 00:30:00
## 3 2016-01-01 01:00 2016-01-01 01:00:00
## 4 2016-01-01 01:30 2016-01-01 01:30:00
## 5 2016-01-01 02:00 2016-01-01 02:00:00
## 6 2016-01-01 02:30 2016-01-01 02:30:00
## 7 2016-01-01 03:00 2016-01-01 03:00:00
## 8 2016-01-01 03:30 2016-01-01 03:30:00
## 9 2016-01-01 04:00 2016-01-01 04:00:00
## 10 2016-01-01 04:30 2016-01-01 04:30:00
## # … with 17,444 more rows
# Plot to check work
ggplot(akl_hourly, aes(x = datetime, y = temperature)) +
geom_line()
[Video]
# Examine the head() of release_time
head(release_time)
## [1] "2015-04-16 07:13:33 UTC"
# Examine the head() of the months of release_time
head(month(release_time))
## [1] 4
# Extract the month of releases
month(release_time) %>% table()
## .
## 4
## 1
# Extract the year of releases
year(release_time) %>% table()
## .
## 2015
## 1
# How often is the hour before 12 (noon)?
mean(hour(release_time) < 12)
## [1] 1
# How often is the release in am?
mean(am(release_time))
## [1] 1
library(ggplot2)
# Use wday() to tabulate release by day of the week
wday(releases$datetime) %>% table()
## .
## 1 2 3 4 5 6 7
## 3 29 9 12 18 31 3
# Add label = TRUE to make table more readable
wday(releases$datetime, label = TRUE) %>% table()
## .
## Sun Mon Tue Wed Thu Fri Sat
## 3 29 9 12 18 31 3
# Create column wday to hold week days
releases$wday <- wday(releases$datetime, label = TRUE)
# Plot barchart of weekday by type of release
ggplot(releases, aes(wday)) +
geom_bar() +
facet_wrap(~ type, ncol = 1, scale = "free_y")
library(ggplot2)
library(dplyr)
library(ggridges)
# Add columns for year, yday and month
akl_daily <- akl_daily %>%
mutate(
year = year(date),
yday = yday(date),
month = month(date, label = TRUE))
# Plot max_temp by yday for all years
ggplot(akl_daily, aes(x = yday, y = max_temp)) +
geom_line(aes(group = year), alpha = 0.5)
## Warning: Removed 1 row(s) containing missing values (geom_path).
# Examine distribution of max_temp by month
ggplot(akl_daily, aes(x = max_temp, y = month, height = ..density..)) +
geom_density_ridges(stat = "density")
## Warning: Removed 10 rows containing non-finite values (stat_density).
# Create new columns hour, month and rainy
akl_hourly <- akl_hourly %>%
mutate(
hour = hour(datetime),
month = month(datetime, label = TRUE),
rainy = weather == "Precipitation"
)
# Filter for hours between 8am and 10pm (inclusive)
akl_day <- akl_hourly %>%
filter(hour >= 8, hour <= 22)
# Summarise for each date if there is any rain
rainy_days <- akl_day %>%
group_by(month, date) %>%
summarise(
any_rain = any(rainy)
)
# Summarise for each month, the number of days with rain
rainy_days %>%
summarise(
days_rainy = sum(any_rain)
)
## # A tibble: 12 x 2
## month days_rainy
## <ord> <int>
## 1 Jan 15
## 2 Feb 13
## 3 Mar 12
## 4 Apr 15
## 5 May 21
## 6 Jun 19
## 7 Jul 22
## 8 Aug 16
## 9 Sep 25
## 10 Oct 20
## 11 Nov 19
## 12 Dec 11
[Video]
r_3_4_1 <- ymd_hms("2016-05-03 07:13:28 UTC")
# Round down to day
floor_date(r_3_4_1, unit = "day")
## [1] "2016-05-03 UTC"
# Round to nearest 5 minutes
round_date(r_3_4_1, unit = "5 minutes")
## [1] "2016-05-03 07:15:00 UTC"
# Round up to week
ceiling_date(r_3_4_1, unit = "week")
## [1] "2016-05-08 UTC"
# Subtract r_3_4_1 rounded down to day
r_3_4_1 - floor_date(r_3_4_1, unit = "day")
## Time difference of 7.224444 hours
# Create day_hour, datetime rounded down to hour
akl_hourly <- akl_hourly %>%
mutate(
day_hour = floor_date(datetime, unit = "hour")
)
# Count observations per hour
akl_hourly %>%
count(day_hour)
## # A tibble: 8,770 x 2
## day_hour n
## <dttm> <int>
## 1 2016-01-01 00:00:00 2
## 2 2016-01-01 01:00:00 2
## 3 2016-01-01 02:00:00 2
## 4 2016-01-01 03:00:00 2
## 5 2016-01-01 04:00:00 2
## 6 2016-01-01 05:00:00 2
## 7 2016-01-01 06:00:00 2
## 8 2016-01-01 07:00:00 2
## 9 2016-01-01 08:00:00 2
## 10 2016-01-01 09:00:00 2
## # … with 8,760 more rows
# Find day_hours with n != 2
akl_hourly %>%
count(day_hour) %>%
filter(n != 2) %>%
arrange(desc(n))
## # A tibble: 92 x 2
## day_hour n
## <dttm> <int>
## 1 2016-04-03 02:00:00 4
## 2 2016-09-25 00:00:00 4
## 3 2016-06-26 09:00:00 1
## 4 2016-09-01 23:00:00 1
## 5 2016-09-02 01:00:00 1
## 6 2016-09-04 11:00:00 1
## 7 2016-09-04 16:00:00 1
## 8 2016-09-04 17:00:00 1
## 9 2016-09-05 00:00:00 1
## 10 2016-09-05 15:00:00 1
## # … with 82 more rows
Michael is a hybrid thinker and doer—a byproduct of being a CliftonStrengths “Learner” over time. With 20+ years of engineering, design, and product experience, he helps organizations identify market needs, mobilize internal and external resources, and deliver delightful digital customer experiences that align with business goals. He has been entrusted with problem-solving for brands—ranging from Fortune 500 companies to early-stage startups to not-for-profit organizations.
Michael earned his BS in Computer Science from New York Institute of Technology and his MBA from the University of Maryland, College Park. He is also a candidate to receive his MS in Applied Analytics from Columbia University.
LinkedIn | Twitter | www.michaelmallari.com/data | www.columbia.edu/~mm5470