This lecture will look at dplyr (Transform) and ggvis (Visualize)
Data that is tidy is: * Easy to transform, visualize, and model * Variables are stored in a consistent way – always as columns * Tidyr provides tools to tidy messy data (incl. gather, spread, and separate) * You can find more information about this package on Google
This package tries to tackle three main bottlenecks in data manipulation: * Cognative - Think about what data manipulation you should be doing Describe it - in the way a PC can understand Do It - Computational * These are the venn diagram slide.
dplyr constrains what you can do to manipulate the data.
Which of these five functions should you use?
Functions:
It is also important to make note of the group_by operator. These satisfy most of your needs.
Test with nycflights13
This contains four data frames:
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dest == "IAH") # Take flights and filter on destination airport "IAH"
## # A tibble: 7,198 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # ℹ 7,188 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, year: day, carrier, tailnum) # Select all variables in list after colon.
## # A tibble: 336,776 × 5
## year month day carrier tailnum
## <int> <int> <int> <chr> <chr>
## 1 2013 1 1 UA N14228
## 2 2013 1 1 UA N24211
## 3 2013 1 1 AA N619AA
## 4 2013 1 1 B6 N804JB
## 5 2013 1 1 DL N668DN
## 6 2013 1 1 UA N39463
## 7 2013 1 1 B6 N516JB
## 8 2013 1 1 EV N829AS
## 9 2013 1 1 B6 N593JB
## 10 2013 1 1 AA N3ALAA
## # ℹ 336,766 more rows
select(flights, -(year:day)) # can also filter on negative values (include everything except these)
## # A tibble: 336,776 × 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
## <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 517 515 2 830 819 11 UA
## 2 533 529 4 850 830 20 UA
## 3 542 540 2 923 850 33 AA
## 4 544 545 -1 1004 1022 -18 B6
## 5 554 600 -6 812 837 -25 DL
## 6 554 558 -4 740 728 12 UA
## 7 555 600 -5 913 854 19 B6
## 8 557 600 -3 709 723 -14 EV
## 9 557 600 -3 838 846 -8 B6
## 10 558 600 -2 753 745 8 AA
## # ℹ 336,766 more rows
## # ℹ 9 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, starts_with("arr")) #can use other functions such as text functions
## # A tibble: 336,776 × 2
## arr_time arr_delay
## <int> <dbl>
## 1 830 11
## 2 850 20
## 3 923 33
## 4 1004 -18
## 5 812 -25
## 6 740 12
## 7 913 19
## 8 709 -14
## 9 838 -8
## 10 753 8
## # ℹ 336,766 more rows
arrange(flights, desc(arr_delay)) # order by arrival delay descending
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 7 22 2257 759 898 121 1026
## 9 2013 12 5 756 1700 896 1058 2020
## 10 2013 5 3 1133 2055 878 1250 2215
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
mutate(flights, speed = distance / air_time * 60) # This adds a new calculated column
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, speed <dbl>
#summarize() - need to group by first to use this
by_day <- group_by(flights, year, month, day)
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE)) # Avg of departure delays
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
These allow you to string along functions rather than nest them. This allows the code to be more readable.
varX %>% (then) f(y) -> f(x,y)
Below is an example of the code nested then using pipelines
hourly_delay <- filter(
summarize(
group_by(
filter(
flights,
!is.na(dep_delay)
),
time_hour, hour
),
delay = mean(dep_delay),
n = n()
),
n > 10
)
## `summarise()` has grouped output by 'time_hour'. You can override using the
## `.groups` argument.
hourly_delay1 <- flights %>%
filter(!is.na(dep_delay)) %>%
group_by(time_hour, hour) %>%
summarize(
delay = mean(dep_delay),
n = n()) %>%
filter(n > 10)
## `summarise()` has grouped output by 'time_hour'. You can override using the
## `.groups` argument.
dplyr can work with remote data sources, not just local data.
To learn more, you can check open the vignettes: browseVignettes(package = “dplyr”) To translate from plyr to dplyr: http://jimhester.gethub.io/plyrToDplyr *Common Q&A: http://stackoverflow.com/questions/tagged/dplyr?sort=frequent
*datatable (DT) is an alternative package to dplyr
A package that is similar to ggplot2 but adds additional features.
A synthesis of ideas:
Check out The Grammar of Graphics by Leland Wilkinson:
Creating the both table from the nycflights13 dataset.
# summarize daily flights
daily <- flights %>%
filter(origin == "EWR") %>%
group_by(year, month, day) %>%
summarise(
delay = mean(dep_delay, na.rm = TRUE),
cancelled = mean(is.na(dep_delay))
)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
# summarize daily weather from hourly weather data
daily_weather <- weather %>%
filter(origin == "EWR") %>%
group_by(year, month, day) %>%
summarise(
temp = mean(temp, na.rm = TRUE),
wind = mean(wind_speed, na.rm = TRUE),
precip = sum(precip, na.rm = TRUE)
)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
# Join the tables together
both <- daily %>%
inner_join(daily_weather) %>%
ungroup() %>%
mutate(date = as.Date(ISOdate(year, month, day)))
## Joining with `by = join_by(year, month, day)`
Now we can use ggvis.
Example: A scatter plot with smoothing
both %>%
ggvis(x = ~temp, y= ~delay) %>%
layer_points() %>%
layer_smooths()
#You can typically drop the x and y
#both %>%
#ggvis(~temp, ~delay) %>%
#layer_points() %>% #Each layer inherits previous properties
#layer_smooths()
Example: A scatter plot with a fill color applied to differentiate precipitation
both %>%
ggvis(~temp, ~delay, fill = ~precip) %>%
layer_points()
Example A histogram of flight delays
both %>%
ggvis(~delay) %>%
layer_histograms()
## Guessing width = 5 # range / 21
Example GGvis makes an educated guess at an appropriate display of delays when no graph layer specified.
both %>% ggvis(~delay)
## Guessing layer_histograms()
## Guessing width = 5 # range / 21
dat <- data.frame(x = c(1,2,3), y = c(10,20,30), f = c("red","green","black"))
dat %>%
ggvis(x = ~x, y = ~y, fill = ~f) %>%
layer_points()
In this scaled value example, we get points with the labels “red”, “green”, and “black”, but they’re actually blue orange, and green.
Now let’s see the unscaled values:
dat %>%
ggvis(x = ~x, y := ~y, fill := ~f) %>%
layer_points()
In this example fill := ~f gave point colors of red, green, and black. y :=~y gave pixel locations at 10, 20, and 30 down from the top.
If we drop the colon off of := ~y, we get a linear set of points .
p <- ggvis(mtcars, x = ~wt, y = ~mpg) #Create a ggvis object with mtcars data
p <- layer_points(p) # Take object p from before and layer on points.
p <- layer_smooths(p) # Take object p from before and layer on smoothing lines.
p # print output
This can also be done with the pipe operator
#form 1- no pipes
p <- ggvis(mtcars, x = ~wt, y = ~mpg)
p <- layer_points(p)
p <- layer_smooths(p, span = 0.5)
p
# form 2 - composition of functions
layer_smooths(
layer_points(
ggvis(mtcars, ~wt, ~mpg)),
span = 0.5)
# form 3 with pipes
mtcars %>%
ggvis(x = ~wt, y = ~mpg) %>%
layer_points() %>%
layer_smooths(span = 0.5)
both %>%
ggvis(~delay) %>%
layer_histograms(width = input_slider(1, 10, value = 5))
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
both %>%
ggvis( ~delay, ~precip) %>%
layer_points(opacity := input_slider(0,1))
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
Example Scatter Plot in ggvis vs ggplot2
# Interactive Scatter Plot using ggvis
scatter_plot <- iris %>%
ggvis(~Sepal.Length, ~Sepal.Width, fill = ~Species) %>%
layer_points(size := 100) %>%
add_tooltip(function(df) df$Species) %>%
hide_legend("fill") %>%
layer_text(x = ~mean(range(Sepal.Length)), y = ~max(Sepal.Width) + 0.5,
text := "Scatter Plot - ggvis", fontSize := 15, baseline := "bottom") %>%
set_options(width = 400, height = 300)
# Display the plot
print(scatter_plot)
# equivalent ggplot2
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:ggvis':
##
## resolution
# Scatter plot using ggplot2
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point(size = 3) +
labs(title = "Scatter Plot - ggplot2", x = "Sepal Length", y = "Sepal Width") +
theme_minimal()
Example Barchart with additional customization
# Load necessary libraries
library(ggvis)
# Interactive Bar Chart using ggvis
bar_chart <- iris %>%
ggvis(x = ~Species, y = ~Sepal.Length, fill = ~Species) %>%
layer_bars() %>%
add_tooltip(function(df) df$Species) %>%
scale_nominal("fill", range = c("blue", "red", "green"))
# Display the plot
print(bar_chart)
Example Boxplot in GGvis
# Load necessary libraries
library(ggvis)
# Define custom colors for species
colors <- c("darkblue", "darkred", "darkgreen")
# Box Plot using ggvis on iris dataset with color scales
box_plot_ggvis <- iris %>%
ggvis(x = ~Species, y = ~Sepal.Length, fill = ~Species) %>%
layer_boxplots(fillOpacity := 0.7, strokeWidth := 0.5, stroke := "black") %>%
scale_nominal("fill", range = colors) %>%
add_tooltip(function(df) df$Species)
# Display the plot
print(box_plot_ggvis)
## GGPLOT
# Load necessary libraries
library(ggplot2)
# Box Plot using ggplot2 on iris dataset
box_plot_ggplot <- ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
geom_boxplot() +
labs(title = "Box Plot of Sepal Length by Species",
x = "Species",
y = "Sepal Length") +
theme_minimal()
# Display the plot
print(box_plot_ggplot)