Tips and Tricks

1 Plot #1

avocado <- read.csv("./avocado_prices/avocado.csv")
avocado$Date <- as.Date(avocado$Date) # R didn't automatically detect the date

avocado <- avocado |> 
  filter(region %in% c("Midsouth", "Northeast", "Plains", "Southeast", "West")) |> 
  sample_n(100) |> 
  mutate(isNorthEast = region == "Northeast")

head(avocado)
   X       Date AveragePrice Total.Volume     X4046      X4225     X4770
1 48 2016-01-24         1.01   3147902.67 472744.05 1787662.94 146094.55
2 38 2017-04-09         1.50    137962.99   3866.01   54710.71   2285.58
3 44 2015-02-22         1.10   2720346.71 632670.12 1209150.04 295228.18
4 42 2016-03-06         1.44     82249.97   3310.73   48133.42   2104.76
5  7 2016-11-06         2.07     89099.08   5608.51   26861.46    140.68
6 36 2017-04-23         1.26    296559.24  43410.75   50687.37    368.72
  Total.Bags Small.Bags Large.Bags XLarge.Bags         type year    region
1  741401.13  630109.47  107122.67     4168.99 conventional 2016  Midsouth
2   77100.69   34323.06   42777.63        0.00      organic 2017  Midsouth
3  583298.37  535973.53   46783.55      541.29 conventional 2015  Midsouth
4   28701.06   11159.01   17542.05        0.00      organic 2016  Midsouth
5   56488.43   55962.01     526.42        0.00      organic 2016 Northeast
6  202092.40   92126.06  109966.34        0.00      organic 2017      West
  isNorthEast
1       FALSE
2       FALSE
3       FALSE
4       FALSE
5        TRUE
6       FALSE
avocado |> 
  ggplot(aes(x = Date, y = AveragePrice, group = region, color = region)) +
  geom_line(aes(
    linetype = isNorthEast
  ), linewidth = 1.25) +
  labs(
    title = "Average Avocado Price in the US Over Time",
    subtitle = "How do avocado prices differ by region?",
    x = "Date",
    y = "Average Price",
    color = "Region"
  ) +
  theme_fivethirtyeight() +
  theme(axis.title = element_text()) +
  scale_linetype_manual(
    values = c(
      "11", # compact dash (dash length 1, gap length 1)
      "solid"
    ),
    guide = "none" # removes the "isNorthEast" legend
  ) +
  scale_x_date(breaks = pretty_breaks(n = 10))

2 tidyverse in action

Default |>
  sample_n(5)
  default student   balance   income
1     Yes     Yes 1956.9239 15574.39
2      No     Yes  599.4710 18575.41
3      No      No  322.9823 25267.40
4      No     Yes 1499.1901 17560.37
5      No      No 1217.0728 62764.10

Use View() to view the data in a comprehensive manner.

To count the number of long_flights (the column long_flights doesn’t exist in the dataframe, on option would be the following

flights |> 
  mutate(long_flight = (air_time >= 6 * 60)) |> 
  count(long_flight)
# A tibble: 3 × 2
  long_flight      n
  <lgl>        <int>
1 FALSE       322630
2 TRUE          4716
3 NA            9430

But a more concise method is the following,

flights |> 
  count(long_flight = air_time >= 6 * 60)
# A tibble: 3 × 2
  long_flight      n
  <lgl>        <int>
1 FALSE       322630
2 TRUE          4716
3 NA            9430

The above option can be used in other settings as well,

flights |> 
  group_by(date = make_date(year, month, day)) |> 
  summarise(flights_n = n(), air_time_median = median(air_time, na.rm = T)) |> 
  ungroup() |> 
  sample_n(5)
# A tibble: 5 × 3
  date       flights_n air_time_median
  <date>         <int>           <dbl>
1 2013-10-16       974             124
2 2013-12-31       776             157
3 2013-03-29       974             126
4 2013-10-13       902             119
5 2013-03-12       966             139

The following code takes one sample from each group,

flights |> 
  group_by(origin) |> 
  slice_sample(n = 1) |> 
  ungroup() |> 
  select(origin, dest, air_time)
# A tibble: 3 × 3
  origin dest  air_time
  <chr>  <chr>    <dbl>
1 EWR    BOS         41
2 JFK    DCA         39
3 LGA    MSY        178

There’s very handy function called parse_number, which can be used to parse column names that contain numbers in them,

parse_number(c("#1", "#2##3", "3.5#"))
[1] 1.0 2.0 3.5

There are functions startswith, endswith, and contains.

flights |> 
  select(starts_with("dep_"), everything()) |> 
  sample_n(5)
# A tibble: 5 × 19
  dep_time dep_delay  year month   day sched_dep_time arr_time sched_arr_time
     <int>     <dbl> <int> <int> <int>          <int>    <int>          <int>
1      821        36  2013     6    26            745     1018           1000
2     1445        -5  2013     5    12           1450     1637           1635
3      820        -5  2013     5    12            825     1030           1026
4     1804       -16  2013     4    28           1820     2131           2131
5      610        -5  2013     3    11            615      756            818
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

It is possible to conditionally change the entry names in the following way,

flights |> 
  mutate(origin = case_when(
    origin == "EWR" ~ "Newark International Airport",
    origin == "JFK" ~ "John F. Kennedy International Airport",
    origin == "LGA" ~ "LaGuardia Airport"
  )) |> 
  count(origin)
# A tibble: 3 × 2
  origin                                     n
  <chr>                                  <int>
1 John F. Kennedy International Airport 111279
2 LaGuardia Airport                     104662
3 Newark International Airport          120835

transmute keeps only the columns that are given as arguments,

flights |> 
  transmute(date = make_date(year, month, day), tailnum, origin, dest) |> 
  sample_n(5)
# A tibble: 5 × 4
  date       tailnum origin dest 
  <date>     <chr>   <chr>  <chr>
1 2013-02-26 N586JB  JFK    SJU  
2 2013-09-09 N454UA  EWR    FLL  
3 2013-02-24 N806MQ  JFK    RDU  
4 2013-04-16 N8665A  JFK    CLE  
5 2013-03-24 N14230  EWR    RSW  

Pipes are really handy,

airlines |> 
  sample_n(5)
# A tibble: 5 × 2
  carrier name                  
  <chr>   <chr>                 
1 AA      American Airlines Inc.
2 F9      Frontier Airlines Inc.
3 DL      Delta Air Lines Inc.  
4 WN      Southwest Airlines Co.
5 MQ      Envoy Air             
airlines |> 
  mutate(name = name |> 
           str_to_upper() |> 
           str_replace_all(" (INC|CO)\\.?$", "") |> 
           str_replace_all(" AIR ?(LINES|WAYS)?( CORPORATION)?$", "") |> 
           str_to_title() |> 
           str_replace_all("\\bUs\\b", "US")) |> 
  sample_n(5)
# A tibble: 5 × 2
  carrier name    
  <chr>   <chr>   
1 UA      United  
2 MQ      Envoy   
3 YV      Mesa    
4 AA      American
5 HA      Hawaiian

group_by carrier, then include only the groups that have more than 27000 rows,

flights |> 
  group_by(carrier) |> 
  filter(n() >= 27000) |> 
  count()
# A tibble: 5 × 2
# Groups:   carrier [5]
  carrier     n
  <chr>   <int>
1 AA      32729
2 B6      54635
3 DL      48110
4 EV      54173
5 UA      58665

Split a string into columns based on a regular expression,

airlines |> 
  extract(
    name,
    into = c("short_name", "remainder"),
    regex = "^([^\\s]+) (.*)$",
    remove = F
  ) |> 
  sample_n(5)
# A tibble: 5 × 4
  carrier name                        short_name remainder          
  <chr>   <chr>                       <chr>      <chr>              
1 DL      Delta Air Lines Inc.        Delta      Air Lines Inc.     
2 VX      Virgin America              Virgin     America            
3 HA      Hawaiian Airlines Inc.      Hawaiian   Airlines Inc.      
4 WN      Southwest Airlines Co.      Southwest  Airlines Co.       
5 FL      AirTran Airways Corporation AirTran    Airways Corporation

semi_join() to pick only rows from the first table which are matched in the second table,

begin_with_a <- airlines |> 
  filter(
    name |> 
      str_detect("^A")
  )

begin_with_a
# A tibble: 3 × 2
  carrier name                       
  <chr>   <chr>                      
1 AA      American Airlines Inc.     
2 AS      Alaska Airlines Inc.       
3 FL      AirTran Airways Corporation
flights |> 
  semi_join(begin_with_a, by = "carrier") |> 
  count(carrier)
# A tibble: 3 × 2
  carrier     n
  <chr>   <int>
1 AA      32729
2 AS        714
3 FL       3260
flights |> 
  slice_sample(n = 5)
# A tibble: 5 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2013     8    31      946            955        -9     1207           1212
2  2013    10     9     1900           1900         0     2105           2127
3  2013     4    28     1355           1400        -5     1614           1555
4  2013     1    15      629            630        -1      925            905
5  2013     7    17     1737           1745        -8     1924           1925
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

3 ISLR plots

Default |>
  sample_n(1000) |>
  ggplot(aes(
    x = balance,
    y = income,
    shape = default,
    color = default
  )) +
  geom_point(size = 3) +
  scale_shape_manual(values = c("Yes" = 3, "No" = 1)) +
  scale_color_manual(values = c("Yes" = "darkorange", "No" = "darkblue")) +
  theme_minimal()

Default |> 
  group_by(default) |> 
  ggplot(aes(
    x = default,
    y = balance,
    fill = default
  )) +
  geom_boxplot() +
  scale_fill_manual(values = c("Yes" = "brown", "No" = "lightblue")) +
  theme_minimal() +
  theme(legend.position = "none")

Default |> 
  group_by(default) |> 
  ggplot(aes(
    x = default,
    y = income,
    fill = default
  )) +
  geom_boxplot() +
  scale_fill_manual(values = c("Yes" = "brown", "No" = "lightblue")) +
  theme_minimal() +
  theme(legend.position = "none")