Data Science and statistics with RStudio

Ch 3 Data Wrangling

Esther Chen

Data Wrangling

Needed Package

library(dplyr)
library(ggplot2)
library(nycflights23)

# data(package = "nycflights23")
# data(package = .packages(all.available = TRUE))

3.2 Filter & Pipe

filter() “choose the rows you want to show”

Basic filter()

3.2 Filter & Pipe

Example 1

Base on flight data to show flights that carrier is “AS”

alaska_flights <- flights |>  
  filter(carrier == "AS")

Example 2

Base on flight data to show flights that dest is “PHX” and name “phoenix_flights”

3.2 Filter & Pipe

Operators

  • > corresponds to “greater than”

  • < corresponds to “less than”

  • >= corresponds to “greater than or equal to”

  • <= corresponds to “less than or equal to”

  • != corresponds to “not equal to.”

  • | corresponds to “or”

  • & corresponds to “and”

3.2 Filter & Pipe

Filter multiple condition

btv_sea_flights_fall <- flights |>  
  filter(origin == "JFK" & (dest == "BTV" | dest == "SEA") & month >= 10)
View(btv_sea_flights_fall)

# & can by replaced by comma

3.2 Filter & Pipe

Filter out

not_BTV_SEA <- flights |>  
  filter(!(dest == "BTV" | dest == "SEA"))
View(not_BTV_SEA)

# Try this code
# flights |>  
# filter(!dest == "BTV" | dest == "SEA")

3.2 Filter & Pipe

Filter multiple condition within same column

many_airports <- flights |> 
  filter(dest == "SEA" | dest == "SFO" | dest == "PDX" | 
         dest == "BTV" | dest == "BDL")

Can replaced by %in% with c()

many_airports <- flights |> 
  filter(dest %in% c("SEA", "SFO", "PDX", "BTV", "BDL"))
View(many_airports)

3.3 Summarize

  • Check the other data table also under nycflights23 packages

  • When we have missing data, can we summarize?

summary_windspeed <- weather |> 
  summarize(mean = mean(wind_speed), std_dev = sd(wind_speed))
summary_windspeed
# A tibble: 1 × 2
   mean std_dev
  <dbl>   <dbl>
1    NA      NA
  • Skip the missing data “na.rm=TRUE”
summary_windspeed <- weather |>  
  summarize(mean = mean(wind_speed, na.rm = TRUE), 
            std_dev = sd(wind_speed, na.rm = TRUE))
summary_windspeed
# A tibble: 1 × 2
   mean std_dev
  <dbl>   <dbl>
1  9.44    5.26
# summarize(weather, mean(temp, na.rm=TRUE))

3.3 Summarize

  • mean() the average
  • sd() the standard deviation, which is a measure of spread
  • min() and max(): the minimum and maximum values, respectively
  • IQR() interquartile range
  • sum() the total amount when adding multiple numbers

3.4 Groups by rows

group_by()

summary_monthly_windspeed <- weather |> 
  group_by(month) |> 
  summarize(mean = mean(wind_speed, na.rm = TRUE), 
            std_dev = sd(wind_speed, na.rm = TRUE)) |> 
  ungroup()

summary_monthly_windspeed
# A tibble: 12 × 3
   month  mean std_dev
   <int> <dbl>   <dbl>
 1     1 10.3     6.01
 2     2 10.9     6.57
 3     3 12.3     6.33
 4     4 10.0     5.03
 5     5  8.89    4.46
 6     6  8.53    4.43
 7     7  7.98    4.35
 8     8  8.85    4.34
 9     9  8.92    4.66
10    10  8.23    4.69
11    11  9.50    4.84
12    12  8.77    5.02
# ungroup()

3.4 Groups by rows

Back to flights data set

by_origin <- flights |>  
  group_by(origin) |>  
  summarize(count = n())|> 
  ungroup()
by_origin
# A tibble: 3 × 2
  origin  count
  <chr>   <int>
1 EWR    138578
2 JFK    133048
3 LGA    163726

3.4.1 Groups by more than one variable

by_origin_monthly <- flights |> 
  group_by(origin, month) |> 
  summarize(count = n())|> 
  ungroup()
by_origin_monthly
# A tibble: 36 × 3
   origin month count
   <chr>  <int> <int>
 1 EWR        1 11623
 2 EWR        2 10991
 3 EWR        3 12593
 4 EWR        4 12022
 5 EWR        5 12371
 6 EWR        6 11339
 7 EWR        7 11646
 8 EWR        8 11561
 9 EWR        9 11373
10 EWR       10 11805
# ℹ 26 more rows

3.4.1 Groups by more than one variable

by_origin_monthly_incorrect <- flights |> 
  group_by(origin) |> 
  group_by(month) |> 
  summarize(count = n()) |> 
  ungroup()
by_origin_monthly_incorrect
# A tibble: 12 × 2
   month count
   <int> <int>
 1     1 36020
 2     2 34761
 3     3 39514
 4     4 37476
 5     5 38710
 6     6 35921
 7     7 36211
 8     8 36765
 9     9 35505
10    10 36586
11    11 34521
12    12 33362

Exercise 1

  • Use diamond data set to group by cut and give the average price and standard deviation
  • Use diamond data set to group by cut and color and give the average price
# A tibble: 5 × 3
  cut       avg_price st_dv
  <ord>         <dbl> <dbl>
1 Fair          4359. 3560.
2 Good          3929. 3682.
3 Very Good     3982. 3936.
4 Premium       4584. 4349.
5 Ideal         3458. 3808.

3.5 Mutate

Mutate() Create new column to your data

weather <- weather |>  
  mutate(temp_in_C = (temp - 32) / 1.8)

3.5 Mutate

# A tibble: 12 × 3
   month mean_temp_in_F mean_temp_in_C
   <int>          <dbl>          <dbl>
 1     1           35.7           2.04
 2     2           34.5           1.39
 3     3           45.0           7.24
 4     4           54.6          12.6 
 5     5           53.6          12.0 
 6     6           69.2          20.6 
 7     7           78.4          25.8 
 8     8           72.8          22.7 
 9     9           64.7          18.1 
10    10           64.2          17.9 
11    11           47.5           8.64
12    12           45.9           7.72

Exercise 2

Use flights data set, to add on these

  • gain = dep_delay - arr_delay

  • hours = air_time / 60

  • gain_per_hour = gain / hours

Summary Table

gain_summary <- flights |> 
  summarize(
    min = min(gain, na.rm = TRUE),
    q1 = quantile(gain, 0.25, na.rm = TRUE),
    median = quantile(gain, 0.5, na.rm = TRUE),
    q3 = quantile(gain, 0.75, na.rm = TRUE),
    max = max(gain, na.rm = TRUE),
    mean = mean(gain, na.rm = TRUE),
    sd = sd(gain, na.rm = TRUE),
    missing = sum(is.na(gain))
  )
gain_summary
# A tibble: 1 × 8
    min    q1 median    q3   max  mean    sd missing
  <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>   <int>
1  -321     1     11    20   101  9.35  18.4   12534

3.6 Arrange and sort rows

# A tibble: 118 × 2
   dest  num_flights
   <chr>       <int>
 1 ABQ           228
 2 ACK           916
 3 AGS            20
 4 ALB          1581
 5 ANC            95
 6 ATL         17570
 7 AUS          4848
 8 AVL          1617
 9 AVP           145
10 BDL           701
# ℹ 108 more rows

3.6 Arrange and sort rows

arrange() Ranking from least to most

freq_dest |> 
  arrange(num_flights)
# A tibble: 118 × 2
   dest  num_flights
   <chr>       <int>
 1 LEX             1
 2 AGS            20
 3 OGG            20
 4 SBN            24
 5 HDN            28
 6 PNS            71
 7 MTJ            77
 8 ANC            95
 9 VPS           109
10 AVP           145
# ℹ 108 more rows

3.6 Arrange and sort rows

Ranking from most to least

freq_dest |>  
  arrange(desc(num_flights))
# A tibble: 118 × 2
   dest  num_flights
   <chr>       <int>
 1 BOS         19036
 2 ORD         18200
 3 MCO         17756
 4 ATL         17570
 5 MIA         16076
 6 LAX         15968
 7 FLL         14239
 8 CLT         12866
 9 DFW         11675
10 SFO         11651
# ℹ 108 more rows

3.7 Join data frame

3.7 Join data frame

inner_join()

Goal: To let flights carrier has carrier name

View(airlines)
flights_joined <- flights |> 
  inner_join(airlines, by = "carrier")
View(flights)
View(flights_joined)

3.7 Join data frame

  • left_join() keeps all observations in x.

  • right_join() keeps all observations in y.

  • semi_join() return all rows from x with a match in y

  • anti_join() return all rows from x without a match in y

3.7 Join data frame

Goal: To let flights dest has full destination name

View(airports)
flights_with_airport_names <- flights |> 
  inner_join(airports, by = c("dest" = "faa"))
View(flights_with_airport_names)

Exercise 3

Can you group flights data by dest and show

  • total number flights by dest

  • airport full name

# A tibble: 114 × 9
   dest  num_flights airport_name             lat    lon   alt    tz dst   tzone
   <chr>       <int> <chr>                  <dbl>  <dbl> <dbl> <dbl> <chr> <chr>
 1 BOS         19036 General Edward Lawren…  42.4  -71.0    20    -5 A     Amer…
 2 ORD         18200 Chicago O'Hare Intern…  42.0  -87.9   672    -6 A     Amer…
 3 MCO         17756 Orlando International…  28.4  -81.3    96    -5 A     Amer…
 4 ATL         17570 Hartsfield Jackson At…  33.6  -84.4  1026    -5 A     Amer…
 5 MIA         16076 Miami International A…  25.8  -80.3     8    -5 A     Amer…
 6 LAX         15968 Los Angeles Internati…  33.9 -118.    125    -8 A     Amer…
 7 FLL         14239 Fort Lauderdale Holly…  26.1  -80.2     9    -5 A     Amer…
 8 CLT         12866 Charlotte Douglas Int…  35.2  -80.9   748    -5 A     Amer…
 9 DFW         11675 Dallas Fort Worth Int…  32.9  -97.0   607    -6 A     Amer…
10 SFO         11651 San Francisco Interna…  37.6 -122.     13    -8 A     Amer…
# ℹ 104 more rows

3.7 Join data frame

flights_weather_joined <- flights |>
  inner_join(weather, by = c("year", "month", "day", "hour", "origin"))
View(flights_weather_joined)

3.8 Others

  • select() only a subset of variables/columns.

  • relocate() variables/columns to a new position.

  • rename() variables/columns to have new names.

  • Return only the top_n() values of a variable.

3.8 Select 1/3

select()only a subset of variables/columns.

#glimpse(flights)

#Select the column that you are interested in
flights |> 
  select(carrier, flight)
# A tibble: 435,352 × 2
   carrier flight
   <chr>    <int>
 1 UA         628
 2 DL         393
 3 B6         371
 4 B6        1053
 5 UA         219
 6 AA         499
 7 B6         996
 8 AA         981
 9 UA         206
10 NK         225
# ℹ 435,342 more rows

3.8 Select 2/3

#Select the flight without year data
flights_no_year <- flights |> 
  select(-year)

#Select a range of data
flight_arr_times <- flights |> 
  select(month:day, arr_time:sched_arr_time)

3.8 Select 3/3

# select by the condition

# flights |> select(starts_with("a"))
# flights |> select(ends_with("delay"))
# flights |> select(contains("time"))

# select the rest of table

flights_reorder <- flights |> 
  select(year, month, day, hour, minute, time_hour, everything())
glimpse(flights_reorder)
Rows: 435,352
Columns: 22
$ year           <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ hour           <dbl> 20, 23, 23, 21, 20, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 6, 6,…
$ minute         <dbl> 38, 0, 44, 40, 48, 0, 10, 30, 20, 45, 59, 0, 59, 0, 0, …
$ time_hour      <dttm> 2023-01-01 20:00:00, 2023-01-01 23:00:00, 2023-01-01 2…
$ dep_time       <int> 1, 18, 31, 33, 36, 503, 520, 524, 537, 547, 549, 551, 5…
$ sched_dep_time <int> 2038, 2300, 2344, 2140, 2048, 500, 510, 530, 520, 545, …
$ dep_delay      <dbl> 203, 78, 47, 173, 228, 3, 10, -6, 17, 2, -10, -9, -7, -…
$ arr_time       <int> 328, 228, 500, 238, 223, 808, 948, 645, 926, 845, 905, …
$ sched_arr_time <int> 3, 135, 426, 2352, 2252, 815, 949, 710, 818, 852, 901, …
$ arr_delay      <dbl> 205, 53, 34, 166, 211, -7, -1, -25, 68, -7, 4, -13, -14…
$ carrier        <chr> "UA", "DL", "B6", "B6", "UA", "AA", "B6", "AA", "UA", "…
$ flight         <int> 628, 393, 371, 1053, 219, 499, 996, 981, 206, 225, 800,…
$ tailnum        <chr> "N25201", "N830DN", "N807JB", "N265JB", "N17730", "N925…
$ origin         <chr> "EWR", "JFK", "JFK", "JFK", "EWR", "EWR", "JFK", "EWR",…
$ dest           <chr> "SMF", "ATL", "BQN", "CHS", "DTW", "MIA", "BQN", "ORD",…
$ air_time       <dbl> 367, 108, 190, 108, 80, 154, 192, 119, 258, 157, 164, 1…
$ distance       <dbl> 2500, 760, 1576, 636, 488, 1085, 1576, 719, 1400, 1065,…
$ gain           <dbl> -2, 25, 13, 7, 17, 10, 11, 19, -51, 9, -14, 4, 7, 0, 4,…
$ hours          <dbl> 6.116667, 1.800000, 3.166667, 1.800000, 1.333333, 2.566…
$ gain_per_hour  <dbl> -0.3269755, 13.8888889, 4.1052632, 3.8888889, 12.750000…

3.8 relocate

relocate() variables/columns to a new position.

flights_relocate <- flights |> 
  relocate(hour, minute, time_hour, .after = day)
glimpse(flights_relocate)
Rows: 435,352
Columns: 22
$ year           <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ hour           <dbl> 20, 23, 23, 21, 20, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 6, 6,…
$ minute         <dbl> 38, 0, 44, 40, 48, 0, 10, 30, 20, 45, 59, 0, 59, 0, 0, …
$ time_hour      <dttm> 2023-01-01 20:00:00, 2023-01-01 23:00:00, 2023-01-01 2…
$ dep_time       <int> 1, 18, 31, 33, 36, 503, 520, 524, 537, 547, 549, 551, 5…
$ sched_dep_time <int> 2038, 2300, 2344, 2140, 2048, 500, 510, 530, 520, 545, …
$ dep_delay      <dbl> 203, 78, 47, 173, 228, 3, 10, -6, 17, 2, -10, -9, -7, -…
$ arr_time       <int> 328, 228, 500, 238, 223, 808, 948, 645, 926, 845, 905, …
$ sched_arr_time <int> 3, 135, 426, 2352, 2252, 815, 949, 710, 818, 852, 901, …
$ arr_delay      <dbl> 205, 53, 34, 166, 211, -7, -1, -25, 68, -7, 4, -13, -14…
$ carrier        <chr> "UA", "DL", "B6", "B6", "UA", "AA", "B6", "AA", "UA", "…
$ flight         <int> 628, 393, 371, 1053, 219, 499, 996, 981, 206, 225, 800,…
$ tailnum        <chr> "N25201", "N830DN", "N807JB", "N265JB", "N17730", "N925…
$ origin         <chr> "EWR", "JFK", "JFK", "JFK", "EWR", "EWR", "JFK", "EWR",…
$ dest           <chr> "SMF", "ATL", "BQN", "CHS", "DTW", "MIA", "BQN", "ORD",…
$ air_time       <dbl> 367, 108, 190, 108, 80, 154, 192, 119, 258, 157, 164, 1…
$ distance       <dbl> 2500, 760, 1576, 636, 488, 1085, 1576, 719, 1400, 1065,…
$ gain           <dbl> -2, 25, 13, 7, 17, 10, 11, 19, -51, 9, -14, 4, 7, 0, 4,…
$ hours          <dbl> 6.116667, 1.800000, 3.166667, 1.800000, 1.333333, 2.566…
$ gain_per_hour  <dbl> -0.3269755, 13.8888889, 4.1052632, 3.8888889, 12.750000…

3.8 rename

rename() variables/columns to have new names

flights_time_new <- flights |> 
  select(dep_time, arr_time) |> 
  rename(departure_time = dep_time, arrival_time = arr_time)
glimpse(flights_time_new)
Rows: 435,352
Columns: 2
$ departure_time <int> 1, 18, 31, 33, 36, 503, 520, 524, 537, 547, 549, 551, 5…
$ arrival_time   <int> 328, 228, 500, 238, 223, 808, 948, 645, 926, 845, 905, …

3.8 Top 10

named_dests |> 
  top_n(n = 10, wt = num_flights)
# A tibble: 10 × 9
   dest  num_flights airport_name             lat    lon   alt    tz dst   tzone
   <chr>       <int> <chr>                  <dbl>  <dbl> <dbl> <dbl> <chr> <chr>
 1 BOS         19036 General Edward Lawren…  42.4  -71.0    20    -5 A     Amer…
 2 ORD         18200 Chicago O'Hare Intern…  42.0  -87.9   672    -6 A     Amer…
 3 MCO         17756 Orlando International…  28.4  -81.3    96    -5 A     Amer…
 4 ATL         17570 Hartsfield Jackson At…  33.6  -84.4  1026    -5 A     Amer…
 5 MIA         16076 Miami International A…  25.8  -80.3     8    -5 A     Amer…
 6 LAX         15968 Los Angeles Internati…  33.9 -118.    125    -8 A     Amer…
 7 FLL         14239 Fort Lauderdale Holly…  26.1  -80.2     9    -5 A     Amer…
 8 CLT         12866 Charlotte Douglas Int…  35.2  -80.9   748    -5 A     Amer…
 9 DFW         11675 Dallas Fort Worth Int…  32.9  -97.0   607    -6 A     Amer…
10 SFO         11651 San Francisco Interna…  37.6 -122.     13    -8 A     Amer…
# how about after arrange?
# named_dests |> 
#   top_n(n = 10, wt = num_flights) |> 
#   arrange(desc(num_flights))