Data Science and statistics with RStudio

Ch 3 Data Wrangling

Esther Chen

Data Wrangling

Needed Package

library(dplyr)
library(ggplot2)
library(nycflights23)

# data(package = "nycflights23")
# data(package = .packages(all.available = TRUE))

3.2 Filter & Pipe

filter() “choose the rows you want to show”

Basic filter()

3.2 Filter & Pipe

Example 1

Base on flight data to show flights that carrier is “AS”

alaska_flights <- flights |>  
  filter(carrier == "AS")

Example 2

Base on flight data to show flights that dest is “PHX” and name “phoenix_flights”

3.2 Filter & Pipe

Operators

> corresponds to “greater than”
< corresponds to “less than”
>= corresponds to “greater than or equal to”
<= corresponds to “less than or equal to”
!= corresponds to “not equal to.”
| corresponds to “or”
& corresponds to “and”

3.2 Filter & Pipe

Filter multiple condition

btv_sea_flights_fall <- flights |>  
  filter(origin == "JFK" & (dest == "BTV" | dest == "SEA") & month >= 10)
View(btv_sea_flights_fall)

# & can by replaced by comma

3.2 Filter & Pipe

Filter out

not_BTV_SEA <- flights |>  
  filter(!(dest == "BTV" | dest == "SEA"))
View(not_BTV_SEA)

# Try this code
# flights |>  
# filter(!dest == "BTV" | dest == "SEA")

3.2 Filter & Pipe

Filter multiple condition within same column

many_airports <- flights |> 
  filter(dest == "SEA" | dest == "SFO" | dest == "PDX" | 
         dest == "BTV" | dest == "BDL")

Can replaced by %in% with c()

many_airports <- flights |> 
  filter(dest %in% c("SEA", "SFO", "PDX", "BTV", "BDL"))
View(many_airports)

3.3 Summarize

Check the other data table also under nycflights23 packages
When we have missing data, can we summarize?

summary_windspeed <- weather |> 
  summarize(mean = mean(wind_speed), std_dev = sd(wind_speed))
summary_windspeed

# A tibble: 1 × 2
   mean std_dev
  <dbl>   <dbl>
1    NA      NA

Skip the missing data “na.rm=TRUE”

summary_windspeed <- weather |>  
  summarize(mean = mean(wind_speed, na.rm = TRUE), 
            std_dev = sd(wind_speed, na.rm = TRUE))
summary_windspeed

# A tibble: 1 × 2
   mean std_dev
  <dbl>   <dbl>
1  9.44    5.26

# summarize(weather, mean(temp, na.rm=TRUE))

3.3 Summarize

mean() the average
sd() the standard deviation, which is a measure of spread
min() and max(): the minimum and maximum values, respectively
IQR() interquartile range
sum() the total amount when adding multiple numbers

3.4 Groups by rows

group_by()

summary_monthly_windspeed <- weather |> 
  group_by(month) |> 
  summarize(mean = mean(wind_speed, na.rm = TRUE), 
            std_dev = sd(wind_speed, na.rm = TRUE)) |> 
  ungroup()

summary_monthly_windspeed

# A tibble: 12 × 3
   month  mean std_dev
   <int> <dbl>   <dbl>
 1     1 10.3     6.01
 2     2 10.9     6.57
 3     3 12.3     6.33
 4     4 10.0     5.03
 5     5  8.89    4.46
 6     6  8.53    4.43
 7     7  7.98    4.35
 8     8  8.85    4.34
 9     9  8.92    4.66
10    10  8.23    4.69
11    11  9.50    4.84
12    12  8.77    5.02

# ungroup()

3.4 Groups by rows

Back to flights data set

by_origin <- flights |>  
  group_by(origin) |>  
  summarize(count = n())|> 
  ungroup()
by_origin

# A tibble: 3 × 2
  origin  count
  <chr>   <int>
1 EWR    138578
2 JFK    133048
3 LGA    163726

3.4.1 Groups by more than one variable

by_origin_monthly <- flights |> 
  group_by(origin, month) |> 
  summarize(count = n())|> 
  ungroup()
by_origin_monthly

# A tibble: 36 × 3
   origin month count
   <chr>  <int> <int>
 1 EWR        1 11623
 2 EWR        2 10991
 3 EWR        3 12593
 4 EWR        4 12022
 5 EWR        5 12371
 6 EWR        6 11339
 7 EWR        7 11646
 8 EWR        8 11561
 9 EWR        9 11373
10 EWR       10 11805
# ℹ 26 more rows

3.4.1 Groups by more than one variable

by_origin_monthly_incorrect <- flights |> 
  group_by(origin) |> 
  group_by(month) |> 
  summarize(count = n()) |> 
  ungroup()
by_origin_monthly_incorrect

# A tibble: 12 × 2
   month count
   <int> <int>
 1     1 36020
 2     2 34761
 3     3 39514
 4     4 37476
 5     5 38710
 6     6 35921
 7     7 36211
 8     8 36765
 9     9 35505
10    10 36586
11    11 34521
12    12 33362

Exercise 1

Use diamond data set to group by cut and give the average price and standard deviation
Use diamond data set to group by cut and color and give the average price

# A tibble: 5 × 3
  cut       avg_price st_dv
  <ord>         <dbl> <dbl>
1 Fair          4359. 3560.
2 Good          3929. 3682.
3 Very Good     3982. 3936.
4 Premium       4584. 4349.
5 Ideal         3458. 3808.

3.5 Mutate

Mutate() Create new column to your data

weather <- weather |>  
  mutate(temp_in_C = (temp - 32) / 1.8)

3.5 Mutate

# A tibble: 12 × 3
   month mean_temp_in_F mean_temp_in_C
   <int>          <dbl>          <dbl>
 1     1           35.7           2.04
 2     2           34.5           1.39
 3     3           45.0           7.24
 4     4           54.6          12.6 
 5     5           53.6          12.0 
 6     6           69.2          20.6 
 7     7           78.4          25.8 
 8     8           72.8          22.7 
 9     9           64.7          18.1 
10    10           64.2          17.9 
11    11           47.5           8.64
12    12           45.9           7.72

Exercise 2

Use flights data set, to add on these

gain = dep_delay - arr_delay
hours = air_time / 60
gain_per_hour = gain / hours

Summary Table

gain_summary <- flights |> 
  summarize(
    min = min(gain, na.rm = TRUE),
    q1 = quantile(gain, 0.25, na.rm = TRUE),
    median = quantile(gain, 0.5, na.rm = TRUE),
    q3 = quantile(gain, 0.75, na.rm = TRUE),
    max = max(gain, na.rm = TRUE),
    mean = mean(gain, na.rm = TRUE),
    sd = sd(gain, na.rm = TRUE),
    missing = sum(is.na(gain))
  )
gain_summary

# A tibble: 1 × 8
    min    q1 median    q3   max  mean    sd missing
  <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>   <int>
1  -321     1     11    20   101  9.35  18.4   12534

3.6 Arrange and sort rows

# A tibble: 118 × 2
   dest  num_flights
   <chr>       <int>
 1 ABQ           228
 2 ACK           916
 3 AGS            20
 4 ALB          1581
 5 ANC            95
 6 ATL         17570
 7 AUS          4848
 8 AVL          1617
 9 AVP           145
10 BDL           701
# ℹ 108 more rows

3.6 Arrange and sort rows

arrange() Ranking from least to most

freq_dest |> 
  arrange(num_flights)

# A tibble: 118 × 2
   dest  num_flights
   <chr>       <int>
 1 LEX             1
 2 AGS            20
 3 OGG            20
 4 SBN            24
 5 HDN            28
 6 PNS            71
 7 MTJ            77
 8 ANC            95
 9 VPS           109
10 AVP           145
# ℹ 108 more rows

3.6 Arrange and sort rows

Ranking from most to least

freq_dest |>  
  arrange(desc(num_flights))

# A tibble: 118 × 2
   dest  num_flights
   <chr>       <int>
 1 BOS         19036
 2 ORD         18200
 3 MCO         17756
 4 ATL         17570
 5 MIA         16076
 6 LAX         15968
 7 FLL         14239
 8 CLT         12866
 9 DFW         11675
10 SFO         11651
# ℹ 108 more rows

3.7 Join data frame

inner_join()

Goal: To let flights carrier has carrier name

View(airlines)
flights_joined <- flights |> 
  inner_join(airlines, by = "carrier")
View(flights)
View(flights_joined)

3.7 Join data frame

left_join() keeps all observations in x.
right_join() keeps all observations in y.
semi_join() return all rows from x with a match in y
anti_join() return all rows from x without a match in y

3.7 Join data frame

Goal: To let flights dest has full destination name

View(airports)
flights_with_airport_names <- flights |> 
  inner_join(airports, by = c("dest" = "faa"))
View(flights_with_airport_names)

Exercise 3

Can you group flights data by dest and show

total number flights by dest
airport full name

# A tibble: 114 × 9
   dest  num_flights airport_name             lat    lon   alt    tz dst   tzone
   <chr>       <int> <chr>                  <dbl>  <dbl> <dbl> <dbl> <chr> <chr>
 1 BOS         19036 General Edward Lawren…  42.4  -71.0    20    -5 A     Amer…
 2 ORD         18200 Chicago O'Hare Intern…  42.0  -87.9   672    -6 A     Amer…
 3 MCO         17756 Orlando International…  28.4  -81.3    96    -5 A     Amer…
 4 ATL         17570 Hartsfield Jackson At…  33.6  -84.4  1026    -5 A     Amer…
 5 MIA         16076 Miami International A…  25.8  -80.3     8    -5 A     Amer…
 6 LAX         15968 Los Angeles Internati…  33.9 -118.    125    -8 A     Amer…
 7 FLL         14239 Fort Lauderdale Holly…  26.1  -80.2     9    -5 A     Amer…
 8 CLT         12866 Charlotte Douglas Int…  35.2  -80.9   748    -5 A     Amer…
 9 DFW         11675 Dallas Fort Worth Int…  32.9  -97.0   607    -6 A     Amer…
10 SFO         11651 San Francisco Interna…  37.6 -122.     13    -8 A     Amer…
# ℹ 104 more rows

3.7 Join data frame

flights_weather_joined <- flights |>
  inner_join(weather, by = c("year", "month", "day", "hour", "origin"))
View(flights_weather_joined)

3.8 Others

select() only a subset of variables/columns.
relocate() variables/columns to a new position.
rename() variables/columns to have new names.
Return only the top_n() values of a variable.

3.8 Select 1/3

select()only a subset of variables/columns.

#glimpse(flights)

#Select the column that you are interested in
flights |> 
  select(carrier, flight)

# A tibble: 435,352 × 2
   carrier flight
   <chr>    <int>
 1 UA         628
 2 DL         393
 3 B6         371
 4 B6        1053
 5 UA         219
 6 AA         499
 7 B6         996
 8 AA         981
 9 UA         206
10 NK         225
# ℹ 435,342 more rows

3.8 Select 2/3

#Select the flight without year data
flights_no_year <- flights |> 
  select(-year)

#Select a range of data
flight_arr_times <- flights |> 
  select(month:day, arr_time:sched_arr_time)

3.8 Select 3/3

# select by the condition

# flights |> select(starts_with("a"))
# flights |> select(ends_with("delay"))
# flights |> select(contains("time"))

# select the rest of table

flights_reorder <- flights |> 
  select(year, month, day, hour, minute, time_hour, everything())
glimpse(flights_reorder)

Rows: 435,352
Columns: 22
$ year           <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ hour           <dbl> 20, 23, 23, 21, 20, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 6, 6,…
$ minute         <dbl> 38, 0, 44, 40, 48, 0, 10, 30, 20, 45, 59, 0, 59, 0, 0, …
$ time_hour      <dttm> 2023-01-01 20:00:00, 2023-01-01 23:00:00, 2023-01-01 2…
$ dep_time       <int> 1, 18, 31, 33, 36, 503, 520, 524, 537, 547, 549, 551, 5…
$ sched_dep_time <int> 2038, 2300, 2344, 2140, 2048, 500, 510, 530, 520, 545, …
$ dep_delay      <dbl> 203, 78, 47, 173, 228, 3, 10, -6, 17, 2, -10, -9, -7, -…
$ arr_time       <int> 328, 228, 500, 238, 223, 808, 948, 645, 926, 845, 905, …
$ sched_arr_time <int> 3, 135, 426, 2352, 2252, 815, 949, 710, 818, 852, 901, …
$ arr_delay      <dbl> 205, 53, 34, 166, 211, -7, -1, -25, 68, -7, 4, -13, -14…
$ carrier        <chr> "UA", "DL", "B6", "B6", "UA", "AA", "B6", "AA", "UA", "…
$ flight         <int> 628, 393, 371, 1053, 219, 499, 996, 981, 206, 225, 800,…
$ tailnum        <chr> "N25201", "N830DN", "N807JB", "N265JB", "N17730", "N925…
$ origin         <chr> "EWR", "JFK", "JFK", "JFK", "EWR", "EWR", "JFK", "EWR",…
$ dest           <chr> "SMF", "ATL", "BQN", "CHS", "DTW", "MIA", "BQN", "ORD",…
$ air_time       <dbl> 367, 108, 190, 108, 80, 154, 192, 119, 258, 157, 164, 1…
$ distance       <dbl> 2500, 760, 1576, 636, 488, 1085, 1576, 719, 1400, 1065,…
$ gain           <dbl> -2, 25, 13, 7, 17, 10, 11, 19, -51, 9, -14, 4, 7, 0, 4,…
$ hours          <dbl> 6.116667, 1.800000, 3.166667, 1.800000, 1.333333, 2.566…
$ gain_per_hour  <dbl> -0.3269755, 13.8888889, 4.1052632, 3.8888889, 12.750000…

3.8 relocate

relocate() variables/columns to a new position.

flights_relocate <- flights |> 
  relocate(hour, minute, time_hour, .after = day)
glimpse(flights_relocate)

Rows: 435,352
Columns: 22
$ year           <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ hour           <dbl> 20, 23, 23, 21, 20, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 6, 6,…
$ minute         <dbl> 38, 0, 44, 40, 48, 0, 10, 30, 20, 45, 59, 0, 59, 0, 0, …
$ time_hour      <dttm> 2023-01-01 20:00:00, 2023-01-01 23:00:00, 2023-01-01 2…
$ dep_time       <int> 1, 18, 31, 33, 36, 503, 520, 524, 537, 547, 549, 551, 5…
$ sched_dep_time <int> 2038, 2300, 2344, 2140, 2048, 500, 510, 530, 520, 545, …
$ dep_delay      <dbl> 203, 78, 47, 173, 228, 3, 10, -6, 17, 2, -10, -9, -7, -…
$ arr_time       <int> 328, 228, 500, 238, 223, 808, 948, 645, 926, 845, 905, …
$ sched_arr_time <int> 3, 135, 426, 2352, 2252, 815, 949, 710, 818, 852, 901, …
$ arr_delay      <dbl> 205, 53, 34, 166, 211, -7, -1, -25, 68, -7, 4, -13, -14…
$ carrier        <chr> "UA", "DL", "B6", "B6", "UA", "AA", "B6", "AA", "UA", "…
$ flight         <int> 628, 393, 371, 1053, 219, 499, 996, 981, 206, 225, 800,…
$ tailnum        <chr> "N25201", "N830DN", "N807JB", "N265JB", "N17730", "N925…
$ origin         <chr> "EWR", "JFK", "JFK", "JFK", "EWR", "EWR", "JFK", "EWR",…
$ dest           <chr> "SMF", "ATL", "BQN", "CHS", "DTW", "MIA", "BQN", "ORD",…
$ air_time       <dbl> 367, 108, 190, 108, 80, 154, 192, 119, 258, 157, 164, 1…
$ distance       <dbl> 2500, 760, 1576, 636, 488, 1085, 1576, 719, 1400, 1065,…
$ gain           <dbl> -2, 25, 13, 7, 17, 10, 11, 19, -51, 9, -14, 4, 7, 0, 4,…
$ hours          <dbl> 6.116667, 1.800000, 3.166667, 1.800000, 1.333333, 2.566…
$ gain_per_hour  <dbl> -0.3269755, 13.8888889, 4.1052632, 3.8888889, 12.750000…

3.8 rename

rename() variables/columns to have new names

flights_time_new <- flights |> 
  select(dep_time, arr_time) |> 
  rename(departure_time = dep_time, arrival_time = arr_time)
glimpse(flights_time_new)

Rows: 435,352
Columns: 2
$ departure_time <int> 1, 18, 31, 33, 36, 503, 520, 524, 537, 547, 549, 551, 5…
$ arrival_time   <int> 328, 228, 500, 238, 223, 808, 948, 645, 926, 845, 905, …

3.8 Top 10

named_dests |> 
  top_n(n = 10, wt = num_flights)

# A tibble: 10 × 9
   dest  num_flights airport_name             lat    lon   alt    tz dst   tzone
   <chr>       <int> <chr>                  <dbl>  <dbl> <dbl> <dbl> <chr> <chr>
 1 BOS         19036 General Edward Lawren…  42.4  -71.0    20    -5 A     Amer…
 2 ORD         18200 Chicago O'Hare Intern…  42.0  -87.9   672    -6 A     Amer…
 3 MCO         17756 Orlando International…  28.4  -81.3    96    -5 A     Amer…
 4 ATL         17570 Hartsfield Jackson At…  33.6  -84.4  1026    -5 A     Amer…
 5 MIA         16076 Miami International A…  25.8  -80.3     8    -5 A     Amer…
 6 LAX         15968 Los Angeles Internati…  33.9 -118.    125    -8 A     Amer…
 7 FLL         14239 Fort Lauderdale Holly…  26.1  -80.2     9    -5 A     Amer…
 8 CLT         12866 Charlotte Douglas Int…  35.2  -80.9   748    -5 A     Amer…
 9 DFW         11675 Dallas Fort Worth Int…  32.9  -97.0   607    -6 A     Amer…
10 SFO         11651 San Francisco Interna…  37.6 -122.     13    -8 A     Amer…

# how about after arrange?
# named_dests |> 
#   top_n(n = 10, wt = num_flights) |> 
#   arrange(desc(num_flights))