Data cleaning using DPlYR
library(nycflights13)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
flights <- nycflights13::flights
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
#Group data
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay,na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # … with 355 more rows
#na.rm -- removes missing values
#delay -- new column name
avg_delay <- summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
avg_delay
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # … with 355 more rows
arrange(avg_delay, delay) #displays smallest delay first
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 9 24 -1.33
## 2 2013 10 2 -0.920
## 3 2013 11 9 -0.629
## 4 2013 9 6 -0.398
## 5 2013 9 5 -0.388
## 6 2013 10 29 -0.349
## 7 2013 9 7 -0.250
## 8 2013 11 5 -0.183
## 9 2013 10 19 -0.107
## 10 2013 10 1 -0.0990
## # … with 355 more rows
carrier_delay <- summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
# Which carrier had the largest mean departure delay? Smallest?
by_carrier <- group_by(flights, carrier)
Avg_delay_by_carrier <- summarise(by_carrier, delay = mean(dep_delay, na.rm = TRUE))
arrange(Avg_delay_by_carrier, delay) # smallest delays displayed first
## # A tibble: 16 × 2
## carrier delay
## <chr> <dbl>
## 1 US 3.78
## 2 HA 4.90
## 3 AS 5.80
## 4 AA 8.59
## 5 DL 9.26
## 6 MQ 10.6
## 7 UA 12.1
## 8 OO 12.6
## 9 VX 12.9
## 10 B6 13.0
## 11 9E 16.7
## 12 WN 17.7
## 13 FL 18.7
## 14 YV 19.0
## 15 EV 20.0
## 16 F9 20.2
arrange(Avg_delay_by_carrier, desc(delay)) # largest delays displayed first
## # A tibble: 16 × 2
## carrier delay
## <chr> <dbl>
## 1 F9 20.2
## 2 EV 20.0
## 3 YV 19.0
## 4 FL 18.7
## 5 WN 17.7
## 6 9E 16.7
## 7 B6 13.0
## 8 VX 12.9
## 9 OO 12.6
## 10 UA 12.1
## 11 MQ 10.6
## 12 DL 9.26
## 13 AA 8.59
## 14 AS 5.80
## 15 HA 4.90
## 16 US 3.78
# Which carrier had the largest difference between their max and min departure delay?
Carrier_range <- summarise(by_carrier,
max = max(dep_delay, na.rm = TRUE),
min = min(dep_delay, na.rm = TRUE),
delta = max - min)
Carrier_range
## # A tibble: 16 × 4
## carrier max min delta
## <chr> <dbl> <dbl> <dbl>
## 1 9E 747 -24 771
## 2 AA 1014 -24 1038
## 3 AS 225 -21 246
## 4 B6 502 -43 545
## 5 DL 960 -33 993
## 6 EV 548 -32 580
## 7 F9 853 -27 880
## 8 FL 602 -22 624
## 9 HA 1301 -16 1317
## 10 MQ 1137 -26 1163
## 11 OO 154 -14 168
## 12 UA 483 -20 503
## 13 US 500 -19 519
## 14 VX 653 -20 673
## 15 WN 471 -13 484
## 16 YV 387 -16 403
arrange(Carrier_range, desc(delta))
## # A tibble: 16 × 4
## carrier max min delta
## <chr> <dbl> <dbl> <dbl>
## 1 HA 1301 -16 1317
## 2 MQ 1137 -26 1163
## 3 AA 1014 -24 1038
## 4 DL 960 -33 993
## 5 F9 853 -27 880
## 6 9E 747 -24 771
## 7 VX 653 -20 673
## 8 FL 602 -22 624
## 9 EV 548 -32 580
## 10 B6 502 -43 545
## 11 US 500 -19 519
## 12 UA 483 -20 503
## 13 WN 471 -13 484
## 14 YV 387 -16 403
## 15 AS 225 -21 246
## 16 OO 154 -14 168
# Which month has the largest standard deviation for arrival delays?
by_month <- group_by(flights, month)
Stan_dev <- summarise(by_month, delay = sd(arr_delay, na.rm = TRUE))
arrange(Stan_dev, desc(delay))
## # A tibble: 12 × 2
## month delay
## <int> <dbl>
## 1 7 57.1
## 2 6 56.1
## 3 4 47.5
## 4 12 46.1
## 5 5 44.2
## 6 3 44.1
## 7 8 42.6
## 8 1 40.4
## 9 9 39.7
## 10 2 39.5
## 11 10 32.6
## 12 11 31.4
#Filter Data
jan1 <- filter(flights, month == 1, day == 1) #when comparing '==' when '=' setting to equal something (changing the table to this one value).
jan1
## # A tibble: 842 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 832 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
#(()) -- to make R display the table without calling it.
#Logical Test
== equals
<= c() less than equal to
%in% group member
in.na() missing values
!= not equal to
!() not
can use filter(data_name, logical_test)
#Multiple Logical Test
& boolean and
| boolean or
! not
any – any true
all – all true
#Multiple Comparison ‘&’ is the same as ‘,’ %in% is the same as |
filter(flights, !(arr_delay>120|dep_delay>120))
## # A tibble: 316,050 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 316,040 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
filter(flights, arr_delay <= 120, dep_delay <= 120)
## # A tibble: 316,050 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 316,040 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
Different ways to show the same things.
#Find the number of flights that
#1.Had an arrival delay of two or more hours
filter(flights, arr_delay >= 120)
## # A tibble: 10,200 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 811 630 101 1047 830 137 MQ
## 2 2013 1 1 848 1835 853 1001 1950 851 MQ
## 3 2013 1 1 957 733 144 1056 853 123 UA
## 4 2013 1 1 1114 900 134 1447 1222 145 UA
## 5 2013 1 1 1505 1310 115 1638 1431 127 EV
## 6 2013 1 1 1525 1340 105 1831 1626 125 B6
## 7 2013 1 1 1549 1445 64 1912 1656 136 EV
## 8 2013 1 1 1558 1359 119 1718 1515 123 EV
## 9 2013 1 1 1732 1630 62 2028 1825 123 EV
## 10 2013 1 1 1803 1620 103 2008 1750 138 MQ
## # … with 10,190 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
#2.Flew to Houston (IAH or HOU)
filter(flights, dest == 'IAH' | dest == 'HOU')
## # A tibble: 9,313 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 623 627 -4 933 932 1 UA
## 4 2013 1 1 728 732 -4 1041 1038 3 UA
## 5 2013 1 1 739 739 0 1104 1038 26 UA
## 6 2013 1 1 908 908 0 1228 1219 9 UA
## 7 2013 1 1 1028 1026 2 1350 1339 11 UA
## 8 2013 1 1 1044 1045 -1 1352 1351 1 UA
## 9 2013 1 1 1114 900 134 1447 1222 145 UA
## 10 2013 1 1 1205 1200 5 1503 1505 -2 UA
## # … with 9,303 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
#3.Arrived more than two hours late, but didn’t leave late
filter(flights, arr_delay > 120 & dep_delay <= 0)
## # A tibble: 29 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 27 1419 1420 -1 1754 1550 124 MQ
## 2 2013 10 7 1350 1350 0 1736 1526 130 EV
## 3 2013 10 7 1357 1359 -2 1858 1654 124 AA
## 4 2013 10 16 657 700 -3 1258 1056 122 B6
## 5 2013 11 1 658 700 -2 1329 1015 194 VX
## 6 2013 3 18 1844 1847 -3 39 2219 140 UA
## 7 2013 4 17 1635 1640 -5 2049 1845 124 MQ
## 8 2013 4 18 558 600 -2 1149 850 179 AA
## 9 2013 4 18 655 700 -5 1213 950 143 AA
## 10 2013 5 22 1827 1830 -3 2217 2010 127 MQ
## # … with 19 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
#Relational Data
x <- data.frame(key = c(1, 2, 3), val = c("x1", "x2", "x3"))
y <- data.frame(key = c(1, 2, 4), val = c("y1", "y2", "y3"))
#Inner Join
x %>% inner_join(y, by = "key") -> joined_data
joined_data
## key val.x val.y
## 1 1 x1 y1
## 2 2 x2 y2
There are 3 types of Outer Join
#Left Join
x %>% left_join(y, by = "key") -> joined_data
joined_data
## key val.x val.y
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
#Right Join
x %>% right_join(y, by = "key") -> joined_data
joined_data
## key val.x val.y
## 1 1 x1 y1
## 2 2 x2 y2
## 3 4 <NA> y3
#Full Join
x %>% full_join(y, by = "key") -> joined_data
joined_data
## key val.x val.y
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## 4 4 <NA> y3
#Denifing Keys
#when keys do not match
#x %>% inner_join(y, by = c("key1" = “key2”)) -> joined_data
#joined_data
airlines <- nycflights13::airlines
head(airlines)
## # A tibble: 6 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
Join the flights and airlines datasets so that all rows in the flights dataset are included. Using this joined dataset, create a new dataset for which all of the following has been done: 1.The “name” variable has been filtered for “Virgin America”
flights %>% left_join(airlines, by = "carrier") -> joined_data #join the tables
filter <- filter(joined_data, name == "Virgin America") #filter
filter
## # A tibble: 5,162 × 20
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 658 700 -2 1027 1025 2 VX
## 2 2013 1 1 729 730 -1 1049 1115 -26 VX
## 3 2013 1 1 859 900 -1 1223 1225 -2 VX
## 4 2013 1 1 932 930 2 1219 1225 -6 VX
## 5 2013 1 1 1031 1030 1 1353 1415 -22 VX
## 6 2013 1 1 1133 1130 3 1448 1450 -2 VX
## 7 2013 1 1 1203 1200 3 1519 1545 -26 VX
## 8 2013 1 1 1327 1330 -3 1638 1655 -17 VX
## 9 2013 1 1 1627 1630 -3 1940 2020 -40 VX
## 10 2013 1 1 1655 1655 0 2025 2030 -5 VX
## # … with 5,152 more rows, 10 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, name <chr>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
2.Is grouped by time_hour
by_time <- group_by(joined_data, time_hour)
3.summarise the data by computing the mean dep_delay identify the top 10 date-times in your newly created dataset that have the highest mean dep_delay
#summarize using and display the mean
summarized <- summarise(by_time, delay = mean(dep_delay, na.mr = TRUE))
summarized
## # A tibble: 6,936 × 2
## time_hour delay
## <dttm> <dbl>
## 1 2013-01-01 05:00:00 0.5
## 2 2013-01-01 06:00:00 NA
## 3 2013-01-01 07:00:00 3.51
## 4 2013-01-01 08:00:00 0.448
## 5 2013-01-01 09:00:00 5.34
## 6 2013-01-01 10:00:00 0.333
## 7 2013-01-01 11:00:00 3.19
## 8 2013-01-01 12:00:00 5.75
## 9 2013-01-01 13:00:00 20.4
## 10 2013-01-01 14:00:00 17.2
## # … with 6,926 more rows
arrange(summarized, desc(delay)) #arrange in descending order
## # A tibble: 6,936 × 2
## time_hour delay
## <dttm> <dbl>
## 1 2013-08-08 22:00:00 134.
## 2 2013-07-10 23:00:00 124.
## 3 2013-07-13 23:00:00 120.
## 4 2013-03-08 22:00:00 118.
## 5 2013-07-07 23:00:00 105
## 6 2013-06-13 22:00:00 103.
## 7 2013-07-18 23:00:00 101.
## 8 2013-08-08 23:00:00 100
## 9 2013-02-11 21:00:00 95.6
## 10 2013-02-11 22:00:00 95.3
## # … with 6,926 more rows
arrange(summarized, delay) #acending
## # A tibble: 6,936 × 2
## time_hour delay
## <dttm> <dbl>
## 1 2013-10-02 23:00:00 -18
## 2 2013-09-23 23:00:00 -17
## 3 2013-10-22 23:00:00 -16
## 4 2013-01-20 23:00:00 -13
## 5 2013-09-16 23:00:00 -13
## 6 2013-09-27 23:00:00 -13
## 7 2013-10-06 23:00:00 -13
## 8 2013-10-09 23:00:00 -13
## 9 2013-09-03 22:00:00 -12.1
## 10 2013-09-10 23:00:00 -12
## # … with 6,926 more rows