Data cleaning using DPlYR

library(nycflights13)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
flights <- nycflights13::flights
flights
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

#Group data

by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay,na.rm = TRUE)) 
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # … with 355 more rows
#na.rm -- removes missing values
#delay -- new column name
avg_delay <- summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
avg_delay
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # … with 355 more rows
arrange(avg_delay, delay) #displays smallest delay first
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day   delay
##    <int> <int> <int>   <dbl>
##  1  2013     9    24 -1.33  
##  2  2013    10     2 -0.920 
##  3  2013    11     9 -0.629 
##  4  2013     9     6 -0.398 
##  5  2013     9     5 -0.388 
##  6  2013    10    29 -0.349 
##  7  2013     9     7 -0.250 
##  8  2013    11     5 -0.183 
##  9  2013    10    19 -0.107 
## 10  2013    10     1 -0.0990
## # … with 355 more rows
carrier_delay <- summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
# Which carrier had the largest mean departure delay?  Smallest?
by_carrier <- group_by(flights, carrier)
Avg_delay_by_carrier <- summarise(by_carrier, delay = mean(dep_delay, na.rm = TRUE))
arrange(Avg_delay_by_carrier, delay) # smallest delays displayed first
## # A tibble: 16 × 2
##    carrier delay
##    <chr>   <dbl>
##  1 US       3.78
##  2 HA       4.90
##  3 AS       5.80
##  4 AA       8.59
##  5 DL       9.26
##  6 MQ      10.6 
##  7 UA      12.1 
##  8 OO      12.6 
##  9 VX      12.9 
## 10 B6      13.0 
## 11 9E      16.7 
## 12 WN      17.7 
## 13 FL      18.7 
## 14 YV      19.0 
## 15 EV      20.0 
## 16 F9      20.2
arrange(Avg_delay_by_carrier, desc(delay)) # largest delays displayed first
## # A tibble: 16 × 2
##    carrier delay
##    <chr>   <dbl>
##  1 F9      20.2 
##  2 EV      20.0 
##  3 YV      19.0 
##  4 FL      18.7 
##  5 WN      17.7 
##  6 9E      16.7 
##  7 B6      13.0 
##  8 VX      12.9 
##  9 OO      12.6 
## 10 UA      12.1 
## 11 MQ      10.6 
## 12 DL       9.26
## 13 AA       8.59
## 14 AS       5.80
## 15 HA       4.90
## 16 US       3.78
# Which carrier had the largest difference between their max and min departure delay?
Carrier_range <- summarise(by_carrier, 
          max = max(dep_delay, na.rm = TRUE),
          min = min(dep_delay, na.rm = TRUE),
          delta = max - min)
Carrier_range
## # A tibble: 16 × 4
##    carrier   max   min delta
##    <chr>   <dbl> <dbl> <dbl>
##  1 9E        747   -24   771
##  2 AA       1014   -24  1038
##  3 AS        225   -21   246
##  4 B6        502   -43   545
##  5 DL        960   -33   993
##  6 EV        548   -32   580
##  7 F9        853   -27   880
##  8 FL        602   -22   624
##  9 HA       1301   -16  1317
## 10 MQ       1137   -26  1163
## 11 OO        154   -14   168
## 12 UA        483   -20   503
## 13 US        500   -19   519
## 14 VX        653   -20   673
## 15 WN        471   -13   484
## 16 YV        387   -16   403
arrange(Carrier_range, desc(delta))
## # A tibble: 16 × 4
##    carrier   max   min delta
##    <chr>   <dbl> <dbl> <dbl>
##  1 HA       1301   -16  1317
##  2 MQ       1137   -26  1163
##  3 AA       1014   -24  1038
##  4 DL        960   -33   993
##  5 F9        853   -27   880
##  6 9E        747   -24   771
##  7 VX        653   -20   673
##  8 FL        602   -22   624
##  9 EV        548   -32   580
## 10 B6        502   -43   545
## 11 US        500   -19   519
## 12 UA        483   -20   503
## 13 WN        471   -13   484
## 14 YV        387   -16   403
## 15 AS        225   -21   246
## 16 OO        154   -14   168
# Which month has the largest standard deviation for arrival delays?
by_month <- group_by(flights, month)
Stan_dev <- summarise(by_month, delay = sd(arr_delay, na.rm = TRUE))
arrange(Stan_dev, desc(delay))
## # A tibble: 12 × 2
##    month delay
##    <int> <dbl>
##  1     7  57.1
##  2     6  56.1
##  3     4  47.5
##  4    12  46.1
##  5     5  44.2
##  6     3  44.1
##  7     8  42.6
##  8     1  40.4
##  9     9  39.7
## 10     2  39.5
## 11    10  32.6
## 12    11  31.4

#Filter Data

jan1 <- filter(flights, month == 1, day == 1) #when comparing '==' when '=' setting to equal something (changing the table to this one value). 
jan1
## # A tibble: 842 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 832 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
#(()) -- to make R display the table without calling it. 

#Logical Test
== equals
<= c() less than equal to
%in% group member
in.na() missing values
!= not equal to
!() not
can use filter(data_name, logical_test)

#Multiple Logical Test
& boolean and
| boolean or
! not
any – any true
all – all true

#Multiple Comparison ‘&’ is the same as ‘,’ %in% is the same as |

filter(flights, !(arr_delay>120|dep_delay>120))
## # A tibble: 316,050 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 316,040 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
filter(flights, arr_delay <= 120, dep_delay <= 120)
## # A tibble: 316,050 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 316,040 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

Different ways to show the same things.

#Find the number of flights that
#1.Had an arrival delay of two or more hours
filter(flights, arr_delay >= 120)
## # A tibble: 10,200 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      811        630     101    1047     830     137 MQ     
##  2  2013     1     1      848       1835     853    1001    1950     851 MQ     
##  3  2013     1     1      957        733     144    1056     853     123 UA     
##  4  2013     1     1     1114        900     134    1447    1222     145 UA     
##  5  2013     1     1     1505       1310     115    1638    1431     127 EV     
##  6  2013     1     1     1525       1340     105    1831    1626     125 B6     
##  7  2013     1     1     1549       1445      64    1912    1656     136 EV     
##  8  2013     1     1     1558       1359     119    1718    1515     123 EV     
##  9  2013     1     1     1732       1630      62    2028    1825     123 EV     
## 10  2013     1     1     1803       1620     103    2008    1750     138 MQ     
## # … with 10,190 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
#2.Flew to Houston (IAH or HOU)
filter(flights, dest == 'IAH' | dest == 'HOU')
## # A tibble: 9,313 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      623        627      -4     933     932       1 UA     
##  4  2013     1     1      728        732      -4    1041    1038       3 UA     
##  5  2013     1     1      739        739       0    1104    1038      26 UA     
##  6  2013     1     1      908        908       0    1228    1219       9 UA     
##  7  2013     1     1     1028       1026       2    1350    1339      11 UA     
##  8  2013     1     1     1044       1045      -1    1352    1351       1 UA     
##  9  2013     1     1     1114        900     134    1447    1222     145 UA     
## 10  2013     1     1     1205       1200       5    1503    1505      -2 UA     
## # … with 9,303 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
#3.Arrived more than two hours late, but didn’t leave late
filter(flights, arr_delay > 120 & dep_delay <= 0)
## # A tibble: 29 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1    27     1419       1420      -1    1754    1550     124 MQ     
##  2  2013    10     7     1350       1350       0    1736    1526     130 EV     
##  3  2013    10     7     1357       1359      -2    1858    1654     124 AA     
##  4  2013    10    16      657        700      -3    1258    1056     122 B6     
##  5  2013    11     1      658        700      -2    1329    1015     194 VX     
##  6  2013     3    18     1844       1847      -3      39    2219     140 UA     
##  7  2013     4    17     1635       1640      -5    2049    1845     124 MQ     
##  8  2013     4    18      558        600      -2    1149     850     179 AA     
##  9  2013     4    18      655        700      -5    1213     950     143 AA     
## 10  2013     5    22     1827       1830      -3    2217    2010     127 MQ     
## # … with 19 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

#Relational Data

x <- data.frame(key = c(1, 2, 3), val = c("x1", "x2", "x3"))

y <- data.frame(key = c(1, 2, 4), val = c("y1", "y2", "y3"))
#Inner Join
x %>% inner_join(y, by = "key") -> joined_data
joined_data
##   key val.x val.y
## 1   1    x1    y1
## 2   2    x2    y2

There are 3 types of Outer Join

#Left Join
x %>% left_join(y, by = "key") -> joined_data
joined_data
##   key val.x val.y
## 1   1    x1    y1
## 2   2    x2    y2
## 3   3    x3  <NA>
#Right Join
x %>% right_join(y, by = "key") -> joined_data
joined_data
##   key val.x val.y
## 1   1    x1    y1
## 2   2    x2    y2
## 3   4  <NA>    y3
#Full Join
x %>% full_join(y, by = "key") -> joined_data
joined_data
##   key val.x val.y
## 1   1    x1    y1
## 2   2    x2    y2
## 3   3    x3  <NA>
## 4   4  <NA>    y3

#Denifing Keys

#when keys do not match 
#x %>% inner_join(y, by = c("key1" = “key2”)) -> joined_data
#joined_data
airlines <- nycflights13::airlines
head(airlines)
## # A tibble: 6 × 2
##   carrier name                    
##   <chr>   <chr>                   
## 1 9E      Endeavor Air Inc.       
## 2 AA      American Airlines Inc.  
## 3 AS      Alaska Airlines Inc.    
## 4 B6      JetBlue Airways         
## 5 DL      Delta Air Lines Inc.    
## 6 EV      ExpressJet Airlines Inc.

Join the flights and airlines datasets so that all rows in the flights dataset are included. Using this joined dataset, create a new dataset for which all of the following has been done: 1.The “name” variable has been filtered for “Virgin America”

flights %>% left_join(airlines, by = "carrier") -> joined_data #join the tables
filter <- filter(joined_data, name == "Virgin America") #filter
filter
## # A tibble: 5,162 × 20
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      658        700      -2    1027    1025       2 VX     
##  2  2013     1     1      729        730      -1    1049    1115     -26 VX     
##  3  2013     1     1      859        900      -1    1223    1225      -2 VX     
##  4  2013     1     1      932        930       2    1219    1225      -6 VX     
##  5  2013     1     1     1031       1030       1    1353    1415     -22 VX     
##  6  2013     1     1     1133       1130       3    1448    1450      -2 VX     
##  7  2013     1     1     1203       1200       3    1519    1545     -26 VX     
##  8  2013     1     1     1327       1330      -3    1638    1655     -17 VX     
##  9  2013     1     1     1627       1630      -3    1940    2020     -40 VX     
## 10  2013     1     1     1655       1655       0    2025    2030      -5 VX     
## # … with 5,152 more rows, 10 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, name <chr>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

2.Is grouped by time_hour

by_time <- group_by(joined_data, time_hour)

3.summarise the data by computing the mean dep_delay identify the top 10 date-times in your newly created dataset that have the highest mean dep_delay

#summarize using and display the mean 
summarized <- summarise(by_time, delay = mean(dep_delay, na.mr = TRUE))
summarized
## # A tibble: 6,936 × 2
##    time_hour            delay
##    <dttm>               <dbl>
##  1 2013-01-01 05:00:00  0.5  
##  2 2013-01-01 06:00:00 NA    
##  3 2013-01-01 07:00:00  3.51 
##  4 2013-01-01 08:00:00  0.448
##  5 2013-01-01 09:00:00  5.34 
##  6 2013-01-01 10:00:00  0.333
##  7 2013-01-01 11:00:00  3.19 
##  8 2013-01-01 12:00:00  5.75 
##  9 2013-01-01 13:00:00 20.4  
## 10 2013-01-01 14:00:00 17.2  
## # … with 6,926 more rows
arrange(summarized, desc(delay)) #arrange in descending order
## # A tibble: 6,936 × 2
##    time_hour           delay
##    <dttm>              <dbl>
##  1 2013-08-08 22:00:00 134. 
##  2 2013-07-10 23:00:00 124. 
##  3 2013-07-13 23:00:00 120. 
##  4 2013-03-08 22:00:00 118. 
##  5 2013-07-07 23:00:00 105  
##  6 2013-06-13 22:00:00 103. 
##  7 2013-07-18 23:00:00 101. 
##  8 2013-08-08 23:00:00 100  
##  9 2013-02-11 21:00:00  95.6
## 10 2013-02-11 22:00:00  95.3
## # … with 6,926 more rows
arrange(summarized, delay) #acending
## # A tibble: 6,936 × 2
##    time_hour           delay
##    <dttm>              <dbl>
##  1 2013-10-02 23:00:00 -18  
##  2 2013-09-23 23:00:00 -17  
##  3 2013-10-22 23:00:00 -16  
##  4 2013-01-20 23:00:00 -13  
##  5 2013-09-16 23:00:00 -13  
##  6 2013-09-27 23:00:00 -13  
##  7 2013-10-06 23:00:00 -13  
##  8 2013-10-09 23:00:00 -13  
##  9 2013-09-03 22:00:00 -12.1
## 10 2013-09-10 23:00:00 -12  
## # … with 6,926 more rows