library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
variable.names(airports)
## [1] "faa" "name" "lat" "lon" "alt" "tz" "dst" "tzone"
flights_route <- flights %>%
inner_join(select(airports,
origin = faa,
origin_lat = lat,
origin_lon = lon),
by = "origin") %>%
inner_join(select(airports,
dest = faa,
dest_lat = lat,
dest_lon = lon),
by = "dest") %>%
select("origin", "dest", "origin_lat", "origin_lon", "dest_lat", "dest_lon")
flights_route
## # A tibble: 329,174 x 6
## origin dest origin_lat origin_lon dest_lat dest_lon
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 EWR IAH 40.7 -74.2 30.0 -95.3
## 2 LGA IAH 40.8 -73.9 30.0 -95.3
## 3 JFK MIA 40.6 -73.8 25.8 -80.3
## 4 LGA ATL 40.8 -73.9 33.6 -84.4
## 5 EWR ORD 40.7 -74.2 42.0 -87.9
## 6 EWR FLL 40.7 -74.2 26.1 -80.2
## 7 LGA IAD 40.8 -73.9 38.9 -77.5
## 8 JFK MCO 40.6 -73.8 28.4 -81.3
## 9 LGA ORD 40.8 -73.9 42.0 -87.9
## 10 JFK PBI 40.6 -73.8 26.7 -80.1
## # … with 329,164 more rows
variable.names(airports)
## [1] "faa" "name" "lat" "lon" "alt" "tz" "dst" "tzone"
variable.names(weather)
## [1] "origin" "year" "month" "day" "hour"
## [6] "temp" "dewp" "humid" "wind_dir" "wind_speed"
## [11] "wind_gust" "precip" "pressure" "visib" "time_hour"
flights %>%
arrange(year, month, day, sched_dep_time, carrier, flight) %>%
mutate(flight_sur = row_number()) %>%
glimpse()
## Rows: 336,776
## Columns: 20
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 559, 558, 559, 558, 558, 557, …
## $ sched_dep_time <int> 515, 529, 540, 545, 558, 559, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -4, 0, -2, -1, -2, -2, -3, NA, 1, 0, -5, -…
## $ arr_time <int> 830, 850, 923, 1004, 740, 702, 753, 941, 849, 853, 838,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 728, 706, 745, 910, 851, 856, 846,…
## $ arr_delay <dbl> 11, 20, 33, -18, 12, -4, 8, 31, -2, -3, -8, NA, -6, -7,…
## $ carrier <chr> "UA", "UA", "AA", "B6", "UA", "B6", "AA", "AA", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 1696, 1806, 301, 707, 49, 71, 79…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N39463", "N708…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "EWR", "JFK", "LGA", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ORD", "BOS", "ORD", "DFW",…
## $ air_time <dbl> 227, 227, 160, 183, 150, 44, 138, 257, 149, 158, 140, N…
## $ distance <dbl> 1400, 1416, 1089, 1576, 719, 187, 733, 1389, 1028, 1005…
## $ hour <dbl> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 58, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
## $ flight_sur <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
Lahman::Batting#Lahman::Batting
Lahman::Batting %>%
count(playerID) %>%
filter(n > 1) %>%
nrow()
## [1] 14748
babynames::babynames#babynames::babynames
babynames::babynames %>%
count(year, sex, name) %>%
filter(n > 1) %>%
nrow()
## [1] 0
nasaweather::atmos#nasaweather::atmos
nasaweather::atmos %>%
count(lat, long, year, month) %>%
filter(n > 1) %>%
nrow()
## [1] 0
fueleconomy::vehicles#fueleconomy::vehicles
fueleconomy::vehicles %>%
count(id) %>%
filter(n > 1) %>%
nrow()
## [1] 0
ggplot2::diamonds#ggplot2::diamonds
ggplot2::diamonds %>%
count(carat, cut, color, clarity) %>%
filter(n > 1) %>%
nrow()
## [1] 8080
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()
#### (Don’t worry if you don’t understand what semi_join() does — you’ll learn about it next.)
avg_delay <- flights %>%
group_by(dest) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c(dest = "faa"))
avg_delay %>%
ggplot(aes(lon, lat, color = delay)) +
borders("state") +
geom_point() +
coord_quickmap()
location <- airports %>%
select(lat, lon, faa)
location
## # A tibble: 1,458 x 3
## lat lon faa
## <dbl> <dbl> <chr>
## 1 41.1 -80.6 04G
## 2 32.5 -85.7 06A
## 3 42.0 -88.1 06C
## 4 41.4 -74.4 06N
## 5 31.1 -81.4 09J
## 6 36.4 -82.2 0A9
## 7 41.5 -84.5 0G6
## 8 42.9 -76.8 0G7
## 9 39.8 -76.6 0P2
## 10 48.1 -123. 0S9
## # … with 1,448 more rows
flights %>%
select(origin, dest) %>%
inner_join(location, by = c(origin = "faa")) %>%
inner_join(location, by = c(dest = "faa"))
## # A tibble: 329,174 x 6
## origin dest lat.x lon.x lat.y lon.y
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 EWR IAH 40.7 -74.2 30.0 -95.3
## 2 LGA IAH 40.8 -73.9 30.0 -95.3
## 3 JFK MIA 40.6 -73.8 25.8 -80.3
## 4 LGA ATL 40.8 -73.9 33.6 -84.4
## 5 EWR ORD 40.7 -74.2 42.0 -87.9
## 6 EWR FLL 40.7 -74.2 26.1 -80.2
## 7 LGA IAD 40.8 -73.9 38.9 -77.5
## 8 JFK MCO 40.6 -73.8 28.4 -81.3
## 9 LGA ORD 40.8 -73.9 42.0 -87.9
## 10 JFK PBI 40.6 -73.8 26.7 -80.1
## # … with 329,164 more rows
plane_age <- planes %>%
select(year, tailnum) %>%
mutate(age = 2021 - year)
plane_age
## # A tibble: 3,322 x 3
## year tailnum age
## <int> <chr> <dbl>
## 1 2004 N10156 17
## 2 1998 N102UW 23
## 3 1999 N103US 22
## 4 1999 N104UW 22
## 5 2002 N10575 19
## 6 1999 N105UW 22
## 7 1999 N107US 22
## 8 1999 N108UW 22
## 9 1999 N109UW 22
## 10 1999 N110UW 22
## # … with 3,312 more rows
relation <- flights %>%
select(tailnum, arr_delay) %>%
group_by(tailnum) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(plane_age, by = c(tailnum = "tailnum"))
relation
## # A tibble: 3,322 x 4
## tailnum delay year age
## <chr> <dbl> <int> <dbl>
## 1 N10156 12.7 2004 17
## 2 N102UW 2.94 1998 23
## 3 N103US -6.93 1999 22
## 4 N104UW 1.80 1999 22
## 5 N10575 20.7 2002 19
## 6 N105UW -0.267 1999 22
## 7 N107US -5.73 1999 22
## 8 N108UW -1.25 1999 22
## 9 N109UW -2.52 1999 22
## 10 N110UW 2.8 1999 22
## # … with 3,312 more rows
relation %>%
ggplot(aes(x = age, y = delay)) +
geom_point(color = "purple") +
xlim(10, 40) + #range
ylim(50, 400)
## Warning: Removed 3272 rows containing missing values (geom_point).
weather_relation <- flights %>%
inner_join(weather, by = c(origin = "origin",
year = "year",
month = "month",
day = "day",
hour = "hour"))
weather_relation
## # A tibble: 335,220 x 29
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 335,210 more rows, and 21 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour.x <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour.y <dttm>
weather_relation %>%
group_by(temp) %>%
summarise(weather_delay = mean(arr_delay, na.rm = TRUE)) %>%
ggplot(aes(x = temp, y = weather_delay)) +
geom_point(color = "red") +
geom_line(color = "pink")
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
June_13_2013 <- flights %>%
filter(year == 2013, month == 6, day == 13) %>%
inner_join(weather, by = c(origin = "origin",
year = "year",
month = "month",
day = "day"))
June_13_2013
## # A tibble: 23,736 x 30
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 6 13 3 2255 68 120 19
## 2 2013 6 13 3 2255 68 120 19
## 3 2013 6 13 3 2255 68 120 19
## 4 2013 6 13 3 2255 68 120 19
## 5 2013 6 13 3 2255 68 120 19
## 6 2013 6 13 3 2255 68 120 19
## 7 2013 6 13 3 2255 68 120 19
## 8 2013 6 13 3 2255 68 120 19
## 9 2013 6 13 3 2255 68 120 19
## 10 2013 6 13 3 2255 68 120 19
## # … with 23,726 more rows, and 22 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour.x <dbl>, minute <dbl>,
## # time_hour.x <dttm>, hour.y <int>, temp <dbl>, dewp <dbl>, humid <dbl>,
## # wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour.y <dttm>
June_13_2013 %>%
group_by(dest) %>%
summarise(June_13_delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c(dest = "faa")) %>%
ggplot(aes(y = lat, x = lon)) +
borders("state") +
geom_point(aes(size = June_13_delay, color = June_13_delay)) +
coord_quickmap()
## Warning: Removed 3 rows containing missing values (geom_point).
flights %>%
filter(is.na(tailnum))
## # A tibble: 2,512 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 NA 1545 NA NA 1910
## 2 2013 1 2 NA 1601 NA NA 1735
## 3 2013 1 3 NA 857 NA NA 1209
## 4 2013 1 3 NA 645 NA NA 952
## 5 2013 1 4 NA 845 NA NA 1015
## 6 2013 1 4 NA 1830 NA NA 2044
## 7 2013 1 5 NA 840 NA NA 1001
## 8 2013 1 7 NA 820 NA NA 958
## 9 2013 1 8 NA 1645 NA NA 1838
## 10 2013 1 9 NA 755 NA NA 1012
## # … with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(!is.na(tailnum)) %>% # !is.na means "except missing tailnum data"
group_by(tailnum) %>%
count() %>%
filter(n >= 100)
## # A tibble: 1,217 x 2
## # Groups: tailnum [1,217]
## tailnum n
## <chr> <int>
## 1 N0EGMQ 371
## 2 N10156 153
## 3 N10575 289
## 4 N11106 129
## 5 N11107 148
## 6 N11109 148
## 7 N11113 138
## 8 N11119 148
## 9 N11121 154
## 10 N11127 124
## # … with 1,207 more rows
variable.names(fueleconomy::vehicles)
## [1] "id" "make" "model" "year" "class" "trans" "drive" "cyl" "displ"
## [10] "fuel" "hwy" "cty"
variable.names(fueleconomy::common)
## [1] "make" "model" "n" "years"
fueleconomy::vehicles %>%
inner_join(fueleconomy::common, by = c(make = "make", model = "model")) %>%
select(make, model) %>%
group_by(model) %>%
count() %>%
arrange(desc(n))
## # A tibble: 342 x 2
## # Groups: model [342]
## model n
## <chr> <int>
## 1 F150 Pickup 2WD 200
## 2 Truck 2WD 187
## 3 F150 Pickup 4WD 178
## 4 Ranger Pickup 2WD 169
## 5 Mustang 153
## 6 Jetta 150
## 7 Sierra 1500 2WD 149
## 8 Sierra 1500 4WD 149
## 9 Civic 142
## 10 Eclipse 133
## # … with 332 more rows
anti_join(flights, airports, by = c("dest" = "faa"))%>%
count(dest)
## # A tibble: 4 x 2
## dest n
## <chr> <int>
## 1 BQN 896
## 2 PSE 365
## 3 SJU 5819
## 4 STT 522
anti_join(airports, flights, by = c("faa" = "dest")) %>%
count(faa)
## # A tibble: 1,357 x 2
## faa n
## <chr> <int>
## 1 04G 1
## 2 06A 1
## 3 06C 1
## 4 06N 1
## 5 09J 1
## 6 0A9 1
## 7 0G6 1
## 8 0G7 1
## 9 0P2 1
## 10 0S9 1
## # … with 1,347 more rows
nycflights13::planes # related in tailnum
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing m… EMBRAER EMB-1… 2 55 NA Turbo-…
## 2 N102UW 1998 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 3 N103US 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 4 N104UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 5 N10575 2002 Fixed wing m… EMBRAER EMB-1… 2 55 NA Turbo-…
## 6 N105UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 7 N107US 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 8 N108UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 9 N109UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 10 N110UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## # … with 3,312 more rows
nycflights13::airlines # relatied in carrier
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
nycflights13::flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
inner_join(planes, by = c(tailnum = "tailnum")) %>%
inner_join(airlines, by = c(carrier = "carrier")) %>%
select(tailnum, carrier, name)
## # A tibble: 284,170 x 3
## tailnum carrier name
## <chr> <chr> <chr>
## 1 N14228 UA United Air Lines Inc.
## 2 N24211 UA United Air Lines Inc.
## 3 N619AA AA American Airlines Inc.
## 4 N804JB B6 JetBlue Airways
## 5 N668DN DL Delta Air Lines Inc.
## 6 N39463 UA United Air Lines Inc.
## 7 N516JB B6 JetBlue Airways
## 8 N829AS EV ExpressJet Airlines Inc.
## 9 N593JB B6 JetBlue Airways
## 10 N793JB B6 JetBlue Airways
## # … with 284,160 more rows