library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
Install the "nycflights13" package. Load
(library()) the package. You’ll also need to load
tidyverse The data frame flights should now be
accessible to you.
# install.packages("nycflights13") # once per computer
library(nycflights13) # every time you use it
library(tidyverse)
??flights to search
for documentation on the data set (for what the columns represent)flights %>% glimpse()
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
dplyr to give the data frame a new column that is
the amount of time gained or lost while flying (that is: how much of the
delay arriving occured during flight, as opposed to before
departing).my_flights <- flights %>% mutate(made_up_time = dep_delay - arr_delay)
my_flights
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, made_up_time <dbl>
dplyr to sort your data frame in descending order
by the column you just created. Remember to save this as a variable (or
in the same one!)my_flights$made_up_time <- as.numeric(my_flights$made_up_time)
my_flights %>% arrange(desc(made_up_time))
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 6 13 1907 1512 235 2134 1928
## 2 2013 2 26 1000 900 60 1513 1540
## 3 2013 2 23 1226 900 206 1746 1540
## 4 2013 5 13 1917 1900 17 2149 2251
## 5 2013 2 27 924 900 24 1448 1540
## 6 2013 7 14 1917 1829 48 2109 2135
## 7 2013 7 17 2004 1930 34 2224 2304
## 8 2013 12 27 1719 1648 31 1956 2038
## 9 2013 5 2 1947 1949 -2 2209 2324
## 10 2013 11 13 2024 2015 9 2251 2354
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, made_up_time <dbl>
If you haven’t already, do the last 2 steps in a single statement using the pipe operator.
Make a histogram of the amount of time gained using the
hist() function
hist(my_flights$made_up_time)
na.rm = TRUE argument to remove NA values from your
aggregationany(is.na(my_flights$made_up_time))
## [1] TRUE
mean(my_flights$made_up_time, na.rm = T)
## [1] 5.659779
seatle <- my_flights %>% filter(dest == "SEA") %>% select(origin, dest, made_up_time)
seatle
## # A tibble: 3,923 × 3
## origin dest made_up_time
## <chr> <chr> <dbl>
## 1 EWR SEA 9
## 2 JFK SEA 10
## 3 EWR SEA 31
## 4 EWR SEA 5
## 5 JFK SEA 0
## 6 EWR SEA 20
## 7 EWR SEA 12
## 8 JFK SEA -4
## 9 JFK SEA 9
## 10 EWR SEA 16
## # ℹ 3,913 more rows
mean(seatle$made_up_time,na.rm = T)
## [1] 11.6991
my_flights %>% group_by(dest) %>% summarise(mean_time = mean(made_up_time, na.rm = T)) %>% arrange(desc(mean_time))
## # A tibble: 105 × 2
## dest mean_time
## <chr> <dbl>
## 1 MTJ 15.9
## 2 ANC 15.4
## 3 SNA 14.6
## 4 SBN 14.6
## 5 ILM 14.5
## 6 SAT 13.4
## 7 LEX 13
## 8 BHM 12.1
## 9 CHO 11.9
## 10 SEA 11.7
## # ℹ 95 more rows
Go back to our flights exercise data. Consider flights from JFK to SEA. What was the average, min, and max air time of those flights? Use pipes to answer this question in one statement (without showing any other data)!
my_flights %>% filter(origin=="JFK" & dest =="SEA") %>% summarise(min_value = min(made_up_time, na.rm = T), max_value = max(made_up_time, na.rm = T), avarage_value = mean(made_up_time, na.rm = T))
## # A tibble: 1 × 3
## min_value max_value avarage_value
## <dbl> <dbl> <dbl>
## 1 -125 71 10.4
Consider flights coming into Houston (HOU and IAH). What is the mean arrival delay by departure airport?
dest_list <- c("HOU","IAH")
my_flights %>% filter(dest %in% dest_list ) %>% group_by(origin) %>% summarise(avg_delay = mean(arr_delay, na.rm = T))
## # A tibble: 3 × 2
## origin avg_delay
## <chr> <dbl>
## 1 EWR 6.23
## 2 JFK 10.3
## 3 LGA 1.37
In the package dplyr there is a dataset called
starwars.
glimpse(starwars)
## Rows: 87
## Columns: 14
## $ name <chr> "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Leia Or…
## $ height <int> 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188, 180, 2…
## $ mass <dbl> 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 84.0, 77.…
## $ hair_color <chr> "blond", NA, NA, "none", "brown", "brown, grey", "brown", N…
## $ skin_color <chr> "fair", "gold", "white, blue", "white", "light", "light", "…
## $ eye_color <chr> "blue", "yellow", "red", "yellow", "brown", "blue", "blue",…
## $ birth_year <dbl> 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0, 57.0, …
## $ sex <chr> "male", "none", "none", "male", "female", "male", "female",…
## $ gender <chr> "masculine", "masculine", "masculine", "masculine", "femini…
## $ homeworld <chr> "Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaan", "T…
## $ species <chr> "Human", "Droid", "Droid", "Human", "Human", "Human", "Huma…
## $ films <list> <"A New Hope", "The Empire Strikes Back", "Return of the J…
## $ vehicles <list> <"Snowspeeder", "Imperial Speeder Bike">, <>, <>, <>, "Imp…
## $ starships <list> <"X-wing", "Imperial shuttle">, <>, <>, "TIE Advanced x1",…
We’re going to use this to practice some data wrangling with
dplyr.
First, we filter by species to only see the humans, and use that subset to plot a scatter plot of their height and mass.
humans <- starwars %>% filter(species == "Human")
plot(humans$height, humans$mass,main = "Human Scatter plot", xlab = "Height", ylab = "Mass", col = "blue")
starwars %>% filter(species == "Human") %>% plot(mass ~ height, data = .)
Now, change this to see the same plot for the Droids.
droids <- starwars %>% filter(species == "Droid")
plot(droids$height, droids$mass,main = "Droids Scatter plot", xlab = "Height", ylab = "Mass", col = "red")
starwars %>% filter(species == "Droid") %>% plot(mass ~ height, data = .)
How many characters are from each planet?
#any(is.na(starwars$name))
starwars %>% group_by(homeworld) %>% summarise(char_count = length(name))
## # A tibble: 49 × 2
## homeworld char_count
## <chr> <int>
## 1 Alderaan 3
## 2 Aleen Minor 1
## 3 Bespin 1
## 4 Bestine IV 1
## 5 Cato Neimoidia 1
## 6 Cerea 1
## 7 Champala 1
## 8 Chandrila 1
## 9 Concord Dawn 1
## 10 Corellia 2
## # ℹ 39 more rows
If you wanted to arrange this in descending order, what would you add to the pipe?
starwars %>% filter(!is.na(homeworld)) %>%group_by(homeworld) %>% summarise(char_count = length(name)) %>% arrange(desc(char_count))
## # A tibble: 48 × 2
## homeworld char_count
## <chr> <int>
## 1 Naboo 11
## 2 Tatooine 10
## 3 Alderaan 3
## 4 Coruscant 3
## 5 Kamino 3
## 6 Corellia 2
## 7 Kashyyyk 2
## 8 Mirial 2
## 9 Ryloth 2
## 10 Aleen Minor 1
## # ℹ 38 more rows
Find the average height for each eye color. You might find some of the answers to be strange, but keep going!
avg_height_group <- starwars %>% filter(!is.na(eye_color) & !is.na(height)) %>% group_by(eye_color) %>% summarise(avg_height = mean(height))
avg_height_group
## # A tibble: 14 × 2
## eye_color avg_height
## <chr> <dbl>
## 1 black 185
## 2 blue 182.
## 3 blue-gray 182
## 4 brown 167.
## 5 gold 191
## 6 green, yellow 216
## 7 hazel 174
## 8 orange 180.
## 9 pink 180
## 10 red 155.
## 11 red, blue 96
## 12 unknown 136
## 13 white 178
## 14 yellow 178.
So…. What’s with the NAs? Filter down to just brown eyes find out what’s going on.
avg_height_group %>% filter(eye_color == "brown")
## # A tibble: 1 × 2
## eye_color avg_height
## <chr> <dbl>
## 1 brown 167.
Okay, so we have missing data here. Try summarising with
na.omit(height) instead of just height or
adding na.rm = T to the mean function inside summarize.
#starwars %>% filter(eye_color=="dark") %>% glimpse()
#starwars %>% group_by(eye_color) %>% summarise(avg_height = mean(na.omit(height)))
# all value of dark eye color's height is NA, so we have to use filter
starwars %>% group_by(eye_color) %>% filter(!is.na(height)) %>% summarise(avg_height = mean(na.omit(height)))
## # A tibble: 14 × 2
## eye_color avg_height
## <chr> <dbl>
## 1 black 185
## 2 blue 182.
## 3 blue-gray 182
## 4 brown 167.
## 5 gold 191
## 6 green, yellow 216
## 7 hazel 174
## 8 orange 180.
## 9 pink 180
## 10 red 155.
## 11 red, blue 96
## 12 unknown 136
## 13 white 178
## 14 yellow 178.
So, who is the tallest male? How many blue eyed women are there? How many genders are there? Answer these questions and spend some time asking and answering some of your own.
starwars %>% filter(sex =="male", height == max(height, na.rm = T)) %>% summarise(name, height)
## # A tibble: 1 × 2
## name height
## <chr> <int>
## 1 Yarael Poof 264
num_blue_eyed_women <- starwars %>% filter(sex=="female" & eye_color =="blue") %>% nrow()
num_blue_eyed_women
## [1] 6
starwars %>% filter(!is.na(sex)) %>% group_by(sex) %>% summarise(num_gender = length(na.omit(sex)))
## # A tibble: 4 × 2
## sex num_gender
## <chr> <int>
## 1 female 16
## 2 hermaphroditic 1
## 3 male 60
## 4 none 6