library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.4 v readr 2.1.6
## v forcats 1.0.1 v stringr 1.6.0
## v ggplot2 4.0.1 v tibble 3.3.1
## v lubridate 1.9.4 v tidyr 1.3.2
## v purrr 1.2.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## 载入需要的程序包:airports
## 载入需要的程序包:cherryblossom
## 载入需要的程序包:usdata
library(nycflights13)
library(dplyr)
library(ggplot2)
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # i 336,766 more rows
## # i 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights,month==6|month==7,!is.na(arr_delay))%>%
ggplot(aes(x = arr_delay))+
geom_histogram(binwith=10,fill = "skyblue")
## Warning in geom_histogram(binwith = 10, fill = "skyblue"): Ignoring unknown
## parameters: `binwith`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
The majority of flights arrive on time or even ahead of schedule, while a small number experience significant delays.
filter(flights,day==1,origin=="EWR")%>%
ggplot(aes(x=dep_delay,y=arr_delay))+
geom_point(alpha=0.5)+
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 109 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 109 rows containing missing values or values outside the scale range
## (`geom_point()`).
There exists a positive correlation between the two; when departure is delayed, it frequently leads to delayed arrival times, though this is not invariably the case.
filter(flights,!is.na(distance))%>%
arrange(distance)%>%
select(origin, dest, distance) %>%
slice(1)
## # A tibble: 1 x 3
## origin dest distance
## <chr> <chr> <dbl>
## 1 EWR LGA 17
mutate(flights,type_distance=ifelse(distance<500,"short_distace","long_distance"))%>%
ggplot(aes(x=type_distance))+
geom_bar()
The number of Long-distance flights larger than short-distance flights.
group_by(flights,dest)%>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE))%>%
arrange(desc(avg_dep_delay))%>%
slice(1:5)%>%
ggplot(aes(x = dest,y = avg_dep_delay)) +
geom_col()
CAE
group_by(flights,dest)%>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE))%>%
arrange(desc(avg_dep_delay))%>%
slice(1)
## # A tibble: 1 x 2
## dest avg_dep_delay
## <chr> <dbl>
## 1 CAE 35.6
group_by(flights,carrier)%>%
mutate(speed=distance/air_time*60)%>%
summarise(avg_speed=mean(speed,na.rm=TRUE))%>%
arrange(avg_speed)
## # A tibble: 16 x 2
## carrier avg_speed
## <chr> <dbl>
## 1 YV 332.
## 2 US 342.
## 3 9E 345.
## 4 EV 363.
## 5 OO 366.
## 6 MQ 368.
## 7 FL 394.
## 8 B6 400.
## 9 WN 401.
## 10 AA 417.
## 11 DL 418.
## 12 UA 421.
## 13 F9 425.
## 14 AS 444.
## 15 VX 446.
## 16 HA 480.
max:HA min:YV
flights %>%
mutate(date = make_date(year, month, day),weekday = wday(date, label = TRUE)) %>%
group_by(weekday) %>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 7 x 2
## weekday avg_dep_delay
## <ord> <dbl>
## 1 周四 16.1
## 2 周一 14.8
## 3 周五 14.7
## 4 周三 11.8
## 5 周日 11.6
## 6 周二 10.6
## 7 周六 7.65
thursday
?seattlepets
## 打开httpd帮助服务器… 好了
unique(seattlepets$species)
## [1] "Dog" "Cat" "Goat" "Pig"
4 “Dog” “Cat” “Goat” “Pig”
filter(seattlepets,species=="Dog"|species=="Cat",!is.na(primary_breed)) %>%
group_by(species, primary_breed) %>%
summarise(count = n()) %>%
arrange(species, desc(count)) %>%
slice(1)
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
## # A tibble: 2 x 3
## # Groups: species [2]
## species primary_breed count
## <chr> <chr> <int>
## 1 Cat Domestic Shorthair 10086
## 2 Dog Retriever, Labrador 4867
Cat Domestic Shorthair
Dog Retriever, Labrador
filter(seattlepets,!is.na(animal_name)) %>%
count(animal_name, sort = TRUE) %>%
slice(1:3)
## # A tibble: 3 x 2
## animal_name n
## <chr> <int>
## 1 Lucy 439
## 2 Charlie 387
## 3 Luna 355
lucy,charlie,luna
filter(seattlepets,species == "Cat", !is.na(animal_name)) %>%
count(animal_name, sort = TRUE) %>%
slice(1:10)
## # A tibble: 10 x 2
## animal_name n
## <chr> <int>
## 1 Luna 111
## 2 Lucy 102
## 3 Lily 86
## 4 Max 83
## 5 Bella 82
## 6 Charlie 81
## 7 Oliver 73
## 8 Jack 65
## 9 Sophie 59
## 10 Leo 54
filter(seattlepets,species == "Dog", !is.na(animal_name)) %>%
count(animal_name, sort = TRUE) %>%
slice(1:10)
## # A tibble: 10 x 2
## animal_name n
## <chr> <int>
## 1 Lucy 337
## 2 Charlie 306
## 3 Bella 249
## 4 Luna 244
## 5 Daisy 221
## 6 Cooper 189
## 7 Lola 187
## 8 Max 186
## 9 Molly 186
## 10 Stella 185
name_100<-seattlepets%>%
group_by(animal_name,species)%>%
filter(!is.na(animal_name)) %>%
count(animal_name) %>%
filter(n > 100)
name_100
## # A tibble: 38 x 3
## # Groups: animal_name, species [38]
## animal_name species n
## <chr> <chr> <int>
## 1 Bailey Dog 139
## 2 Bella Dog 249
## 3 Buddy Dog 175
## 4 Charlie Dog 306
## 5 Chloe Dog 134
## 6 Coco Dog 122
## 7 Cooper Dog 189
## 8 Daisy Dog 221
## 9 Ginger Dog 109
## 10 Gus Dog 106
## # i 28 more rows
56
name_counts <- seattlepets %>%
filter(animal_name %in% name_100$animal_name) %>%
group_by(animal_name, species) %>%
summarise(count = n(), .groups = "drop") %>%
pivot_wider(
names_from = species,
values_from = count,
values_fill = 0
) %>%
mutate(cat_to_dog_ratio = Cat / Dog)
name_counts
## # A tibble: 36 x 5
## animal_name Cat Dog Goat cat_to_dog_ratio
## <chr> <int> <int> <int> <dbl>
## 1 Bailey 18 139 0 0.129
## 2 Bella 82 249 0 0.329
## 3 Buddy 43 175 0 0.246
## 4 Charlie 81 306 0 0.265
## 5 Chloe 39 134 0 0.291
## 6 Coco 25 122 0 0.205
## 7 Cooper 16 189 0 0.0847
## 8 Daisy 40 221 0 0.181
## 9 Ginger 33 109 0 0.303
## 10 Gus 25 106 0 0.236
## # i 26 more rows
name_counts %>%
arrange(desc(cat_to_dog_ratio)) %>%
slice(1)
## # A tibble: 1 x 5
## animal_name Cat Dog Goat cat_to_dog_ratio
## <chr> <int> <int> <int> <dbl>
## 1 Lily 86 146 0 0.589
name_counts %>%
arrange(cat_to_dog_ratio) %>%
slice(1)
## # A tibble: 1 x 5
## animal_name Cat Dog Goat cat_to_dog_ratio
## <chr> <int> <int> <int> <dbl>
## 1 Riley 9 117 0 0.0769