library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.4     v readr     2.1.6
## v forcats   1.0.1     v stringr   1.6.0
## v ggplot2   4.0.1     v tibble    3.3.1
## v lubridate 1.9.4     v tidyr     1.3.2
## v purrr     1.2.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## 载入需要的程序包:airports
## 载入需要的程序包:cherryblossom
## 载入需要的程序包:usdata
library(nycflights13)
library(dplyr)
library(ggplot2)
flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # i 336,766 more rows
## # i 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights,month==6|month==7,!is.na(arr_delay))%>%
ggplot(aes(x = arr_delay))+
  geom_histogram(binwith=10,fill = "skyblue")
## Warning in geom_histogram(binwith = 10, fill = "skyblue"): Ignoring unknown
## parameters: `binwith`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

The majority of flights arrive on time or even ahead of schedule, while a small number experience significant delays.

filter(flights,day==1,origin=="EWR")%>%
  ggplot(aes(x=dep_delay,y=arr_delay))+
  geom_point(alpha=0.5)+
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 109 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 109 rows containing missing values or values outside the scale range
## (`geom_point()`).

There exists a positive correlation between the two; when departure is delayed, it frequently leads to delayed arrival times, though this is not invariably the case.

filter(flights,!is.na(distance))%>%
  arrange(distance)%>%
  select(origin, dest, distance) %>%
  slice(1)
## # A tibble: 1 x 3
##   origin dest  distance
##   <chr>  <chr>    <dbl>
## 1 EWR    LGA         17
mutate(flights,type_distance=ifelse(distance<500,"short_distace","long_distance"))%>%
  ggplot(aes(x=type_distance))+
  geom_bar()

The number of Long-distance flights larger than short-distance flights.

group_by(flights,dest)%>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE))%>%
arrange(desc(avg_dep_delay))%>%
  slice(1:5)%>%
  ggplot(aes(x = dest,y = avg_dep_delay)) +
  geom_col() 

CAE

group_by(flights,dest)%>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE))%>%
arrange(desc(avg_dep_delay))%>%
  slice(1)
## # A tibble: 1 x 2
##   dest  avg_dep_delay
##   <chr>         <dbl>
## 1 CAE            35.6
group_by(flights,carrier)%>%
  mutate(speed=distance/air_time*60)%>%
  summarise(avg_speed=mean(speed,na.rm=TRUE))%>%
  arrange(avg_speed)
## # A tibble: 16 x 2
##    carrier avg_speed
##    <chr>       <dbl>
##  1 YV           332.
##  2 US           342.
##  3 9E           345.
##  4 EV           363.
##  5 OO           366.
##  6 MQ           368.
##  7 FL           394.
##  8 B6           400.
##  9 WN           401.
## 10 AA           417.
## 11 DL           418.
## 12 UA           421.
## 13 F9           425.
## 14 AS           444.
## 15 VX           446.
## 16 HA           480.

max:HA min:YV

flights %>%
  mutate(date = make_date(year, month, day),weekday = wday(date, label = TRUE)) %>%
  group_by(weekday) %>%
  summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_dep_delay))
## # A tibble: 7 x 2
##   weekday avg_dep_delay
##   <ord>           <dbl>
## 1 周四            16.1 
## 2 周一            14.8 
## 3 周五            14.7 
## 4 周三            11.8 
## 5 周日            11.6 
## 6 周二            10.6 
## 7 周六             7.65

thursday

?seattlepets
## 打开httpd帮助服务器… 好了
unique(seattlepets$species)
## [1] "Dog"  "Cat"  "Goat" "Pig"

4 “Dog” “Cat” “Goat” “Pig”

  filter(seattlepets,species=="Dog"|species=="Cat",!is.na(primary_breed)) %>%
  group_by(species, primary_breed) %>%
  summarise(count = n()) %>%
  arrange(species, desc(count)) %>%
  slice(1)
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
## # A tibble: 2 x 3
## # Groups:   species [2]
##   species primary_breed       count
##   <chr>   <chr>               <int>
## 1 Cat     Domestic Shorthair  10086
## 2 Dog     Retriever, Labrador  4867

Cat Domestic Shorthair
Dog Retriever, Labrador

  filter(seattlepets,!is.na(animal_name)) %>%
  count(animal_name, sort = TRUE) %>%
  slice(1:3)
## # A tibble: 3 x 2
##   animal_name     n
##   <chr>       <int>
## 1 Lucy          439
## 2 Charlie       387
## 3 Luna          355

lucy,charlie,luna

  filter(seattlepets,species == "Cat", !is.na(animal_name)) %>%
  count(animal_name, sort = TRUE) %>%
  slice(1:10)
## # A tibble: 10 x 2
##    animal_name     n
##    <chr>       <int>
##  1 Luna          111
##  2 Lucy          102
##  3 Lily           86
##  4 Max            83
##  5 Bella          82
##  6 Charlie        81
##  7 Oliver         73
##  8 Jack           65
##  9 Sophie         59
## 10 Leo            54
 filter(seattlepets,species == "Dog", !is.na(animal_name)) %>%
  count(animal_name, sort = TRUE) %>%
  slice(1:10)
## # A tibble: 10 x 2
##    animal_name     n
##    <chr>       <int>
##  1 Lucy          337
##  2 Charlie       306
##  3 Bella         249
##  4 Luna          244
##  5 Daisy         221
##  6 Cooper        189
##  7 Lola          187
##  8 Max           186
##  9 Molly         186
## 10 Stella        185
name_100<-seattlepets%>%
  group_by(animal_name,species)%>%
  filter(!is.na(animal_name)) %>%
  count(animal_name) %>%
  filter(n > 100)
name_100
## # A tibble: 38 x 3
## # Groups:   animal_name, species [38]
##    animal_name species     n
##    <chr>       <chr>   <int>
##  1 Bailey      Dog       139
##  2 Bella       Dog       249
##  3 Buddy       Dog       175
##  4 Charlie     Dog       306
##  5 Chloe       Dog       134
##  6 Coco        Dog       122
##  7 Cooper      Dog       189
##  8 Daisy       Dog       221
##  9 Ginger      Dog       109
## 10 Gus         Dog       106
## # i 28 more rows

56

name_counts <- seattlepets %>%
  filter(animal_name %in% name_100$animal_name) %>%
  group_by(animal_name, species) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(
    names_from = species,
    values_from = count,
    values_fill = 0
  ) %>%
  mutate(cat_to_dog_ratio = Cat / Dog)

name_counts
## # A tibble: 36 x 5
##    animal_name   Cat   Dog  Goat cat_to_dog_ratio
##    <chr>       <int> <int> <int>            <dbl>
##  1 Bailey         18   139     0           0.129 
##  2 Bella          82   249     0           0.329 
##  3 Buddy          43   175     0           0.246 
##  4 Charlie        81   306     0           0.265 
##  5 Chloe          39   134     0           0.291 
##  6 Coco           25   122     0           0.205 
##  7 Cooper         16   189     0           0.0847
##  8 Daisy          40   221     0           0.181 
##  9 Ginger         33   109     0           0.303 
## 10 Gus            25   106     0           0.236 
## # i 26 more rows
name_counts %>%
  arrange(desc(cat_to_dog_ratio)) %>%
  slice(1)
## # A tibble: 1 x 5
##   animal_name   Cat   Dog  Goat cat_to_dog_ratio
##   <chr>       <int> <int> <int>            <dbl>
## 1 Lily           86   146     0            0.589
name_counts %>%
  arrange(cat_to_dog_ratio) %>%
  slice(1)
## # A tibble: 1 x 5
##   animal_name   Cat   Dog  Goat cat_to_dog_ratio
##   <chr>       <int> <int> <int>            <dbl>
## 1 Riley           9   117     0           0.0769