flight_c <- flights %>%
filter(!is.na(arr_delay), month == 6 | month == 7) %>%
dplyr :: select(month, arr_delay)
ggplot(flight_c) +
geom_histogram(aes(x = arr_delay), bins = 20, fill = "lightblue") +
labs(title = "Arrival Delay in June and July",
x = "Arrival Delay (minutes)",
y = "Count")+
theme(plot.title = element_text(hjust = 0.5,size = 20))
Based on the graph, we can tell most of the flight was on time, only some of them is late and most of the late one was not more than 400 minutes delay though we can see there is some rare case where flight can be delayed for 1200 mins. Some of the flight was actually departed earlier than expected.
flights_from_E <- flights %>%
filter(origin == "EWR", day == 1)%>%
dplyr :: select(arr_delay, dep_delay, month, day)
ggplot (flights_from_E,aes(x=arr_delay, y=dep_delay) )+
geom_smooth()+
labs(title = "Arrival Delay vs Departure delays ",
x = "Arrival Delay (minutes)",
y = "Departure Delay (minutes)")+
theme(plot.title = element_text(hjust = 0.5,size = 20))
Just by a glance we would think this graph would be some thing like y=x but if we look carefully we can tell that the arival time is gonna be delayed more than the departure time. For exaple, if departure time is delayed for 100m then the arrival time woulf be delayed for 110m or so. And the more the departure time delay, the greatter the change between the arrivaltime delay and the departure time delay.
fights_actual_depart <- flights%>%
filter ( !is.na(air_time),!is.na(distance) ) %>%
arrange(distance) %>%
dplyr ::select(distance, air_time, origin, dest)
fights_actual_depart
## # A tibble: 327,346 × 4
## distance air_time origin dest
## <dbl> <dbl> <chr> <chr>
## 1 80 30 EWR PHL
## 2 80 30 EWR PHL
## 3 80 28 EWR PHL
## 4 80 32 EWR PHL
## 5 80 29 EWR PHL
## 6 80 22 EWR PHL
## 7 80 25 EWR PHL
## 8 80 30 EWR PHL
## 9 80 27 EWR PHL
## 10 80 30 EWR PHL
## # ℹ 327,336 more rows
The flights that actually departed with the shortest travel distance departed from EWR and arrive at PHL
add_columes <- mutate(flights, distance_description = if_else( distance< 500 , "short_distance" ,"long_distance" ))
add_columes
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, distance_description <chr>
ggplot (add_columes,aes(x=distance_description) )+
geom_bar()+
labs(title = "Compare between short and long distance flights ",
x = "Type of Distance ",
y = "Count")+
theme(plot.title = element_text(hjust = 0.5,size = 20))+
scale_y_continuous(labels = scales::comma)
Based on the bar plot, one fourth of the flights is short distance, and most of it were long distance
f_dep_del <- flights %>%
group_by(dest)%>%
summarize(Mean_delay = mean(dep_delay))%>%
filter(!is.na(Mean_delay))
f_dep_del
## # A tibble: 6 × 2
## dest Mean_delay
## <chr> <dbl>
## 1 ABQ 13.7
## 2 ACK 6.46
## 3 ANC 12.9
## 4 EYW 3.65
## 5 LEX -9
## 6 SBN 21.1
ggplot(f_dep_del) +
geom_boxplot(aes(x = dest, y = Mean_delay)) +
labs(title = "Destination airport and its average departure delay",
x = "Destination airport",
y = "Average Departure Delay")+
theme(plot.title = element_text(hjust = 0.5,size = 20))
Based onthe plot, the destination airport that has the longest average departure delay is SBN
f_dep_del <- flights %>%
group_by(dest)%>%
summarize(Mean_delay = mean(dep_delay))%>%
filter(!is.na(Mean_delay))
f_dep_del %>%
arrange (desc(Mean_delay))
## # A tibble: 6 × 2
## dest Mean_delay
## <chr> <dbl>
## 1 SBN 21.1
## 2 ABQ 13.7
## 3 ANC 12.9
## 4 ACK 6.46
## 5 EYW 3.65
## 6 LEX -9
It is SBN
f_speed <- flights %>%
filter(!is.na(distance) & !is.na(air_time)) %>%
group_by(carrier)%>%
summarize(Mean_flight_speed = mean(distance/air_time*60))
f_speed %>%
arrange (desc(Mean_flight_speed))
## # A tibble: 16 × 2
## carrier Mean_flight_speed
## <chr> <dbl>
## 1 HA 480.
## 2 VX 446.
## 3 AS 444.
## 4 F9 425.
## 5 UA 421.
## 6 DL 418.
## 7 AA 417.
## 8 WN 401.
## 9 B6 400.
## 10 FL 394.
## 11 MQ 368.
## 12 OO 366.
## 13 EV 363.
## 14 9E 345.
## 15 US 342.
## 16 YV 332.
f_speed %>%
arrange (Mean_flight_speed)
## # A tibble: 16 × 2
## carrier Mean_flight_speed
## <chr> <dbl>
## 1 YV 332.
## 2 US 342.
## 3 9E 345.
## 4 EV 363.
## 5 OO 366.
## 6 MQ 368.
## 7 FL 394.
## 8 B6 400.
## 9 WN 401.
## 10 AA 417.
## 11 DL 418.
## 12 UA 421.
## 13 F9 425.
## 14 AS 444.
## 15 VX 446.
## 16 HA 480.
highest is HA with 480.3577 m/h lowest is YV with 331.9700 m/h
# See unique species
seattlepets %>%
distinct(species)
## # A tibble: 4 × 1
## species
## <chr>
## 1 Dog
## 2 Cat
## 3 Goat
## 4 Pig
#count
seattlepets %>%
distinct(species) %>%
nrow()
## [1] 4
There are 4 species and there are Dog, Cat, Goat, and Pig
seattlepets%>%
filter(species== "Cat"| species=="Dog") %>%
group_by(species,primary_breed)%>%
summarize(Count = n())%>%
arrange(desc(Count))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
## # A tibble: 333 × 3
## # Groups: species [2]
## species primary_breed Count
## <chr> <chr> <int>
## 1 Cat Domestic Shorthair 10086
## 2 Dog Retriever, Labrador 4867
## 3 Cat Domestic Medium Hair 2146
## 4 Dog Retriever, Golden 1900
## 5 Dog Chihuahua, Short Coat 1850
## 6 Cat Domestic Longhair 1365
## 7 Dog German Shepherd 1005
## 8 Dog Terrier 1002
## 9 Dog Australian Shepherd 923
## 10 Cat American Shorthair 860
## # ℹ 323 more rows
The most popular primary breeds for cats is Domestic Shorthair The most popular primary breeds for dogs is Retriever, Labrador
seattlepets%>%
group_by(animal_name)%>%
summarize(Count = n())%>%
arrange(desc(Count))
## # A tibble: 13,930 × 2
## animal_name Count
## <chr> <int>
## 1 <NA> 483
## 2 Lucy 439
## 3 Charlie 387
## 4 Luna 355
## 5 Bella 331
## 6 Max 270
## 7 Daisy 261
## 8 Molly 240
## 9 Jack 232
## 10 Lily 232
## # ℹ 13,920 more rows
The three most common pet names in Seattle are Lucy, Charlie, Luna
ten_name_for_C <- seattlepets%>%
filter(species== "Cat", !is.na(animal_name)) %>%
group_by(animal_name)%>%
summarize(Count = n())%>%
arrange(desc(Count)) %>%
head(10)
print(ten_name_for_C)
## # A tibble: 10 × 2
## animal_name Count
## <chr> <int>
## 1 Luna 111
## 2 Lucy 102
## 3 Lily 86
## 4 Max 83
## 5 Bella 82
## 6 Charlie 81
## 7 Oliver 73
## 8 Jack 65
## 9 Sophie 59
## 10 Leo 54
ten_name_for_D <- seattlepets%>%
filter(species== "Dog", !is.na(animal_name)) %>%
group_by(animal_name)%>%
summarize(Count = n())%>%
arrange(desc(Count)) %>%
head(10)
print(ten_name_for_D)
## # A tibble: 10 × 2
## animal_name Count
## <chr> <int>
## 1 Lucy 337
## 2 Charlie 306
## 3 Bella 249
## 4 Luna 244
## 5 Daisy 221
## 6 Cooper 189
## 7 Lola 187
## 8 Max 186
## 9 Molly 186
## 10 Stella 185
name <- seattlepets%>%
filter(!is.na(animal_name)) %>%
group_by(animal_name)%>%
summarize(Count = n())%>%
arrange(desc(Count))
name_more_than_100 <-name%>%
filter(Count>100)
print(nrow(name_more_than_100))
## [1] 56
There are 56 names appear more than 100 times
cd<-seattlepets %>%
filter( !is.na(animal_name), species == "Cat" |species == "Dog" ) %>%
count(animal_name, species) %>%
pivot_wider(names_from = species, values_from = n, values_fill = 0) %>%
mutate(
total = Cat + Dog,
cat_to_dog_ratio = Cat / Dog,
) %>%
filter(total > 100)
cd %>%
arrange(desc(cat_to_dog_ratio))
## # A tibble: 56 × 5
## animal_name Dog Cat total cat_to_dog_ratio
## <chr> <int> <int> <int> <dbl>
## 1 Shadow 79 53 132 0.671
## 2 Lily 146 86 232 0.589
## 3 Leo 96 54 150 0.562
## 4 Loki 75 40 115 0.533
## 5 Oliver 137 73 210 0.533
## 6 Gracie 94 48 142 0.511
## 7 Sam 67 34 101 0.507
## 8 Mia 71 34 105 0.479
## 9 Oscar 100 47 147 0.47
## 10 Luna 244 111 355 0.455
## # ℹ 46 more rows
cd %>%
arrange(cat_to_dog_ratio)
## # A tibble: 56 × 5
## animal_name Dog Cat total cat_to_dog_ratio
## <chr> <int> <int> <int> <dbl>
## 1 Riley 117 9 126 0.0769
## 2 Cooper 189 16 205 0.0847
## 3 Bailey 139 18 157 0.129
## 4 Kona 90 12 102 0.133
## 5 Jake 101 14 115 0.139
## 6 Teddy 89 14 103 0.157
## 7 Sadie 153 25 178 0.163
## 8 Rosie 144 25 169 0.174
## 9 Murphy 95 17 112 0.179
## 10 Daisy 221 40 261 0.181
## # ℹ 46 more rows
Shadow is the name that has the highest “cat_to_dog” ratio while Riley has the lowest “cat_to_dog” ratio