#Poner librerias
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(dplyr)
library(nycflights13)
#Carga en memoria el data frame flights y muestra su contenido.
data(flights)
view(flights)
#Encuentra los datos descriptivos del data frame flights. Identifica la media de las distancias recorridas en millas.
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00.00
## Median :29.00 Median :2013-07-03 10:00:00.00
## Mean :26.23 Mean :2013-07-03 05:22:54.64
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00
##
#1. Define un criterio para encontrar las aerolíneas que han recorrido más distancia (en millas) y crea un nuevo data frame que filtre solamente a las aeorlíneas que han recorrido una distancia superior a la media, se desean ver los campos carrier, distance, origin, dest en forma descendente por distance.
mas_distancia <- flights %>% select(distance, carrier, origin, dest) %>% filter(distance > 1040) %>% arrange(desc(distance))
mas_distancia
## # A tibble: 127,665 × 4
## distance carrier origin dest
## <dbl> <chr> <chr> <chr>
## 1 4983 HA JFK HNL
## 2 4983 HA JFK HNL
## 3 4983 HA JFK HNL
## 4 4983 HA JFK HNL
## 5 4983 HA JFK HNL
## 6 4983 HA JFK HNL
## 7 4983 HA JFK HNL
## 8 4983 HA JFK HNL
## 9 4983 HA JFK HNL
## 10 4983 HA JFK HNL
## # … with 127,655 more rows
# Encuentra la suma y la media de las distancias recorridas por carrier, elimina los NA’S e interpreta que significa la suma y la media de las distancias recorridas.
#Ordena en forma descendente
info_distance <- flights %>% group_by(carrier) %>% summarise(suma_distance = sum(distance, na.rm = TRUE), media_distance = mean(distance, na.rm = TRUE)) %>% arrange(desc(suma_distance), desc(media_distance))
info_distance
## # A tibble: 16 × 3
## carrier suma_distance media_distance
## <chr> <dbl> <dbl>
## 1 UA 89705524 1529.
## 2 DL 59507317 1237.
## 3 B6 58384137 1069.
## 4 AA 43864584 1340.
## 5 EV 30498951 563.
## 6 MQ 15033955 570.
## 7 VX 12902327 2499.
## 8 WN 12229203 996.
## 9 US 11365778 553.
## 10 9E 9788152 530.
## 11 FL 2167344 665.
## 12 AS 1715028 2402
## 13 HA 1704186 4983
## 14 F9 1109700 1620
## 15 YV 225395 375.
## 16 OO 16026 501.
# Identifica si las aerolíneas líderes son las mismas en los tres aeropuertos cuyo origen es Nueva York ( John F. Kennedy (JFK), LaGuardia (LGA) and Newark Liberty (EWR) ).
df_jfk <- flights %>% group_by(origin, carrier) %>% filter(origin=="JFK") %>% summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
df_jfk
## # A tibble: 10 × 3
## # Groups: origin [1]
## origin carrier count
## <chr> <chr> <int>
## 1 JFK B6 42076
## 2 JFK DL 20701
## 3 JFK 9E 14651
## 4 JFK AA 13783
## 5 JFK MQ 7193
## 6 JFK UA 4534
## 7 JFK VX 3596
## 8 JFK US 2995
## 9 JFK EV 1408
## 10 JFK HA 342
#La aerolinea no 1 en JFK es B6
df_lga <- flights %>% group_by(origin, carrier) %>% filter(origin=="LGA") %>% summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
df_lga
## # A tibble: 13 × 3
## # Groups: origin [1]
## origin carrier count
## <chr> <chr> <int>
## 1 LGA DL 23067
## 2 LGA MQ 16928
## 3 LGA AA 15459
## 4 LGA US 13136
## 5 LGA EV 8826
## 6 LGA UA 8044
## 7 LGA WN 6087
## 8 LGA B6 6002
## 9 LGA FL 3260
## 10 LGA 9E 2541
## 11 LGA F9 685
## 12 LGA YV 601
## 13 LGA OO 26
#La aerolinea no 1 en LGA es DL
df_ewr <- flights %>% group_by(origin, carrier) %>% filter(origin=="EWR") %>% summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
df_ewr
## # A tibble: 12 × 3
## # Groups: origin [1]
## origin carrier count
## <chr> <chr> <int>
## 1 EWR UA 46087
## 2 EWR EV 43939
## 3 EWR B6 6557
## 4 EWR WN 6188
## 5 EWR US 4405
## 6 EWR DL 4342
## 7 EWR AA 3487
## 8 EWR MQ 2276
## 9 EWR VX 1566
## 10 EWR 9E 1268
## 11 EWR AS 714
## 12 EWR OO 6
#La aerolinea no 1 en LGA es UA