library(nycflights13)
data(flights)
?flights
str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame': 336776 obs. of 19 variables:
## $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr "UA" "UA" "AA" "B6" ...
## $ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num 1400 1416 1089 1576 762 ...
## $ hour : num 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
flights_mis<- aggr(flights, col=c('navyblue','yellow'),numbers=TRUE, sortVars=TRUE,labels=names(flights), cex.axis=.7,gap=3)
##
## Variables sorted by number of missings:
## Variable Count
## arr_delay 0.028000808
## air_time 0.028000808
## arr_time 0.025871796
## dep_time 0.024511842
## dep_delay 0.024511842
## tailnum 0.007458964
## year 0.000000000
## month 0.000000000
## day 0.000000000
## sched_dep_time 0.000000000
## sched_arr_time 0.000000000
## carrier 0.000000000
## flight 0.000000000
## origin 0.000000000
## dest 0.000000000
## distance 0.000000000
## hour 0.000000000
## minute 0.000000000
## time_hour 0.000000000
# Ket qua: So du lieu khong thieu la 97.2%; Co 13 bien khong thieu du lieu va 6 bien thieu du lieu
df1 <- subset(flights, year == 2013 & month == 1 & day == 1)
dim(df1)
## [1] 842 19
# Nhu vay co 842 chuyen bay vao ngay 1/1/2013
# So chuyen bay tre gio khoi hanh
df2 <- subset(flights, dep_delay > 0)
dim(df2)
## [1] 128432 19
# Vay la co 128432 chuyen bay tre gioi khoi hanh
# Ty le
128432/336776
## [1] 0.3813573
# 0.38%
# Danh sach 10 chuyen bay tre gio nhat
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## between(): dplyr, data.table
## filter(): dplyr, stats
## first(): dplyr, data.table
## lag(): dplyr, stats
## last(): dplyr, data.table
## transpose(): purrr, data.table
df3 <- arrange(df2, desc(dep_delay))
head(df3)
## # A tibble: 6 <U+00D7> 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
#Hoac
df3 %>% arrange(desc(dep_delay)) %>% head(., 10) # Vay nhung chuyen bay co thoi gian khoi hanh nhieu hown 896 la nhung chuyen bay tre gio nhat
## # A tibble: 10 <U+00D7> 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
# Loc ra cac chuyen bay xuat phat tu cac san bay khac nhau
df5_1 <- filter (flights, origin == "EWR") # co 120835 chuyen bay xuat phat tu san EWR
df5_2 <- filter (flights, origin == "LGA") # co 104662 chuyen bay xuat phat tu san LAG
df5_3 <- filter (flights, origin == "JFK") # co 111279 chuyen bay xuat phat tu san JFK
120835+104662+111279
## [1] 336776
# Vay san EWR la san co nhieu chuyen bay xuat phat nhat
Thang nao trong nam co nhieu chuyen bay nhat
df6_1 <- filter(flights, month == 1) # 27004 chuyen
df6_2 <- filter(flights, month == 2) # 24951 chuyen
df6_3 <- filter(flights, month == 3) # 28834 chuyen
df6_4 <- filter(flights, month == 4) # 28330 chuyen
df6_5 <- filter(flights, month == 5) # 28796 chuyen
df6_6 <- filter(flights, month == 6) # 28243 chuyen
df6_7 <- filter(flights, month == 7) # 29425 chuyen
df6_8 <- filter(flights, month == 8) # 29327 chuyen
df6_9 <- filter(flights, month == 9) # 27572 chuyen
df6_10 <- filter(flights, month == 10) # 28889 chuyen
df6_11 <- filter(flights, month == 11) # 27262 chuyen
df6_12 <- filter(flights, month == 12) # 28135 chuyen
# Vay thang 7 la thang co nhieu chuyen bay nhat trong nam (29425 chuyen)
# Ve hinh => Khong biet
library(ggplot2)
df7 <- filter(df3, dep_delay >= 120)
# Vay la co 9888 chuyen bay khoi hanh tre hon 120 phut
df8_1 <- filter(flights, dest == "IAH")
# Co 7918 chuyen bay den san IAH
df8_1
## # A tibble: 7,198 <U+00D7> 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 623 627 -4 933
## 4 2013 1 1 728 732 -4 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2 1350
## 8 2013 1 1 1044 1045 -1 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5 1503
## # ... with 7,188 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
df8_2 <- filter(flights, dest == "HOU")
# Co 2115 chuyen bay den san HOU
df8_2
## # A tibble: 2,115 <U+00D7> 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 1208 1158 10 1540
## 2 2013 1 1 1306 1300 6 1622
## 3 2013 1 1 1708 1700 8 2037
## 4 2013 1 1 2030 2035 -5 2354
## 5 2013 1 2 734 700 34 1045
## 6 2013 1 2 1156 1158 -2 1517
## 7 2013 1 2 1319 1305 14 1633
## 8 2013 1 2 1810 1655 75 2146
## 9 2013 1 2 2031 2035 -4 2353
## 10 2013 1 3 704 700 4 1036
## # ... with 2,105 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
Tu cac df tao ra o cau 10, thang 7 co 29425 chuyn bay, thang 8 co 29327 chuyen bay, thang 9 co 27547 chuyen bay ## Cau 14
library(stringr)
library(tidyverse)
# Van toc trung binh
df10 <- mutate(flights, avespeed = distance/air_time)
df10
## # A tibble: 336,776 <U+00D7> 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, avespeed <dbl>
# Thoi gian bay tinh theo gio
df11 <- mutate(flights, airtime_hour = air_time/60)
df11
## # A tibble: 336,776 <U+00D7> 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, airtime_hour <dbl>
Chon ra 3 hang co nhieu chuyen bay nhat khoi hanh tu NY
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.