flight_data <- read.csv("C:/Users/TO UYEN/Downloads/flight_data.csv")

1. Thông tin cơ bản liên quan đến bộ dữ liệu

1.1. Kích thước bộ dữ liệu

dim(flight_data)
## [1] 1048575      26

=> Bộ dữ liệu có 1048575 quan sát và 26 biến.

1.2. Tên các biến của bộ dữ liệu

names(flight_data)
##  [1] "month"               "day_of_month"        "day_of_week"        
##  [4] "fl_date"             "op_unique_carrier"   "op_carrier_fl_num"  
##  [7] "origin"              "origin_city_name"    "origin_state_nm"    
## [10] "dest"                "dest_city_name"      "dest_state_nm"      
## [13] "crs_dep_time"        "dep_time"            "dep_delay"          
## [16] "taxi_out"            "wheels_off"          "wheels_on"          
## [19] "taxi_in"             "crs_arr_time"        "arr_time"           
## [22] "arr_delay"           "crs_elapsed_time"    "actual_elapsed_time"
## [25] "air_time"            "distance"

1.3. Kiểu dữ liệu của các biến trong bộ dữ liệu

sapply(flight_data, typeof)
##               month        day_of_month         day_of_week             fl_date 
##           "integer"           "integer"           "integer"         "character" 
##   op_unique_carrier   op_carrier_fl_num              origin    origin_city_name 
##         "character"           "integer"         "character"         "character" 
##     origin_state_nm                dest      dest_city_name       dest_state_nm 
##         "character"         "character"         "character"         "character" 
##        crs_dep_time            dep_time           dep_delay            taxi_out 
##           "integer"           "integer"           "integer"           "integer" 
##          wheels_off           wheels_on             taxi_in        crs_arr_time 
##           "integer"           "integer"           "integer"           "integer" 
##            arr_time           arr_delay    crs_elapsed_time actual_elapsed_time 
##           "integer"           "integer"           "integer"           "integer" 
##            air_time            distance 
##           "integer"           "integer"

1.4. Thống kê tóm tắt

summary(flight_data)
##      month        day_of_month    day_of_week      fl_date         
##  Min.   :1.000   Min.   : 1.00   Min.   :1.000   Length:1048575    
##  1st Qu.:1.000   1st Qu.: 8.00   1st Qu.:2.000   Class :character  
##  Median :1.000   Median :15.00   Median :4.000   Mode  :character  
##  Mean   :1.478   Mean   :15.31   Mean   :3.893                     
##  3rd Qu.:2.000   3rd Qu.:23.00   3rd Qu.:6.000                     
##  Max.   :2.000   Max.   :31.00   Max.   :7.000                     
##                                                                    
##  op_unique_carrier  op_carrier_fl_num    origin          origin_city_name  
##  Length:1048575     Min.   :   1      Length:1048575     Length:1048575    
##  Class :character   1st Qu.:1088      Class :character   Class :character  
##  Mode  :character   Median :2074      Mode  :character   Mode  :character  
##                     Mean   :2355                                           
##                     3rd Qu.:3467                                           
##                     Max.   :8819                                           
##                                                                            
##  origin_state_nm        dest           dest_city_name     dest_state_nm     
##  Length:1048575     Length:1048575     Length:1048575     Length:1048575    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   crs_dep_time     dep_time       dep_delay          taxi_out     
##  Min.   :   1   Min.   :   1    Min.   : -96.00   Min.   :  1.00  
##  1st Qu.: 905   1st Qu.: 911    1st Qu.:  -6.00   1st Qu.: 12.00  
##  Median :1315   Median :1323    Median :  -2.00   Median : 15.00  
##  Mean   :1319   Mean   :1325    Mean   :  11.68   Mean   : 18.25  
##  3rd Qu.:1727   3rd Qu.:1736    3rd Qu.:   8.00   3rd Qu.: 21.00  
##  Max.   :2359   Max.   :2400    Max.   :3125.00   Max.   :213.00  
##                 NA's   :22553   NA's   :22650     NA's   :23125   
##    wheels_off      wheels_on        taxi_in         crs_arr_time 
##  Min.   :   1    Min.   :   1    Min.   :  1.000   Min.   :   1  
##  1st Qu.: 929    1st Qu.:1058    1st Qu.:  4.000   1st Qu.:1115  
##  Median :1337    Median :1510    Median :  6.000   Median :1522  
##  Mean   :1350    Mean   :1476    Mean   :  8.083   Mean   :1503  
##  3rd Qu.:1750    3rd Qu.:1914    3rd Qu.:  9.000   3rd Qu.:1924  
##  Max.   :2400    Max.   :2400    Max.   :444.000   Max.   :2359  
##  NA's   :23125   NA's   :23677   NA's   :23677                   
##     arr_time       arr_delay        crs_elapsed_time actual_elapsed_time
##  Min.   :   1    Min.   :-117.000   Min.   :  6.0    Min.   : 16.0      
##  1st Qu.:1102    1st Qu.: -17.000   1st Qu.: 95.0    1st Qu.: 89.0      
##  Median :1514    Median :  -7.000   Median :133.0    Median :127.0      
##  Mean   :1481    Mean   :   5.584   Mean   :148.8    Mean   :142.5      
##  3rd Qu.:1920    3rd Qu.:   9.000   3rd Qu.:180.0    3rd Qu.:175.0      
##  Max.   :2400    Max.   :3136.000   Max.   :859.0    Max.   :792.0      
##  NA's   :23675   NA's   :25751      NA's   :1        NA's   :25751      
##     air_time        distance     
##  Min.   :  8.0   Min.   :  31.0  
##  1st Qu.: 64.0   1st Qu.: 402.0  
##  Median :100.0   Median : 692.0  
##  Mean   :116.2   Mean   : 834.5  
##  3rd Qu.:147.0   3rd Qu.:1069.0  
##  Max.   :723.0   Max.   :5095.0  
##  NA's   :25751

1.5. Số quan sát trùng lặp

sum(duplicated(flight_data))
## [1] 0

=> Không có quan sát bị trùng lặp trong bộ dữ liệu.