PROJECT FOR STATISTICAL FLIGHTS IN NEW YORK 2013

library(nycflights13)
# Make the copy of data in order to not change the origin
flight_data <- flights

count_flight <- subset(flight_data, month == 1 & day == 1)

nrow(count_flight)

## [1] 842

#Question 2

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

flight_data$month <- as.factor(flight_data$month)
table(flight_data$month, flight_data$day) -> ketqua
ketqua <- as.data.frame(ketqua)
names(ketqua) <- c("Month", "Day", "Total_flight")

ketqua[which.max(ketqua$Total_flight),]

##     Month Day Total_flight
## 323    11  27         1014

#Question 3

library(nycflights13)
library(tidyr)
library(dplyr)

flight_data %>% 
  group_by(carrier) %>% 
  count() %>% 
  ungroup() %>% 
  arrange(-n) %>% 
  slice(1:10) -> companies_top10
  companies_top10

## # A tibble: 10 × 2
##    carrier     n
##    <chr>   <int>
##  1 UA      58665
##  2 B6      54635
##  3 EV      54173
##  4 DL      48110
##  5 AA      32729
##  6 MQ      26397
##  7 US      20536
##  8 9E      18460
##  9 WN      12275
## 10 VX       5162

#Question 4

library(nycflights13)
library(tidyr)
library(dplyr)

flight_data %>% 
  filter(!is.na(dep_delay)) -> df_full_dep_delay
 df_full_dep_delay

## # A tibble: 328,521 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <fct> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013 1         1      517        515       2     830     819      11 UA     
##  2  2013 1         1      533        529       4     850     830      20 UA     
##  3  2013 1         1      542        540       2     923     850      33 AA     
##  4  2013 1         1      544        545      -1    1004    1022     -18 B6     
##  5  2013 1         1      554        600      -6     812     837     -25 DL     
##  6  2013 1         1      554        558      -4     740     728      12 UA     
##  7  2013 1         1      555        600      -5     913     854      19 B6     
##  8  2013 1         1      557        600      -3     709     723     -14 EV     
##  9  2013 1         1      557        600      -3     838     846      -8 B6     
## 10  2013 1         1      558        600      -2     753     745       8 AA     
## # … with 328,511 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

#Question 5

library(nycflights13)
library(tidyr)
library(dplyr)

df_full_dep_delay %>% 
  group_by(carrier) %>% 
  summarise(avg_delay_company = mean(dep_delay)) %>% 
  ungroup() %>% 
  arrange(avg_delay_company) -> sapxep_tb
 
sapxep_tb

## # A tibble: 16 × 2
##    carrier avg_delay_company
##    <chr>               <dbl>
##  1 US                   3.78
##  2 HA                   4.90
##  3 AS                   5.80
##  4 AA                   8.59
##  5 DL                   9.26
##  6 MQ                  10.6 
##  7 UA                  12.1 
##  8 OO                  12.6 
##  9 VX                  12.9 
## 10 B6                  13.0 
## 11 9E                  16.7 
## 12 WN                  17.7 
## 13 FL                  18.7 
## 14 YV                  19.0 
## 15 EV                  20.0 
## 16 F9                  20.2

#R-Test 2 #Question 1

library(nycflights13)
library(tidyr)
library(dplyr)
space_vector <- c()
binhphuong <- 1
   
    for (i in 1:10)
         { 
            binhphuong <- i^2
            space_vector <- c(space_vector, binhphuong)
        
                }

space_vector

##  [1]   1   4   9  16  25  36  49  64  81 100

#Question 2

count_missing <- function(x) # Create the function count_missing
{
    is.na(x) -> missing_value
    sum(missing_value) -> all_missing_value
    return(all_missing_value)
    
    }
myvector <- c(NA, 3, 5, 4, 8, NA, 3, 88, NA, NA, NA)  # Test the function
count_missing(myvector)

## [1] 5

# Find all the missing values in the dataset flights for all variables
vector_missing_flights <- c()
k <- ncol(flight_data)

    for (i in 1:k )  # for loop for all columns
     { 
# Use funtion pull() to list all the values for each columns, such as the second for loop for all rows of dataset
        flight_data %>% pull(i) -> alternative_values
        count_missing(alternative_values) -> missing_values_flights 
        vector_missing_flights <- c(vector_missing_flights, missing_values_flights)
        
    }

names(flight_data) -> col_name

final_result <- data.frame(col_name, vector_missing_flights)
names(final_result)[2] <- c("n_missing")

final_result

##          col_name n_missing
## 1            year         0
## 2           month         0
## 3             day         0
## 4        dep_time      8255
## 5  sched_dep_time         0
## 6       dep_delay      8255
## 7        arr_time      8713
## 8  sched_arr_time         0
## 9       arr_delay      9430
## 10        carrier         0
## 11         flight         0
## 12        tailnum      2512
## 13         origin         0
## 14           dest         0
## 15       air_time      9430
## 16       distance         0
## 17           hour         0
## 18         minute         0
## 19      time_hour         0

#R-Test 3 #Question 1

benford <- function(d)
   {
    prob_d <- log10(1 + 1/d)
    
    return(prob_d)
    
        }
 test_vector_benford <- c()
# Using the for loop
  for (i in 1:9)
        { i <- i + 1
            benford(i) -> vector_benford
            test_vector_benford <- c(test_vector_benford, vector_benford)
            
                    }

test_vector_benford

## [1] 0.17609126 0.12493874 0.09691001 0.07918125 0.06694679 0.05799195 0.05115252
## [8] 0.04575749 0.04139269

digits <- c(1:9)

visualization <- data.frame(digits, test_vector_benford)
names(visualization)[2] <- "Probability"

visualization

##   digits Probability
## 1      1  0.17609126
## 2      2  0.12493874
## 3      3  0.09691001
## 4      4  0.07918125
## 5      5  0.06694679
## 6      6  0.05799195
## 7      7  0.05115252
## 8      8  0.04575749
## 9      9  0.04139269

#Question 2

library(ggplot2)
digits <- c(1:9)

visualization <- data.frame(digits, test_vector_benford)
names(visualization)[2] <- "Probability"

ggBarplot <- ggplot(visualization, aes(x = reorder(factor(digits), Probability), y = Probability, fill = Probability)) +
    geom_col() +
    geom_bar(stat = "identity", width = 0.9) + 
    xlab("Digits") +
    ylab("Probability")+
    ggtitle("The Benford's law") +
    coord_flip ()
ggBarplot

PROJECT FOR STATISTICAL FLIGHTS IN NEW YORK 2013

Hoàng Văn Thảo

2023-02-23