library(nycflights13)
# Make the copy of data in order to not change the origin
flight_data <- flights
count_flight <- subset(flight_data, month == 1 & day == 1)
nrow(count_flight)
## [1] 842
#Question 2
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
flight_data$month <- as.factor(flight_data$month)
table(flight_data$month, flight_data$day) -> ketqua
ketqua <- as.data.frame(ketqua)
names(ketqua) <- c("Month", "Day", "Total_flight")
ketqua[which.max(ketqua$Total_flight),]
## Month Day Total_flight
## 323 11 27 1014
#Question 3
library(nycflights13)
library(tidyr)
library(dplyr)
flight_data %>%
group_by(carrier) %>%
count() %>%
ungroup() %>%
arrange(-n) %>%
slice(1:10) -> companies_top10
companies_top10
## # A tibble: 10 × 2
## carrier n
## <chr> <int>
## 1 UA 58665
## 2 B6 54635
## 3 EV 54173
## 4 DL 48110
## 5 AA 32729
## 6 MQ 26397
## 7 US 20536
## 8 9E 18460
## 9 WN 12275
## 10 VX 5162
#Question 4
library(nycflights13)
library(tidyr)
library(dplyr)
flight_data %>%
filter(!is.na(dep_delay)) -> df_full_dep_delay
df_full_dep_delay
## # A tibble: 328,521 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <fct> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 328,511 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
#Question 5
library(nycflights13)
library(tidyr)
library(dplyr)
df_full_dep_delay %>%
group_by(carrier) %>%
summarise(avg_delay_company = mean(dep_delay)) %>%
ungroup() %>%
arrange(avg_delay_company) -> sapxep_tb
sapxep_tb
## # A tibble: 16 × 2
## carrier avg_delay_company
## <chr> <dbl>
## 1 US 3.78
## 2 HA 4.90
## 3 AS 5.80
## 4 AA 8.59
## 5 DL 9.26
## 6 MQ 10.6
## 7 UA 12.1
## 8 OO 12.6
## 9 VX 12.9
## 10 B6 13.0
## 11 9E 16.7
## 12 WN 17.7
## 13 FL 18.7
## 14 YV 19.0
## 15 EV 20.0
## 16 F9 20.2
#R-Test 2 #Question 1
library(nycflights13)
library(tidyr)
library(dplyr)
space_vector <- c()
binhphuong <- 1
for (i in 1:10)
{
binhphuong <- i^2
space_vector <- c(space_vector, binhphuong)
}
space_vector
## [1] 1 4 9 16 25 36 49 64 81 100
#Question 2
count_missing <- function(x) # Create the function count_missing
{
is.na(x) -> missing_value
sum(missing_value) -> all_missing_value
return(all_missing_value)
}
myvector <- c(NA, 3, 5, 4, 8, NA, 3, 88, NA, NA, NA) # Test the function
count_missing(myvector)
## [1] 5
# Find all the missing values in the dataset flights for all variables
vector_missing_flights <- c()
k <- ncol(flight_data)
for (i in 1:k ) # for loop for all columns
{
# Use funtion pull() to list all the values for each columns, such as the second for loop for all rows of dataset
flight_data %>% pull(i) -> alternative_values
count_missing(alternative_values) -> missing_values_flights
vector_missing_flights <- c(vector_missing_flights, missing_values_flights)
}
names(flight_data) -> col_name
final_result <- data.frame(col_name, vector_missing_flights)
names(final_result)[2] <- c("n_missing")
final_result
## col_name n_missing
## 1 year 0
## 2 month 0
## 3 day 0
## 4 dep_time 8255
## 5 sched_dep_time 0
## 6 dep_delay 8255
## 7 arr_time 8713
## 8 sched_arr_time 0
## 9 arr_delay 9430
## 10 carrier 0
## 11 flight 0
## 12 tailnum 2512
## 13 origin 0
## 14 dest 0
## 15 air_time 9430
## 16 distance 0
## 17 hour 0
## 18 minute 0
## 19 time_hour 0
#R-Test 3 #Question 1
benford <- function(d)
{
prob_d <- log10(1 + 1/d)
return(prob_d)
}
test_vector_benford <- c()
# Using the for loop
for (i in 1:9)
{ i <- i + 1
benford(i) -> vector_benford
test_vector_benford <- c(test_vector_benford, vector_benford)
}
test_vector_benford
## [1] 0.17609126 0.12493874 0.09691001 0.07918125 0.06694679 0.05799195 0.05115252
## [8] 0.04575749 0.04139269
digits <- c(1:9)
visualization <- data.frame(digits, test_vector_benford)
names(visualization)[2] <- "Probability"
visualization
## digits Probability
## 1 1 0.17609126
## 2 2 0.12493874
## 3 3 0.09691001
## 4 4 0.07918125
## 5 5 0.06694679
## 6 6 0.05799195
## 7 7 0.05115252
## 8 8 0.04575749
## 9 9 0.04139269
#Question 2
library(ggplot2)
digits <- c(1:9)
visualization <- data.frame(digits, test_vector_benford)
names(visualization)[2] <- "Probability"
ggBarplot <- ggplot(visualization, aes(x = reorder(factor(digits), Probability), y = Probability, fill = Probability)) +
geom_col() +
geom_bar(stat = "identity", width = 0.9) +
xlab("Digits") +
ylab("Probability")+
ggtitle("The Benford's law") +
coord_flip ()
ggBarplot