library(nycflights13)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
#Data Issues
#Check NA
flights %>%
summarise(across(everything(),~sum(is.na(.)))) #NA terbanyak ada di kolom delay
## # A tibble: 1 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 8255 0 8255 8713 0
## # ℹ 11 more variables: arr_delay <int>, carrier <int>, flight <int>,
## # tailnum <int>, origin <int>, dest <int>, air_time <int>, distance <int>,
## # hour <int>, minute <int>, time_hour <int>
View(flights)
#Mencari jumlah NA di dataset
sum(is.na(flights))
## [1] 46595
#Check Duplicate Data
flights %>%
duplicated()%>%
sum()
## [1] 0
#View Duplicated Data
flights %>% filter(duplicated(.))
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## # sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
#How to fixed the dataset
#Cleaning data
flights_cleaning <- flights %>%
distinct()%>%
drop_na()
summary(flights_cleaning)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 500
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 905
## Median :2013 Median : 7.000 Median :16.00 Median :1400 Median :1355
## Mean :2013 Mean : 6.565 Mean :15.74 Mean :1349 Mean :1340
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1122 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1554 Median : -5.000
## Mean : 12.56 Mean :1502 Mean :1533 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1944 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## carrier flight tailnum origin
## Length:327346 Min. : 1 Length:327346 Length:327346
## Class :character 1st Qu.: 544 Class :character Class :character
## Mode :character Median :1467 Mode :character Mode :character
## Mean :1943
## 3rd Qu.:3412
## Max. :8500
## dest air_time distance hour
## Length:327346 Min. : 20.0 Min. : 80 Min. : 5.00
## Class :character 1st Qu.: 82.0 1st Qu.: 509 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 888 Median :13.00
## Mean :150.7 Mean :1048 Mean :13.14
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00.00
## 1st Qu.: 8.00 1st Qu.:2013-04-05 06:00:00.00
## Median :29.00 Median :2013-07-04 09:00:00.00
## Mean :26.23 Mean :2013-07-03 17:56:45.44
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 18:00:00.00
## Max. :59.00 Max. :2013-12-31 23:00:00.00
ggplot(flights_cleaning,aes(x=origin,fill=origin))+geom_bar()+labs(title="Jumlah Daerah")

ggplot(flights_cleaning,aes(x=origin,y=flight))+geom_boxplot(outlier.color="blue")+labs(title="Distribusi penerbangan tiap daerah",x="origin",y="flight")

ggplot(flights_cleaning,aes(x=origin))+geom_density(color="red")+labs(title="Distribution Origin",x="Origin")

ggplot(flights_cleaning,aes(x=origin,y=distance))+geom_boxplot(outlier.color="red")+labs(title="Distribusi jarak tiap daerah")
