#Choice of questions for analysis
We start by installing the relevant packages and loading the libraries as below
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(RMySQL)
## Loading required package: DBI
library(DBI)
library(readr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(tidyr)
The flight database is created in MySQL. The sql that creates the database can be found on my github
Connect to database and read query
db_host <- "localhost"
db_port <- 5432
db_user <- "root"
db_password <- "7!8Kas33!4"
movies <- "tidydata"
# Create a connection object
con <- dbConnect(MySQL(),
user = "root",
password = "7!8Kas33!4",
host = "localhost",
name = "tidydata")
cat("Connected to database successfully.")
## Connected to database successfully.
dbGetQuery(con, "USE tidydata")
## data frame with 0 columns and 0 rows
query <- readLines("https://raw.githubusercontent.com/hawa1983/WK5Assignment/main/TidyingTransforming_Data.sql")
dbGetQuery(con, query)
## data frame with 0 columns and 0 rows
query1 <- "SELECT
a.AirlineName AS 'Airline',
s.StatusName AS 'Status',
SUM(IF(d.DestName = 'Los Angeles', fd.FlightCount, 0)) AS 'Los Angeles',
SUM(IF(d.DestName = 'Phoenix', fd.FlightCount, 0)) AS 'Phoenix',
SUM(IF(d.DestName = 'San Diego', fd.FlightCount, 0)) AS 'San Diego',
SUM(IF(d.DestName = 'San Francisco', fd.FlightCount, 0)) AS 'San Francisco',
SUM(IF(d.DestName = 'Seattle', fd.FlightCount, 0)) AS 'Seattle'
FROM FlightData fd
JOIN Airlines a ON fd.AirlineId = a.AirlineId
JOIN Destinations d ON fd.DestId = d.DestId
JOIN Statuses s ON fd.StatusId = s.StatusId
GROUP BY a.AirlineName, s.StatusName;"
#airline_data <- dbGetQuery(con, query1) # use this line after TidyingTransforming_Data.sql is run on your database
#write_csv(airline_data, "airline_data.csv") # use this line after TidyingTransforming_Data.sql is run on your database
airline_data <- read_csv("https://raw.githubusercontent.com/hawa1983/WK5Assignment/main/airline_data.csv")
## Rows: 4 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Airline, Status
## dbl (5): Los Angeles, Phoenix, San Diego, San Francisco, Seattle
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
airline_data
## # A tibble: 4 × 7
## Airline Status `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AMWEST delayed 117 415 65 129 61
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 ALASKA on time 497 221 212 503 1841
dbDisconnect(con)
## [1] TRUE
#2 Reshape the data to long format
# Reshape the data from wide to long format
data_long <- airline_data %>%
pivot_longer(cols = -c(Airline, Status), names_to = "destination", values_to = "count")
data_long
## # A tibble: 20 × 4
## Airline Status destination count
## <chr> <chr> <chr> <dbl>
## 1 AMWEST delayed Los Angeles 117
## 2 AMWEST delayed Phoenix 415
## 3 AMWEST delayed San Diego 65
## 4 AMWEST delayed San Francisco 129
## 5 AMWEST delayed Seattle 61
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AMWEST on time Los Angeles 694
## 12 AMWEST on time Phoenix 4840
## 13 AMWEST on time San Diego 383
## 14 AMWEST on time San Francisco 320
## 15 AMWEST on time Seattle 201
## 16 ALASKA on time Los Angeles 497
## 17 ALASKA on time Phoenix 221
## 18 ALASKA on time San Diego 212
## 19 ALASKA on time San Francisco 503
## 20 ALASKA on time Seattle 1841
#3 Calculate total column and proportion of flights delay by carrier
# Calculate the "Total" column
data_long <- data_long %>%
group_by(Airline, Status, destination) %>%
summarise(Total = sum(count))
## `summarise()` has grouped output by 'Airline', 'Status'. You can override using
## the `.groups` argument.
# Calculate the "prop_delayed" column
data_long <- data_long %>%
pivot_wider(names_from = Status, values_from = Total) %>%
mutate(, total = `on time` + delayed, prop_delayed = paste0(format(100 * delayed / (delayed + `on time`), digits = 2), "%"))
# Rename the Status column to match the desired output
data_long <- data_long %>%
rename(on_time = `on time`)
print(data_long)
## # A tibble: 10 × 6
## # Groups: Airline [2]
## Airline destination delayed on_time total prop_delayed
## <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 ALASKA Los Angeles 62 497 559 "11.1%"
## 2 ALASKA Phoenix 12 221 233 " 5.2%"
## 3 ALASKA San Diego 20 212 232 " 8.6%"
## 4 ALASKA San Francisco 102 503 605 "16.9%"
## 5 ALASKA Seattle 305 1841 2146 "14.2%"
## 6 AMWEST Los Angeles 117 694 811 "14.4%"
## 7 AMWEST Phoenix 415 4840 5255 " 7.9%"
## 8 AMWEST San Diego 65 383 448 "14.5%"
## 9 AMWEST San Francisco 129 320 449 "28.7%"
## 10 AMWEST Seattle 61 201 262 "23.3%"
On average and by destination, the bar plots below shows that ALASKA airline has the lower proportion of flights delayed compared with AMWEST. Therefore, ALASKA should be the airline of choice if you want to reach your destination on time.
# Create the bar chart
ggplot(data_long, aes(x = destination, y = prop_delayed, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Proportion of Delayed Flights by Destination and Airline",
x = "Destination",
y = "Proportion Delayed") +
theme_minimal() +
theme(legend.position = "right")
data_long %>%
mutate(prop_delayed = as.numeric(gsub("%", "", prop_delayed))) %>% # Convert prop_delayed to numeric
ggplot(aes(x = Airline, y = prop_delayed)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Proportion of Delayed Flights by Destination and Airline",
x = "Carrier",
y = "Proportion Delayed",
fill = "Airline"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#4: Waht carrier provides the most service and what is the proportion of
delayed arrivals per carries
Based on the summary below, AMWEST provides the most flights and has the higher proportion of delayed arrivals.
data_long %>%
group_by(Airline) %>%
summarise(total_fights = sum(total), mean_prop_delayed = mean(as.numeric(gsub("%", "", prop_delayed))))
## # A tibble: 2 × 3
## Airline total_fights mean_prop_delayed
## <chr> <dbl> <dbl>
## 1 ALASKA 3775 11.2
## 2 AMWEST 7225 17.8
#5: What airport experiences the most delays. Plot a bar chart to illustrate the total arrivals and the proportion of total delay per destination.
Based on the summary and plots below, San Francisco is the destination that experience the most delays even though the number of arrivals is third to last. This is also illustrated in the following plot
destination_delay <- data_long %>%
group_by(destination) %>%
summarise(total_delayed = sum(delayed), total_arrivals = sum(total), prop_delayed = round(sum(delayed)/sum(total), 2))
destination_delay
## # A tibble: 5 × 4
## destination total_delayed total_arrivals prop_delayed
## <chr> <dbl> <dbl> <dbl>
## 1 Los Angeles 179 1370 0.13
## 2 Phoenix 427 5488 0.08
## 3 San Diego 85 680 0.12
## 4 San Francisco 231 1054 0.22
## 5 Seattle 366 2408 0.15
# Create the bar chart
p1 <- ggplot(destination_delay, aes(x = destination)) +
geom_bar(aes(y = prop_delayed), stat = "identity", position = "dodge", fill = "blue", width = 0.4) +
labs(title = "Total of Delayed Flights by Destination",
x = "Destination",
y = "Total Delayed") +
theme_minimal()
p1
p2 <- ggplot(destination_delay, aes(x = destination)) +
geom_bar(aes(y = total_arrivals), stat = "identity", position = "dodge", fill = "red", width = 0.4) +
labs(title = "Total of Delayed Flights by Destination",
x = "Destination",
y = "Total Delayed") +
theme_minimal()
p2
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, ncol = 1)
t