#Choice of questions for analysis

What is the total number of flights by each airline to a destination?
What proportion of flights are delayed for each airline per destination? Illustrate using a bar chart.
If you want to get to your destination on time, which airlines will you use?
What carrier provides the most service and what is the proportion of delayed arrivals per carries
What airport experiences the most delays. Plot a bar chart to illustrate the total arrivals and the proportion of total delay per destination.

Load the relevant libraries

We start by installing the relevant packages and loading the libraries as below

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(RMySQL)

## Loading required package: DBI

library(DBI)
library(readr)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(tidyr)

Create database in sql

The flight database is created in MySQL. The sql that creates the database can be found on my github

Connect to database and read query

db_host <- "localhost"
db_port <- 5432
db_user <- "root"
db_password <- "7!8Kas33!4"
movies <- "tidydata"


# Create a connection object
con <- dbConnect(MySQL(), 
                 user = "root",
                 password = "7!8Kas33!4",
                 host = "localhost",
                 name = "tidydata")

cat("Connected to database successfully.")

## Connected to database successfully.

dbGetQuery(con, "USE tidydata")

## data frame with 0 columns and 0 rows

query <- readLines("https://raw.githubusercontent.com/hawa1983/WK5Assignment/main/TidyingTransforming_Data.sql")
dbGetQuery(con, query)

## data frame with 0 columns and 0 rows

query1 <- "SELECT 
    a.AirlineName AS 'Airline',
    s.StatusName AS 'Status',
    SUM(IF(d.DestName = 'Los Angeles', fd.FlightCount, 0)) AS 'Los Angeles',
    SUM(IF(d.DestName = 'Phoenix', fd.FlightCount, 0)) AS 'Phoenix',
    SUM(IF(d.DestName = 'San Diego', fd.FlightCount, 0)) AS 'San Diego',
    SUM(IF(d.DestName = 'San Francisco', fd.FlightCount, 0)) AS 'San Francisco',
    SUM(IF(d.DestName = 'Seattle', fd.FlightCount, 0)) AS 'Seattle'
FROM FlightData fd
JOIN Airlines a ON fd.AirlineId = a.AirlineId
JOIN Destinations d ON fd.DestId = d.DestId
JOIN Statuses s ON fd.StatusId = s.StatusId
GROUP BY a.AirlineName, s.StatusName;"

#airline_data <- dbGetQuery(con, query1)  # use this line after TidyingTransforming_Data.sql is run on your database 
#write_csv(airline_data, "airline_data.csv") # use this line after TidyingTransforming_Data.sql is run on your database 
airline_data <- read_csv("https://raw.githubusercontent.com/hawa1983/WK5Assignment/main/airline_data.csv")

## Rows: 4 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Airline, Status
## dbl (5): Los Angeles, Phoenix, San Diego, San Francisco, Seattle
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

airline_data

## # A tibble: 4 × 7
##   Airline Status  `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 AMWEST  delayed           117     415          65             129      61
## 2 ALASKA  delayed            62      12          20             102     305
## 3 AMWEST  on time           694    4840         383             320     201
## 4 ALASKA  on time           497     221         212             503    1841

dbDisconnect(con)

## [1] TRUE

#2 Reshape the data to long format

# Reshape the data from wide to long format
data_long <- airline_data %>%
  pivot_longer(cols = -c(Airline, Status), names_to = "destination", values_to = "count")

data_long

## # A tibble: 20 × 4
##    Airline Status  destination   count
##    <chr>   <chr>   <chr>         <dbl>
##  1 AMWEST  delayed Los Angeles     117
##  2 AMWEST  delayed Phoenix         415
##  3 AMWEST  delayed San Diego        65
##  4 AMWEST  delayed San Francisco   129
##  5 AMWEST  delayed Seattle          61
##  6 ALASKA  delayed Los Angeles      62
##  7 ALASKA  delayed Phoenix          12
##  8 ALASKA  delayed San Diego        20
##  9 ALASKA  delayed San Francisco   102
## 10 ALASKA  delayed Seattle         305
## 11 AMWEST  on time Los Angeles     694
## 12 AMWEST  on time Phoenix        4840
## 13 AMWEST  on time San Diego       383
## 14 AMWEST  on time San Francisco   320
## 15 AMWEST  on time Seattle         201
## 16 ALASKA  on time Los Angeles     497
## 17 ALASKA  on time Phoenix         221
## 18 ALASKA  on time San Diego       212
## 19 ALASKA  on time San Francisco   503
## 20 ALASKA  on time Seattle        1841

#3 Calculate total column and proportion of flights delay by carrier

# Calculate the "Total" column
data_long <- data_long %>%
  group_by(Airline, Status, destination) %>%
  summarise(Total = sum(count))

## `summarise()` has grouped output by 'Airline', 'Status'. You can override using
## the `.groups` argument.

# Calculate the "prop_delayed" column
data_long <- data_long %>%
  pivot_wider(names_from = Status, values_from = Total) %>%
  mutate(, total  = `on time` + delayed, prop_delayed = paste0(format(100 * delayed / (delayed + `on time`), digits = 2), "%"))

# Rename the Status column to match the desired output
data_long <- data_long %>%
  rename(on_time = `on time`)


print(data_long)

## # A tibble: 10 × 6
## # Groups:   Airline [2]
##    Airline destination   delayed on_time total prop_delayed
##    <chr>   <chr>           <dbl>   <dbl> <dbl> <chr>       
##  1 ALASKA  Los Angeles        62     497   559 "11.1%"     
##  2 ALASKA  Phoenix            12     221   233 " 5.2%"     
##  3 ALASKA  San Diego          20     212   232 " 8.6%"     
##  4 ALASKA  San Francisco     102     503   605 "16.9%"     
##  5 ALASKA  Seattle           305    1841  2146 "14.2%"     
##  6 AMWEST  Los Angeles       117     694   811 "14.4%"     
##  7 AMWEST  Phoenix           415    4840  5255 " 7.9%"     
##  8 AMWEST  San Diego          65     383   448 "14.5%"     
##  9 AMWEST  San Francisco     129     320   449 "28.7%"     
## 10 AMWEST  Seattle            61     201   262 "23.3%"

3. If you want to get to your destination on time, which airlines will you use?

On average and by destination, the bar plots below shows that ALASKA airline has the lower proportion of flights delayed compared with AMWEST. Therefore, ALASKA should be the airline of choice if you want to reach your destination on time.

# Create the bar chart
ggplot(data_long, aes(x = destination, y = prop_delayed, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Proportion of Delayed Flights by Destination and Airline",
       x = "Destination",
       y = "Proportion Delayed") +
  theme_minimal() +
  theme(legend.position = "right")

data_long %>%
  mutate(prop_delayed = as.numeric(gsub("%", "", prop_delayed))) %>%  # Convert prop_delayed to numeric
  ggplot(aes(x = Airline, y = prop_delayed)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Proportion of Delayed Flights by Destination and Airline",
    x = "Carrier",
    y = "Proportion Delayed",
    fill = "Airline"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#4: Waht carrier provides the most service and what is the proportion of delayed arrivals per carries

Based on the summary below, AMWEST provides the most flights and has the higher proportion of delayed arrivals.

data_long %>%
  group_by(Airline) %>%
  summarise(total_fights = sum(total), mean_prop_delayed = mean(as.numeric(gsub("%", "", prop_delayed))))

## # A tibble: 2 × 3
##   Airline total_fights mean_prop_delayed
##   <chr>          <dbl>             <dbl>
## 1 ALASKA          3775              11.2
## 2 AMWEST          7225              17.8

#5: What airport experiences the most delays. Plot a bar chart to illustrate the total arrivals and the proportion of total delay per destination.

Based on the summary and plots below, San Francisco is the destination that experience the most delays even though the number of arrivals is third to last. This is also illustrated in the following plot

destination_delay <- data_long %>%
  group_by(destination) %>%
  summarise(total_delayed = sum(delayed), total_arrivals = sum(total), prop_delayed = round(sum(delayed)/sum(total), 2))

destination_delay

## # A tibble: 5 × 4
##   destination   total_delayed total_arrivals prop_delayed
##   <chr>                 <dbl>          <dbl>        <dbl>
## 1 Los Angeles             179           1370         0.13
## 2 Phoenix                 427           5488         0.08
## 3 San Diego                85            680         0.12
## 4 San Francisco           231           1054         0.22
## 5 Seattle                 366           2408         0.15

# Create the bar chart
p1 <- ggplot(destination_delay, aes(x = destination)) +
  geom_bar(aes(y = prop_delayed), stat = "identity", position = "dodge", fill = "blue", width = 0.4) +
  labs(title = "Total of Delayed Flights by Destination",
       x = "Destination",
       y = "Total Delayed") +
  theme_minimal()

p1

p2 <- ggplot(destination_delay, aes(x = destination)) +
  geom_bar(aes(y = total_arrivals), stat = "identity", position = "dodge", fill = "red", width = 0.4) +
  labs(title = "Total of Delayed Flights by Destination",
       x = "Destination",
       y = "Total Delayed") +
  theme_minimal()

p2

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(p1, p2, ncol = 1)

Conclusion

AMWEST provides the most flights and has the highest proportion of delayed flights for all destinations.
Phoenix is the destination with the most arrivals but with the lowest proportion of delayed arrivals.
San Francisco, though it comes third to last in arrivals flight has the highest proportion of delayed arrivals.
For any given destination, ALASKA has the lowest proportion of delayed flights.

Tidying and Transforming Data

Fomba Kassoh, Group Member: Souleymane Doumbia

2023-10-08

Load the relevant libraries

Create database in sql

3. If you want to get to your destination on time, which airlines will you use?

Conclusion