options(repos = c(CRAN = "https://cran.rstudio.com"))
req_packages <- c("DBI","RMySQL","dplyr","dbplyr","knitr","tidyr", "readr", "stringr","tibble", "rmarkdown", "purrr", "lubridate", "here", "httr2", "RCurl","ggplot2")
for (pkg in req_packages) {
if (!require(pkg, character.only = TRUE)) {
message(paste("Installing package:", pkg))
install.packages(pkg, dependencies = TRUE)
} else {
message(paste(pkg, " already installed."))
}
library(pkg, character.only = TRUE)
}
## Loading required package: DBI
## DBI already installed.
## Loading required package: RMySQL
## RMySQL already installed.
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## dplyr already installed.
## Loading required package: dbplyr
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
## dbplyr already installed.
## Loading required package: knitr
## knitr already installed.
## Loading required package: tidyr
## tidyr already installed.
## Loading required package: readr
## readr already installed.
## Loading required package: stringr
## stringr already installed.
## Loading required package: tibble
## tibble already installed.
## Loading required package: rmarkdown
## rmarkdown already installed.
## Loading required package: purrr
## purrr already installed.
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## lubridate already installed.
## Loading required package: here
## here() starts at /Users/paulabrown/Documents/CUNY SPS- Data 607/Week 5 Assignments
## here already installed.
## Loading required package: httr2
## httr2 already installed.
## Loading required package: RCurl
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
## RCurl already installed.
## Loading required package: ggplot2
## ggplot2 already installed.
# Create a matrix that mimics the untidy structure
untidy_data <- data.frame(
X = c("ALASKA", "", "", "AM WEST",""),
Y = c("on time","delayed","","on time","delayed"),
`Los Angeles` = c(497, 62, "", 694, 117),
Phoenix = c(221, 12, "", 4840, 415),
`San Diego` = c(212, 20, "", 383, 65),
`San Francisco` = c(503, 102, "", 320, 129),
Seattle = c(1841, 305, "", 201, 61),
stringsAsFactors = FALSE
)
# Rename the first column to blank (like in the image)
names(untidy_data)[1] <- ""
We need a CSV of untidy data so that we can practice tidying and transforming the data with R.
# Save the untidy data to a CSV file
write.csv(untidy_data, here::here("untidy_flight_data.csv"), row.names = FALSE)
After creating the CSV containing untidy data, we must read it in R so we can begin tidying and transforming.
# Read the CSV
untidy_delays <- read.csv(here("untidy_flight_data.csv"), na.strings = c("", "NA"))
## na.strings = c("", "NA")) tells R to treat blanks and NA as missing values
##Summarize the data in the .csv file Let’s take a look at the untidy data we created. Check to see that it reflects the data in the “Airline Delay” image from Source: Numbersense, Kaiser Fung, McGraw Hill, 2013
knitr::kable(untidy_delays)
X | Y | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
---|---|---|---|---|---|---|
ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
NA | delayed | 62 | 12 | 20 | 102 | 305 |
NA | NA | NA | NA | NA | NA | NA |
AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
NA | delayed | 117 | 415 | 65 | 129 | 61 |
# kable() = creates a nicely formatted table from a data frame or tibble.
We only want to see the rows with data - remove/filter out blank/NA rows.
## Remove rows with all NA values
untidy_delays <- untidy_delays %>%
filter(rowSums(is.na(.)) != ncol(.)) ## Filter to show rows that are NOT "NA"
Associate the airline with the row containing data for the “delayed” status.
## Fill missing airline names
tidy_delays <- untidy_delays %>%
tidyr::fill(X, .direction = "down") ##fill() tells R to fill in the column "X" and fill it going in a downward direction.
tidy_delays
## X Y Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Transform table from wide to long by combining City names into 1 column named “City” and their values in another column named “Delay”
## Pivot city columns into long format
tidy_delays <- tidy_delays %>%
pivot_longer(cols = -c(X, Y), names_to = "City", values_to = "Delay")
Rename for clarity.
tidy_delays <- tidy_delays %>%
rename(Airline = X, Status = Y)
knitr::kable(tidy_delays)
Airline | Status | City | Delay |
---|---|---|---|
ALASKA | on time | Los.Angeles | 497 |
ALASKA | on time | Phoenix | 221 |
ALASKA | on time | San.Diego | 212 |
ALASKA | on time | San.Francisco | 503 |
ALASKA | on time | Seattle | 1841 |
ALASKA | delayed | Los.Angeles | 62 |
ALASKA | delayed | Phoenix | 12 |
ALASKA | delayed | San.Diego | 20 |
ALASKA | delayed | San.Francisco | 102 |
ALASKA | delayed | Seattle | 305 |
AM WEST | on time | Los.Angeles | 694 |
AM WEST | on time | Phoenix | 4840 |
AM WEST | on time | San.Diego | 383 |
AM WEST | on time | San.Francisco | 320 |
AM WEST | on time | Seattle | 201 |
AM WEST | delayed | Los.Angeles | 117 |
AM WEST | delayed | Phoenix | 415 |
AM WEST | delayed | San.Diego | 65 |
AM WEST | delayed | San.Francisco | 129 |
AM WEST | delayed | Seattle | 61 |
Based on the overall averages below, we can conclude that both airlines are mostly on time.
# Summary statistics
summary_stats <- tidy_delays %>%
group_by(Airline,Status) %>%
summarise(Average_Delay = mean(Delay), .groups = "drop")
print(summary_stats)
## # A tibble: 4 × 3
## Airline Status Average_Delay
## <chr> <chr> <dbl>
## 1 ALASKA delayed 100.
## 2 ALASKA on time 655.
## 3 AM WEST delayed 157.
## 4 AM WEST on time 1288.
# Summarize total flights per airline and status
overall_summary <- tidy_delays %>%
group_by(Airline, Status) %>%
summarise(Total = sum(Delay), .groups = "drop")
# Calculate percentage of each status per airline
overall_summary <- overall_summary %>%
group_by(Airline) %>%
mutate(Percentage = round((Total / sum(Total)) * 100, 2))
print(overall_summary)
## # A tibble: 4 × 4
## # Groups: Airline [2]
## Airline Status Total Percentage
## <chr> <chr> <int> <dbl>
## 1 ALASKA delayed 501 13.3
## 2 ALASKA on time 3274 86.7
## 3 AM WEST delayed 787 10.9
## 4 AM WEST on time 6438 89.1
On average both airlines perform well as they are often on time than and rarely delayed. It appears that there are more flights to Phoenix via AM West airline, and more flights to Seattle via Alaska airline. Overall, it seems AM West is a more popular airline and may have more flights which can skew the averages in its favor. Since we do not know the number of flights for each airline’s city we cannot really compare or give a recommendation. Another thing to factor in is how close in proximity each airport is to the destinations.
# Summary statistics
summary_stats <- tidy_delays %>%
group_by(Airline,Status,City) %>%
summarise(Average_Delay = mean(Delay), .groups = "drop")
print(summary_stats)
## # A tibble: 20 × 4
## Airline Status City Average_Delay
## <chr> <chr> <chr> <dbl>
## 1 ALASKA delayed Los.Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San.Diego 20
## 4 ALASKA delayed San.Francisco 102
## 5 ALASKA delayed Seattle 305
## 6 ALASKA on time Los.Angeles 497
## 7 ALASKA on time Phoenix 221
## 8 ALASKA on time San.Diego 212
## 9 ALASKA on time San.Francisco 503
## 10 ALASKA on time Seattle 1841
## 11 AM WEST delayed Los.Angeles 117
## 12 AM WEST delayed Phoenix 415
## 13 AM WEST delayed San.Diego 65
## 14 AM WEST delayed San.Francisco 129
## 15 AM WEST delayed Seattle 61
## 16 AM WEST on time Los.Angeles 694
## 17 AM WEST on time Phoenix 4840
## 18 AM WEST on time San.Diego 383
## 19 AM WEST on time San.Francisco 320
## 20 AM WEST on time Seattle 201
# Plot overall percentages
ggplot(overall_summary, aes(x = Airline, y = Percentage, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Overall Arrival vs Delay Percentages by Airline",
y = "Percentage (%)", x = "Airline") +
scale_fill_manual(values = c("on time" = "lightblue", "delayed" = "pink")) +
theme_minimal()
## Summarize flights per airline, status, and city
city_summary <- tidy_delays %>%
group_by(Airline, City, Status) %>%
summarise(Total = sum(Delay), .groups = "drop")
## Calculate percentage of each status per airline per city
city_summary <- city_summary %>%
group_by(Airline, City) %>%
mutate(Percentage = round((Total / sum(Total)) * 100, 2))
print(city_summary)
## # A tibble: 20 × 5
## # Groups: Airline, City [10]
## Airline City Status Total Percentage
## <chr> <chr> <chr> <int> <dbl>
## 1 ALASKA Los.Angeles delayed 62 11.1
## 2 ALASKA Los.Angeles on time 497 88.9
## 3 ALASKA Phoenix delayed 12 5.15
## 4 ALASKA Phoenix on time 221 94.8
## 5 ALASKA San.Diego delayed 20 8.62
## 6 ALASKA San.Diego on time 212 91.4
## 7 ALASKA San.Francisco delayed 102 16.9
## 8 ALASKA San.Francisco on time 503 83.1
## 9 ALASKA Seattle delayed 305 14.2
## 10 ALASKA Seattle on time 1841 85.8
## 11 AM WEST Los.Angeles delayed 117 14.4
## 12 AM WEST Los.Angeles on time 694 85.6
## 13 AM WEST Phoenix delayed 415 7.9
## 14 AM WEST Phoenix on time 4840 92.1
## 15 AM WEST San.Diego delayed 65 14.5
## 16 AM WEST San.Diego on time 383 85.5
## 17 AM WEST San.Francisco delayed 129 28.7
## 18 AM WEST San.Francisco on time 320 71.3
## 19 AM WEST Seattle delayed 61 23.3
## 20 AM WEST Seattle on time 201 76.7
# Plot overall percentages
ggplot(city_summary, aes(x = City, y = Percentage, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "City Arrival vs City Percentages by Airline",
y = "Percentage (%)", x = "Airline") +
scale_fill_manual(values = c("on time" = "gray", "delayed" ="orange")) +
theme_minimal()