# Upload the libraries needed.
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ dplyr 1.0.9
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
# Import the data from github.
# Link is provided to the csv file below:
# https://github.com/enidroman/data_607_data_acquisition_and_management
urlfile <- "https://raw.githubusercontent.com/enidroman/data_607_data_acquisition_and_management/main/Tidying%20and%20Transforming%20Data.csv"
table <- read.csv(urlfile)
table
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AM WEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
table2 <- table %>%
rename(AIRLINE = X, STATUS = X.1) # Renamed column X = AIRLINE and X.1 = STATUS
table2 <- drop_na(table2) # Removed blank row that seperates the airlines.
table2[table2==""]<-NA # Bring down the Airlines name to be aligned with the Status Column.
table2 <- fill(table2, AIRLINE)
table2
## AIRLINE STATUS Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4,840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
table2$Phoenix <- as.integer(gsub(",","",table2$Phoenix)) # Removed comma in 4,480 in Phoenix to convert to Phoenix column to integer.
sapply(table2, class)
## AIRLINE STATUS Los.Angeles Phoenix San.Diego
## "character" "character" "integer" "integer" "integer"
## San.Francisco Seattle
## "integer" "character"
table2$Seattle <- as.integer(gsub(",","",table2$Seattle)) # Removed comma in 1,841 in Seattle to convert to Seattle column to integer.
sapply(table2, class)
## AIRLINE STATUS Los.Angeles Phoenix San.Diego
## "character" "character" "integer" "integer" "integer"
## San.Francisco Seattle
## "integer" "integer"
# Combined all city in City Column while aligning the cities with the airline names. Created a Delayed and On Time column while aligning the numbers with the cities.
table2 <- table2 %>%
gather(CITY,NUM_FLIGHTS, -AIRLINE, -STATUS) %>%
spread(STATUS, NUM_FLIGHTS)
colnames(table2) <- c('AIRLINE', 'CITY', 'DELAYED', 'ON_TIME')
table2$CITY <- str_replace_all(table2$CITY, "\\.", " ") # Replaced "," between the cities with a space.
table2
## AIRLINE CITY DELAYED ON_TIME
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
table3 <- table2 # Created a dataframe with just Airline, City, and Delayed.
select(table2, AIRLINE, CITY, DELAYED)
## AIRLINE CITY DELAYED
## 1 ALASKA Los Angeles 62
## 2 ALASKA Phoenix 12
## 3 ALASKA San Diego 20
## 4 ALASKA San Francisco 102
## 5 ALASKA Seattle 305
## 6 AM WEST Los Angeles 117
## 7 AM WEST Phoenix 415
## 8 AM WEST San Diego 65
## 9 AM WEST San Francisco 129
## 10 AM WEST Seattle 61
flights_delayed <- table3
ggplot(data = flights_delayed, mapping = aes(x = CITY, y = DELAYED, fill = AIRLINE, position = "dodge")) +
geom_point()
#### Here you can see that AM WEST had 286 more delays then Alaska. That
is an average of 57.2 more then Alaska. AM WEST had a median of 117
delays vs 62 delays of Alaska and a minimum of 61 delays vs 12 delays of
Alaska, and max of 415 delays vs 305 delays of Alaska. More
investigation has to be done to find the reason behind the delays in AM
West.
ggplot(flights_delayed, aes(x = CITY, y = DELAYED, fill = AIRLINE)) +
geom_col(position = "dodge")
#### Here you can see that AM WEST had 286 more delays then Alaska. That
is an average of 57.2 more then Alaska. AM WEST had a median of 117
delays vs 62 delays of Alaska and a minimum of 61 delays vs 12 delays of
Alaska, and max of 415 delays vs 305 delays of Alaska. More
investigation has to be done to find the reason behind the delays in AM
West.
flights_delayed %>% # Summary of delays for each airline.
group_by(AIRLINE) %>%
summarise(TOTAL_DELAYS = sum(DELAYED), # Total sum of delays for each airline.
AVG_NUM_DELAYS = mean(DELAYED), # Average of delays for each airline.
MEDIAN_DELAYS = median(DELAYED), # The median of delays for each airline.
MIN_DELAYS = min(DELAYED), # The minimum of delays for each airline.
MAX_DELAYS = max(DELAYED)) # The maximum of delays for each airline.
## # A tibble: 2 × 6
## AIRLINE TOTAL_DELAYS AVG_NUM_DELAYS MEDIAN_DELAYS MIN_DELAYS MAX_DELAYS
## <chr> <int> <dbl> <int> <int> <int>
## 1 ALASKA 501 100. 62 12 305
## 2 AM WEST 787 157. 117 61 415