Assignment – Tidying and Transforming Data

Load data and libraries
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(readr)



uncleanflights <- read_csv("/Users/leslie/607/607data.csv")
## New names:
## • `` -> `...1`
## • `` -> `...2`
## Rows: 4 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, ...2
## dbl (5): Los Angeles, Phoenix, San Diego, San Francisco, Seattle
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
uncleanflights
## # A tibble: 4 × 7
##   ...1    ...2    `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           487     221         212             503    1841
## 2 <NA>    delayed            62      12          21             102     305
## 3 AM WEST on time           694    4840         383             320     201
## 4 <NA>    delayed           117     415          65             129      61
First, I will change the names of the first and second column to “airline” and “status”. This way any one who looks at my data knows what each column is for. Then I will add the airline name to the delayed rows 2 and 4. This will help me later when I switch from wide to long data. I will also omit any NA values and convert data to lowercase to enhance data intergerity and reduce errors.
names(uncleanflights)
## [1] "...1"          "...2"          "Los Angeles"   "Phoenix"      
## [5] "San Diego"     "San Francisco" "Seattle"
names(uncleanflights) <- c("airline", "status", "los_angeles", "phoenix", "san_diego", "san_francisco", "seattle")
print(uncleanflights)
## # A tibble: 4 × 7
##   airline status  los_angeles phoenix san_diego san_francisco seattle
##   <chr>   <chr>         <dbl>   <dbl>     <dbl>         <dbl>   <dbl>
## 1 ALASKA  on time         487     221       212           503    1841
## 2 <NA>    delayed          62      12        21           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 <NA>    delayed         117     415        65           129      61
uncleanflights[2, "airline"] <- "ALASKA"
uncleanflights[4, "airline"] <- "AM WEST"

uncleanflights <- uncleanflights %>%
na.omit(uncleanflights)

uncleanflights <- uncleanflights %>%
mutate(airline = tolower(airline))

uncleanflights
## # A tibble: 4 × 7
##   airline status  los_angeles phoenix san_diego san_francisco seattle
##   <chr>   <chr>         <dbl>   <dbl>     <dbl>         <dbl>   <dbl>
## 1 alaska  on time         487     221       212           503    1841
## 2 alaska  delayed          62      12        21           102     305
## 3 am west on time         694    4840       383           320     201
## 4 am west delayed         117     415        65           129      61
Now, I will reshape my data from wide to long format.
long_flights <-
  pivot_longer(uncleanflights, cols = -c(airline, status), names_to = "destination", values_to = "flights")


long_flights
## # A tibble: 20 × 4
##    airline status  destination   flights
##    <chr>   <chr>   <chr>           <dbl>
##  1 alaska  on time los_angeles       487
##  2 alaska  on time phoenix           221
##  3 alaska  on time san_diego         212
##  4 alaska  on time san_francisco     503
##  5 alaska  on time seattle          1841
##  6 alaska  delayed los_angeles        62
##  7 alaska  delayed phoenix            12
##  8 alaska  delayed san_diego          21
##  9 alaska  delayed san_francisco     102
## 10 alaska  delayed seattle           305
## 11 am west on time los_angeles       694
## 12 am west on time phoenix          4840
## 13 am west on time san_diego         383
## 14 am west on time san_francisco     320
## 15 am west on time seattle           201
## 16 am west delayed los_angeles       117
## 17 am west delayed phoenix           415
## 18 am west delayed san_diego          65
## 19 am west delayed san_francisco     129
## 20 am west delayed seattle            61
The question I want to answer is: “Which airline experiences the highest number of flight delays?”. Here, I create a table isolating the number of delayed flights by the airline and destination. Then, I use this data to create a histogram. From the histogram, it seems like AM West experiences the highest number of delays. However, is it possible that AM West flies more frequently then Alaska?
delayed_flights <- long_flights %>%
                    filter(long_flights$status == "delayed")
delayed_flights
## # A tibble: 10 × 4
##    airline status  destination   flights
##    <chr>   <chr>   <chr>           <dbl>
##  1 alaska  delayed los_angeles        62
##  2 alaska  delayed phoenix            12
##  3 alaska  delayed san_diego          21
##  4 alaska  delayed san_francisco     102
##  5 alaska  delayed seattle           305
##  6 am west delayed los_angeles       117
##  7 am west delayed phoenix           415
##  8 am west delayed san_diego          65
##  9 am west delayed san_francisco     129
## 10 am west delayed seattle            61
library(ggplot2)

    ggp <- ggplot(data=delayed_flights, aes(x=destination, y=flights, fill=airline))
    ggp <- ggp +  ggtitle('Delayed Flights') + theme(plot.title = element_text(hjust = 0.5))
    ggp <- ggp + geom_text(aes(label=flights), vjust=-0.2,
                            position = position_dodge(0.9), size=3.5) +
                            scale_fill_brewer(palette="Paired") +
            geom_bar(stat="identity", position=position_dodge()) 
ggp

I think I need to continue to analyze this. I am going to try to make separate columns for delayed and on time. Then I want to group the data by airline, then find the percentage of delayed flight for each airline, then compare.
key_longdata <- gather(uncleanflights, "city", "count", 3:7)
key_longdata
## # A tibble: 20 × 4
##    airline status  city          count
##    <chr>   <chr>   <chr>         <dbl>
##  1 alaska  on time los_angeles     487
##  2 alaska  delayed los_angeles      62
##  3 am west on time los_angeles     694
##  4 am west delayed los_angeles     117
##  5 alaska  on time phoenix         221
##  6 alaska  delayed phoenix          12
##  7 am west on time phoenix        4840
##  8 am west delayed phoenix         415
##  9 alaska  on time san_diego       212
## 10 alaska  delayed san_diego        21
## 11 am west on time san_diego       383
## 12 am west delayed san_diego        65
## 13 alaska  on time san_francisco   503
## 14 alaska  delayed san_francisco   102
## 15 am west on time san_francisco   320
## 16 am west delayed san_francisco   129
## 17 alaska  on time seattle        1841
## 18 alaska  delayed seattle         305
## 19 am west on time seattle         201
## 20 am west delayed seattle          61
library(DT)
transformed <- spread(key_longdata,status,count)
datatable(transformed, extensions = 'Scroller', options = list(scrollY = 200, scroller = TRUE ))
I want to compare the percentage of delayed flights by city and airline. I will plot this to see if the percentages of delays for each city are similar for both airlines.
In this histogram, AM West has a higher percentage of delayed flights for every single city! I think this histogram is better for data analysis then my previous one because here I take into consideration the total amount of flights for each airline. In this histogram, we can see that although AM West has a higher percentage of flight delays per city, Alaska Airline also has similar percentages of flight delays.
Both airlines have the highest chance of flight delays when traveling to San Fransisco. If we had more data, perhaps on weather or air traffic we could analyze why does San Francisco has the highest delays.
percentage_delays <- transformed %>%
  mutate(
    total_flights = `on time` + delayed,  # Calculate total flights
    percentage_delayed = (delayed / total_flights) * 100  # Calculate percentage
  ) %>%
  select(airline, city, total_flights, delayed, percentage_delayed)  # Select relevant columns

# Display the results
print(percentage_delays)
## # A tibble: 10 × 5
##    airline city          total_flights delayed percentage_delayed
##    <chr>   <chr>                 <dbl>   <dbl>              <dbl>
##  1 alaska  los_angeles             549      62              11.3 
##  2 alaska  phoenix                 233      12               5.15
##  3 alaska  san_diego               233      21               9.01
##  4 alaska  san_francisco           605     102              16.9 
##  5 alaska  seattle                2146     305              14.2 
##  6 am west los_angeles             811     117              14.4 
##  7 am west phoenix                5255     415               7.90
##  8 am west san_diego               448      65              14.5 
##  9 am west san_francisco           449     129              28.7 
## 10 am west seattle                 262      61              23.3
ggplot(percentage_delays, aes(x = city, y = percentage_delayed, fill = airline)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(
    title = "Percentage of Delayed Flights by Airline and City",
    x = "City",
    y = "Percentage of Delayed Flights (%)"
  ) +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")  # Optional: Use a color palette

I am isolating the Alaska Airline data to look at it more closely.
alaska_delays <- transformed %>%
  filter(airline == "alaska")
print(alaska_delays)
## # A tibble: 5 × 4
##   airline city          delayed `on time`
##   <chr>   <chr>           <dbl>     <dbl>
## 1 alaska  los_angeles        62       487
## 2 alaska  phoenix            12       221
## 3 alaska  san_diego          21       212
## 4 alaska  san_francisco     102       503
## 5 alaska  seattle           305      1841
Now, I will do the same for AM WEST, isolate the data to look at it more closely
amwest_delays <- transformed %>%
  filter(airline == "am west")
print(amwest_delays)
## # A tibble: 5 × 4
##   airline city          delayed `on time`
##   <chr>   <chr>           <dbl>     <dbl>
## 1 am west los_angeles       117       694
## 2 am west phoenix           415      4840
## 3 am west san_diego          65       383
## 4 am west san_francisco     129       320
## 5 am west seattle            61       201
Lastly, I want to compare the total percentage of delayed flights for Alaska Airline and AM West Airline. AM WEST has a total delay percentage of 10.89%.Alaska has a total percentage of delayed flights of 13.33%.
total_delay_am_west <- percentage_delays %>%
  filter(airline == "am west") %>%  # Adjust the airline code if needed
  summarise(
    total_delayed = sum(delayed),
    total_flights = sum(total_flights),
    percentage_delayed = (total_delayed / total_flights) * 100
  )

# Display the result
print(total_delay_am_west)
## # A tibble: 1 × 3
##   total_delayed total_flights percentage_delayed
##           <dbl>         <dbl>              <dbl>
## 1           787          7225               10.9
total_delay_alaska <- percentage_delays %>%
  filter(airline == "alaska") %>%  # Adjust the airline code if needed
  summarise(
    total_delayed = sum(delayed),
    total_flights = sum(total_flights),
    percentage_delayed = (total_delayed / total_flights) * 100
  )

# Display the result
print(total_delay_alaska)
## # A tibble: 1 × 3
##   total_delayed total_flights percentage_delayed
##           <dbl>         <dbl>              <dbl>
## 1           502          3766               13.3
In conclusion, AM West Airlines has a lower percentage of delayed flights than initially perceived. I’m pleased that I continued to analyze the data, as my earlier analysis did not reveal this insight. Although AM West has a higher total number of flights, which might suggest higher delays, our recent findings clarify that the actual percentage of delayed flights is lower than expected.