rawdata1 <- read.csv("ASGMNT5RAW.csv", header = TRUE, stringsAsFactors = FALSE) # Read the .csv file
rawdata1
## X X.1 LAX PHX SAN SFO SEA
## 1 ALASKA On-Time 497 221 212 503 1,841
## 2 Delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AM WEST On-Time 694 4,840 383 320 201
## 5 Delayed 117 415 65 129 61
rawdata2 <- rawdata1[-3, ] # Removing empty row
rawdata2
## X X.1 LAX PHX SAN SFO SEA
## 1 ALASKA On-Time 497 221 212 503 1,841
## 2 Delayed 62 12 20 102 305
## 4 AM WEST On-Time 694 4,840 383 320 201
## 5 Delayed 117 415 65 129 61
# Adding missing column names
rawdata2[2, 1] <- 'ALASKA'
rawdata2[3, 1] <- 'AM_WEST'
rawdata2[4, 1] <- 'AM_WEST'
rawdata2
## X X.1 LAX PHX SAN SFO SEA
## 1 ALASKA On-Time 497 221 212 503 1,841
## 2 ALASKA Delayed 62 12 20 102 305
## 4 AM_WEST On-Time 694 4,840 383 320 201
## 5 AM_WEST Delayed 117 415 65 129 61
# Assigning names to the Columns
names(rawdata2)[1:7] <- c('airline', 'flightStatus','Los_Angeles','Phoenix','San_Diego','San_Francisco','Seattle')
rawdata2
## airline flightStatus Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA On-Time 497 221 212 503 1,841
## 2 ALASKA Delayed 62 12 20 102 305
## 4 AM_WEST On-Time 694 4,840 383 320 201
## 5 AM_WEST Delayed 117 415 65 129 61
# Using the "gather" function in tidyr to rearrange the dataset, sourced from https://rstudio.com/resources/webinars/data-wrangling-with-r-and-rstudio/
rawdata3 <- gather(rawdata2, c('Los_Angeles':'Seattle'), key = 'destination', value = 'tally')
rawdata3
## airline flightStatus destination tally
## 1 ALASKA On-Time Los_Angeles 497
## 2 ALASKA Delayed Los_Angeles 62
## 3 AM_WEST On-Time Los_Angeles 694
## 4 AM_WEST Delayed Los_Angeles 117
## 5 ALASKA On-Time Phoenix 221
## 6 ALASKA Delayed Phoenix 12
## 7 AM_WEST On-Time Phoenix 4,840
## 8 AM_WEST Delayed Phoenix 415
## 9 ALASKA On-Time San_Diego 212
## 10 ALASKA Delayed San_Diego 20
## 11 AM_WEST On-Time San_Diego 383
## 12 AM_WEST Delayed San_Diego 65
## 13 ALASKA On-Time San_Francisco 503
## 14 ALASKA Delayed San_Francisco 102
## 15 AM_WEST On-Time San_Francisco 320
## 16 AM_WEST Delayed San_Francisco 129
## 17 ALASKA On-Time Seattle 1,841
## 18 ALASKA Delayed Seattle 305
## 19 AM_WEST On-Time Seattle 201
## 20 AM_WEST Delayed Seattle 61
# removing the "flightstatus" column and adding 2 additional columns "On-Time" and "Delayed" Using the "spread"SPREAD" function - sourced from https://rstudio.com/resources/webinars/data-wrangling-with-r-and-rstudio/
rawdata4 <- spread(rawdata3, 'flightStatus', 'tally')
rawdata4
## airline destination Delayed On-Time
## 1 ALASKA Los_Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San_Diego 20 212
## 4 ALASKA San_Francisco 102 503
## 5 ALASKA Seattle 305 1,841
## 6 AM_WEST Los_Angeles 117 694
## 7 AM_WEST Phoenix 415 4,840
## 8 AM_WEST San_Diego 65 383
## 9 AM_WEST San_Francisco 129 320
## 10 AM_WEST Seattle 61 201
names(rawdata4)[1:4] <- c('airline','destination','delayed','ontime')
rawdata4
## airline destination delayed ontime
## 1 ALASKA Los_Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San_Diego 20 212
## 4 ALASKA San_Francisco 102 503
## 5 ALASKA Seattle 305 1,841
## 6 AM_WEST Los_Angeles 117 694
## 7 AM_WEST Phoenix 415 4,840
## 8 AM_WEST San_Diego 65 383
## 9 AM_WEST San_Francisco 129 320
## 10 AM_WEST Seattle 61 201
finaldata <- data.frame(rawdata4)
class(rawdata4)
## [1] "data.frame"