Section1: Read my CSV file from Github and Reassign missing value for column x

myurl <- "https://raw.githubusercontent.com/BanuB/Week5AssignmentDATA607/master/Sample4.csv"
csvdata <- read.csv(file=myurl, header=TRUE,sep=",",stringsAsFactors = FALSE,na.strings=c("NA"))
filename <- "flightanalysis.csv"

str(csvdata)
## 'data.frame':    5 obs. of  7 variables:
##  $ X            : chr  "Alaska" "" "" "AM WEST" ...
##  $ X.1          : chr  "on time" "delayed" "" "on time" ...
##  $ Los.Angeles  : int  497 62 NA 694 117
##  $ Phoenix      : chr  "221" "12" "" "4,840" ...
##  $ San.Diego    : int  212 20 NA 383 65
##  $ San.Fransisco: int  503 102 NA 320 129
##  $ Seattle      : chr  "1,841" "305" "" "201" ...
head(csvdata)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Fransisco Seattle
## 1  Alaska on time         497     221       212           503   1,841
## 2         delayed          62      12        20           102     305
## 3                          NA                NA            NA        
## 4 AM WEST on time         694   4,840       383           320     201
## 5         delayed         117     415        65           129      61
#Reassign missing value for column X
csvdata[2,1] <- csvdata$X[1]
csvdata[5,1] <- csvdata$X[4]

Section2: Tidy Step 1 to gather, mutate and omit to remove NA and rename column names Airline Flight_Status

#This csvdata appears to be wide since variable (represented as destinationcity) is such that each city is a variable. We can convert the values appearing in each column as one column variable and bring the values to be populated for that column variable.
#change layout of the dataset from wide to long format using tidyr. Remove NA rows
long_data <- csvdata %>% gather(Destination_City, "Num_of_flights", Los.Angeles:Seattle)%>% 
  mutate(Num_of_flights = ifelse(Num_of_flights == "", NA, Num_of_flights))%>% 
  na.omit 

#assign column names
colnames(long_data)[1] <- "Airline" 
colnames(long_data)[2] <- "Flight_Status"

Section3: Tidy Step 2 change type of column to Numeric for Num_of_flights, use spread() Flight_Status, replace space in “on time” to aid in easy reading

#change type to numeric 
long_data
##    Airline Flight_Status Destination_City Num_of_flights
## 1   Alaska       on time      Los.Angeles            497
## 2   Alaska       delayed      Los.Angeles             62
## 4  AM WEST       on time      Los.Angeles            694
## 5  AM WEST       delayed      Los.Angeles            117
## 6   Alaska       on time          Phoenix            221
## 7   Alaska       delayed          Phoenix             12
## 9  AM WEST       on time          Phoenix          4,840
## 10 AM WEST       delayed          Phoenix            415
## 11  Alaska       on time        San.Diego            212
## 12  Alaska       delayed        San.Diego             20
## 14 AM WEST       on time        San.Diego            383
## 15 AM WEST       delayed        San.Diego             65
## 16  Alaska       on time    San.Fransisco            503
## 17  Alaska       delayed    San.Fransisco            102
## 19 AM WEST       on time    San.Fransisco            320
## 20 AM WEST       delayed    San.Fransisco            129
## 21  Alaska       on time          Seattle          1,841
## 22  Alaska       delayed          Seattle            305
## 24 AM WEST       on time          Seattle            201
## 25 AM WEST       delayed          Seattle             61
long_data[,"Num_of_flights"]  <- as.numeric(parse_number(long_data$Num_of_flights))
sapply(long_data,mode)
##          Airline    Flight_Status Destination_City   Num_of_flights 
##      "character"      "character"      "character"        "numeric"
#use tidyR to spread the flight status
long_data1 <- long_data %>% spread(Flight_Status, Num_of_flights)
long_data1
##    Airline Destination_City delayed on time
## 1   Alaska      Los.Angeles      62     497
## 2   Alaska          Phoenix      12     221
## 3   Alaska        San.Diego      20     212
## 4   Alaska    San.Fransisco     102     503
## 5   Alaska          Seattle     305    1841
## 6  AM WEST      Los.Angeles     117     694
## 7  AM WEST          Phoenix     415    4840
## 8  AM WEST        San.Diego      65     383
## 9  AM WEST    San.Fransisco     129     320
## 10 AM WEST          Seattle      61     201
#replace the on time with _ so there is no space in the column name
names(long_data1) <- names(long_data1) %>% str_replace_all("\\s","_") 
sapply(long_data1,mode)
##          Airline Destination_City          delayed          on_time 
##      "character"      "character"        "numeric"        "numeric"
long_data1 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
Airline Destination_City delayed on_time
Alaska Los.Angeles 62 497
Alaska Phoenix 12 221
Alaska San.Diego 20 212
Alaska San.Fransisco 102 503
Alaska Seattle 305 1841
AM WEST Los.Angeles 117 694
AM WEST Phoenix 415 4840
AM WEST San.Diego 65 383
AM WEST San.Fransisco 129 320
AM WEST Seattle 61 201

Section5: Now we have long_data2 in our tidy format for analysis. Group by Airline to find percent delay and percent on time value

#Arrange By Destination_City
long_data2 <- long_data1 %>% arrange(Destination_City)


long_data3 <- long_data2 %>%
  group_by(Airline) %>%
  mutate(cum_delayed = cumsum(delayed), cum_ontime = cumsum(on_time)) %>%
  mutate (total = delayed + on_time, percentdelay = round(delayed/(delayed+on_time)*100,2), percentontime = round(on_time/(delayed+on_time)*100,2))
long_data3 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
Airline Destination_City delayed on_time cum_delayed cum_ontime total percentdelay percentontime
Alaska Los.Angeles 62 497 62 497 559 11.09 88.91
AM WEST Los.Angeles 117 694 117 694 811 14.43 85.57
Alaska Phoenix 12 221 74 718 233 5.15 94.85
AM WEST Phoenix 415 4840 532 5534 5255 7.90 92.10
Alaska San.Diego 20 212 94 930 232 8.62 91.38
AM WEST San.Diego 65 383 597 5917 448 14.51 85.49
Alaska San.Fransisco 102 503 196 1433 605 16.86 83.14
AM WEST San.Fransisco 129 320 726 6237 449 28.73 71.27
Alaska Seattle 305 1841 501 3274 2146 14.21 85.79
AM WEST Seattle 61 201 787 6438 262 23.28 76.72
#report percentdelay by DestinationCity and Airline :AM WEST HAS HIGHER DELAY THAN ALASKA FOR EACH CITY
ggplot(long_data3,aes(x=Destination_City,y=percentdelay))+geom_bar(aes(fill=Airline),stat="identity",position="dodge")+geom_text(aes(label=..y..),stat="identity",vjust=-.5,hjust=0)+labs(x="Destination_City",y="percentdelay",fill="Airline") + ggtitle("percentdelay by DestinationCity: AM WEST HAS HIGHER DELAY THAN ALASKA FOR EACH CITY")

#report percentontime by DestinationCity and Airline 
ggplot(long_data3,aes(x=Destination_City,y=percentontime))+geom_bar(aes(fill=Airline),stat="identity",position="dodge")+geom_text(aes(label=..y..),check_overlap = FALSE,size =3, stat="identity",vjust=-.5,hjust=.5)+labs(x="Destination_City",y="percentontime",fill="Airline")  +ggtitle("percentontime by DestinationCity")

#report flightsflown to each city by Airline : AM WEST HAS FLOWN THE MOST TO PHOENIX WHEREAS ALASKA IS FLYING THE MOST TO SEATTLE
ggplot(long_data3)+ geom_bar(aes(x=Airline, y=total,fill=Airline),stat = "identity") + facet_wrap(~Destination_City,nrow=2)+ ggtitle("flights flown to each city by Airline: AM WEST HAS FLOWN THE MOST TO \n\ PHOENIX WHEREAS ALASKA IS FLYING THE MOST TO SEATTLE")

Section6: Perform summary report stats

#summarize Analysis 1 - group by Airline, get total flights delayed and on time
summarystats <- long_data1 %>% 
  group_by(Airline) %>% 
  summarize(total_flights_delayed = sum(delayed), total_flights_ontime = sum(on_time))
summarystats
## # A tibble: 2 x 3
##   Airline total_flights_delayed total_flights_ontime
##   <chr>                   <dbl>                <dbl>
## 1 Alaska                    501                 3274
## 2 AM WEST                   787                 6438
summarystats %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
Airline total_flights_delayed total_flights_ontime
Alaska 501 3274
AM WEST 787 6438
#summarize Analysis 2 - group by Destination_City, get total flights delayed and on time
summarystats1 <- long_data1 %>% 
  group_by(Destination_City) %>% 
  summarize(total_flights_delayed = sum(delayed), total_flights_ontime = sum(on_time))
summarystats
## # A tibble: 2 x 3
##   Airline total_flights_delayed total_flights_ontime
##   <chr>                   <dbl>                <dbl>
## 1 Alaska                    501                 3274
## 2 AM WEST                   787                 6438
summarystats1 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
Destination_City total_flights_delayed total_flights_ontime
Los.Angeles 179 1191
Phoenix 427 5061
San.Diego 85 595
San.Fransisco 231 823
Seattle 366 2042
#summarize the airline and the delayed% of an airline and delayed% of airlineontime
summarystats2 <- summarystats %>% mutate(totalflights = total_flights_delayed + total_flights_ontime, 
                        airlinedelay = round((total_flights_delayed / totalflights) * 100, 2),
                        airlineontime = round((total_flights_ontime / totalflights) * 100, 2))
summarystats2 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
Airline total_flights_delayed total_flights_ontime totalflights airlinedelay airlineontime
Alaska 501 3274 3775 13.27 86.73
AM WEST 787 6438 7225 10.89 89.11
#summarize the airline and the delayed% and ontime% of flying to a city
summarystats3 <- summarystats1   %>% mutate(totalflights = total_flights_delayed + total_flights_ontime, 
                         citydelay = round((total_flights_delayed / totalflights) * 100, 2),
                         cityontime = round((total_flights_ontime/ totalflights) * 100, 2))
summarystats3 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
Destination_City total_flights_delayed total_flights_ontime totalflights citydelay cityontime
Los.Angeles 179 1191 1370 13.07 86.93
Phoenix 427 5061 5488 7.78 92.22
San.Diego 85 595 680 12.50 87.50
San.Fransisco 231 823 1054 21.92 78.08
Seattle 366 2042 2408 15.20 84.80

Section7: Plot using melt and select and ANALYSIS

  1. AM WEST has more flights into PHOENIX (TOTAL = 5255). Phoenix overall city delay is (7.78%).
  2. ALASKA has more flights into SEATTLE (TOTAL = 2146). Seattle overall city delay is (15.20%).
  3. While AM WEST HAS HIGHER DELAY THAN ALASKA FOR EACH CITY, the overall Airline delay for all destinations is higher on ALASKA(13.27) AND AM WEST IS (10.89). Since the ontime arrival% is comparable across each city for both airline, Alaska is disadvantaged because of a smaller fleet flying as AM WEST overall %delay is better due to its larger fleet or total flights flown. A traveller could rely on ontimearrival%/CITY as a good predictor when selecting flights to fly through either Alaska or AMWEST or simple Alaska could guide a customer to its lower delay% across each city through its Airline.
#AM WEST has MORE FLIGHTS INTO PHOENIX THAN ALASKA WHEREAS ALASKA IS FLYING TO SEATTLE THE MOST
showcounts1 <- select(long_data3,Airline, Destination_City,total) %>% 
  melt() %>%
  ggplot(aes(x = Destination_City, y = value, fill = variable))+
  theme(axis.text.x=element_text(angle=90,hjust=1))+
  geom_bar(stat = 'identity')+
  geom_text(size =5, aes(label=value), position = position_stack(vjust = 0.5)) +
  labs(y = "flights") +
  facet_grid(~Airline)
## Using Airline, Destination_City as id variables
showcounts2 <- long_data2 %>% 
  melt() %>%
  ggplot(aes(x = Destination_City, y = value, fill = variable))+ 
  theme(axis.text.x=element_text(angle=90,hjust=1))+
  geom_bar(stat = 'identity')+
  geom_text(size =4, aes(label=value), position = position_stack(vjust = 0.5)) +
  labs(y = "flights") +
  facet_grid(~Airline) 
## Using Airline, Destination_City as id variables
showcounts1

showcounts2