Section1: Read my CSV file from Github and Reassign missing value for column x
myurl <- "https://raw.githubusercontent.com/BanuB/Week5AssignmentDATA607/master/Sample4.csv"
csvdata <- read.csv(file=myurl, header=TRUE,sep=",",stringsAsFactors = FALSE,na.strings=c("NA"))
filename <- "flightanalysis.csv"
str(csvdata)
## 'data.frame': 5 obs. of 7 variables:
## $ X : chr "Alaska" "" "" "AM WEST" ...
## $ X.1 : chr "on time" "delayed" "" "on time" ...
## $ Los.Angeles : int 497 62 NA 694 117
## $ Phoenix : chr "221" "12" "" "4,840" ...
## $ San.Diego : int 212 20 NA 383 65
## $ San.Fransisco: int 503 102 NA 320 129
## $ Seattle : chr "1,841" "305" "" "201" ...
head(csvdata)
## X X.1 Los.Angeles Phoenix San.Diego San.Fransisco Seattle
## 1 Alaska on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AM WEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
#Reassign missing value for column X
csvdata[2,1] <- csvdata$X[1]
csvdata[5,1] <- csvdata$X[4]
Section2: Tidy Step 1 to gather, mutate and omit to remove NA and rename column names Airline Flight_Status
#This csvdata appears to be wide since variable (represented as destinationcity) is such that each city is a variable. We can convert the values appearing in each column as one column variable and bring the values to be populated for that column variable.
#change layout of the dataset from wide to long format using tidyr. Remove NA rows
long_data <- csvdata %>% gather(Destination_City, "Num_of_flights", Los.Angeles:Seattle)%>%
mutate(Num_of_flights = ifelse(Num_of_flights == "", NA, Num_of_flights))%>%
na.omit
#assign column names
colnames(long_data)[1] <- "Airline"
colnames(long_data)[2] <- "Flight_Status"
Section3: Tidy Step 2 change type of column to Numeric for Num_of_flights, use spread() Flight_Status, replace space in “on time” to aid in easy reading
#change type to numeric
long_data
## Airline Flight_Status Destination_City Num_of_flights
## 1 Alaska on time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 4 AM WEST on time Los.Angeles 694
## 5 AM WEST delayed Los.Angeles 117
## 6 Alaska on time Phoenix 221
## 7 Alaska delayed Phoenix 12
## 9 AM WEST on time Phoenix 4,840
## 10 AM WEST delayed Phoenix 415
## 11 Alaska on time San.Diego 212
## 12 Alaska delayed San.Diego 20
## 14 AM WEST on time San.Diego 383
## 15 AM WEST delayed San.Diego 65
## 16 Alaska on time San.Fransisco 503
## 17 Alaska delayed San.Fransisco 102
## 19 AM WEST on time San.Fransisco 320
## 20 AM WEST delayed San.Fransisco 129
## 21 Alaska on time Seattle 1,841
## 22 Alaska delayed Seattle 305
## 24 AM WEST on time Seattle 201
## 25 AM WEST delayed Seattle 61
long_data[,"Num_of_flights"] <- as.numeric(parse_number(long_data$Num_of_flights))
sapply(long_data,mode)
## Airline Flight_Status Destination_City Num_of_flights
## "character" "character" "character" "numeric"
#use tidyR to spread the flight status
long_data1 <- long_data %>% spread(Flight_Status, Num_of_flights)
long_data1
## Airline Destination_City delayed on time
## 1 Alaska Los.Angeles 62 497
## 2 Alaska Phoenix 12 221
## 3 Alaska San.Diego 20 212
## 4 Alaska San.Fransisco 102 503
## 5 Alaska Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Fransisco 129 320
## 10 AM WEST Seattle 61 201
#replace the on time with _ so there is no space in the column name
names(long_data1) <- names(long_data1) %>% str_replace_all("\\s","_")
sapply(long_data1,mode)
## Airline Destination_City delayed on_time
## "character" "character" "numeric" "numeric"
long_data1 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
|
Airline
|
Destination_City
|
delayed
|
on_time
|
|
Alaska
|
Los.Angeles
|
62
|
497
|
|
Alaska
|
Phoenix
|
12
|
221
|
|
Alaska
|
San.Diego
|
20
|
212
|
|
Alaska
|
San.Fransisco
|
102
|
503
|
|
Alaska
|
Seattle
|
305
|
1841
|
|
AM WEST
|
Los.Angeles
|
117
|
694
|
|
AM WEST
|
Phoenix
|
415
|
4840
|
|
AM WEST
|
San.Diego
|
65
|
383
|
|
AM WEST
|
San.Fransisco
|
129
|
320
|
|
AM WEST
|
Seattle
|
61
|
201
|
Section5: Now we have long_data2 in our tidy format for analysis. Group by Airline to find percent delay and percent on time value
#Arrange By Destination_City
long_data2 <- long_data1 %>% arrange(Destination_City)
long_data3 <- long_data2 %>%
group_by(Airline) %>%
mutate(cum_delayed = cumsum(delayed), cum_ontime = cumsum(on_time)) %>%
mutate (total = delayed + on_time, percentdelay = round(delayed/(delayed+on_time)*100,2), percentontime = round(on_time/(delayed+on_time)*100,2))
long_data3 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
|
Airline
|
Destination_City
|
delayed
|
on_time
|
cum_delayed
|
cum_ontime
|
total
|
percentdelay
|
percentontime
|
|
Alaska
|
Los.Angeles
|
62
|
497
|
62
|
497
|
559
|
11.09
|
88.91
|
|
AM WEST
|
Los.Angeles
|
117
|
694
|
117
|
694
|
811
|
14.43
|
85.57
|
|
Alaska
|
Phoenix
|
12
|
221
|
74
|
718
|
233
|
5.15
|
94.85
|
|
AM WEST
|
Phoenix
|
415
|
4840
|
532
|
5534
|
5255
|
7.90
|
92.10
|
|
Alaska
|
San.Diego
|
20
|
212
|
94
|
930
|
232
|
8.62
|
91.38
|
|
AM WEST
|
San.Diego
|
65
|
383
|
597
|
5917
|
448
|
14.51
|
85.49
|
|
Alaska
|
San.Fransisco
|
102
|
503
|
196
|
1433
|
605
|
16.86
|
83.14
|
|
AM WEST
|
San.Fransisco
|
129
|
320
|
726
|
6237
|
449
|
28.73
|
71.27
|
|
Alaska
|
Seattle
|
305
|
1841
|
501
|
3274
|
2146
|
14.21
|
85.79
|
|
AM WEST
|
Seattle
|
61
|
201
|
787
|
6438
|
262
|
23.28
|
76.72
|
#report percentdelay by DestinationCity and Airline :AM WEST HAS HIGHER DELAY THAN ALASKA FOR EACH CITY
ggplot(long_data3,aes(x=Destination_City,y=percentdelay))+geom_bar(aes(fill=Airline),stat="identity",position="dodge")+geom_text(aes(label=..y..),stat="identity",vjust=-.5,hjust=0)+labs(x="Destination_City",y="percentdelay",fill="Airline") + ggtitle("percentdelay by DestinationCity: AM WEST HAS HIGHER DELAY THAN ALASKA FOR EACH CITY")

#report percentontime by DestinationCity and Airline
ggplot(long_data3,aes(x=Destination_City,y=percentontime))+geom_bar(aes(fill=Airline),stat="identity",position="dodge")+geom_text(aes(label=..y..),check_overlap = FALSE,size =3, stat="identity",vjust=-.5,hjust=.5)+labs(x="Destination_City",y="percentontime",fill="Airline") +ggtitle("percentontime by DestinationCity")

#report flightsflown to each city by Airline : AM WEST HAS FLOWN THE MOST TO PHOENIX WHEREAS ALASKA IS FLYING THE MOST TO SEATTLE
ggplot(long_data3)+ geom_bar(aes(x=Airline, y=total,fill=Airline),stat = "identity") + facet_wrap(~Destination_City,nrow=2)+ ggtitle("flights flown to each city by Airline: AM WEST HAS FLOWN THE MOST TO \n\ PHOENIX WHEREAS ALASKA IS FLYING THE MOST TO SEATTLE")

Section6: Perform summary report stats
#summarize Analysis 1 - group by Airline, get total flights delayed and on time
summarystats <- long_data1 %>%
group_by(Airline) %>%
summarize(total_flights_delayed = sum(delayed), total_flights_ontime = sum(on_time))
summarystats
## # A tibble: 2 x 3
## Airline total_flights_delayed total_flights_ontime
## <chr> <dbl> <dbl>
## 1 Alaska 501 3274
## 2 AM WEST 787 6438
summarystats %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
|
Airline
|
total_flights_delayed
|
total_flights_ontime
|
|
Alaska
|
501
|
3274
|
|
AM WEST
|
787
|
6438
|
#summarize Analysis 2 - group by Destination_City, get total flights delayed and on time
summarystats1 <- long_data1 %>%
group_by(Destination_City) %>%
summarize(total_flights_delayed = sum(delayed), total_flights_ontime = sum(on_time))
summarystats
## # A tibble: 2 x 3
## Airline total_flights_delayed total_flights_ontime
## <chr> <dbl> <dbl>
## 1 Alaska 501 3274
## 2 AM WEST 787 6438
summarystats1 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
|
Destination_City
|
total_flights_delayed
|
total_flights_ontime
|
|
Los.Angeles
|
179
|
1191
|
|
Phoenix
|
427
|
5061
|
|
San.Diego
|
85
|
595
|
|
San.Fransisco
|
231
|
823
|
|
Seattle
|
366
|
2042
|
#summarize the airline and the delayed% of an airline and delayed% of airlineontime
summarystats2 <- summarystats %>% mutate(totalflights = total_flights_delayed + total_flights_ontime,
airlinedelay = round((total_flights_delayed / totalflights) * 100, 2),
airlineontime = round((total_flights_ontime / totalflights) * 100, 2))
summarystats2 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
|
Airline
|
total_flights_delayed
|
total_flights_ontime
|
totalflights
|
airlinedelay
|
airlineontime
|
|
Alaska
|
501
|
3274
|
3775
|
13.27
|
86.73
|
|
AM WEST
|
787
|
6438
|
7225
|
10.89
|
89.11
|
#summarize the airline and the delayed% and ontime% of flying to a city
summarystats3 <- summarystats1 %>% mutate(totalflights = total_flights_delayed + total_flights_ontime,
citydelay = round((total_flights_delayed / totalflights) * 100, 2),
cityontime = round((total_flights_ontime/ totalflights) * 100, 2))
summarystats3 %>% kable() %>% kable_styling(bootstrap_options = c("striped","hover","condensed"))
|
Destination_City
|
total_flights_delayed
|
total_flights_ontime
|
totalflights
|
citydelay
|
cityontime
|
|
Los.Angeles
|
179
|
1191
|
1370
|
13.07
|
86.93
|
|
Phoenix
|
427
|
5061
|
5488
|
7.78
|
92.22
|
|
San.Diego
|
85
|
595
|
680
|
12.50
|
87.50
|
|
San.Fransisco
|
231
|
823
|
1054
|
21.92
|
78.08
|
|
Seattle
|
366
|
2042
|
2408
|
15.20
|
84.80
|
Section7: Plot using melt and select and ANALYSIS
- AM WEST has more flights into PHOENIX (TOTAL = 5255). Phoenix overall city delay is (7.78%).
- ALASKA has more flights into SEATTLE (TOTAL = 2146). Seattle overall city delay is (15.20%).
- While AM WEST HAS HIGHER DELAY THAN ALASKA FOR EACH CITY, the overall Airline delay for all destinations is higher on ALASKA(13.27) AND AM WEST IS (10.89). Since the ontime arrival% is comparable across each city for both airline, Alaska is disadvantaged because of a smaller fleet flying as AM WEST overall %delay is better due to its larger fleet or total flights flown. A traveller could rely on ontimearrival%/CITY as a good predictor when selecting flights to fly through either Alaska or AMWEST or simple Alaska could guide a customer to its lower delay% across each city through its Airline.
#AM WEST has MORE FLIGHTS INTO PHOENIX THAN ALASKA WHEREAS ALASKA IS FLYING TO SEATTLE THE MOST
showcounts1 <- select(long_data3,Airline, Destination_City,total) %>%
melt() %>%
ggplot(aes(x = Destination_City, y = value, fill = variable))+
theme(axis.text.x=element_text(angle=90,hjust=1))+
geom_bar(stat = 'identity')+
geom_text(size =5, aes(label=value), position = position_stack(vjust = 0.5)) +
labs(y = "flights") +
facet_grid(~Airline)
## Using Airline, Destination_City as id variables
showcounts2 <- long_data2 %>%
melt() %>%
ggplot(aes(x = Destination_City, y = value, fill = variable))+
theme(axis.text.x=element_text(angle=90,hjust=1))+
geom_bar(stat = 'identity')+
geom_text(size =4, aes(label=value), position = position_stack(vjust = 0.5)) +
labs(y = "flights") +
facet_grid(~Airline)
## Using Airline, Destination_City as id variables
showcounts1

showcounts2
