The following steps must be implemented:
Install packages tidyr and dplyr.
Read csv file.
flights <- read.csv("/Users/olga/desktop/flights.csv")
flights
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
flights_modified <- subset(flights,flights$X.1!="")
flights_modified
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
for (i in 1:nrow(flights_modified)){
if (flights_modified[i,1]==" "){
flights_modified[i,1]<-flights_modified[i-1,1]
}
}
flights_modified
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
dim(flights_modified)
## [1] 4 7
names(flights_modified[3:7])
## [1] "Los.Angeles" "Phoenix" "San.Diego" "San.Francisco"
## [5] "Seattle"
flights_long <- flights_modified %>% gather(X, sX.1, Los.Angeles:Seattle)
flights_long
## X X.1 X sX.1
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
fligts_sum<-aggregate(flights_long$sX.1, by=list(flights_long$X), FUN=sum)
fligts_sum
## Group.1 x
## 1 ALASKA 3775
## 2 AM WEST 7225
for (i in 1:nrow(flights_long)){
if (flights_long[i,1]=="ALASKA"){
flights_long[i,4]<-(flights_long[i,4]/3775)*100
}
else{
flights_long[i,4]<-(flights_long[i,4]/7225)*100
}
}
flights_long
## X X.1 X sX.1
## 1 ALASKA on time Los.Angeles 13.1655629
## 2 ALASKA delayed Los.Angeles 1.6423841
## 3 AM WEST on time Los.Angeles 9.6055363
## 4 AM WEST delayed Los.Angeles 1.6193772
## 5 ALASKA on time Phoenix 5.8543046
## 6 ALASKA delayed Phoenix 0.3178808
## 7 AM WEST on time Phoenix 66.9896194
## 8 AM WEST delayed Phoenix 5.7439446
## 9 ALASKA on time San.Diego 5.6158940
## 10 ALASKA delayed San.Diego 0.5298013
## 11 AM WEST on time San.Diego 5.3010381
## 12 AM WEST delayed San.Diego 0.8996540
## 13 ALASKA on time San.Francisco 13.3245033
## 14 ALASKA delayed San.Francisco 2.7019868
## 15 AM WEST on time San.Francisco 4.4290657
## 16 AM WEST delayed San.Francisco 1.7854671
## 17 ALASKA on time Seattle 48.7682119
## 18 ALASKA delayed Seattle 8.0794702
## 19 AM WEST on time Seattle 2.7820069
## 20 AM WEST delayed Seattle 0.8442907
colnames(flights_long)<-c("airline","status","city","number_of_flights")
colnames(flights_long)
## [1] "airline" "status" "city"
## [4] "number_of_flights"
flights_wide <- spread(flights_long, status, number_of_flights)
flights_wide
## airline city delayed on time
## 1 ALASKA Los.Angeles 1.6423841 13.165563
## 2 ALASKA Phoenix 0.3178808 5.854305
## 3 ALASKA San.Diego 0.5298013 5.615894
## 4 ALASKA San.Francisco 2.7019868 13.324503
## 5 ALASKA Seattle 8.0794702 48.768212
## 6 AM WEST Los.Angeles 1.6193772 9.605536
## 7 AM WEST Phoenix 5.7439446 66.989619
## 8 AM WEST San.Diego 0.8996540 5.301038
## 9 AM WEST San.Francisco 1.7854671 4.429066
## 10 AM WEST Seattle 0.8442907 2.782007
group_by(flights_wide, airline)%>% summarise(mean_delay = mean(delayed), mean_on_time=mean(`on time`),std_delayed = sd(delayed),std_on_time=sd(`on time`))
## # A tibble: 2 x 5
## airline mean_delay mean_on_time std_delayed std_on_time
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA 2.654305 17.34570 3.179271 17.96287
## 2 AM WEST 2.178547 17.82145 2.036851 27.60123
Analysis table shows that at average ALASKA Airlines has 2.65% plus/minus 3.18% delayed flights while AM WEST has 2.17% plus/minus 2.04% delayed flights. ALASKA has 17.35 plus/minus 17.96 on time flights than AM WEST has 17.82% plus/minus 27.6% on time flights.
group_by(flights_wide, airline)%>% summarise(min_delay=min(delayed),max_delayed=max(delayed),min_on_time=min(`on time`),max_on_time=max(`on time`))
## # A tibble: 2 x 5
## airline min_delay max_delayed min_on_time max_on_time
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA 0.3178808 8.079470 5.615894 48.76821
## 2 AM WEST 0.8442907 5.743945 2.782007 66.98962
Also, ALASKA’s minimum for delayed flights is smaller than AM WEST’s and ALASKA’s maximum for delayed flights is greater than AM WEST’s maximum. Moreover, ALASKA’s minimum for on time flights is greater that AM WEST’s mininimum and ALASKA’s max for on_time flights is smaller that AM WEST’s maximum for on time flights.