# load require packages
require(tidyr)
## Loading required package: tidyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
# Import data from a.csv file
source <-tbl_df(read.csv('D:/CUNY_SPS_DA/607_Data_Aq/week5/a.csv', stringsAsFactors = FALSE))
source
## # A tibble: 5 x 7
## ï.. X Los.Angeles Phoenix San.Diego San.Francisco Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# Check data typesraw()
str(source)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 7 variables:
## $ ï.. : chr "ALASKA" "" "" "AM WEST" ...
## $ X : chr "on time" "delayed" "" "on time" ...
## $ Los.Angeles : int 497 62 NA 694 117
## $ Phoenix : int 221 12 NA 4840 415
## $ San.Diego : int 212 20 NA 383 65
## $ San.Francisco: int 503 102 NA 320 129
## $ Seattle : int 1841 305 NA 201 61
# only select rows with complete data in all columns and show data
# renames first two columns
a <- rename(na.omit(source),c("ï.." ="airlines", "X"="statues"))
# insert missing values in first columns
a[2,1] <- "ALASKA"
a[4,1] <- "AM WEST"
# to shorten the string values to character in second column
#a$statues <- strtrim(a$statues,1)
a
## # A tibble: 4 x 7
## airlines statues Los.Angeles Phoenix San.Diego San.Francisco Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
# long-formatted-data
a2 <- a %>%
gather(cities, n, Los.Angeles:Seattle, na.rm = TRUE)
a2
## # A tibble: 20 x 4
## airlines statues cities n
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
# wide-formatted-data
a3 <- a2 %>%
spread(statues,n)
# correct the column name "on time" to "on_time"
colnames(a3)[4] <- "on_time"
a3
## # A tibble: 10 x 4
## airlines cities delayed on_time
## * <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
# load requried packages for string value
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## arrange(): dplyr, plyr
## compact(): purrr, plyr
## count(): dplyr, plyr
## failwith(): dplyr, plyr
## filter(): dplyr, stats
## id(): dplyr, plyr
## lag(): dplyr, stats
## mutate(): dplyr, plyr
## rename(): dplyr, plyr
## summarise(): dplyr, plyr
## summarize(): dplyr, plyr
library(stringr)
# filter data for airline "AM WEST"
am <- a2 %>%
filter(str_detect(airlines, 'AM WEST'))%>%
spread(statues,n)
am
## # A tibble: 5 x 4
## airlines cities delayed `on time`
## * <chr> <chr> <int> <int>
## 1 AM WEST Los.Angeles 117 694
## 2 AM WEST Phoenix 415 4840
## 3 AM WEST San.Diego 65 383
## 4 AM WEST San.Francisco 129 320
## 5 AM WEST Seattle 61 201
# correct the column name "on time" to "on_time"
colnames(am)[4] <- "on_time"
# unite columns airlines and cities
am2<- am %>%
unite(AM.city, airlines, cities,sep="." ) %>%
select(AM.city,delayed,on_time)
# tranfrom rows to columns
am2<- as.data.frame(t(am2))
# first row as the header in the table a4
colnames(am2) <- unlist(am2[1,])
am2=am2[-1,]
am2
## AM WEST.Los.Angeles AM WEST.Phoenix AM WEST.San.Diego
## delayed 117 415 65
## on_time 694 4840 383
## AM WEST.San.Francisco AM WEST.Seattle
## delayed 129 61
## on_time 320 201
sub.a3 <- a3%>% select(airlines,delayed)
sub.a3 %>% summarise(Min = min(delayed, na.rm=TRUE),
Median = median(delayed, na.rm=TRUE),
Mean = mean(delayed, na.rm=TRUE),
Var = var(delayed, na.rm=TRUE),
SD = sd(delayed, na.rm=TRUE),
Max = max(delayed, na.rm=TRUE))
## Min Median Mean Var SD Max
## 1 12 83.5 128.8 16935.96 130.1382 415
a3
## # A tibble: 10 x 4
## airlines cities delayed on_time
## * <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
df <- a3 %>%
mutate(total = on_time + delayed)%>%
mutate(delayed = delayed/total)%>%
select(airlines,cities,delayed)%>%
arrange(cities,airlines)
df
## # A tibble: 10 x 3
## airlines cities delayed
## <chr> <chr> <dbl>
## 1 ALASKA Los.Angeles 0.11091234
## 2 AM WEST Los.Angeles 0.14426634
## 3 ALASKA Phoenix 0.05150215
## 4 AM WEST Phoenix 0.07897241
## 5 ALASKA San.Diego 0.08620690
## 6 AM WEST San.Diego 0.14508929
## 7 ALASKA San.Francisco 0.16859504
## 8 AM WEST San.Francisco 0.28730512
## 9 ALASKA Seattle 0.14212488
## 10 AM WEST Seattle 0.23282443
al_delay <- df%>%
filter(str_detect(airlines, 'ALASKA'))%>%
select(cities,delayed)%>%
arrange(cities)
#al_delay
colnames(al_delay)[2] <-"ALASKA.delayed"
am_delay <- df%>%
filter(str_detect(airlines, 'AM WEST'))%>%
select(cities,delayed)%>%
arrange(cities)
#am_delay
colnames(am_delay)[2] <-"AM_WEST.delayed"
delayed<- al_delay %>%
join(am_delay, by="cities") #%>%
# mutate(diff = AM_WEST.delayed - ALASKA.delayed )
delayed
## cities ALASKA.delayed AM_WEST.delayed
## 1 Los.Angeles 0.11091234 0.14426634
## 2 Phoenix 0.05150215 0.07897241
## 3 San.Diego 0.08620690 0.14508929
## 4 San.Francisco 0.16859504 0.28730512
## 5 Seattle 0.14212488 0.23282443
library ('ggplot2')
barplot(delayed$ALASKA.delayed, names.arg=delayed$cities, ylim=c(0,0.30), main="ALASKA Airline Delayed (%)", ylab="blah", xlab="cities", col="darkmagenta")
barplot(delayed$AM_WEST.delayed, names.arg=delayed$cities, ylim=c(0,0.30), main="AM WEST Airline Delayed (%)", ylab="blah", xlab="cities", col="orange")
matplot(delayed, type = c("b"),pch=1,col = 2:3) #plot
## Warning in xy.coords(x, y, xlabel, ylabel, log = log): NAs introduced by
## coercion
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion
legend("topleft", legend =1:2, col=1:2, pch=1) # optional legend
write.table(delayed, "D:/CUNY_SPS_DA/607_Data_Aq/week5/delayed.csv", sep="\t")
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.