IS 607 Assignment –Tidying and Transforming DataPage 1Assignment–TidyingandTransformingDataSource: Numbersense, Kaiser Fung, McGraw Hill, 2013
library("readxl")
library("tidyr")
library("dplyr")
library("Hmisc")
library("ggplot2")
#Read from excel file
dfAirlinesDataXl <- read_excel("airlines_data_5.xlsx", na="")
dfAirlinesDataXl
## # A tibble: 5 x 7
## X__1 X__2 `Los Angeles` Phoenix `San Diego` `San Franciso` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#Find the datatype of dfAirlinesData before we do cleanup
typeof(dfAirlinesDataXl)
## [1] "list"
#Convert to data frame
dfAirlinesData <- data.frame(dfAirlinesDataXl)
#View the data to see what has to be cleaned up
dfAirlinesData
## X__1 X__2 Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#Rename columns to readable names
dfAirlinesData <- dfAirlinesData %>%
filter(!is.na(n)) %>%
rename(airline = X__1, status = X__2, los_angeles = Los.Angeles, phoenix = Phoenix, san_diego = San.Diego, san_franciso = San.Franciso, seattle = Seattle)
dfAirlinesData
## airline status los_angeles phoenix san_diego san_franciso seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#Remove NA and smooth data further
dfAirlinesData <- dfAirlinesData[-c(3),]
#Add missing values
dfAirlinesData[2, "airline"] <- "ALASKA"
dfAirlinesData[4, "airline"] <- "AM WEST"
#Reset index
rownames(dfAirlinesData) <- NULL
#Add factor
dfAirlinesData$airline <- factor(dfAirlinesData$airline)
dfAirlinesData
## airline status los_angeles phoenix san_diego san_franciso seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
#View column names
names(dfAirlinesData)
## [1] "airline" "status" "los_angeles" "phoenix"
## [5] "san_diego" "san_franciso" "seattle"
#View datatypes
sapply(dfAirlinesData, typeof)
## airline status los_angeles phoenix san_diego
## "integer" "character" "double" "double" "double"
## san_franciso seattle
## "double" "double"
#Lets see the stats of the data structure before saving it to csv
describe(dfAirlinesData)
## dfAirlinesData
##
## 7 Variables 4 Observations
## ---------------------------------------------------------------------------
## airline
## n missing distinct
## 4 0 2
##
## Value ALASKA AM WEST
## Frequency 2 2
## Proportion 0.5 0.5
## ---------------------------------------------------------------------------
## status
## n missing distinct
## 4 0 2
##
## Value delayed on time
## Frequency 2 2
## Proportion 0.5 0.5
## ---------------------------------------------------------------------------
## los_angeles
## n missing distinct Info Mean Gmd
## 4 0 4 1 342.5 379.3
##
## Value 62 117 497 694
## Frequency 1 1 1 1
## Proportion 0.25 0.25 0.25 0.25
## ---------------------------------------------------------------------------
## phoenix
## n missing distinct Info Mean Gmd
## 4 0 4 1 1372 2446
##
## Value 12 221 415 4840
## Frequency 1 1 1 1
## Proportion 0.25 0.25 0.25 0.25
## ---------------------------------------------------------------------------
## san_diego
## n missing distinct Info Mean Gmd
## 4 0 4 1 170 206
##
## Value 20 65 212 383
## Frequency 1 1 1 1
## Proportion 0.25 0.25 0.25 0.25
## ---------------------------------------------------------------------------
## san_franciso
## n missing distinct Info Mean Gmd
## 4 0 4 1 263.5 232.3
##
## Value 102 129 320 503
## Frequency 1 1 1 1
## Proportion 0.25 0.25 0.25 0.25
## ---------------------------------------------------------------------------
## seattle
## n missing distinct Info Mean Gmd
## 4 0 4 1 602 907.3
##
## Value 61 201 305 1841
## Frequency 1 1 1 1
## Proportion 0.25 0.25 0.25 0.25
## ---------------------------------------------------------------------------
write.csv(dfAirlinesData, "./dfAirlinesData.csv", row.names=FALSE)
dfAirlinesCSVStore <- read.csv("./dfAirlinesData.csv",header=TRUE,stringsAsFactors = FALSE)
#Store data in temp variable for reuse
dfAirlinesCSV <- dfAirlinesCSVStore
#List columns we are going to gather
names(dfAirlinesCSV)[3:7]
## [1] "los_angeles" "phoenix" "san_diego" "san_franciso"
## [5] "seattle"
#Apply and view output
dfAirlinesAn1 <- gather(dfAirlinesCSV, "city", "num", names(dfAirlinesCSV)[3:7], factor_key=TRUE)
dfAirlinesAn1
## airline status city num
## 1 ALASKA on time los_angeles 497
## 2 ALASKA delayed los_angeles 62
## 3 AM WEST on time los_angeles 694
## 4 AM WEST delayed los_angeles 117
## 5 ALASKA on time phoenix 221
## 6 ALASKA delayed phoenix 12
## 7 AM WEST on time phoenix 4840
## 8 AM WEST delayed phoenix 415
## 9 ALASKA on time san_diego 212
## 10 ALASKA delayed san_diego 20
## 11 AM WEST on time san_diego 383
## 12 AM WEST delayed san_diego 65
## 13 ALASKA on time san_franciso 503
## 14 ALASKA delayed san_franciso 102
## 15 AM WEST on time san_franciso 320
## 16 AM WEST delayed san_franciso 129
## 17 ALASKA on time seattle 1841
## 18 ALASKA delayed seattle 305
## 19 AM WEST on time seattle 201
## 20 AM WEST delayed seattle 61
dfAirlinesAn1 <- dfAirlinesAn1 %>%
spread(status, `num`)
dfAirlinesAn1
## airline city delayed on time
## 1 ALASKA los_angeles 62 497
## 2 ALASKA phoenix 12 221
## 3 ALASKA san_diego 20 212
## 4 ALASKA san_franciso 102 503
## 5 ALASKA seattle 305 1841
## 6 AM WEST los_angeles 117 694
## 7 AM WEST phoenix 415 4840
## 8 AM WEST san_diego 65 383
## 9 AM WEST san_franciso 129 320
## 10 AM WEST seattle 61 201
dfAirlinesAn1 <- dfAirlinesAn1 %>%
mutate(total = delayed + `on time`, percent_delayed = delayed/ total, percent_ontime = `on time`/ total)
dfAirlinesAn1
## airline city delayed on time total percent_delayed
## 1 ALASKA los_angeles 62 497 559 0.11091234
## 2 ALASKA phoenix 12 221 233 0.05150215
## 3 ALASKA san_diego 20 212 232 0.08620690
## 4 ALASKA san_franciso 102 503 605 0.16859504
## 5 ALASKA seattle 305 1841 2146 0.14212488
## 6 AM WEST los_angeles 117 694 811 0.14426634
## 7 AM WEST phoenix 415 4840 5255 0.07897241
## 8 AM WEST san_diego 65 383 448 0.14508929
## 9 AM WEST san_franciso 129 320 449 0.28730512
## 10 AM WEST seattle 61 201 262 0.23282443
## percent_ontime
## 1 0.8890877
## 2 0.9484979
## 3 0.9137931
## 4 0.8314050
## 5 0.8578751
## 6 0.8557337
## 7 0.9210276
## 8 0.8549107
## 9 0.7126949
## 10 0.7671756
ggplot(data=dfAirlinesAn1, aes(x=airline, y=percent_ontime)) +
geom_bar(stat="identity", width=0.5, fill="#52823f") +
xlab("Airlines") + ylab("On Time") +
ggtitle("On Time")
ggplot(data=dfAirlinesAn1, aes(x=airline, y=percent_delayed)) +
geom_bar(stat="identity", width=0.5, fill="#82423f") +
xlab("Airlines") + ylab("Delays") +
ggtitle("Delays")
ggplot(data=dfAirlinesAn1, aes(x = airline, y=percent_ontime, fill = city)) +
geom_bar(stat="identity",position="dodge") +
xlab("Airlines") + ylab("On Time") + ggtitle("On Time per Airport")
ggplot(data=dfAirlinesAn1, aes(x = airline, y=percent_delayed, fill = city)) +
geom_bar(stat="identity",position="dodge") +
xlab("Airlines") + ylab("Delays") + ggtitle("On Time per Airport")
#Apply and view output
dfAirlinesAn2 <- gather(dfAirlinesCSV, "city", "num", names(dfAirlinesCSV)[3:7], factor_key=TRUE)
dfAirlinesAn2
## airline status city num
## 1 ALASKA on time los_angeles 497
## 2 ALASKA delayed los_angeles 62
## 3 AM WEST on time los_angeles 694
## 4 AM WEST delayed los_angeles 117
## 5 ALASKA on time phoenix 221
## 6 ALASKA delayed phoenix 12
## 7 AM WEST on time phoenix 4840
## 8 AM WEST delayed phoenix 415
## 9 ALASKA on time san_diego 212
## 10 ALASKA delayed san_diego 20
## 11 AM WEST on time san_diego 383
## 12 AM WEST delayed san_diego 65
## 13 ALASKA on time san_franciso 503
## 14 ALASKA delayed san_franciso 102
## 15 AM WEST on time san_franciso 320
## 16 AM WEST delayed san_franciso 129
## 17 ALASKA on time seattle 1841
## 18 ALASKA delayed seattle 305
## 19 AM WEST on time seattle 201
## 20 AM WEST delayed seattle 61
ggplot(data=dfAirlinesAn2, aes(x = airline, y=num, fill = city)) +
geom_bar(stat="identity",position="dodge") +
xlab("Airlines") + ylab("Number of Flights") + ggtitle("Number of Flights per Airport")