Tidying up the flights raw dataset which will be imported from mySQL database on Google Cloud platform.
Some observations about the source dataset
suppressMessages(library(tidyr))
suppressMessages(library(dplyr))
suppressMessages(library(DT))
suppressMessages(library(RMySQL))
suppressMessages(library(DBI))
suppressMessages(library(stringr))
The file is uploaded on GitHub here
(https://raw.githubusercontent.com/DataScienceAR/silver-octo-funicular/master/Netflix%20Shows.csv)
theurl <- url('https://raw.githubusercontent.com/DataScienceAR/Cuny-Assignments/master/Data-607/Airlines_Time.csv')
airlines.df <- tbl_df(read.csv(theurl))
datatable(head(airlines.df,3))
A new “Flights” table is created in Google Cloud. The .csv files available on GitHub is used a source to feed the table creation.
Calling the SQL in R
con <- dbConnect(RMySQL::MySQL(),
dbname ="flight",
host = "34.73.184.91",
port = 3306,
user = "Ruser",
password = "Ruser" )
dbListTables(con)
## [1] "Airlines_Time" "airlines" "airports" "flights"
## [5] "planes" "weather"
dbGetQuery(con,"select * from `Airlines_Time` limit 5 ")
## V1 V2 Los Angeles Phoenix San Diego San Francisco Seattle
## 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 ALASKA on time 497 221 212 503 1841
## 3 <NA> delayed 62 12 20 102 305
## 4 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 5 AM WEST on time 694 4840 383 320 201
untidy_dataset <- dbGetQuery(con,"select * from `Airlines_Time` ")
datatable(untidy_dataset)
names(untidy_dataset)[c(1,2)] <- c("Airlines","Time_Category")
names(untidy_dataset)
## [1] "Airlines" "Time_Category" "Los Angeles" "Phoenix"
## [5] "San Diego" "San Francisco" "Seattle"
untidy_dataset_step1 <-untidy_dataset[-c(1,4),]
datatable(untidy_dataset_step1)
untidy_dataset_step2<- tibble::rowid_to_column(untidy_dataset_step1,var = "rowid")
datatable(untidy_dataset_step2)
untidy_dataset_step3<- gather(untidy_dataset_step2,key ="City",value = "Time",c("Los Angeles" ,"Phoenix","San Diego" ,"San Francisco" ,"Seattle" ),na.rm = FALSE)
datatable(untidy_dataset_step3)
### Step 5: spreading the Time Category variable values into variable names
untidy_dataset_step4<- spread(untidy_dataset_step3,Time_Category,Time)
datatable(untidy_dataset_step4)
### Step 6: Remove the 1st variabe named rowid
untidy_dataset_step5<- untidy_dataset_step4[,-1]
datatable(untidy_dataset_step5)
### Step 7: Replacing the “NA” with 0 for time under variables “delayed” and “on time”
missing_delayed <- is.na(untidy_dataset_step5$delayed)
missing_delayed
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## [12] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
untidy_dataset_step5$delayed[missing_delayed] <- 0
datatable(untidy_dataset_step5)
missing_on_time <- is.na(untidy_dataset_step5$`on time` )
untidy_dataset_step5$`on time`[missing_on_time] <- 0
untidy_dataset_step5$Airlines[c(6:10,16:20)] <- c("ALASKA","AM WEST")
#untidy_dataset_step5$Airlines[16:20] <-
datatable(untidy_dataset_step5,options = list(pageLength = 20, initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});",
"}")))