Github Link: https://github.com/asmozo24/Data607_Assignment5.git

R Packages

library(tidyverse) #loading all library needed for this assignment
library(openintro)
#head(fastfood)
library(readxl)
library(readr)
library(plyr)
library(dplyr)
#library(dice)
# #library(VennDiagram)
# #library(help = "dice")
library(DBI)
library(dbplyr)
library(data.table)
library(rstudioapi)
#rstudioapi::isAvailable()
library(RJDBC)
library(odbc)
library(RSQLite)
library(rvest)
library(stringr)
library(readtext)
library(ggpubr)
#library(fitdistrplus)
library(ggplot2)
library(moments)
library(qualityTools)
library(normalp)
library(utils)
library(MASS)
library(qqplotr)
#library(DATA606)
library(RCurl)

#connecting to my github respository to import data

# pulling raw data from my github respository, Data607_Assignment5
# https://github.com/asmozo24/Data607_Assignment5.git

airlines_arrival1 <- read.table(url("https://raw.githubusercontent.com/asmozo24/Data607_Assignment5/master/Arrival_Delays_02_airlines%20.csv"), sep = ",")

airlines_arrival1  ## header not clean
##       V1      V2          V3      V4        V5            V6      V7
## 1                Los Angeles Phoenix San Diego San Francisco Seattle
## 2 ALASKA on time         497     221       212           503    1841
## 3        delayed          62      12        20           102     305
## 4                                                                   
## 5 AMWEST on time         694    4840       383           320     201
## 6        delayed         117     415        65           129      61
View(airlines_arrival1)

# This access from github to Rstudio also work 
 airlines_arrival2 <- read.csv("https://raw.githubusercontent.com/asmozo24/Data607_Assignment5/master/Arrival_Delays_02_airlines%20.csv",header=TRUE,stringsAsFactors=FALSE)
 airlines_arrival2 # this is to check it is the actual dataframe I am looking for ....this works too.
##        X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time         497     221       212           503    1841
## 2        delayed          62      12        20           102     305
## 3                         NA      NA        NA            NA      NA
## 4 AMWEST on time         694    4840       383           320     201
## 5        delayed         117     415        65           129      61

connection to working directory and import data from local drive

setwd("~/R/DATA607_Assg5") #set the working directory to access dataframe
  # display dataframe from local drive
# newDataName <- read.csv("https://raw.githubusercontent.com/asmozo24/Data607_Assignment5/master/Arrival_Delays_02_airlines%20.csv", sep = ",", na.strings = "NA", strip.white = TRUE, stringsAsFactors = FALSE)

Arrival_Delays_02_airlines_ <- read.csv("C:/Users/Petit Mandela/Documents/R/DATA607_Assg5/Arrival_Delays_02_airlines .csv")

Arrival_Delays_02_airlines_
##        X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time         497     221       212           503    1841
## 2        delayed          62      12        20           102     305
## 3                         NA      NA        NA            NA      NA
## 4 AMWEST on time         694    4840       383           320     201
## 5        delayed         117     415        65           129      61

Connecting to SQL server

–this connection works but requires password, I am going to comment this block because preventing the knit process. –Now we succeeded in reading Arrival_Delays_02_airlines full table into R dataframe…..

{r connection to sql} #sql connection = con # # con <- dbConnect(odbc(), # Driver = "SQL Server", # Server = "ATM\\ATMSERVER", # Database = "Data607_Airline_Delays_Db", # UID = "Alex", # PWD = rstudioapi::askForPassword("Database password"), # Port = 1433) # # dbListFields(con, "Arrival_Delays_02_airlines") # airlines_arrival <- dbReadTable(con, "Arrival_Delays_02_airlines") # airlines_arrival # # #

Handling missings data

I will use Github link

is.na(airlines_arrival2) # checking if there is a missing data in the dataset, return is yes
##          X   X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## [1,] FALSE FALSE       FALSE   FALSE     FALSE         FALSE   FALSE
## [2,] FALSE FALSE       FALSE   FALSE     FALSE         FALSE   FALSE
## [3,] FALSE FALSE        TRUE    TRUE      TRUE          TRUE    TRUE
## [4,] FALSE FALSE       FALSE   FALSE     FALSE         FALSE   FALSE
## [5,] FALSE FALSE       FALSE   FALSE     FALSE         FALSE   FALSE
sum(is.na(airlines_arrival2)) # file to big, checking the sum of all missing data (return is 09 missing data)
## [1] 5
airlines_arrival4 <- na.omit(airlines_arrival2) # delete/remove the missings data because it is an imcomplete observation
airlines_arrival4
##        X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time         497     221       212           503    1841
## 2        delayed          62      12        20           102     305
## 4 AMWEST on time         694    4840       383           320     201
## 5        delayed         117     415        65           129      61
View(airlines_arrival4)

Tidying data

I will use Github link What city has the most arrivals all airlines combined? Answer. Phoenix appears to be have most arrivals or the most destination all airlines combined. Thus, Phoenix is the most visited city and probably collecting more dividends than other cities.

What city has the least arrivals all airlines combined.? Answer: San Diego appears to have the least arrivals or the least destination all airlines combined. San Diego is the least visited, the mayor needs to work on making the city more attractive. Maybe trying to find out why most people go to Phoenix.

row.names(airlines_arrival4) <- NULL
airlines_arrival4
##        X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time         497     221       212           503    1841
## 2        delayed          62      12        20           102     305
## 3 AMWEST on time         694    4840       383           320     201
## 4        delayed         117     415        65           129      61
#renaming the no name columns
names(airlines_arrival4)[1] <- 'airlines'
names(airlines_arrival4)[2] <- 'arrival_status'

# Another way of renaming
# airlines_arrival4 <- airlines_arrival4 %>% 
#   rename(airlines = X,  arrival_status = X.1)

airlines_arrival4
##   airlines arrival_status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA        on time         497     221       212           503    1841
## 2                 delayed          62      12        20           102     305
## 3   AMWEST        on time         694    4840       383           320     201
## 4                 delayed         117     415        65           129      61
view(airlines_arrival4)

#filling  airlines companies
airlines_arrival4$airlines[2] <- c("ALASKA")
airlines_arrival4$airlines[4] <- c("AMWEST")
airlines_arrival4
##   airlines arrival_status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA        on time         497     221       212           503    1841
## 2   ALASKA        delayed          62      12        20           102     305
## 3   AMWEST        on time         694    4840       383           320     201
## 4   AMWEST        delayed         117     415        65           129      61
summary(airlines_arrival4)
##    airlines         arrival_status      Los.Angeles       Phoenix      
##  Length:4           Length:4           Min.   : 62.0   Min.   :  12.0  
##  Class :character   Class :character   1st Qu.:103.2   1st Qu.: 168.8  
##  Mode  :character   Mode  :character   Median :307.0   Median : 318.0  
##                                        Mean   :342.5   Mean   :1372.0  
##                                        3rd Qu.:546.2   3rd Qu.:1521.2  
##                                        Max.   :694.0   Max.   :4840.0  
##    San.Diego      San.Francisco      Seattle    
##  Min.   : 20.00   Min.   :102.0   Min.   :  61  
##  1st Qu.: 53.75   1st Qu.:122.2   1st Qu.: 166  
##  Median :138.50   Median :224.5   Median : 253  
##  Mean   :170.00   Mean   :263.5   Mean   : 602  
##  3rd Qu.:254.75   3rd Qu.:365.8   3rd Qu.: 689  
##  Max.   :383.00   Max.   :503.0   Max.   :1841
# What city has the most arrivals all airlines combined? Answer. Phoenix appears to be have most arrivals or the most destination all airlines combined.
# Thus, Phoenix is the most visited city and probably collecting more dividends than other cities.

# What city has the least arrivals all airlines combined.? Answer: San Diego appears to have the least arrivals or the least destination all airlines combined.
# San Diego is the least visited, the mayor needs to work on making the city more attractive. Maybe trying to find out why most people go to Phoenix.

total_Los_Angeles <- sum(airlines_arrival4$Los.Angeles)     # 1370
total_Phoenix <- sum(airlines_arrival4$Phoenix)             # 5488
total_San.Diego <- sum(airlines_arrival4$San.Diego)         # 680
total_San.Francisco <- sum(airlines_arrival4$San.Francisco) # 1054
total_Seattle <- sum(airlines_arrival4$Seattle)             # 2408

total_arrivals <- c(total_Los_Angeles, total_Phoenix, total_San.Diego, total_San.Francisco, total_Seattle)
cities <- c('Los Angeles', 'Phoenix', 'San Diego', 'San Francisco', 'Seattle')

# Barplot to visualize the number 
Cities_Arrivals <- data.frame(cities, total_arrivals)
View(Cities_Arrivals)
barplot(Cities_Arrivals$total_arrivals, names.arg = Cities_Arrivals$cities, main= "Total Arrivals per City", xlab = "Cities", ylab = "Total Arrivals",  col = rainbow(6))

# another way to do a barplot 
 ggplot(data = Cities_Arrivals, aes(x= cities, y=total_arrivals)) +
    geom_bar(stat="identity", fill = rainbow(5), width = 0.5)

Let’s look at the data a little bit different

#let call gather data in columns

airlines_arrival5 <- airlines_arrival4 %>%
  gather(key = city, value = "total_arrivals", 3:7)


# looking at the view (airlines_arrival5), by filtering descending order, I can say Alaska has the least number of delay arrivals which is Phoenix, AMWEST has  the most number of arrivals in time in Phoenix.