Assignment 5

Gehad Gad

March 1st, 2020

Assignment 5

#Import libraries and/or Packages:

library (tidyr)
library (dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(funModeling)

## Loading required package: Hmisc

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
##  / Now in Spanish: librovivodecienciadedatos.ai

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v tibble  2.1.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## v purrr   0.3.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x Hmisc::src()       masks dplyr::src()
## x Hmisc::summarize() masks dplyr::summarize()

library(Hmisc)
library(ggpubr)

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(modeest)

## Registered S3 method overwritten by 'rmutil':
##   method         from
##   print.response httr

#Import the data from csv file:

Airlines <- read.csv ("Airlines.csv")

#To have access to the data from GitHub:

#Airlines_GitHub <- read.csv ("https://github.com/GehadGad/Airlines-data-Assignment5/raw/master/Airlines.csv")


#Display the data.

head (Airlines)

##   Airline ArrivalStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA       on time         497     221       212           503   1,841
## 2    <NA>       delayed          62      12        20           102     305
## 3    <NA>          <NA>          NA    <NA>        NA            NA    <NA>
## 4  AMWEST       on time         694   4,840       383           320     201
## 5    <NA>       delayed         117     415        65           129      61

#Replace the NA values for the Airlines names.

for(i in 2:nrow(Airlines)){
  if(is.na(Airlines$Airline[i])){
    Airlines$Airline[i] <- Airlines$Airline[i-1]
  }
}


#Remove the third row.
Airlines = Airlines[-c(3),]
#Display the data.
head (Airlines)

##   Airline ArrivalStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA       on time         497     221       212           503   1,841
## 2  ALASKA       delayed          62      12        20           102     305
## 4  AMWEST       on time         694   4,840       383           320     201
## 5  AMWEST       delayed         117     415        65           129      61

head (Airlines)

##   Airline ArrivalStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA       on time         497     221       212           503   1,841
## 2  ALASKA       delayed          62      12        20           102     305
## 4  AMWEST       on time         694   4,840       383           320     201
## 5  AMWEST       delayed         117     415        65           129      61

#Create a new data frame called gather_df to tide the data and have all cities into one column.

gather_df <- gather(Airlines, City, Number, 3:7)

## Warning: attributes are not identical across measure variables;
## they will be dropped

#Display the tidy data.
head (gather_df)

##   Airline ArrivalStatus        City Number
## 1  ALASKA       on time Los.Angeles    497
## 2  ALASKA       delayed Los.Angeles     62
## 3  AMWEST       on time Los.Angeles    694
## 4  AMWEST       delayed Los.Angeles    117
## 5  ALASKA       on time     Phoenix    221
## 6  ALASKA       delayed     Phoenix     12

#Create a new data frame called spread_df to spread the two rows (on time and delayed) into two columns.

spread_df <- spread(gather_df, ArrivalStatus, Number, 3:7)

## Warning in if (!is.na(fill)) {: the condition has length > 1 and only the first
## element will be used

arrange(Airlines)

##   Airline ArrivalStatus Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA       on time         497     221       212           503   1,841
## 2  ALASKA       delayed          62      12        20           102     305
## 3  AMWEST       on time         694   4,840       383           320     201
## 4  AMWEST       delayed         117     415        65           129      61

spread_df = rename (spread_df, "on_time" = "on time")

#Display the spread data.

head(spread_df)

##   Airline          City delayed on_time
## 1  ALASKA   Los.Angeles      62     497
## 2  ALASKA       Phoenix      12     221
## 3  ALASKA     San.Diego      20     212
## 4  ALASKA San.Francisco     102     503
## 5  ALASKA       Seattle     305   1,841
## 6  AMWEST   Los.Angeles     117     694

#Remove the (,) because it change values over 1,000 to NA values.

spread_df []= lapply(spread_df, gsub, pattern = ",", replacement = "")

#Change the two columns (on_timev and delayed) from char to numeric. changing it to numeric so that I can add the two columns and we cann't add chr values.

spread_df$on_time <- as.numeric (as.character(spread_df$on_time))

spread_df$delayed <- as.numeric (spread_df$delayed)

#Add new column to add the total number of on_time and delayed flights.

spread_df %>%
  group_by(Airline, City) %>%
  mutate(Total = sum(on_time, delayed))

## # A tibble: 10 x 5
## # Groups:   Airline, City [10]
##    Airline City          delayed on_time Total
##    <chr>   <chr>           <dbl>   <dbl> <dbl>
##  1 ALASKA  Los.Angeles        62     497   559
##  2 ALASKA  Phoenix            12     221   233
##  3 ALASKA  San.Diego          20     212   232
##  4 ALASKA  San.Francisco     102     503   605
##  5 ALASKA  Seattle           305    1841  2146
##  6 AMWEST  Los.Angeles       117     694   811
##  7 AMWEST  Phoenix           415    4840  5255
##  8 AMWEST  San.Diego          65     383   448
##  9 AMWEST  San.Francisco     129     320   449
## 10 AMWEST  Seattle            61     201   262

Perform analysis to compare the arrival delays for the two airlines.

summary(spread_df)

##    Airline              City              delayed          on_time      
##  Length:10          Length:10          Min.   : 12.00   Min.   : 201.0  
##  Class :character   Class :character   1st Qu.: 61.25   1st Qu.: 245.8  
##  Mode  :character   Mode  :character   Median : 83.50   Median : 440.0  
##                                        Mean   :128.80   Mean   : 971.2  
##                                        3rd Qu.:126.00   3rd Qu.: 646.2  
##                                        Max.   :415.00   Max.   :4840.0

The Summary shows the minimum and maximum number of delayed and on time flights. It also shows the mean and median for delayed and on time flights.

#Get the Variance. The formula for Variance is: 1/(n-1)*sum((x-mean)**2)

ontime= spread_df$on_time

var(ontime)

## [1] 2083057

#Find the correlation between on_time and delayed flights.

cor(spread_df$on_time,spread_df$delayed)

## [1] 0.9274815

Correlation shows how strongly the variables are related. The correlation ranges from -1.0 to +1.0. The closer the correlation (r) to +1 or -1, the more closely the two variables are related. The cor between delayed and on time is very high and close to 1. However, we need to keep in mind that this data is very small.

ggscatter(spread_df, x= "on_time", y= "delayed", add = "reg.line", cor.coef = TRUE, conf.int = TRUE)

This is a scatter with a regression line and including the Correlation and p values.

#Graphs

freq(spread_df)

##   Airline frequency percentage cumulative_perc
## 1  ALASKA         5         50              50
## 2  AMWEST         5         50             100

##            City frequency percentage cumulative_perc
## 1   Los.Angeles         2         20              20
## 2       Phoenix         2         20              40
## 3     San.Diego         2         20              60
## 4 San.Francisco         2         20              80
## 5       Seattle         2         20             100

## [1] "Variables processed: Airline, City"

This graph shows the distribution of all the cities.

plot_num (spread_df)

This graph shows the distribution of on_time and delayed flights.

ggboxplot(spread_df ,  y = 'ontime')

Boxplot displays the distribution of on_time flights. The two dots are considered as outliers!

ggboxplot(spread_df ,  y = 'delayed')

Boxplot displays the distribution of delayed flights. The two dots are considered as outliers!

ggbarplot(spread_df ,  x= "Airline", y= "on_time", color = "City" , position = position_dodge())

The graph above displays distribution of on_time flight in the two airline and cities.