Github Link: https://github.com/asmozo24/Data607_Assignment5.git
library(tidyverse) #loading all library needed for this assignment
library(openintro)
#head(fastfood)
library(readxl)
library(readr)
library(plyr)
library(dplyr)
#library(dice)
# #library(VennDiagram)
# #library(help = "dice")
library(DBI)
library(dbplyr)
library(data.table)
library(rstudioapi)
#rstudioapi::isAvailable()
library(RJDBC)
library(odbc)
library(RSQLite)
library(rvest)
library(stringr)
library(readtext)
library(ggpubr)
#library(fitdistrplus)
library(ggplot2)
library(moments)
library(qualityTools)
library(normalp)
library(utils)
library(MASS)
library(qqplotr)
#library(DATA606)
library(RCurl)
#connecting to my github respository to import data
# pulling raw data from my github respository, Data607_Assignment5
# https://github.com/asmozo24/Data607_Assignment5.git
airlines_arrival1 <- read.table(url("https://raw.githubusercontent.com/asmozo24/Data607_Assignment5/master/Arrival_Delays_02_airlines%20.csv"), sep = ",")
airlines_arrival1 ## header not clean
## V1 V2 V3 V4 V5 V6 V7
## 1 Los Angeles Phoenix San Diego San Francisco Seattle
## 2 ALASKA on time 497 221 212 503 1841
## 3 delayed 62 12 20 102 305
## 4
## 5 AMWEST on time 694 4840 383 320 201
## 6 delayed 117 415 65 129 61
View(airlines_arrival1)
# This access from github to Rstudio also work
airlines_arrival2 <- read.csv("https://raw.githubusercontent.com/asmozo24/Data607_Assignment5/master/Arrival_Delays_02_airlines%20.csv",header=TRUE,stringsAsFactors=FALSE)
airlines_arrival2 # this is to check it is the actual dataframe I am looking for ....this works too.
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
setwd("~/R/DATA607_Assg5") #set the working directory to access dataframe
# display dataframe from local drive
# newDataName <- read.csv("https://raw.githubusercontent.com/asmozo24/Data607_Assignment5/master/Arrival_Delays_02_airlines%20.csv", sep = ",", na.strings = "NA", strip.white = TRUE, stringsAsFactors = FALSE)
Arrival_Delays_02_airlines_ <- read.csv("C:/Users/Petit Mandela/Documents/R/DATA607_Assg5/Arrival_Delays_02_airlines .csv")
Arrival_Delays_02_airlines_
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
–this connection works but requires password, I am going to comment this block because preventing the knit process. –Now we succeeded in reading Arrival_Delays_02_airlines full table into R dataframe…..
{r connection to sql} #sql connection = con # # con <- dbConnect(odbc(), # Driver = "SQL Server", # Server = "ATM\\ATMSERVER", # Database = "Data607_Airline_Delays_Db", # UID = "Alex", # PWD = rstudioapi::askForPassword("Database password"), # Port = 1433) # # dbListFields(con, "Arrival_Delays_02_airlines") # airlines_arrival <- dbReadTable(con, "Arrival_Delays_02_airlines") # airlines_arrival # # #
I will use Github link
is.na(airlines_arrival2) # checking if there is a missing data in the dataset, return is yes
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [4,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(is.na(airlines_arrival2)) # file to big, checking the sum of all missing data (return is 09 missing data)
## [1] 5
airlines_arrival4 <- na.omit(airlines_arrival2) # delete/remove the missings data because it is an imcomplete observation
airlines_arrival4
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
View(airlines_arrival4)
I will use Github link What city has the most arrivals all airlines combined? Answer. Phoenix appears to be have most arrivals or the most destination all airlines combined. Thus, Phoenix is the most visited city and probably collecting more dividends than other cities.
What city has the least arrivals all airlines combined.? Answer: San Diego appears to have the least arrivals or the least destination all airlines combined. San Diego is the least visited, the mayor needs to work on making the city more attractive. Maybe trying to find out why most people go to Phoenix.
row.names(airlines_arrival4) <- NULL
airlines_arrival4
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
#renaming the no name columns
names(airlines_arrival4)[1] <- 'airlines'
names(airlines_arrival4)[2] <- 'arrival_status'
# Another way of renaming
# airlines_arrival4 <- airlines_arrival4 %>%
# rename(airlines = X, arrival_status = X.1)
airlines_arrival4
## airlines arrival_status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 delayed 117 415 65 129 61
view(airlines_arrival4)
#filling airlines companies
airlines_arrival4$airlines[2] <- c("ALASKA")
airlines_arrival4$airlines[4] <- c("AMWEST")
airlines_arrival4
## airlines arrival_status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
summary(airlines_arrival4)
## airlines arrival_status Los.Angeles Phoenix
## Length:4 Length:4 Min. : 62.0 Min. : 12.0
## Class :character Class :character 1st Qu.:103.2 1st Qu.: 168.8
## Mode :character Mode :character Median :307.0 Median : 318.0
## Mean :342.5 Mean :1372.0
## 3rd Qu.:546.2 3rd Qu.:1521.2
## Max. :694.0 Max. :4840.0
## San.Diego San.Francisco Seattle
## Min. : 20.00 Min. :102.0 Min. : 61
## 1st Qu.: 53.75 1st Qu.:122.2 1st Qu.: 166
## Median :138.50 Median :224.5 Median : 253
## Mean :170.00 Mean :263.5 Mean : 602
## 3rd Qu.:254.75 3rd Qu.:365.8 3rd Qu.: 689
## Max. :383.00 Max. :503.0 Max. :1841
# What city has the most arrivals all airlines combined? Answer. Phoenix appears to be have most arrivals or the most destination all airlines combined.
# Thus, Phoenix is the most visited city and probably collecting more dividends than other cities.
# What city has the least arrivals all airlines combined.? Answer: San Diego appears to have the least arrivals or the least destination all airlines combined.
# San Diego is the least visited, the mayor needs to work on making the city more attractive. Maybe trying to find out why most people go to Phoenix.
total_Los_Angeles <- sum(airlines_arrival4$Los.Angeles) # 1370
total_Phoenix <- sum(airlines_arrival4$Phoenix) # 5488
total_San.Diego <- sum(airlines_arrival4$San.Diego) # 680
total_San.Francisco <- sum(airlines_arrival4$San.Francisco) # 1054
total_Seattle <- sum(airlines_arrival4$Seattle) # 2408
total_arrivals <- c(total_Los_Angeles, total_Phoenix, total_San.Diego, total_San.Francisco, total_Seattle)
cities <- c('Los Angeles', 'Phoenix', 'San Diego', 'San Francisco', 'Seattle')
# Barplot to visualize the number
Cities_Arrivals <- data.frame(cities, total_arrivals)
View(Cities_Arrivals)
barplot(Cities_Arrivals$total_arrivals, names.arg = Cities_Arrivals$cities, main= "Total Arrivals per City", xlab = "Cities", ylab = "Total Arrivals", col = rainbow(6))
# another way to do a barplot
ggplot(data = Cities_Arrivals, aes(x= cities, y=total_arrivals)) +
geom_bar(stat="identity", fill = rainbow(5), width = 0.5)
#let call gather data in columns
airlines_arrival5 <- airlines_arrival4 %>%
gather(key = city, value = "total_arrivals", 3:7)
# looking at the view (airlines_arrival5), by filtering descending order, I can say Alaska has the least number of delay arrivals which is Phoenix, AMWEST has the most number of arrivals in time in Phoenix.