library(tidyverse)
library(openintro)
library(tidyr)
library(dplyr)
library(tidyverse)
library(ggplot2)Insert any text here.
flights <- read.csv("C:\\Users\\tanzi\\OneDrive\\DATA\\607\\Week5\\tanzildata.csv ")
library(readr)
# Read the CSV file directly from the URL
flights <- read.csv("https://raw.githubusercontent.com/tanzil64/Data-607-Assignment-05/main/tanzildata.csv")
# Display the dataset
print(flights)## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AMWEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
#provide the name for the cols
colnames(flights)[1] <- "airline"
colnames(flights)[2] <- "status"
df <-data.frame(flights)
df## airline status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AMWEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
## airline status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 AMWEST on time 694 4,840 383 320 201
## 4 delayed 117 415 65 129 61
## airline status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on time 694 4,840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
## airline status city frequency
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AMWEST on time Los.Angeles 694
## 4 AMWEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AMWEST on time Phoenix 4,840
## 8 AMWEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AMWEST on time San.Diego 383
## 12 AMWEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AMWEST on time San.Francisco 320
## 16 AMWEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1,841
## 18 ALASKA delayed Seattle 305
## 19 AMWEST on time Seattle 201
## 20 AMWEST delayed Seattle 61
## airline city delayed on time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1,841
## 6 AMWEST Los.Angeles 117 694
## 7 AMWEST Phoenix 415 4,840
## 8 AMWEST San.Diego 65 383
## 9 AMWEST San.Francisco 129 320
## 10 AMWEST Seattle 61 201
## airline city delayed on time
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1,841
## 6 AMWEST Los Angeles 117 694
## 7 AMWEST Phoenix 415 4,840
## 8 AMWEST San Diego 65 383
## 9 AMWEST San Francisco 129 320
## 10 AMWEST Seattle 61 201
library(dplyr)
library(ggplot2)
library(dplyr)
library(ggplot2)
# Ensure the 'delayed' column is numeric
df1$delayed <- as.numeric(df1$delayed)
# Summarize the total number of delayed flights by airline
delayed_summary <- df1 %>%
group_by(airline) %>%
summarise(total_delayed = sum(delayed, na.rm = TRUE))
# Print the summary
print(delayed_summary)## # A tibble: 2 × 2
## airline total_delayed
## <chr> <dbl>
## 1 ALASKA 501
## 2 AMWEST 787
# Create a bar plot of the total number of delayed flights by airline
ggplot(delayed_summary, aes(x = airline, y = total_delayed, fill = airline)) +
geom_bar(stat = "identity") +
labs(title = "Total Number of Delayed Flights by Airline", x = "Airline", y = "Total Delayed Flights") +
theme_minimal()## airline city delayed on_time
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1,841
## 6 AMWEST Los Angeles 117 694
## 7 AMWEST Phoenix 415 4,840
## 8 AMWEST San Diego 65 383
## 9 AMWEST San Francisco 129 320
## 10 AMWEST Seattle 61 201
library(dplyr)
library(ggplot2)
# Ensure the 'on_time' column is numeric
df1$on_time <- as.numeric(gsub("_", "", df1$on_time))## Warning: NAs introduced by coercion
# Summarize the total number of on-time flights by airline
on_time_summary <- df1 %>%
group_by(airline) %>%
summarise(total_on_time = sum(on_time, na.rm = TRUE))
# Print the summary
print(on_time_summary)## # A tibble: 2 × 2
## airline total_on_time
## <chr> <dbl>
## 1 ALASKA 1433
## 2 AMWEST 1598
# Create a bar plot of the total number of on-time flights by airline
ggplot(on_time_summary, aes(x = airline, y = total_on_time, fill = airline)) +
geom_bar(stat = "identity") +
labs(title = "Total Number of On-Time Flights by Airline", x = "Airline", y = "Total On-Time Flights") +
theme_minimal()## airline city delayed on_time
## Length:10 Length:10 Min. : 12.00 Min. :201.0
## Class :character Class :character 1st Qu.: 61.25 1st Qu.:218.8
## Mode :character Mode :character Median : 83.50 Median :351.5
## Mean :128.80 Mean :378.9
## 3rd Qu.:126.00 3rd Qu.:498.5
## Max. :415.00 Max. :694.0
## NA's :2
library(ggplot2)
library(tidyr)
# Sample data frame
df1 <- data.frame(
x = c("AMWEST", "delayed", "ALASKA", NA, "delayed"),
x_1 = c("on time", "delayed", "on time", NA, "delayed"),
los_angeles = c(694, 117, 497, NA, 62),
phoenix = c(4840, 415, 221, NA, 12),
san_diego = c(383, 65, 212, NA, 20),
san_francisco = c(320, 129, 503, NA, 102),
seattle = c(201, 61, 1841, NA, 305)
)
# Reshape the data frame to long format
df_long <- df1 %>%
pivot_longer(cols = c(los_angeles, san_diego, san_francisco),
names_to = "city",
values_to = "delay")
# Filter out rows with NA values in 'x_1' or 'delay'
df_long <- df_long %>%
filter(!is.na(x_1) & !is.na(delay))
# Create the ggplot
ggplot(df_long, aes(x = city, y = delay, fill = x_1)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Delay by City", x = "City", y = "Delay") +
theme_minimal()## x x_1 los_angeles phoenix
## Length:5 Length:5 Min. : 62.0 Min. : 12.0
## Class :character Class :character 1st Qu.:103.2 1st Qu.: 168.8
## Mode :character Mode :character Median :307.0 Median : 318.0
## Mean :342.5 Mean :1372.0
## 3rd Qu.:546.2 3rd Qu.:1521.2
## Max. :694.0 Max. :4840.0
## NA's :1 NA's :1
## san_diego san_francisco seattle
## Min. : 20.00 Min. :102.0 Min. : 61
## 1st Qu.: 53.75 1st Qu.:122.2 1st Qu.: 166
## Median :138.50 Median :224.5 Median : 253
## Mean :170.00 Mean :263.5 Mean : 602
## 3rd Qu.:254.75 3rd Qu.:365.8 3rd Qu.: 689
## Max. :383.00 Max. :503.0 Max. :1841
## NA's :1 NA's :1 NA's :1