#THE GOAL IS TO: #(1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. #You’re encouraged to use a “wide” structure similar to how the information appears above, so #that you can practice tidying and transformations as described below. #(2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy #and transform your data. #(3) Perform analysis to compare the arrival delays for the two airlines. #(4) Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative #descriptions of your data cleanup work, analysis, and conclusions. Please include in your #homework submission: # The URL to the .Rmd file in your GitHub repository. and # The URL for your rpubs.com web page.
MY PLAN:
B)IF this was python you would use describe to summarizes the count, mean Standard dev and min and max for the numeric variables
C)Look for discrepancies Min vs max, mean vs median, wide or small std dev
Indenitfy unique values in both columns and rows D)CLEANING YOUR DATASET - Remove redundant variables - Remove Variable selection - Remove outliers
PACKAGES
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ tidyr 1.3.0 ✔ forcats 1.0.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stats)
READ IN DATA
flightinfo = read_csv("flightinfo_Data607_copy.csv")
## New names:
## Rows: 7 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ...2, ...3, ...4, ...5, ...6, ...7, ...8 lgl (1): Table 1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
head(flightinfo)
## # A tibble: 6 × 8
## `Table 1` ...2 ...3 ...4 ...5 ...6 ...7 ...8
## <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 NA <NA> <NA> Los Angeles Phoenix San Diego San Francisco Seattle
## 3 NA ALASKA On time 497 221 212 503 1841
## 4 NA <NA> Delayed 62 12 20 102 305
## 5 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 NA AM WEST On time 694 4840 383 320 201
head(flightinfo)
## # A tibble: 6 × 8
## `Table 1` ...2 ...3 ...4 ...5 ...6 ...7 ...8
## <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 NA <NA> <NA> Los Angeles Phoenix San Diego San Francisco Seattle
## 3 NA ALASKA On time 497 221 212 503 1841
## 4 NA <NA> Delayed 62 12 20 102 305
## 5 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 NA AM WEST On time 694 4840 383 320 201
GET THE NUMBER THE NUMBER OF COLMUNS #I have 8 columuns and 6 rows
##I just realized I can not summarize Delayed by Airlines beacuse it its in a fdifferent lines
DESCRIBE/SUMMMARIES THE DATA
#SHORTCUT, I NEED TO LABLED THE ROW
rownames(flightinfo) = (LETTERS[1:7] )
## Warning: Setting row names on a tibble is deprecated.
flightinfo
## # A tibble: 7 × 8
## `Table 1` ...2 ...3 ...4 ...5 ...6 ...7 ...8
## * <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 NA <NA> <NA> Los Angeles Phoenix San Diego San Francisco Seattle
## 3 NA ALASKA On time 497 221 212 503 1841
## 4 NA <NA> Delayed 62 12 20 102 305
## 5 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 NA AM WEST On time 694 4840 383 320 201
## 7 NA <NA> Delayed 117 415 65 129 61
#apparently tibble can not have custom names.We are still going to try. If I can’t change the row name, then I will have to convert df to a dataframe using as.data.frame
row.names(flightinfo) [2] = "Destination"
## Warning: Setting row names on a tibble is deprecated.
flightinfo
## # A tibble: 7 × 8
## `Table 1` ...2 ...3 ...4 ...5 ...6 ...7 ...8
## * <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 NA <NA> <NA> Los Angeles Phoenix San Diego San Francisco Seattle
## 3 NA ALASKA On time 497 221 212 503 1841
## 4 NA <NA> Delayed 62 12 20 102 305
## 5 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 NA AM WEST On time 694 4840 383 320 201
## 7 NA <NA> Delayed 117 415 65 129 61
flight311 = as.data.frame(flightinfo)
flight311
## Table 1 ...2 ...3 ...4 ...5 ...6 ...7 ...8
## 1 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 NA <NA> <NA> Los Angeles Phoenix San Diego San Francisco Seattle
## 3 NA ALASKA On time 497 221 212 503 1841
## 4 NA <NA> Delayed 62 12 20 102 305
## 5 NA <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 NA AM WEST On time 694 4840 383 320 201
## 7 NA <NA> Delayed 117 415 65 129 61
row.names(flight311) [2] = "Destination"
flight311
## Table 1 ...2 ...3 ...4 ...5 ...6 ...7
## 1 NA <NA> <NA> <NA> <NA> <NA> <NA>
## Destination NA <NA> <NA> Los Angeles Phoenix San Diego San Francisco
## 3 NA ALASKA On time 497 221 212 503
## 4 NA <NA> Delayed 62 12 20 102
## 5 NA <NA> <NA> <NA> <NA> <NA> <NA>
## 6 NA AM WEST On time 694 4840 383 320
## 7 NA <NA> Delayed 117 415 65 129
## ...8
## 1 <NA>
## Destination Seattle
## 3 1841
## 4 305
## 5 <NA>
## 6 201
## 7 61
#row.names(flight311) [3] = "ALASKA Airline"
#row.names(flight311) [4] = "ALASKA Airline"
#row.names(flight311) [6] = "AM WEST Airline"
#row.names(flight311) [7] = "AM WEST Airline"
didn’t work
flight311 = as.data.frame(t(flight311))
flight311
## 1 Destination 3 4 5 6 7
## Table 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## ...2 <NA> <NA> ALASKA <NA> <NA> AM WEST <NA>
## ...3 <NA> <NA> On time Delayed <NA> On time Delayed
## ...4 <NA> Los Angeles 497 62 <NA> 694 117
## ...5 <NA> Phoenix 221 12 <NA> 4840 415
## ...6 <NA> San Diego 212 20 <NA> 383 65
## ...7 <NA> San Francisco 503 102 <NA> 320 129
## ...8 <NA> Seattle 1841 305 <NA> 201 61
colnames(flight311)[4] = "ALASKA Airline"
flight311
## 1 Destination 3 ALASKA Airline 5 6 7
## Table 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## ...2 <NA> <NA> ALASKA <NA> <NA> AM WEST <NA>
## ...3 <NA> <NA> On time Delayed <NA> On time Delayed
## ...4 <NA> Los Angeles 497 62 <NA> 694 117
## ...5 <NA> Phoenix 221 12 <NA> 4840 415
## ...6 <NA> San Diego 212 20 <NA> 383 65
## ...7 <NA> San Francisco 503 102 <NA> 320 129
## ...8 <NA> Seattle 1841 305 <NA> 201 61
colnames(flight311)[6] = "AM WEST Airline"
colnames(flight311)[7] = "AM WEST Airline"
row.names(flight311)[3] = "Scheduled Gate arrival"
flight311
## 1 Destination 3 ALASKA Airline 5
## Table 1 <NA> <NA> <NA> <NA> <NA>
## ...2 <NA> <NA> ALASKA <NA> <NA>
## Scheduled Gate arrival <NA> <NA> On time Delayed <NA>
## ...4 <NA> Los Angeles 497 62 <NA>
## ...5 <NA> Phoenix 221 12 <NA>
## ...6 <NA> San Diego 212 20 <NA>
## ...7 <NA> San Francisco 503 102 <NA>
## ...8 <NA> Seattle 1841 305 <NA>
## AM WEST Airline AM WEST Airline
## Table 1 <NA> <NA>
## ...2 AM WEST <NA>
## Scheduled Gate arrival On time Delayed
## ...4 694 117
## ...5 4840 415
## ...6 383 65
## ...7 320 129
## ...8 201 61
colnames(flight311)[3] = "ALASKA Airline_ONTIME"
colnames(flight311)[4] = "ALASKA Airline_DELAY"
colnames(flight311)[6] = "AM WEST Airline_ONTIME"
colnames(flight311)[7] = "AM WEST Airline_DELAY"
rownames(flight311) = c(1:8)
flight311
## 1 Destination ALASKA Airline_ONTIME ALASKA Airline_DELAY 5
## 1 <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> ALASKA <NA> <NA>
## 3 <NA> <NA> On time Delayed <NA>
## 4 <NA> Los Angeles 497 62 <NA>
## 5 <NA> Phoenix 221 12 <NA>
## 6 <NA> San Diego 212 20 <NA>
## 7 <NA> San Francisco 503 102 <NA>
## 8 <NA> Seattle 1841 305 <NA>
## AM WEST Airline_ONTIME AM WEST Airline_DELAY
## 1 <NA> <NA>
## 2 AM WEST <NA>
## 3 On time Delayed
## 4 694 117
## 5 4840 415
## 6 383 65
## 7 320 129
## 8 201 61
flight311 = flight311[-1,]
#I need to make a Carrier Columns and put the airlines under that and double the Destinations #first let’s see if I can find speific data
ALASKAflights=flight311[,c(2,3,4)]
ALASKAflights
## Destination ALASKA Airline_ONTIME ALASKA Airline_DELAY
## 2 <NA> ALASKA <NA>
## 3 <NA> On time Delayed
## 4 Los Angeles 497 62
## 5 Phoenix 221 12
## 6 San Diego 212 20
## 7 San Francisco 503 102
## 8 Seattle 1841 305
AMWESTflights=flight311[,c(2,6,7)]
AMWESTflights
## Destination AM WEST Airline_ONTIME AM WEST Airline_DELAY
## 2 <NA> AM WEST <NA>
## 3 <NA> On time Delayed
## 4 Los Angeles 694 117
## 5 Phoenix 4840 415
## 6 San Diego 383 65
## 7 San Francisco 320 129
## 8 Seattle 201 61
#df3 = rbind.data.frame(AMWESTflights,ALASKAflights)
#I am trying to create a data frame that basically have a column with that has 1) ontime arrivals, #2)delays arrivals, , 3) carrier ( says AMWEST or ALASKA) 4) DESINTATIONS and match # SO right here I am imaging that I am trying to combined AM WEST FLIGHTS on top of ALASKA. I imagine each destination would pop up twice correlating the correct ontime flights and delays flights for each respective airline to its respective destination
AMWESTflights$CARRIER = c("AM WEST","AM WEST","AM WEST","AM WEST","AM WEST","AM WEST","")
AMWESTflights
## Destination AM WEST Airline_ONTIME AM WEST Airline_DELAY CARRIER
## 2 <NA> AM WEST <NA> AM WEST
## 3 <NA> On time Delayed AM WEST
## 4 Los Angeles 694 117 AM WEST
## 5 Phoenix 4840 415 AM WEST
## 6 San Diego 383 65 AM WEST
## 7 San Francisco 320 129 AM WEST
## 8 Seattle 201 61
ALASKAflights$CARRIER = c("ALASKA","ALASKA","ALASKA","ALASKA","ALASKA","ALASKA","")
ALASKAflights
## Destination ALASKA Airline_ONTIME ALASKA Airline_DELAY CARRIER
## 2 <NA> ALASKA <NA> ALASKA
## 3 <NA> On time Delayed ALASKA
## 4 Los Angeles 497 62 ALASKA
## 5 Phoenix 221 12 ALASKA
## 6 San Diego 212 20 ALASKA
## 7 San Francisco 503 102 ALASKA
## 8 Seattle 1841 305
colnames(ALASKAflights)[2] = "ARRIVALS_ONTIME"
colnames(ALASKAflights)[3] = "ARRIVAL_DELAY"
ALASKAflights
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 2 <NA> ALASKA <NA> ALASKA
## 3 <NA> On time Delayed ALASKA
## 4 Los Angeles 497 62 ALASKA
## 5 Phoenix 221 12 ALASKA
## 6 San Diego 212 20 ALASKA
## 7 San Francisco 503 102 ALASKA
## 8 Seattle 1841 305
colnames(AMWESTflights)[2] = "ARRIVALS_ONTIME"
colnames(AMWESTflights)[3] = "ARRIVAL_DELAY"
AMWESTflights
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 2 <NA> AM WEST <NA> AM WEST
## 3 <NA> On time Delayed AM WEST
## 4 Los Angeles 694 117 AM WEST
## 5 Phoenix 4840 415 AM WEST
## 6 San Diego 383 65 AM WEST
## 7 San Francisco 320 129 AM WEST
## 8 Seattle 201 61
#Try rbind again
df3 = rbind.data.frame(AMWESTflights,ALASKAflights)
df3
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 2 <NA> AM WEST <NA> AM WEST
## 3 <NA> On time Delayed AM WEST
## 4 Los Angeles 694 117 AM WEST
## 5 Phoenix 4840 415 AM WEST
## 6 San Diego 383 65 AM WEST
## 7 San Francisco 320 129 AM WEST
## 8 Seattle 201 61
## 21 <NA> ALASKA <NA> ALASKA
## 31 <NA> On time Delayed ALASKA
## 41 Los Angeles 497 62 ALASKA
## 51 Phoenix 221 12 ALASKA
## 61 San Diego 212 20 ALASKA
## 71 San Francisco 503 102 ALASKA
## 81 Seattle 1841 305
df3[c(1,7),]
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 2 <NA> AM WEST <NA> AM WEST
## 8 Seattle 201 61
#Ok I just checked the rows I need to take out
df3[-c(1,7),]
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 3 <NA> On time Delayed AM WEST
## 4 Los Angeles 694 117 AM WEST
## 5 Phoenix 4840 415 AM WEST
## 6 San Diego 383 65 AM WEST
## 7 San Francisco 320 129 AM WEST
## 21 <NA> ALASKA <NA> ALASKA
## 31 <NA> On time Delayed ALASKA
## 41 Los Angeles 497 62 ALASKA
## 51 Phoenix 221 12 ALASKA
## 61 San Diego 212 20 ALASKA
## 71 San Francisco 503 102 ALASKA
## 81 Seattle 1841 305
df4 = df3[-c(1,7),]
df4
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 3 <NA> On time Delayed AM WEST
## 4 Los Angeles 694 117 AM WEST
## 5 Phoenix 4840 415 AM WEST
## 6 San Diego 383 65 AM WEST
## 7 San Francisco 320 129 AM WEST
## 21 <NA> ALASKA <NA> ALASKA
## 31 <NA> On time Delayed ALASKA
## 41 Los Angeles 497 62 ALASKA
## 51 Phoenix 221 12 ALASKA
## 61 San Diego 212 20 ALASKA
## 71 San Francisco 503 102 ALASKA
## 81 Seattle 1841 305
print(sapply(df4,class))
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## "character" "character" "character" "character"
#transform(df4$ARRIVALS_ONTIME)= as.numeric(df4$ARRIVALS_ONTIME)
#df4
#install.packages("skimr")
library(skimr)
skim(df4)
| Name | df4 |
| Number of rows | 12 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Destination | 3 | 0.75 | 7 | 13 | 0 | 5 | 0 |
| ARRIVALS_ONTIME | 0 | 1.00 | 3 | 7 | 0 | 11 | 0 |
| ARRIVAL_DELAY | 1 | 0.92 | 2 | 7 | 0 | 10 | 0 |
| CARRIER | 0 | 1.00 | 0 | 7 | 1 | 3 | 0 |
skim(df4$ARRIVALS_ONTIME)
| Name | df4$ARRIVALS_ONTIME |
| Number of rows | 12 |
| Number of columns | 1 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| data | 0 | 1 | 3 | 7 | 0 | 11 | 0 |
skim(df4$ARRIVAL_DELAY)
| Name | df4$ARRIVAL_DELAY |
| Number of rows | 12 |
| Number of columns | 1 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| data | 1 | 0.92 | 2 | 7 | 0 | 10 | 0 |
df4
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 3 <NA> On time Delayed AM WEST
## 4 Los Angeles 694 117 AM WEST
## 5 Phoenix 4840 415 AM WEST
## 6 San Diego 383 65 AM WEST
## 7 San Francisco 320 129 AM WEST
## 21 <NA> ALASKA <NA> ALASKA
## 31 <NA> On time Delayed ALASKA
## 41 Los Angeles 497 62 ALASKA
## 51 Phoenix 221 12 ALASKA
## 61 San Diego 212 20 ALASKA
## 71 San Francisco 503 102 ALASKA
## 81 Seattle 1841 305
df5 = df3[-c(1,7),]
df5
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 3 <NA> On time Delayed AM WEST
## 4 Los Angeles 694 117 AM WEST
## 5 Phoenix 4840 415 AM WEST
## 6 San Diego 383 65 AM WEST
## 7 San Francisco 320 129 AM WEST
## 21 <NA> ALASKA <NA> ALASKA
## 31 <NA> On time Delayed ALASKA
## 41 Los Angeles 497 62 ALASKA
## 51 Phoenix 221 12 ALASKA
## 61 San Diego 212 20 ALASKA
## 71 San Francisco 503 102 ALASKA
## 81 Seattle 1841 305
skim(df5)
| Name | df5 |
| Number of rows | 12 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Destination | 3 | 0.75 | 7 | 13 | 0 | 5 | 0 |
| ARRIVALS_ONTIME | 0 | 1.00 | 3 | 7 | 0 | 11 | 0 |
| ARRIVAL_DELAY | 1 | 0.92 | 2 | 7 | 0 | 10 | 0 |
| CARRIER | 0 | 1.00 | 0 | 7 | 1 | 3 | 0 |
skim(df5$ARRIVALS_ONTIME)
| Name | df5$ARRIVALS_ONTIME |
| Number of rows | 12 |
| Number of columns | 1 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| data | 0 | 1 | 3 | 7 | 0 | 11 | 0 |
df5[,c(2,3)] = sapply(df5[,c(2,3)], as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
df5
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## 3 <NA> NA NA AM WEST
## 4 Los Angeles 694 117 AM WEST
## 5 Phoenix 4840 415 AM WEST
## 6 San Diego 383 65 AM WEST
## 7 San Francisco 320 129 AM WEST
## 21 <NA> NA NA ALASKA
## 31 <NA> NA NA ALASKA
## 41 Los Angeles 497 62 ALASKA
## 51 Phoenix 221 12 ALASKA
## 61 San Diego 212 20 ALASKA
## 71 San Francisco 503 102 ALASKA
## 81 Seattle 1841 305
summary(df5$ARRIVALS_ONTIME)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 212 320 497 1057 694 4840 3
summary(df5$ARRIVAL_DELAY)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.0 62.0 102.0 136.3 129.0 415.0 3
Q- get a summary per airline? and/or per place
df5 %>%
group_by(df5$ARRIVAL_DELAY) %>%
summarize( mean_a= mean(df5$ARRIVAL_DELAY,
sum_a = sum(df5$ARRIVAL_DELAY)
))
## # A tibble: 10 × 2
## `df5$ARRIVAL_DELAY` mean_a
## <dbl> <dbl>
## 1 12 NA
## 2 20 NA
## 3 62 NA
## 4 65 NA
## 5 102 NA
## 6 117 NA
## 7 129 NA
## 8 305 NA
## 9 415 NA
## 10 NA NA
df5 %>%
group_by(df5$ARRIVAL_DELAY,df5$CARRIER) %>%
summarize()
## `summarise()` has grouped output by 'df5$ARRIVAL_DELAY'. You can override using
## the `.groups` argument.
## # A tibble: 11 × 2
## # Groups: df5$ARRIVAL_DELAY [10]
## `df5$ARRIVAL_DELAY` `df5$CARRIER`
## <dbl> <chr>
## 1 12 "ALASKA"
## 2 20 "ALASKA"
## 3 62 "ALASKA"
## 4 65 "AM WEST"
## 5 102 "ALASKA"
## 6 117 "AM WEST"
## 7 129 "AM WEST"
## 8 305 ""
## 9 415 "AM WEST"
## 10 NA "ALASKA"
## 11 NA "AM WEST"
skim(df5)
| Name | df5 |
| Number of rows | 12 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Destination | 3 | 0.75 | 7 | 13 | 0 | 5 | 0 |
| CARRIER | 0 | 1.00 | 0 | 7 | 1 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ARRIVALS_ONTIME | 3 | 0.75 | 1056.78 | 1503.68 | 212 | 320 | 497 | 694 | 4840 | ▇▁▁▁▁ |
| ARRIVAL_DELAY | 3 | 0.75 | 136.33 | 135.70 | 12 | 62 | 102 | 129 | 415 | ▇▆▁▂▂ |
df5 %>%
group_by(CARRIER)
## # A tibble: 12 × 4
## # Groups: CARRIER [3]
## Destination ARRIVALS_ONTIME ARRIVAL_DELAY CARRIER
## <chr> <dbl> <dbl> <chr>
## 1 <NA> NA NA "AM WEST"
## 2 Los Angeles 694 117 "AM WEST"
## 3 Phoenix 4840 415 "AM WEST"
## 4 San Diego 383 65 "AM WEST"
## 5 San Francisco 320 129 "AM WEST"
## 6 <NA> NA NA "ALASKA"
## 7 <NA> NA NA "ALASKA"
## 8 Los Angeles 497 62 "ALASKA"
## 9 Phoenix 221 12 "ALASKA"
## 10 San Diego 212 20 "ALASKA"
## 11 San Francisco 503 102 "ALASKA"
## 12 Seattle 1841 305 ""
df5 %>%
group_by(CARRIER) %>%
group_by(ARRIVAL_DELAY) %>%
summarize( mean = mean(df5$ARRIVAL_DELAY), max = max(df5$ARRIVAL_DELAY), std = sd(df5$ARRIVAL_DELAY))
## # A tibble: 10 × 4
## ARRIVAL_DELAY mean max std
## <dbl> <dbl> <dbl> <dbl>
## 1 12 NA NA NA
## 2 20 NA NA NA
## 3 62 NA NA NA
## 4 65 NA NA NA
## 5 102 NA NA NA
## 6 117 NA NA NA
## 7 129 NA NA NA
## 8 305 NA NA NA
## 9 415 NA NA NA
## 10 NA NA NA NA
ggplot(df5, aes(x = df5$ARRIVAL_DELAY))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).
#df5$ARRIVAL_DELAY %>%
# group_by(CARRIER) %>%
# ggplot(df5$ARRIVAL_DELAY, x = CARRIER)
CLEAN UP DATA
ANAYLYZE RELATIONSHIP BETWEEN VARIABLEs - scatterplot = correlation matrix - histogram
#Create the csv on excel
#DOWNLOAD PACKAGES
#Read in the data
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.