library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(nycflights23)
3.1 I could use the c() combine function to create a vector containing “BTV” and “SEA” I could then select the destinations containing this vector using “dest %in%”. Finally, I could put the “not” operator “!” in front of my filter to filter only the rows that are not going to “BTV” nor “SEA”.
btv_sea_flights_fall<-flights|>
filter(!(dest %in% c("BTV","SEA")))
glimpse(btv_sea_flights_fall)
## Rows: 426,667
## Columns: 19
## $ year <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 1, 18, 31, 33, 36, 503, 520, 524, 537, 547, 549, 551, 5…
## $ sched_dep_time <int> 2038, 2300, 2344, 2140, 2048, 500, 510, 530, 520, 545, …
## $ dep_delay <dbl> 203, 78, 47, 173, 228, 3, 10, -6, 17, 2, -10, -9, -7, -…
## $ arr_time <int> 328, 228, 500, 238, 223, 808, 948, 645, 926, 845, 905, …
## $ sched_arr_time <int> 3, 135, 426, 2352, 2252, 815, 949, 710, 818, 852, 901, …
## $ arr_delay <dbl> 205, 53, 34, 166, 211, -7, -1, -25, 68, -7, 4, -13, -14…
## $ carrier <chr> "UA", "DL", "B6", "B6", "UA", "AA", "B6", "AA", "UA", "…
## $ flight <int> 628, 393, 371, 1053, 219, 499, 996, 981, 206, 225, 800,…
## $ tailnum <chr> "N25201", "N830DN", "N807JB", "N265JB", "N17730", "N925…
## $ origin <chr> "EWR", "JFK", "JFK", "JFK", "EWR", "EWR", "JFK", "EWR",…
## $ dest <chr> "SMF", "ATL", "BQN", "CHS", "DTW", "MIA", "BQN", "ORD",…
## $ air_time <dbl> 367, 108, 190, 108, 80, 154, 192, 119, 258, 157, 164, 1…
## $ distance <dbl> 2500, 760, 1576, 636, 488, 1085, 1576, 719, 1400, 1065,…
## $ hour <dbl> 20, 23, 23, 21, 20, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 6, 6,…
## $ minute <dbl> 38, 0, 44, 40, 48, 0, 10, 30, 20, 45, 59, 0, 59, 0, 0, …
## $ time_hour <dttm> 2023-01-01 20:00:00, 2023-01-01 23:00:00, 2023-01-01 2…
3.2 The doctor’s data is biased because it is only including those who are still alive. This exclusion could potentially be ignoring people who died due to lung cancer and therefore would be relevant for studying. There is a bias for more healthy patients.
3.3
summary_windspeed<-weather|>
summarize(mean=mean(wind_speed, na.rm=TRUE),
std_dev=sd(wind_speed,na.rm=TRUE), count=n())
summary_windspeed
## # A tibble: 1 × 3
## mean std_dev count
## <dbl> <dbl> <int>
## 1 9.43 5.27 26207
The returned value corresponds to the number of data points that exist for wind_speed.
3.4 The code doesn’t work because it splits summarize into 2 different functions. The code works fine when you run mean, but breaks when you include the standard deviation line. This is likely due to the fact that the summarize function corresponds directly to one summary, so the standard deviation has no summary to link to.
3.6
day_speed<-weather|>
group_by(day)|>
summarize(mean=mean(wind_speed, na.rm=TRUE),
std_dev=sd(wind_speed, na.rm=TRUE))
day_speed
## # A tibble: 31 × 3
## day mean std_dev
## <int> <dbl> <dbl>
## 1 1 8.97 4.47
## 2 2 8.12 4.61
## 3 3 9.53 6.18
## 4 4 9.13 5.93
## 5 5 7.79 4.62
## 6 6 8.18 4.94
## 7 7 10.3 5.16
## 8 8 10.1 5.89
## 9 9 8.74 4.91
## 10 10 7.94 4.72
## # ℹ 21 more rows
3.7
by_origin_montly<-flights|>
group_by(month,origin)|>
summarize(count=n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
by_origin_montly
## # A tibble: 36 × 3
## # Groups: month [12]
## month origin count
## <int> <chr> <int>
## 1 1 EWR 11623
## 2 1 JFK 10918
## 3 1 LGA 13479
## 4 2 EWR 10991
## 5 2 JFK 10567
## 6 2 LGA 13203
## 7 3 EWR 12593
## 8 3 JFK 12158
## 9 3 LGA 14763
## 10 4 EWR 12022
## # ℹ 26 more rows
The resulting dataset now groups by ascending months first and displays each airport’s number of flights for that month.
3.9 Filter() decides which data will be shown based on other variables while group_by() followed by summarize() sorts the data into different groups. In other words, filter decides which data to show while group_by decides how to sort data.
3.11
flights<-flights|>
mutate(new_dep_delay=dep_time-sched_dep_time,
new_arr_delay=arr_time-sched_arr_time)
newdelay<-flights[ c("new_dep_delay", "dep_delay", "new_arr_delay", "arr_delay")]
glimpse(newdelay)
## Rows: 435,352
## Columns: 4
## $ new_dep_delay <int> -2037, -2282, -2313, -2107, -2012, 3, 10, -6, 17, 2, -10…
## $ dep_delay <dbl> 203, 78, 47, 173, 228, 3, 10, -6, 17, 2, -10, -9, -7, -6…
## $ new_arr_delay <int> 325, 93, 74, -2114, -2029, -7, -1, -65, 108, -7, 4, -13,…
## $ arr_delay <dbl> 205, 53, 34, 166, 211, -7, -1, -25, 68, -7, 4, -13, -14,…
From the data I can see, my recreated columns do very poorly when wrapping around midnight and a change in day. It seems to accurately reflect the given columns when both scheduled time and actual time fall within the same day, but it completely falls apart otherwise.
3.13 Because all of those other variables exist in both flights and weather. Excluding any of them would lead to them being included twice in the new joined data frame. You need to account for all key variables when joining data sets
3.20
needed_data<-flights|>
inner_join(planes, by ="tailnum")|>
inner_join(airlines, by ="carrier")|>
select("name","distance","seats")
raw_seat_miles<-needed_data|>
mutate(seat_miles=distance*seats)|>
select("name","seat_miles")
per_carrier<-raw_seat_miles|>
group_by(name)|>
summarize(seat_miles=sum(seat_miles))
per_carrier|>
arrange(desc(seat_miles))
## # A tibble: 13 × 2
## name seat_miles
## <chr> <dbl>
## 1 United Air Lines Inc. 18753552904
## 2 JetBlue Airways 18302094316
## 3 Delta Air Lines Inc. 15282765973
## 4 American Airlines Inc. 11173950579
## 5 Spirit Air Lines 3880566315
## 6 Republic Airline 3512338372
## 7 Alaska Airlines Inc. 2974953367
## 8 Endeavor Air Inc. 2501279760
## 9 Southwest Airlines Co. 1986242879
## 10 Hawaiian Airlines Inc. 683807124
## 11 SkyWest Airlines Inc. 325167024
## 12 Frontier Airlines Inc. 230428926
## 13 Envoy Air 16311790