Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
Use the flights data from package nycflights13
to
answer the following questions
How many flights arrived late each month? The late arrival is the
flight arriving more than 5 minutes after its scheduled arrival
time.
lateflights <- flights %>%
filter(arr_delay>5) %>%
group_by(month) %>%
summarise(lateflights = n()) %>%
ungroup()
lateflights
## # A tibble: 12 × 2
## month lateflights
## <int> <int>
## 1 1 8988
## 2 2 8119
## 3 3 9033
## 4 4 10544
## 5 5 8490
## 6 6 10739
## 7 7 11518
## 8 8 9649
## 9 9 5347
## 10 10 7628
## 11 11 7485
## 12 12 12291
What percentage of traffic did each carrier represent, by
month?
#Calculate total number of flights by carrier and month
total_flights_per_carrier <- flights %>%
group_by(month, carrier) %>%
summarise(total_flights_by_carrier = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
#Calculate total number of flights per month
total_flights_per_month <- flights %>%
group_by(month) %>%
summarise(total_flights_per_month = n())
#Join the two dataset and calculate the percentage
percentage_traffic_per_carrier <- total_flights_per_carrier %>%
left_join(total_flights_per_month, by = "month") %>%
mutate(percentage_traffic = (total_flights_by_carrier / total_flights_per_month)) %>%
mutate(percentage_traffic = scales::percent(percentage_traffic)) %>%
select(month, carrier, percentage_traffic)
#Transpose and print the result
percentage_traffic_per_carrier %>% spread(key = month, value = percentage_traffic)
## # A tibble: 16 × 13
## carrier `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 9E 5.825% 5.8475% 5.64… 5.33… 5.07… 5.08… 5.07… 4.96… 5.58… 5.79… 5.84…
## 2 AA 10.347% 10.087… 9.66… 9.60… 9.73… 9.76… 9.79… 9.73… 9.47… 9.39… 9.45…
## 3 AS 0.230% 0.2244% 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.19…
## 4 B6 16.394% 16.444… 16.5… 15.9… 15.8… 16.3… 16.9… 16.8… 15.5… 15.0… 15.7…
## 5 DL 13.665% 13.803… 14.5… 14.4… 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
## 6 EV 15.446% 15.338… 16.3… 16.1… 16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.3…
## 7 F9 0.218% 0.1964% 0.19… 0.20… 0.20… 0.19… 0.19… 0.18… 0.21… 0.19… 0.22…
## 8 FL 1.215% 1.1863% 1.09… 1.09… 1.12… 0.89… 0.89… 0.89… 0.92… 0.81… 0.74…
## 9 HA 0.115% 0.1122% 0.10… 0.10… 0.10… 0.10… 0.10… 0.10… 0.09… 0.07… 0.09…
## 10 MQ 8.410% 8.1921% 7.82… 7.80… 7.93… 7.71… 7.68… 7.71… 8.00… 7.71… 7.54…
## 11 OO 0.004% <NA> <NA> <NA> <NA> 0.00… <NA> 0.01… 0.07… <NA> 0.01…
## 12 UA 17.172% 17.418… 17.2… 17.8… 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8…
## 13 US 5.932% 6.2202% 5.96… 6.09… 6.19… 6.14… 6.07… 6.06… 6.15… 6.39… 6.23…
## 14 VX 1.170% 1.0861% 1.05… 1.64… 1.72… 1.70… 1.66… 1.66… 1.64… 1.63… 1.65…
## 15 WN 3.688% 3.6512% 3.46… 3.45… 3.49… 3.64… 3.65… 3.57… 3.66… 3.77… 3.78…
## 16 YV 0.170% 0.1924% 0.06… 0.13… 0.17… 0.17… 0.27… 0.22… 0.15… 0.22… 0.18…
## # ℹ 1 more variable: `12` <chr>
What was the latest flight to depart each month?
flights %>%
group_by(month) %>%
filter(dep_delay == max(dep_delay, na.rm = TRUE)) %>%
arrange(-desc(month))
## # A tibble: 12 × 19
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 2 10 2243 830 853 100 1106
## 3 2013 3 17 2321 810 911 135 1020
## 4 2013 4 10 1100 1900 960 1342 2211
## 5 2013 5 3 1133 2055 878 1250 2215
## 6 2013 6 15 1432 1935 1137 1607 2120
## 7 2013 7 22 845 1600 1005 1044 1815
## 8 2013 8 8 2334 1454 520 120 1710
## 9 2013 9 20 1139 1845 1014 1457 2210
## 10 2013 10 14 2042 900 702 2255 1127
## 11 2013 11 3 603 1645 798 829 1913
## 12 2013 12 5 756 1700 896 1058 2020
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Use the multipleChoiceResponses1.cvs
to answer the
following questions:
Import dataset
library(readr)
multipleChoiceResponses <- read_csv("C:/Users/Admin/OneDrive - 亞洲大學[Asia University]/Financial Database Mana & Application/Data/multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl (1): Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Compute the number of total reponses and number of responses which
are at least useful
#Calculate the number of total responses by learning platform
total_usefulness_by_platform <- usefulness_by_platform %>% group_by(learning_platform) %>% summarise(tot = n())
#Calculate the number of useful responses by learning platform
usefulness_count <- usefulness_by_platform %>% filter(!grepl("Not Useful",usefulness,ignore.case = TRUE)) %>%
group_by(learning_platform) %>% summarise(count = n())
#Calculate the percentage of usefulness for each learning platform
perc_usefulness <- usefulness_count %>% left_join(total_usefulness_by_platform, by = "learning_platform") %>%
mutate(perc_usefulness = count/tot) %>%
mutate(perc_usefulness = round(perc_usefulness, digits = 3))
#Print the result
perc_usefulness
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2354 2391 0.985
## 2 Blogs 4720 4765 0.991
## 3 College 3258 3359 0.97
## 4 Communities 1126 1142 0.986
## 5 Company 940 981 0.958
## 6 Conferences 2063 2182 0.945
## 7 Courses 5945 5992 0.992
## 8 Documentation 2279 2321 0.982
## 9 Friends 1530 1581 0.968
## 10 Kaggle 6527 6583 0.991
## 11 Newsletters 1033 1089 0.949
## 12 Podcasts 1090 1214 0.898
## 13 Projects 4755 4794 0.992
## 14 SO 5576 5640 0.989
## 15 Textbook 4112 4181 0.983
## 16 TradeBook 324 333 0.973
## 17 Tutoring 1394 1426 0.978
## 18 YouTube 5125 5229 0.98
Visualization
#Change platforms into factors
perc_usefulness_fct <- perc_usefulness %>%
mutate(learning_platform = fct(learning_platform) %>%
fct_reorder(perc_usefulness) %>%
fct_rev() %>%
fct_relevel("Courses", after = 0))
#Plot the percentage of usefulness of each platform
perc_usefulness_fct %>% ggplot(aes(x = learning_platform, y = perc_usefulness))+
geom_segment(aes(xend = learning_platform, yend=0))+
geom_point()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
x = "Learning Platform",
y = "Percent finding at least somewhat useful"
)+
scale_y_continuous(labels = scales::percent_format(scale = 100, suffix = "%"))
