final

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(nycflights13)

## Warning: package 'nycflights13' was built under R version 4.3.2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(scales)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.0
## ✔ readr   2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#1
# Load the flights data
flights_data <- flights

# Summarize the number of late flights per month
late_flights_summary <- flights_data %>%
  mutate(is_late = arr_delay > 5) %>%
  group_by(month) %>%
  summarize(lateflights = sum(is_late, na.rm = TRUE))

# View the summary table
late_flights_summary

## # A tibble: 12 × 2
##    month lateflights
##    <int>       <int>
##  1     1        8988
##  2     2        8119
##  3     3        9033
##  4     4       10544
##  5     5        8490
##  6     6       10739
##  7     7       11518
##  8     8        9649
##  9     9        5347
## 10    10        7628
## 11    11        7485
## 12    12       12291

#2
# Calculate the total number of flights per month
total_flights_per_month <- flights %>%
  group_by(month) %>%
  summarise(total = n())

# Calculate the number of flights per carrier per month
carrier_flights_per_month <- flights %>%
  group_by(carrier, month) %>%
  summarise(count = n())

## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.

# Join total flights with carrier flights and calculate percentage
carrier_percentage <- left_join(carrier_flights_per_month, total_flights_per_month, by = "month") %>%
  mutate(percentage = paste0(round((count / total) * 100, 3), "%"))

# Spread data to wide format for easier viewing
spread_data <- carrier_percentage %>% 
  select(-count, -total) %>% 
  spread(key = month, value = percentage)

print(spread_data)

## # A tibble: 16 × 13
## # Groups:   carrier [16]
##    carrier `1`     `2`     `3`   `4`   `5`   `6`   `7`   `8`   `9`   `10`  `11` 
##    <chr>   <chr>   <chr>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 9E      5.825%  5.847%  5.64… 5.33… 5.07… 5.08… 5.07… 4.96… 5.58… 5.79… 5.84…
##  2 AA      10.347% 10.088% 9.66… 9.60… 9.73… 9.76… 9.79… 9.73… 9.48% 9.39… 9.45…
##  3 AS      0.23%   0.224%  0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.19…
##  4 B6      16.394% 16.444% 16.5… 15.9… 15.8… 16.3… 16.9… 16.8… 15.5… 15.0… 15.7…
##  5 DL      13.665% 13.803% 14.5… 14.4… 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
##  6 EV      15.446% 15.338% 16.3… 16.1% 16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.3…
##  7 F9      0.218%  0.196%  0.19… 0.20… 0.20… 0.19… 0.19… 0.18… 0.21% 0.19… 0.22…
##  8 FL      1.215%  1.186%  1.09… 1.09… 1.12… 0.89… 0.89… 0.89… 0.92… 0.81… 0.74…
##  9 HA      0.115%  0.112%  0.10… 0.10… 0.10… 0.10… 0.10… 0.10… 0.09… 0.07… 0.09…
## 10 MQ      8.41%   8.192%  7.82… 7.80… 7.93… 7.71… 7.68… 7.71… 8%    7.71… 7.54%
## 11 OO      0.004%  <NA>    <NA>  <NA>  <NA>  0.00… <NA>  0.01… 0.07… <NA>  0.01…
## 12 UA      17.172% 17.418% 17.2… 17.8… 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8…
## 13 US      5.932%  6.22%   5.96… 6.09… 6.19… 6.14… 6.07% 6.06… 6.15… 6.39% 6.23…
## 14 VX      1.17%   1.086%  1.05… 1.64… 1.72… 1.7%  1.66… 1.66… 1.64… 1.63… 1.65…
## 15 WN      3.688%  3.651%  3.46… 3.45… 3.49… 3.64% 3.65… 3.57% 3.66… 3.77… 3.78…
## 16 YV      0.17%   0.192%  0.06… 0.13… 0.17% 0.17… 0.27… 0.22… 0.15… 0.22… 0.18%
## # ℹ 1 more variable: `12` <chr>

#3
flights <- flights %>%
  mutate(delay = dep_delay)

# Find the flight with the most delayed departure time each month
most_delayed_flights <- flights %>%
  group_by(month) %>%
  filter(delay == max(delay, na.rm = TRUE)) %>%
  slice(1)

print(most_delayed_flights)

## # A tibble: 12 × 20
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     2    10     2243            830       853      100           1106
##  3  2013     3    17     2321            810       911      135           1020
##  4  2013     4    10     1100           1900       960     1342           2211
##  5  2013     5     3     1133           2055       878     1250           2215
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     7    22      845           1600      1005     1044           1815
##  8  2013     8     8     2334           1454       520      120           1710
##  9  2013     9    20     1139           1845      1014     1457           2210
## 10  2013    10    14     2042            900       702     2255           1127
## 11  2013    11     3      603           1645       798      829           1913
## 12  2013    12     5      756           1700       896     1058           2020
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, delay <dbl>

#4
responses <- read.csv("C:/Users/daavka/Desktop/multipleChoiceResponses1.csv")
usefulness_count <- responses %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  gather(key = "learning_platform", value = "usefulness") %>%
  filter(!is.na(usefulness)) %>%
  group_by(learning_platform, usefulness) %>%
  summarise(count = n(), .groups = 'drop')

# Remove "LearningPlatformUsefulness" from each string in learning_platform
usefulness_count$learning_platform <- sub("LearningPlatformUsefulness", "", usefulness_count$learning_platform)

print(usefulness_count)

## # A tibble: 340 × 3
##    learning_platform usefulness                                            count
##    <chr>             <chr>                                                 <int>
##  1 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,…     1
##  2 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,…     1
##  3 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,…     1
##  4 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,…     1
##  5 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,…     1
##  6 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,…     1
##  7 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Somewhat …     1
##  8 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Very usef…     1
##  9 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Very usef…     1
## 10 Arxiv             "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Very useful,…     1
## # ℹ 330 more rows

#5
selected_data <- responses %>% select(starts_with("LearningPlatformUsefulness"))

# Convert data into long format
long_data <- selected_data %>% 
  pivot_longer(cols = everything(), names_to = "learning_platform", values_to = "usefulness") %>%
  filter(!is.na(usefulness))

# Remove "LearningPlatformUsefulness" from each string in learning_platform
long_data$learning_platform <- sub("LearningPlatformUsefulness", "", long_data$learning_platform)

# Compute total count and count of useful responses for each learning platform
result <- long_data %>%
  group_by(learning_platform) %>%
  summarise(
    count = n(),
    tot = sum(usefulness != "Not Useful"),
    perc_usefulness = tot / count
  )

print(result)

## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2603  2566           0.986
##  2 Blogs              4895  4850           0.991
##  3 College            3547  3448           0.972
##  4 Communities        1378  1362           0.988
##  5 Company            1218  1177           0.966
##  6 Conferences        2400  2284           0.952
##  7 Courses            6130  6084           0.992
##  8 Documentation      2540  2500           0.984
##  9 Friends            1822  1771           0.972
## 10 Kaggle             6675  6619           0.992
## 11 Newsletters        1329  1274           0.959
## 12 Podcasts           1456  1333           0.916
## 13 Projects           4945  4907           0.992
## 14 SO                 5761  5697           0.989
## 15 Textbook           4363  4294           0.984
## 16 TradeBook           597   588           0.985
## 17 Tutoring           1663  1631           0.981
## 18 YouTube            5392  5290           0.981

final

Dagvadorj

2024-01-02

R Markdown

Including Plots