final exam

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(nycflights13)

## Warning: package 'nycflights13' was built under R version 4.3.2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.2

## Warning: package 'ggplot2' was built under R version 4.3.2

## Warning: package 'dplyr' was built under R version 4.3.2

## Warning: package 'forcats' was built under R version 4.3.2

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
late_flights_by_month <- flights %>%
  mutate(late = arr_delay > 5) %>%  
  group_by(month) %>%              
  summarise(lateflights = sum(late, na.rm = TRUE))

late_flights_by_month

## # A tibble: 12 × 2
##    month lateflights
##    <int>       <int>
##  1     1        8988
##  2     2        8119
##  3     3        9033
##  4     4       10544
##  5     5        8490
##  6     6       10739
##  7     7       11518
##  8     8        9649
##  9     9        5347
## 10    10        7628
## 11    11        7485
## 12    12       12291

traffic_percentage <- flights %>%
  group_by(month, carrier) %>%
  summarise(n_flights = n(), .groups = 'drop') %>%
  group_by(month) %>%
  mutate(total_flights = sum(n_flights)) %>%
  ungroup() %>%
  mutate(percentage = n_flights / total_flights * 100) %>%
  select(month, carrier, percentage) %>%
  pivot_wider(names_from = month, values_from = percentage, names_prefix = '')

print(head(traffic_percentage, 6))

## # A tibble: 6 × 13
##   carrier    `1`    `2`    `3`    `4`    `5`    `6`    `7`    `8`    `9`   `10`
##   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 9E       5.83   5.85   5.64   5.33   5.08   5.09   5.08   4.96   5.58   5.79 
## 2 AA      10.3   10.1    9.67   9.61   9.73   9.76   9.79   9.74   9.48   9.40 
## 3 AS       0.230  0.224  0.215  0.212  0.215  0.212  0.211  0.211  0.218  0.215
## 4 B6      16.4   16.4   16.5   15.9   15.9   16.4   16.9   16.9   15.6   15.1  
## 5 DL      13.7   13.8   14.5   14.4   14.2   14.6   14.4   14.7   14.1   14.2  
## 6 EV      15.4   15.3   16.4   16.1   16.7   15.8   15.8   15.6   17.1   17.0  
## # ℹ 2 more variables: `11` <dbl>, `12` <dbl>

latest_dep <- flights %>%
  group_by(month) %>%
  filter(dep_delay == max(dep_delay, na.rm = TRUE)) %>% 
  arrange(-desc(month))

latest_dep

## # A tibble: 12 × 19
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     2    10     2243            830       853      100           1106
##  3  2013     3    17     2321            810       911      135           1020
##  4  2013     4    10     1100           1900       960     1342           2211
##  5  2013     5     3     1133           2055       878     1250           2215
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     7    22      845           1600      1005     1044           1815
##  8  2013     8     8     2334           1454       520      120           1710
##  9  2013     9    20     1139           1845      1014     1457           2210
## 10  2013    10    14     2042            900       702     2255           1127
## 11  2013    11     3      603           1645       798      829           1913
## 12  2013    12     5      756           1700       896     1058           2020
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

library(tidyverse)
library(lubridate)
library(dplyr)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggplot2)
library(tidyquant)

## Warning: package 'tidyquant' was built under R version 4.3.2

## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(zoo)
blah <- read.csv("C:/Users/Badamkhand/Downloads/multipleChoiceResponses1.csv")
usefulness_count <- blah %>%
  gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
  group_by(learning_platform, usefulness) %>%
  summarise(n = n()) %>%
  arrange(learning_platform, usefulness)

## `summarise()` has grouped output by 'learning_platform'. You can override using
## the `.groups` argument.

atleastuseful <- blah %>%
  gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
  count(learning_platform, name = "count") %>%
  left_join(
    blah %>%
      gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
      filter(!is.na(usefulness) & usefulness != "Not Useful") %>%
      mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
      count(learning_platform, name = "at_least_useful"),
    by = "learning_platform"
  ) %>%
  mutate(
    tot = ifelse(is.na(at_least_useful), count, at_least_useful),
    perc_usefulness = tot / count
  ) %>%
  select(learning_platform, tot, count, perc_usefulness)

atleastuseful

##    learning_platform  tot count perc_usefulness
## 1              Arxiv 2354  2391       0.9845253
## 2              Blogs 4720  4765       0.9905561
## 3            College 3258  3359       0.9699315
## 4        Communities 1126  1142       0.9859895
## 5            Company  940   981       0.9582059
## 6        Conferences 2063  2182       0.9454629
## 7            Courses 5945  5992       0.9921562
## 8      Documentation 2279  2321       0.9819044
## 9            Friends 1530  1581       0.9677419
## 10            Kaggle 6527  6583       0.9914932
## 11       Newsletters 1033  1089       0.9485767
## 12          Podcasts 1090  1214       0.8978583
## 13          Projects 4755  4794       0.9918648
## 14                SO 5576  5640       0.9886525
## 15          Textbook 4112  4181       0.9834968
## 16         TradeBook  324   333       0.9729730
## 17          Tutoring 1394  1426       0.9775596
## 18           YouTube 5125  5229       0.9801109

atleastuseful %>%
  mutate(
    learning_platform = fct_reorder(learning_platform, perc_usefulness, .desc = TRUE),
    perc_usefulness = as.numeric(perc_usefulness)
  ) %>%
  ggplot(aes(y = learning_platform, yend = learning_platform, x = 0, xend = perc_usefulness)) +
  geom_segment(color = "black") +
  geom_point(aes(x = perc_usefulness), color = "black", size = 3) + 
  scale_x_continuous(labels = scales::percent_format()) +
  coord_flip() +
  labs(
    title = "Percentage of Usefulness by Learning Platform",
    x = "Percent findings at least somewhat useful",
    y = "Learning platform"
  )

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

final exam

erdembileg/112035150

2024-01-02

R Markdown