R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Install and load the nycflights13 package

install.packages(“nycflights13”)

Check if the flights dataset is loaded correctly

data(flights)

Question 1.1: How many flights arrived late each month?

late_flights <- flights %>% mutate(late_arrival = arr_delay > 5) %>% filter(late_arrival) %>% group_by(month) %>% summarize(lateflights = n(), .groups = “drop”)

print(late_flights)

Question 1.2: What percentage of traffic did each carrier represent, by month?

traffic_percentage <- flights %>% group_by(month, carrier) %>% summarize(total_flights = n(), .groups = “drop”) %>% group_by(month) %>% mutate(traffic_percent = (total_flights / sum(total_flights)) * 100) %>% ungroup()

print(traffic_percentage)

Question 1.3: What was the latest flight to depart each month?

latest_flights <- flights %>% mutate(dep_time_numeric = if_else(dep_time < 2400, dep_time, NA_real_)) %>% group_by(month) %>% filter(dep_time_numeric == max(dep_time_numeric, na.rm = TRUE)) %>% select(year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, carrier)

print(latest_flights)

Load dataset

responses <- read.csv(“/mnt/data/multipleChoiceResponses1.csv”)

Question 2.1: Count the usefulness by learning platform

usefulness_count <- responses %>% pivot_longer(cols = starts_with(“LearningPlatformUsefulness”), names_to = “learning_platform”, values_to = “usefulness”) %>% drop_na(usefulness) %>% mutate(learning_platform = str_remove(learning_platform, “LearningPlatformUsefulness”)) %>% count(learning_platform, usefulness)

print(usefulness_count)

Question 2.2: Compute the total responses and responses which are at least useful

usefulness_summary <- usefulness_count %>% group_by(learning_platform) %>% summarize( tot = sum(n), count = sum(ifelse(usefulness != “Not Useful”, n, 0)), perc_usefulness = count / tot )

print(usefulness_summary) # Load twitter data twitter_data <- readRDS(“/mnt/data/twitter_data.rds”)

Question 3.1: Group by complaint and compute mean, min, and max follower counts

followers_stats <- twitter_data %>% group_by(complaint_label) %>% summarize( avg_followers = mean(followers_count, na.rm = TRUE), min_followers = min(followers_count, na.rm = TRUE), max_followers = max(followers_count, na.rm = TRUE) )

print(followers_stats)

Question 3.2: Tokenize twitter data and count word frequencies

library(tidytext)

word_counts <- twitter_data %>% unnest_tokens(word, text) %>% count(word, sort = TRUE)

print(word_counts)

Final Project

MATTHEW RICHARD PITANA

2024-12-31