library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)


1. Regular expressions <a/ Use the words data set, find all the words that match each of the following patterns:

four_letters <- str_subset(words, "^.{4}$")
four_letters
##   [1] "able" "also" "area" "away" "baby" "back" "ball" "bank" "base" "bear"
##  [11] "beat" "best" "bill" "blow" "blue" "boat" "body" "book" "both" "busy"
##  [21] "cake" "call" "card" "care" "case" "cent" "chap" "city" "club" "cold"
##  [31] "come" "cook" "copy" "cost" "date" "dead" "deal" "dear" "deep" "door"
##  [41] "down" "draw" "drop" "each" "east" "easy" "else" "even" "ever" "face"
##  [51] "fact" "fair" "fall" "farm" "fast" "feed" "feel" "file" "fill" "film"
##  [61] "find" "fine" "fire" "fish" "five" "flat" "food" "foot" "form" "four"
##  [71] "free" "from" "full" "fund" "game" "girl" "give" "good" "grow" "hair"
##  [81] "half" "hall" "hand" "hang" "hard" "hate" "have" "head" "hear" "heat"
##  [91] "hell" "help" "here" "high" "hold" "home" "hope" "hour" "idea" "into"
## [101] "item" "join" "jump" "just" "keep" "kill" "kind" "king" "know" "lady"
## [111] "land" "last" "late" "lead" "left" "less" "life" "like" "line" "link"
## [121] "list" "live" "load" "lock" "long" "look" "lord" "lose" "love" "luck"
## [131] "main" "make" "many" "mark" "mean" "meet" "mile" "milk" "mind" "miss"
## [141] "more" "most" "move" "much" "must" "name" "near" "need" "news" "next"
## [151] "nice" "nine" "none" "note" "okay" "once" "only" "open" "over" "pack"
## [161] "page" "pair" "park" "part" "pass" "past" "pick" "plan" "play" "plus"
## [171] "poor" "post" "pull" "push" "quid" "rail" "rate" "read" "real" "rest"
## [181] "ring" "rise" "road" "role" "roll" "room" "rule" "safe" "sale" "same"
## [191] "save" "seat" "seem" "self" "sell" "send" "shoe" "shop" "show" "shut"
## [201] "sick" "side" "sign" "sing" "site" "size" "slow" "some" "soon" "sort"
## [211] "stay" "step" "stop" "such" "suit" "sure" "take" "talk" "tape" "team"
## [221] "tell" "tend" "term" "test" "than" "then" "they" "this" "thou" "time"
## [231] "town" "tree" "true" "turn" "type" "unit" "upon" "very" "view" "vote"
## [241] "wage" "wait" "walk" "wall" "want" "warm" "wash" "wear" "week" "well"
## [251] "west" "what" "when" "wide" "wife" "will" "wind" "wish" "with" "wood"
## [261] "word" "work" "year"


four or five letters long:

four_or_five_letters <- str_subset(words, "^.{4,5}$")
four_or_five_letters
##   [1] "able"  "about" "admit" "after" "again" "agent" "agree" "allow" "along"
##  [10] "also"  "apart" "apply" "area"  "argue" "aware" "away"  "awful" "baby" 
##  [19] "back"  "ball"  "bank"  "base"  "basis" "bear"  "beat"  "begin" "best" 
##  [28] "bill"  "birth" "black" "bloke" "blood" "blow"  "blue"  "board" "boat" 
##  [37] "body"  "book"  "both"  "break" "brief" "bring" "build" "busy"  "cake" 
##  [46] "call"  "card"  "care"  "carry" "case"  "catch" "cause" "cent"  "chair"
##  [55] "chap"  "cheap" "check" "child" "city"  "claim" "class" "clean" "clear"
##  [64] "clock" "close" "club"  "cold"  "come"  "cook"  "copy"  "cost"  "could"
##  [73] "count" "court" "cover" "cross" "date"  "dead"  "deal"  "dear"  "deep" 
##  [82] "door"  "doubt" "down"  "draw"  "dress" "drink" "drive" "drop"  "each" 
##  [91] "early" "east"  "easy"  "eight" "elect" "else"  "enjoy" "enter" "equal"
## [100] "even"  "ever"  "every" "exact" "exist" "extra" "face"  "fact"  "fair" 
## [109] "fall"  "farm"  "fast"  "feed"  "feel"  "field" "fight" "file"  "fill" 
## [118] "film"  "final" "find"  "fine"  "fire"  "first" "fish"  "five"  "flat" 
## [127] "floor" "food"  "foot"  "force" "form"  "four"  "free"  "from"  "front"
## [136] "full"  "fund"  "game"  "girl"  "give"  "glass" "good"  "grand" "grant"
## [145] "great" "green" "group" "grow"  "guess" "hair"  "half"  "hall"  "hand" 
## [154] "hang"  "happy" "hard"  "hate"  "have"  "head"  "hear"  "heart" "heat" 
## [163] "heavy" "hell"  "help"  "here"  "high"  "hold"  "home"  "hope"  "horse"
## [172] "hour"  "house" "hullo" "idea"  "into"  "issue" "item"  "jesus" "join" 
## [181] "judge" "jump"  "just"  "keep"  "kill"  "kind"  "king"  "knock" "know" 
## [190] "lady"  "land"  "large" "last"  "late"  "laugh" "lead"  "learn" "leave"
## [199] "left"  "less"  "level" "life"  "light" "like"  "limit" "line"  "link" 
## [208] "list"  "live"  "load"  "local" "lock"  "long"  "look"  "lord"  "lose" 
## [217] "love"  "luck"  "lunch" "main"  "major" "make"  "many"  "mark"  "marry"
## [226] "match" "maybe" "mean"  "meet"  "might" "mile"  "milk"  "mind"  "minus"
## [235] "miss"  "money" "month" "more"  "most"  "move"  "much"  "music" "must" 
## [244] "name"  "near"  "need"  "never" "news"  "next"  "nice"  "night" "nine" 
## [253] "none"  "north" "note"  "offer" "often" "okay"  "once"  "only"  "open" 
## [262] "order" "other" "ought" "over"  "pack"  "page"  "paint" "pair"  "paper"
## [271] "park"  "part"  "party" "pass"  "past"  "pence" "pick"  "piece" "place"
## [280] "plan"  "play"  "plus"  "point" "poor"  "post"  "pound" "power" "press"
## [289] "price" "print" "pull"  "push"  "quick" "quid"  "quiet" "quite" "radio"
## [298] "rail"  "raise" "range" "rate"  "read"  "ready" "real"  "refer" "rest" 
## [307] "right" "ring"  "rise"  "road"  "role"  "roll"  "room"  "round" "rule" 
## [316] "safe"  "sale"  "same"  "save"  "score" "seat"  "seem"  "self"  "sell" 
## [325] "send"  "sense" "serve" "seven" "shall" "share" "sheet" "shoe"  "shoot"
## [334] "shop"  "short" "show"  "shut"  "sick"  "side"  "sign"  "since" "sing" 
## [343] "site"  "size"  "sleep" "slow"  "small" "smoke" "some"  "soon"  "sorry"
## [352] "sort"  "sound" "south" "space" "speak" "speed" "spell" "spend" "staff"
## [361] "stage" "stand" "start" "state" "stay"  "step"  "stick" "still" "stop" 
## [370] "story" "study" "stuff" "such"  "suit"  "sure"  "table" "take"  "talk" 
## [379] "tape"  "teach" "team"  "tell"  "tend"  "term"  "test"  "than"  "thank"
## [388] "then"  "there" "they"  "thing" "think" "this"  "thou"  "three" "throw"
## [397] "time"  "today" "total" "touch" "town"  "trade" "train" "treat" "tree" 
## [406] "true"  "trust" "turn"  "type"  "under" "union" "unit"  "unite" "until"
## [415] "upon"  "usual" "value" "very"  "video" "view"  "visit" "vote"  "wage" 
## [424] "wait"  "walk"  "wall"  "want"  "warm"  "wash"  "waste" "watch" "water"
## [433] "wear"  "week"  "weigh" "well"  "west"  "what"  "when"  "where" "which"
## [442] "while" "white" "whole" "wide"  "wife"  "will"  "wind"  "wish"  "with" 
## [451] "woman" "wood"  "word"  "work"  "world" "worry" "worse" "worth" "would"
## [460] "write" "wrong" "year"  "young"


the second letter is “s” or “t”:

second_letter <- str_subset(words, "^.[st]")
second_letter
##  [1] "as"        "ask"       "associate" "assume"    "at"        "attend"   
##  [7] "especial"  "issue"     "it"        "item"      "other"     "otherwise"
## [13] "staff"     "stage"     "stairs"    "stand"     "standard"  "start"    
## [19] "state"     "station"   "stay"      "step"      "stick"     "still"    
## [25] "stop"      "story"     "straight"  "strategy"  "street"    "strike"   
## [31] "strong"    "structure" "student"   "study"     "stuff"     "stupid"   
## [37] "use"       "usual"


contains the pattern like “oxx” where “o” is one letter and “x” is another letter:

oxx_pattern <- str_subset(words, "(.)(.)\\2")
oxx_pattern
##   [1] "accept"      "account"     "across"      "add"         "address"    
##   [6] "affect"      "afford"      "afternoon"   "agree"       "all"        
##  [11] "allow"       "apparent"    "appear"      "apply"       "appoint"    
##  [16] "approach"    "appropriate" "arrange"     "associate"   "assume"     
##  [21] "attend"      "ball"        "between"     "bill"        "blood"      
##  [26] "book"        "bottle"      "bottom"      "brilliant"   "business"   
##  [31] "call"        "carry"       "choose"      "class"       "coffee"     
##  [36] "colleague"   "collect"     "college"     "comment"     "commit"     
##  [41] "committee"   "common"      "community"   "cook"        "correct"    
##  [46] "cross"       "current"     "deep"        "degree"      "difference" 
##  [51] "difficult"   "dinner"      "discuss"     "door"        "dress"      
##  [56] "effect"      "egg"         "express"     "fall"        "feed"       
##  [61] "feel"        "fill"        "floor"       "follow"      "food"       
##  [66] "foot"        "free"        "full"        "glass"       "good"       
##  [71] "goodbye"     "green"       "guess"       "hall"        "happen"     
##  [76] "happy"       "hell"        "hullo"       "indeed"      "issue"      
##  [81] "keep"        "kill"        "less"        "letter"      "little"     
##  [86] "look"        "marry"       "matter"      "meet"        "middle"     
##  [91] "million"     "miss"        "necessary"   "need"        "occasion"   
##  [96] "odd"         "off"         "offer"       "office"      "opportunity"
## [101] "oppose"      "pass"        "poor"        "possible"    "press"      
## [106] "pressure"    "pretty"      "proceed"     "process"     "programme"  
## [111] "pull"        "really"      "recommend"   "roll"        "room"       
## [116] "school"      "see"         "seem"        "sell"        "settle"     
## [121] "shall"       "sheet"       "shoot"       "sleep"       "small"      
## [126] "soon"        "sorry"       "speed"       "spell"       "staff"      
## [131] "still"       "street"      "stuff"       "succeed"     "sudden"     
## [136] "suggest"     "summer"      "supply"      "support"     "suppose"    
## [141] "tell"        "terrible"    "thirteen"    "three"       "tomorrow"   
## [146] "too"         "traffic"     "tree"        "unless"      "village"    
## [151] "wall"        "wee"         "week"        "well"        "will"       
## [156] "wood"        "worry"


contains “a”, “e” and “o” at the same time:

a_e_o <- words[str_detect(words, "a") & str_detect(words, "e") & str_detect(words, "o")]
a_e_o
##  [1] "absolute"    "afternoon"   "another"     "appropriate" "associate"  
##  [6] "colleague"   "compare"     "encourage"   "operate"     "organize"   
## [11] "probable"    "programme"   "reason"      "relation"


b) Use the sentences data set, make the following plot
a bar plot counting sentences with and without “the” (or “The”):

df <- tibble(sentence = sentences)
    df_plot1 <- df %>%
      mutate(
        has_the = ifelse(str_detect(sentence, "\\b[tT]he\\b"), "With 'the'", "Without 'the'")
      )
    ggplot(df_plot1, aes(x = has_the, fill = has_the)) +
      geom_bar()


a scatterplot with 𝑥 being the average length of words in a sentence, and 𝑦 being the number of words starting with “a” or “e” or “i” or “o” or “u” in the sentence.

df_plot2 <- df %>%
  mutate(
    vowel_start_words = str_count(sentence, "(?i)\\b[aeiou][a-z]*\\b"),
    total_words = str_count(sentence, "\\b[A-Za-z]+\\b"),
    total_letters = str_count(sentence, "[A-Za-z]"),
    avg_word_length = total_letters / total_words
  )
ggplot(df_plot2, aes(x = avg_word_length, y = vowel_start_words)) +
  geom_point(alpha = 0.5, color = "blue")


c) Application

dict_lines <- read_lines("/Users/HoangDucVinh/Downloads/Oxford_English_Dictionary.txt")
dict_tibble <- tibble(line = dict_lines) %>%
  filter(line != "")
dict_tibble <- dict_tibble %>%
  mutate(words = str_extract(line, "^[A-Za-z\\-]+"))
words_with_all_vowels_and_y <- dict_tibble %>%
  filter(
    str_detect(str_to_lower(words), "a"),
    str_detect(str_to_lower(words), "e"),
    str_detect(str_to_lower(words), "i"),
    str_detect(str_to_lower(words), "o"),
    str_detect(str_to_lower(words), "u"),
    str_detect(str_to_lower(words), "y")
  ) %>%
  select(words) %>%
  filter(!is.na(words))
print(words_with_all_vowels_and_y)
## # A tibble: 6 × 1
##   words          
##   <chr>          
## 1 Byelorussian   
## 2 Fully-fashioned
## 3 Immunotherapy  
## 4 Praseodymium   
## 5 Revolutionary  
## 6 Uncomplimentary


2. Factors
a) Use the BankChurners.csv to answer the following questions:
Which features can be regarded as a factor?

Attrition_Flag, Gender, Marital_Status, Education_Level, Income_Category, Card_Category


Which features can be regarded as an ordered factor (ordinal)?

Education_Level, Income_Category, Card_Category
bank_data <- read_csv("/Users/HoangDucVinh/Downloads/BankChurners.csv")
## Rows: 10127 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (14): Customer_Age, Dependent_count, Months_on_book, Total_Relationship_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bank_data <- bank_data %>%
  mutate(
    Attrition_Flag = as_factor(Attrition_Flag),
    Gender = as_factor(Gender),
    Marital_Status = as_factor(Marital_Status),
    Education_Level = factor(Education_Level, 
                             levels = c("Uneducated", "High School", "College", 
                                        "Graduate", "Post-Graduate", "Doctorate", "Unknown"),
                             ordered = TRUE),
    Income_Category = factor(Income_Category, 
                             levels = c("Less than $40K", "$40K - $60K", "$60K - $80K", 
                                        "$80K - $120K", "$120K +", "Unknown"),
                             ordered = TRUE),
    Card_Category = factor(Card_Category, 
                           levels = c("Blue", "Silver", "Gold", "Platinum"),
                           ordered = TRUE)
  )

ggplot(bank_data, aes(x = Education_Level, y = Avg_Utilization_Ratio, fill = Education_Level)) +
  geom_boxplot() 


b) Use the gss_cat data set

    levels(gss_cat$marital)
## [1] "No answer"     "Never married" "Separated"     "Divorced"     
## [5] "Widowed"       "Married"
gss_modified <- gss_cat %>%
  mutate(
    marital = fct_collapse(marital,
      "Once Married" = c("Separated", "Divorced", "Widowed")
    )
  )

gss_summary <- gss_modified %>%
  group_by(marital) %>%
  summarise(avg_tvhours = mean(tvhours, na.rm = TRUE))

ggplot(gss_summary, aes(x = avg_tvhours, y = fct_reorder(marital, avg_tvhours))) +
  geom_point(size = 4, color = "blue")


3. Date and Time - nycflights13 data set

flights %>%
      filter(!is.na(dest)) %>%
      inner_join(airports, by = c("dest" = "faa")) %>%
      filter(!is.na(tzone)) %>%
      summarise(unique_timezones = n_distinct(tzone))
## # A tibble: 1 × 1
##   unique_timezones
##              <int>
## 1                7


There are 7 timezones

different_airports <- c("ORD", "DFW", "DEN", "SEA", "ANC", "HNL")
    nyc_tz <- -5
    airports %>%
      filter(faa %in% different_airports) %>%
      select(faa, name, tz) %>%
      mutate(time_diff_hours = tz - nyc_tz)
## # A tibble: 6 × 4
##   faa   name                          tz time_diff_hours
##   <chr> <chr>                      <dbl>           <dbl>
## 1 ANC   Ted Stevens Anchorage Intl    -9              -4
## 2 DEN   Denver Intl                   -7              -2
## 3 DFW   Dallas Fort Worth Intl        -6              -1
## 4 HNL   Honolulu Intl                -10              -5
## 5 ORD   Chicago Ohare Intl            -6              -1
## 6 SEA   Seattle Tacoma Intl           -8              -3
time_difference_NYC <- function(destination) {
      dest_tz <- airports %>%
        filter(faa == destination) %>%
        pull(tz)
      
      nyc_tz <- -5
      return(dest_tz - nyc_tz)
    }
flight_time <- function(dep_time, arr_time, origin, dest) {
  
  dep_min <- (dep_time %/% 100) * 60 + (dep_time %% 100)
  arr_min <- (arr_time %/% 100) * 60 + (arr_time %% 100)
  tz_diff_hours <- Time_difference_NYC(dest)
  tz_diff_min <- tz_diff_hours * 60
  arr_min_nyc <- arr_min - tz_diff_min
  if (arr_min_nyc < dep_min) {
    arr_min_nyc <- arr_min_nyc + (24 * 60)
  }
  actual_flight_time <- arr_min_nyc - dep_min
  
  return(actual_flight_time)
}