library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
1. Regular expressions <a/ Use the words data
set, find all the words that match each of the following patterns:
four_letters <- str_subset(words, "^.{4}$")
four_letters
## [1] "able" "also" "area" "away" "baby" "back" "ball" "bank" "base" "bear"
## [11] "beat" "best" "bill" "blow" "blue" "boat" "body" "book" "both" "busy"
## [21] "cake" "call" "card" "care" "case" "cent" "chap" "city" "club" "cold"
## [31] "come" "cook" "copy" "cost" "date" "dead" "deal" "dear" "deep" "door"
## [41] "down" "draw" "drop" "each" "east" "easy" "else" "even" "ever" "face"
## [51] "fact" "fair" "fall" "farm" "fast" "feed" "feel" "file" "fill" "film"
## [61] "find" "fine" "fire" "fish" "five" "flat" "food" "foot" "form" "four"
## [71] "free" "from" "full" "fund" "game" "girl" "give" "good" "grow" "hair"
## [81] "half" "hall" "hand" "hang" "hard" "hate" "have" "head" "hear" "heat"
## [91] "hell" "help" "here" "high" "hold" "home" "hope" "hour" "idea" "into"
## [101] "item" "join" "jump" "just" "keep" "kill" "kind" "king" "know" "lady"
## [111] "land" "last" "late" "lead" "left" "less" "life" "like" "line" "link"
## [121] "list" "live" "load" "lock" "long" "look" "lord" "lose" "love" "luck"
## [131] "main" "make" "many" "mark" "mean" "meet" "mile" "milk" "mind" "miss"
## [141] "more" "most" "move" "much" "must" "name" "near" "need" "news" "next"
## [151] "nice" "nine" "none" "note" "okay" "once" "only" "open" "over" "pack"
## [161] "page" "pair" "park" "part" "pass" "past" "pick" "plan" "play" "plus"
## [171] "poor" "post" "pull" "push" "quid" "rail" "rate" "read" "real" "rest"
## [181] "ring" "rise" "road" "role" "roll" "room" "rule" "safe" "sale" "same"
## [191] "save" "seat" "seem" "self" "sell" "send" "shoe" "shop" "show" "shut"
## [201] "sick" "side" "sign" "sing" "site" "size" "slow" "some" "soon" "sort"
## [211] "stay" "step" "stop" "such" "suit" "sure" "take" "talk" "tape" "team"
## [221] "tell" "tend" "term" "test" "than" "then" "they" "this" "thou" "time"
## [231] "town" "tree" "true" "turn" "type" "unit" "upon" "very" "view" "vote"
## [241] "wage" "wait" "walk" "wall" "want" "warm" "wash" "wear" "week" "well"
## [251] "west" "what" "when" "wide" "wife" "will" "wind" "wish" "with" "wood"
## [261] "word" "work" "year"
four or five letters long:
four_or_five_letters <- str_subset(words, "^.{4,5}$")
four_or_five_letters
## [1] "able" "about" "admit" "after" "again" "agent" "agree" "allow" "along"
## [10] "also" "apart" "apply" "area" "argue" "aware" "away" "awful" "baby"
## [19] "back" "ball" "bank" "base" "basis" "bear" "beat" "begin" "best"
## [28] "bill" "birth" "black" "bloke" "blood" "blow" "blue" "board" "boat"
## [37] "body" "book" "both" "break" "brief" "bring" "build" "busy" "cake"
## [46] "call" "card" "care" "carry" "case" "catch" "cause" "cent" "chair"
## [55] "chap" "cheap" "check" "child" "city" "claim" "class" "clean" "clear"
## [64] "clock" "close" "club" "cold" "come" "cook" "copy" "cost" "could"
## [73] "count" "court" "cover" "cross" "date" "dead" "deal" "dear" "deep"
## [82] "door" "doubt" "down" "draw" "dress" "drink" "drive" "drop" "each"
## [91] "early" "east" "easy" "eight" "elect" "else" "enjoy" "enter" "equal"
## [100] "even" "ever" "every" "exact" "exist" "extra" "face" "fact" "fair"
## [109] "fall" "farm" "fast" "feed" "feel" "field" "fight" "file" "fill"
## [118] "film" "final" "find" "fine" "fire" "first" "fish" "five" "flat"
## [127] "floor" "food" "foot" "force" "form" "four" "free" "from" "front"
## [136] "full" "fund" "game" "girl" "give" "glass" "good" "grand" "grant"
## [145] "great" "green" "group" "grow" "guess" "hair" "half" "hall" "hand"
## [154] "hang" "happy" "hard" "hate" "have" "head" "hear" "heart" "heat"
## [163] "heavy" "hell" "help" "here" "high" "hold" "home" "hope" "horse"
## [172] "hour" "house" "hullo" "idea" "into" "issue" "item" "jesus" "join"
## [181] "judge" "jump" "just" "keep" "kill" "kind" "king" "knock" "know"
## [190] "lady" "land" "large" "last" "late" "laugh" "lead" "learn" "leave"
## [199] "left" "less" "level" "life" "light" "like" "limit" "line" "link"
## [208] "list" "live" "load" "local" "lock" "long" "look" "lord" "lose"
## [217] "love" "luck" "lunch" "main" "major" "make" "many" "mark" "marry"
## [226] "match" "maybe" "mean" "meet" "might" "mile" "milk" "mind" "minus"
## [235] "miss" "money" "month" "more" "most" "move" "much" "music" "must"
## [244] "name" "near" "need" "never" "news" "next" "nice" "night" "nine"
## [253] "none" "north" "note" "offer" "often" "okay" "once" "only" "open"
## [262] "order" "other" "ought" "over" "pack" "page" "paint" "pair" "paper"
## [271] "park" "part" "party" "pass" "past" "pence" "pick" "piece" "place"
## [280] "plan" "play" "plus" "point" "poor" "post" "pound" "power" "press"
## [289] "price" "print" "pull" "push" "quick" "quid" "quiet" "quite" "radio"
## [298] "rail" "raise" "range" "rate" "read" "ready" "real" "refer" "rest"
## [307] "right" "ring" "rise" "road" "role" "roll" "room" "round" "rule"
## [316] "safe" "sale" "same" "save" "score" "seat" "seem" "self" "sell"
## [325] "send" "sense" "serve" "seven" "shall" "share" "sheet" "shoe" "shoot"
## [334] "shop" "short" "show" "shut" "sick" "side" "sign" "since" "sing"
## [343] "site" "size" "sleep" "slow" "small" "smoke" "some" "soon" "sorry"
## [352] "sort" "sound" "south" "space" "speak" "speed" "spell" "spend" "staff"
## [361] "stage" "stand" "start" "state" "stay" "step" "stick" "still" "stop"
## [370] "story" "study" "stuff" "such" "suit" "sure" "table" "take" "talk"
## [379] "tape" "teach" "team" "tell" "tend" "term" "test" "than" "thank"
## [388] "then" "there" "they" "thing" "think" "this" "thou" "three" "throw"
## [397] "time" "today" "total" "touch" "town" "trade" "train" "treat" "tree"
## [406] "true" "trust" "turn" "type" "under" "union" "unit" "unite" "until"
## [415] "upon" "usual" "value" "very" "video" "view" "visit" "vote" "wage"
## [424] "wait" "walk" "wall" "want" "warm" "wash" "waste" "watch" "water"
## [433] "wear" "week" "weigh" "well" "west" "what" "when" "where" "which"
## [442] "while" "white" "whole" "wide" "wife" "will" "wind" "wish" "with"
## [451] "woman" "wood" "word" "work" "world" "worry" "worse" "worth" "would"
## [460] "write" "wrong" "year" "young"
the second letter is “s” or “t”:
second_letter <- str_subset(words, "^.[st]")
second_letter
## [1] "as" "ask" "associate" "assume" "at" "attend"
## [7] "especial" "issue" "it" "item" "other" "otherwise"
## [13] "staff" "stage" "stairs" "stand" "standard" "start"
## [19] "state" "station" "stay" "step" "stick" "still"
## [25] "stop" "story" "straight" "strategy" "street" "strike"
## [31] "strong" "structure" "student" "study" "stuff" "stupid"
## [37] "use" "usual"
contains the pattern like “oxx” where “o” is one letter and “x”
is another letter:
oxx_pattern <- str_subset(words, "(.)(.)\\2")
oxx_pattern
## [1] "accept" "account" "across" "add" "address"
## [6] "affect" "afford" "afternoon" "agree" "all"
## [11] "allow" "apparent" "appear" "apply" "appoint"
## [16] "approach" "appropriate" "arrange" "associate" "assume"
## [21] "attend" "ball" "between" "bill" "blood"
## [26] "book" "bottle" "bottom" "brilliant" "business"
## [31] "call" "carry" "choose" "class" "coffee"
## [36] "colleague" "collect" "college" "comment" "commit"
## [41] "committee" "common" "community" "cook" "correct"
## [46] "cross" "current" "deep" "degree" "difference"
## [51] "difficult" "dinner" "discuss" "door" "dress"
## [56] "effect" "egg" "express" "fall" "feed"
## [61] "feel" "fill" "floor" "follow" "food"
## [66] "foot" "free" "full" "glass" "good"
## [71] "goodbye" "green" "guess" "hall" "happen"
## [76] "happy" "hell" "hullo" "indeed" "issue"
## [81] "keep" "kill" "less" "letter" "little"
## [86] "look" "marry" "matter" "meet" "middle"
## [91] "million" "miss" "necessary" "need" "occasion"
## [96] "odd" "off" "offer" "office" "opportunity"
## [101] "oppose" "pass" "poor" "possible" "press"
## [106] "pressure" "pretty" "proceed" "process" "programme"
## [111] "pull" "really" "recommend" "roll" "room"
## [116] "school" "see" "seem" "sell" "settle"
## [121] "shall" "sheet" "shoot" "sleep" "small"
## [126] "soon" "sorry" "speed" "spell" "staff"
## [131] "still" "street" "stuff" "succeed" "sudden"
## [136] "suggest" "summer" "supply" "support" "suppose"
## [141] "tell" "terrible" "thirteen" "three" "tomorrow"
## [146] "too" "traffic" "tree" "unless" "village"
## [151] "wall" "wee" "week" "well" "will"
## [156] "wood" "worry"
contains “a”, “e” and “o” at the same time:
a_e_o <- words[str_detect(words, "a") & str_detect(words, "e") & str_detect(words, "o")]
a_e_o
## [1] "absolute" "afternoon" "another" "appropriate" "associate"
## [6] "colleague" "compare" "encourage" "operate" "organize"
## [11] "probable" "programme" "reason" "relation"
b) Use the sentences data set, make the following
plot
a bar plot counting sentences with and without “the” (or
“The”):
df <- tibble(sentence = sentences)
df_plot1 <- df %>%
mutate(
has_the = ifelse(str_detect(sentence, "\\b[tT]he\\b"), "With 'the'", "Without 'the'")
)
ggplot(df_plot1, aes(x = has_the, fill = has_the)) +
geom_bar()
a scatterplot with 𝑥 being the average length of words in a
sentence, and 𝑦 being the number of words starting with “a” or “e” or
“i” or “o” or “u” in the sentence.
df_plot2 <- df %>%
mutate(
vowel_start_words = str_count(sentence, "(?i)\\b[aeiou][a-z]*\\b"),
total_words = str_count(sentence, "\\b[A-Za-z]+\\b"),
total_letters = str_count(sentence, "[A-Za-z]"),
avg_word_length = total_letters / total_words
)
ggplot(df_plot2, aes(x = avg_word_length, y = vowel_start_words)) +
geom_point(alpha = 0.5, color = "blue")
c) Application
dict_lines <- read_lines("/Users/HoangDucVinh/Downloads/Oxford_English_Dictionary.txt")
dict_tibble <- tibble(line = dict_lines) %>%
filter(line != "")
dict_tibble <- dict_tibble %>%
mutate(words = str_extract(line, "^[A-Za-z\\-]+"))
words_with_all_vowels_and_y <- dict_tibble %>%
filter(
str_detect(str_to_lower(words), "a"),
str_detect(str_to_lower(words), "e"),
str_detect(str_to_lower(words), "i"),
str_detect(str_to_lower(words), "o"),
str_detect(str_to_lower(words), "u"),
str_detect(str_to_lower(words), "y")
) %>%
select(words) %>%
filter(!is.na(words))
print(words_with_all_vowels_and_y)
## # A tibble: 6 × 1
## words
## <chr>
## 1 Byelorussian
## 2 Fully-fashioned
## 3 Immunotherapy
## 4 Praseodymium
## 5 Revolutionary
## 6 Uncomplimentary
2. Factors
a) Use the BankChurners.csv to answer
the following questions:
Which features can be regarded as a
factor?
Attrition_Flag, Gender, Marital_Status, Education_Level, Income_Category, Card_Category
Which features can be regarded as an ordered factor
(ordinal)?
Education_Level, Income_Category, Card_Category
bank_data <- read_csv("/Users/HoangDucVinh/Downloads/BankChurners.csv")
## Rows: 10127 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (14): Customer_Age, Dependent_count, Months_on_book, Total_Relationship_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bank_data <- bank_data %>%
mutate(
Attrition_Flag = as_factor(Attrition_Flag),
Gender = as_factor(Gender),
Marital_Status = as_factor(Marital_Status),
Education_Level = factor(Education_Level,
levels = c("Uneducated", "High School", "College",
"Graduate", "Post-Graduate", "Doctorate", "Unknown"),
ordered = TRUE),
Income_Category = factor(Income_Category,
levels = c("Less than $40K", "$40K - $60K", "$60K - $80K",
"$80K - $120K", "$120K +", "Unknown"),
ordered = TRUE),
Card_Category = factor(Card_Category,
levels = c("Blue", "Silver", "Gold", "Platinum"),
ordered = TRUE)
)
ggplot(bank_data, aes(x = Education_Level, y = Avg_Utilization_Ratio, fill = Education_Level)) +
geom_boxplot()
b) Use the gss_cat data set
levels(gss_cat$marital)
## [1] "No answer" "Never married" "Separated" "Divorced"
## [5] "Widowed" "Married"
gss_modified <- gss_cat %>%
mutate(
marital = fct_collapse(marital,
"Once Married" = c("Separated", "Divorced", "Widowed")
)
)
gss_summary <- gss_modified %>%
group_by(marital) %>%
summarise(avg_tvhours = mean(tvhours, na.rm = TRUE))
ggplot(gss_summary, aes(x = avg_tvhours, y = fct_reorder(marital, avg_tvhours))) +
geom_point(size = 4, color = "blue")
3. Date and Time - nycflights13 data set
flights %>%
filter(!is.na(dest)) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
filter(!is.na(tzone)) %>%
summarise(unique_timezones = n_distinct(tzone))
## # A tibble: 1 × 1
## unique_timezones
## <int>
## 1 7
There are 7 timezones
different_airports <- c("ORD", "DFW", "DEN", "SEA", "ANC", "HNL")
nyc_tz <- -5
airports %>%
filter(faa %in% different_airports) %>%
select(faa, name, tz) %>%
mutate(time_diff_hours = tz - nyc_tz)
## # A tibble: 6 × 4
## faa name tz time_diff_hours
## <chr> <chr> <dbl> <dbl>
## 1 ANC Ted Stevens Anchorage Intl -9 -4
## 2 DEN Denver Intl -7 -2
## 3 DFW Dallas Fort Worth Intl -6 -1
## 4 HNL Honolulu Intl -10 -5
## 5 ORD Chicago Ohare Intl -6 -1
## 6 SEA Seattle Tacoma Intl -8 -3
time_difference_NYC <- function(destination) {
dest_tz <- airports %>%
filter(faa == destination) %>%
pull(tz)
nyc_tz <- -5
return(dest_tz - nyc_tz)
}
flight_time <- function(dep_time, arr_time, origin, dest) {
dep_min <- (dep_time %/% 100) * 60 + (dep_time %% 100)
arr_min <- (arr_time %/% 100) * 60 + (arr_time %% 100)
tz_diff_hours <- Time_difference_NYC(dest)
tz_diff_min <- tz_diff_hours * 60
arr_min_nyc <- arr_min - tz_diff_min
if (arr_min_nyc < dep_min) {
arr_min_nyc <- arr_min_nyc + (24 * 60)
}
actual_flight_time <- arr_min_nyc - dep_min
return(actual_flight_time)
}