1. Regular Expressions

1a) Patterns on the `words` data set

The words vector ships with the stringr package.

length(words)

## [1] 980

head(words)

## [1] "a"        "able"     "about"    "absolute" "accept"   "account"

Words that are exactly four letters long

four <- words[str_detect(words, "^.{4}$")]
length(four)

## [1] 263

head(four, 20)

##  [1] "able" "also" "area" "away" "baby" "back" "ball" "bank" "base" "bear"
## [11] "beat" "best" "bill" "blow" "blue" "boat" "body" "book" "both" "busy"

Words that are either four or five letters long

four_five <- words[str_detect(words, "^.{4,5}$")]
length(four_five)

## [1] 463

head(four_five, 20)

##  [1] "able"  "about" "admit" "after" "again" "agent" "agree" "allow" "along"
## [10] "also"  "apart" "apply" "area"  "argue" "aware" "away"  "awful" "baby" 
## [19] "back"  "ball"

Words whose second letter is “s” or “t”

second_st <- words[str_detect(words, "^.[st]")]
length(second_st)

## [1] 38

head(second_st, 20)

##  [1] "as"        "ask"       "associate" "assume"    "at"        "attend"   
##  [7] "especial"  "issue"     "it"        "item"      "other"     "otherwise"
## [13] "staff"     "stage"     "stairs"    "stand"     "standard"  "start"    
## [19] "state"     "station"

Words that contain a pattern like “oxx” (one letter, then a different letter repeated twice)

oxx <- words[str_detect(words, "(.)(.)\\2")]
length(oxx)

## [1] 157

head(oxx, 20)

##  [1] "accept"      "account"     "across"      "add"         "address"    
##  [6] "affect"      "afford"      "afternoon"   "agree"       "all"        
## [11] "allow"       "apparent"    "appear"      "apply"       "appoint"    
## [16] "approach"    "appropriate" "arrange"     "associate"   "assume"

Words that contain “a”, “e”, and “o” at the same time

aeo <- words[str_detect(words, "a") &
             str_detect(words, "e") &
             str_detect(words, "o")]
aeo

##  [1] "absolute"    "afternoon"   "another"     "appropriate" "associate"  
##  [6] "colleague"   "compare"     "encourage"   "operate"     "organize"   
## [11] "probable"    "programme"   "reason"      "relation"

1b) Plots on the `sentences` data set

length(sentences)

## [1] 720

head(sentences, 3)

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."

Bar plot — sentences with vs. without “the” / “The”

sent_df <- tibble(sentence = sentences) %>%
  mutate(has_the = str_detect(sentence, "\\b[Tt]he\\b"))

ggplot(sent_df, aes(x = has_the, fill = has_the)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.3) +
  scale_x_discrete(labels = c("FALSE" = "Without 'the'",
                              "TRUE"  = "With 'the'")) +
  labs(title = "Sentences containing 'the' (or 'The')",
       x = NULL, y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

Scatterplot — average word length vs. number of vowel-starting words

sent_scatter <- tibble(sentence = sentences) %>%
  mutate(
    word_list      = str_extract_all(sentence, "[A-Za-z]+"),
    avg_word_len   = map_dbl(word_list, ~ mean(str_length(.x))),
    n_vowel_start  = map_int(word_list,
                             ~ sum(str_detect(.x, "^[aeiouAEIOU]")))
  )

ggplot(sent_scatter, aes(avg_word_len, n_vowel_start)) +
  geom_point(alpha = 0.5, colour = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, colour = "tomato") +
  labs(title = "Sentence structure: avg word length vs. vowel-starting words",
       x = "Average word length in the sentence",
       y = "Number of words starting with a vowel") +
  theme_minimal()

1c) Application — Oxford English Dictionary

i–iii. Read the file, make a tibble, drop blank lines

oed_raw <- read_lines("Oxford_English_Dictionary.txt")

oed <- tibble(line = oed_raw) %>%
  filter(str_detect(line, "\\S"))   # keep only lines containing non-whitespace

nrow(oed)

## [1] 36692

head(oed, 5)

## # A tibble: 5 × 1
##   line                                                                          
##   <chr>                                                                         
## 1 "A "                                                                          
## 2 "A-  prefix (also an- before a vowel sound) not, without (amoral). [greek]"   
## 3 "Aa  abbr. 1 automobile association. 2 alcoholics anonymous. 3 anti-aircraft."
## 4 "Aardvark  n. Mammal with a tubular snout and a long tongue, feeding on termi…
## 5 "Ab-  prefix off, away, from (abduct). [latin]"

iv. Extract the headword for each entry into a column `words`

oed <- oed %>%
  mutate(words = str_extract(line, "^\\S+"))

oed %>% select(words, line) %>% head(10)

## # A tibble: 10 × 2
##    words     line                                                               
##    <chr>     <chr>                                                              
##  1 A         "A "                                                               
##  2 A-        "A-  prefix (also an- before a vowel sound) not, without (amoral).…
##  3 Aa        "Aa  abbr. 1 automobile association. 2 alcoholics anonymous. 3 ant…
##  4 Aardvark  "Aardvark  n. Mammal with a tubular snout and a long tongue, feedi…
##  5 Ab-       "Ab-  prefix off, away, from (abduct). [latin]"                    
##  6 Aback     "Aback  adv. \u007f take aback surprise, disconcert. [old english:…
##  7 Abacus    "Abacus  n. (pl. -cuses) 1 frame with wires along which beads are …
##  8 Abaft     "Abaft  naut. —adv. In the stern half of a ship. —prep. Nearer the…
##  9 Abandon   "Abandon  —v. 1 give up. 2 forsake, desert. 3 (often foll. By to; …
## 10 Abandoned "Abandoned  adj. 1 deserted, forsaken. 2 unrestrained, profligate."

v. Words containing “a”, “e”, “i”, “o”, “u”, and “y” all at the same time

all_vowels_y <- oed %>%
  mutate(w = str_to_lower(words)) %>%
  filter(str_detect(w, "a"),
         str_detect(w, "e"),
         str_detect(w, "i"),
         str_detect(w, "o"),
         str_detect(w, "u"),
         str_detect(w, "y")) %>%
  distinct(words)

all_vowels_y

## # A tibble: 6 × 1
##   words          
##   <chr>          
## 1 Byelorussian   
## 2 Fully-fashioned
## 3 Immunotherapy  
## 4 Praseodymium   
## 5 Revolutionary  
## 6 Uncomplimentary

2. Factors

2a) `BankChurners.csv`

bank <- read_csv("BankChurners.csv")
glimpse(bank)

## Rows: 10,127
## Columns: 23
## $ CLIENTNUM                                                                                                                          <dbl> …
## $ Attrition_Flag                                                                                                                     <chr> …
## $ Customer_Age                                                                                                                       <dbl> …
## $ Gender                                                                                                                             <chr> …
## $ Dependent_count                                                                                                                    <dbl> …
## $ Education_Level                                                                                                                    <chr> …
## $ Marital_Status                                                                                                                     <chr> …
## $ Income_Category                                                                                                                    <chr> …
## $ Card_Category                                                                                                                      <chr> …
## $ Months_on_book                                                                                                                     <dbl> …
## $ Total_Relationship_Count                                                                                                           <dbl> …
## $ Months_Inactive_12_mon                                                                                                             <dbl> …
## $ Contacts_Count_12_mon                                                                                                              <dbl> …
## $ Credit_Limit                                                                                                                       <dbl> …
## $ Total_Revolving_Bal                                                                                                                <dbl> …
## $ Avg_Open_To_Buy                                                                                                                    <dbl> …
## $ Total_Amt_Chng_Q4_Q1                                                                                                               <dbl> …
## $ Total_Trans_Amt                                                                                                                    <dbl> …
## $ Total_Trans_Ct                                                                                                                     <dbl> …
## $ Total_Ct_Chng_Q4_Q1                                                                                                                <dbl> …
## $ Avg_Utilization_Ratio                                                                                                              <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …

Which features can be regarded as a (nominal) factor?

These columns hold a small set of unordered categories:

Attrition_Flag — Existing Customer / Attrited Customer
Gender — M / F
Marital_Status — Married, Single, Divorced, Unknown

Which features can be regarded as an ordered factor (ordinal)?

These columns have a meaningful ordering:

Education_Level — Uneducated < High School < College < Graduate < Post-Graduate < Doctorate (plus Unknown)
Income_Category — Less than $40K < $40K - $60K < $60K - $80K < $80K - $120K < $120K + (plus Unknown)
Card_Category — Blue < Silver < Gold < Platinum

Convert the relevant columns to factors / ordered factors

bank <- bank %>%
  mutate(
    Attrition_Flag  = factor(Attrition_Flag),
    Gender          = factor(Gender),
    Marital_Status  = factor(Marital_Status),

    Education_Level = factor(Education_Level,
                             levels = c("Uneducated", "High School", "College",
                                        "Graduate", "Post-Graduate",
                                        "Doctorate", "Unknown"),
                             ordered = TRUE),

    Income_Category = factor(Income_Category,
                             levels = c("Less than $40K", "$40K - $60K",
                                        "$60K - $80K",   "$80K - $120K",
                                        "$120K +",       "Unknown"),
                             ordered = TRUE),

    Card_Category   = factor(Card_Category,
                             levels = c("Blue", "Silver", "Gold", "Platinum"),
                             ordered = TRUE)
  )

bank %>% select(Education_Level, Income_Category, Card_Category) %>% summary()

##       Education_Level       Income_Category  Card_Category 
##  Uneducated   :1487   Less than $40K:3561   Blue    :9436  
##  High School  :2013   $40K - $60K   :1790   Silver  : 555  
##  College      :1013   $60K - $80K   :1402   Gold    : 116  
##  Graduate     :3128   $80K - $120K  :1535   Platinum:  20  
##  Post-Graduate: 516   $120K +       : 727                  
##  Doctorate    : 451   Unknown       :1112                  
##  Unknown      :1519

Effect of education level on average utilization ratio

bank_edu <- bank %>%
  filter(Education_Level != "Unknown") %>%
  group_by(Education_Level) %>%
  summarise(mean_util = mean(Avg_Utilization_Ratio, na.rm = TRUE),
            se        = sd(Avg_Utilization_Ratio,  na.rm = TRUE) / sqrt(n()),
            .groups   = "drop")

ggplot(bank_edu, aes(Education_Level, mean_util, fill = Education_Level)) +
  geom_col(show.legend = FALSE) +
  geom_errorbar(aes(ymin = mean_util - se, ymax = mean_util + se),
                width = 0.2) +
  labs(title = "Average utilization ratio by education level",
       x = "Education level",
       y = "Mean Avg_Utilization_Ratio") +
  theme_minimal()

ggplot(filter(bank, Education_Level != "Unknown"),
       aes(Education_Level, Avg_Utilization_Ratio, fill = Education_Level)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Distribution of utilization ratio by education level",
       x = "Education level", y = "Avg_Utilization_Ratio") +
  theme_minimal()

The mean utilization ratio is roughly similar (~0.27–0.28) across education levels, suggesting education has only a small effect on how heavily customers use their available credit.

2b) `gss_cat` data set

gss_cat

## # A tibble: 21,483 × 9
##     year marital         age race  rincome        partyid    relig denom tvhours
##    <int> <fct>         <int> <fct> <fct>          <fct>      <fct> <fct>   <int>
##  1  2000 Never married    26 White $8000 to 9999  Ind,near … Prot… Sout…      12
##  2  2000 Divorced         48 White $8000 to 9999  Not str r… Prot… Bapt…      NA
##  3  2000 Widowed          67 White Not applicable Independe… Prot… No d…       2
##  4  2000 Never married    39 White Not applicable Ind,near … Orth… Not …       4
##  5  2000 Divorced         25 White Not applicable Not str d… None  Not …       1
##  6  2000 Married          25 White $20000 - 24999 Strong de… Prot… Sout…      NA
##  7  2000 Never married    36 White $25000 or more Not str r… Chri… Not …       3
##  8  2000 Divorced         44 White $7000 to 7999  Ind,near … Prot… Luth…      NA
##  9  2000 Married          44 White $25000 or more Not str d… Prot… Other       0
## 10  2000 Married          47 White $25000 or more Strong re… Prot… Sout…       3
## # ℹ 21,473 more rows

Levels of `marital`

levels(gss_cat$marital)

## [1] "No answer"     "Never married" "Separated"     "Divorced"     
## [5] "Widowed"       "Married"

Collapse “Separated”, “Divorced”, “Widowed” into “Once Married”

gss2 <- gss_cat %>%
  mutate(marital = fct_collapse(marital,
                                "Once Married" = c("Separated",
                                                   "Divorced",
                                                   "Widowed")))

levels(gss2$marital)

## [1] "No answer"     "Never married" "Once Married"  "Married"

gss2 %>% count(marital)

## # A tibble: 4 × 2
##   marital           n
##   <fct>         <int>
## 1 No answer        17
## 2 Never married  5416
## 3 Once Married   5933
## 4 Married       10117

Does marital status affect `tvhours`?

gss2 %>%
  filter(!is.na(tvhours), marital != "No answer") %>%
  group_by(marital) %>%
  summarise(mean_tv = mean(tvhours), n = n(), .groups = "drop") %>%
  ggplot(aes(fct_reorder(marital, mean_tv), mean_tv, fill = marital)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = round(mean_tv, 2)), vjust = -0.3) +
  labs(title = "Average daily TV hours by (collapsed) marital status",
       x = "Marital status", y = "Mean tvhours") +
  theme_minimal()

gss2 %>%
  filter(!is.na(tvhours), marital != "No answer") %>%
  ggplot(aes(marital, tvhours, fill = marital)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "TV hours distribution by marital status",
       x = "Marital status", y = "tvhours") +
  theme_minimal()

gss_anova <- gss2 %>% filter(!is.na(tvhours), marital != "No answer")
summary(aov(tvhours ~ marital, data = gss_anova))

##                Df Sum Sq Mean Sq F value Pr(>F)    
## marital         2   1179   589.3   89.36 <2e-16 ***
## Residuals   11325  74685     6.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The “Once Married” and “Never married” groups watch noticeably more TV on average than “Married” respondents; the ANOVA p-value is essentially zero, so the effect is statistically significant.

3. Date and Time — `nycflights13`

library(nycflights13)
head(flights, 3)

## # A tibble: 3 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

head(airports, 3)

## # A tibble: 3 × 8
##   faa   name                            lat   lon   alt    tz dst   tzone       
##   <chr> <chr>                         <dbl> <dbl> <dbl> <dbl> <chr> <chr>       
## 1 04G   Lansdowne Airport              41.1 -80.6  1044    -5 A     America/New…
## 2 06A   Moton Field Municipal Airport  32.5 -85.7   264    -6 A     America/Chi…
## 3 06C   Schaumburg Regional            42.0 -88.1   801    -6 A     America/Chi…

How many timezones cover all destination airports (excluding NA)?

dest_tz <- flights %>%
  distinct(dest) %>%
  left_join(airports, by = c("dest" = "faa")) %>%
  filter(!is.na(tzone)) %>%
  distinct(tzone)

dest_tz

## # A tibble: 7 × 1
##   tzone              
##   <chr>              
## 1 America/Chicago    
## 2 America/New_York   
## 3 America/Los_Angeles
## 4 America/Phoenix    
## 5 America/Denver     
## 6 Pacific/Honolulu   
## 7 America/Anchorage

nrow(dest_tz)

## [1] 7

Time difference (in hours) between NYC and selected cities

nyc_tz <- airports %>%
  filter(faa %in% c("JFK", "LGA", "EWR")) %>%
  summarise(tz = unique(tz)) %>% pull(tz)
nyc_tz

## [1] -5

target <- tibble(
  city = c("Chicago", "Dallas", "Denver", "Seattle", "Anchorage", "Honolulu"),
  faa  = c("ORD",     "DFW",    "DEN",    "SEA",     "ANC",       "HNL")
)

target %>%
  left_join(airports, by = "faa") %>%
  mutate(time_diff_from_NYC = tz - nyc_tz) %>%
  select(city, faa, name, tz, time_diff_from_NYC)

## # A tibble: 6 × 5
##   city      faa   name                          tz time_diff_from_NYC
##   <chr>     <chr> <chr>                      <dbl>              <dbl>
## 1 Chicago   ORD   Chicago Ohare Intl            -6                 -1
## 2 Dallas    DFW   Dallas Fort Worth Intl        -6                 -1
## 3 Denver    DEN   Denver Intl                   -7                 -2
## 4 Seattle   SEA   Seattle Tacoma Intl           -8                 -3
## 5 Anchorage ANC   Ted Stevens Anchorage Intl    -9                 -4
## 6 Honolulu  HNL   Honolulu Intl                -10                 -5

A negative value means the destination is behind New York City.

Function — `Time_difference_NYC(dest)`

Time_difference_NYC <- function(dest) {
  nyc_tz  <- airports %>% filter(faa == "JFK") %>% pull(tz)
  dest_tz <- airports %>% filter(faa == dest) %>% pull(tz)

  if (length(dest_tz) == 0) {
    warning("FAA code '", dest, "' not found in airports.")
    return(NA_real_)
  }
  dest_tz - nyc_tz
}

# Tests
Time_difference_NYC("LAX")   # Los Angeles

## [1] -3

Time_difference_NYC("ORD")   # Chicago

## [1] -1

Time_difference_NYC("HNL")   # Honolulu

## [1] -5

Time_difference_NYC("ANC")   # Anchorage

## [1] -4

Function — `flight_time(dep_time, arr_time, origin, dest)`

flight_time <- function(dep_time, arr_time, origin, dest) {

  hhmm_to_hours <- function(x) (x %/% 100) + (x %% 100) / 60

  dep_local <- hhmm_to_hours(dep_time)
  arr_local <- hhmm_to_hours(arr_time)

  origin_tz <- airports %>% filter(faa == origin) %>% pull(tz)
  dest_tz   <- airports %>% filter(faa == dest)   %>% pull(tz)

  if (length(origin_tz) == 0 || length(dest_tz) == 0) {
    warning("Unknown FAA code for origin or destination.")
    return(NA_real_)
  }

  dep_utc <- dep_local - origin_tz
  arr_utc <- arr_local - dest_tz

  if (arr_utc < dep_utc) arr_utc <- arr_utc + 24

  arr_utc - dep_utc
}

flights %>%
  filter(!is.na(dep_time), !is.na(arr_time)) %>%
  slice(1:5) %>%
  rowwise() %>%
  mutate(computed_hours = flight_time(dep_time, arr_time, origin, dest),
         air_time_hrs   = air_time / 60) %>%
  select(origin, dest, dep_time, arr_time,
         computed_hours, air_time_hrs)

## # A tibble: 5 × 6
## # Rowwise: 
##   origin dest  dep_time arr_time computed_hours air_time_hrs
##   <chr>  <chr>    <int>    <int>          <dbl>        <dbl>
## 1 EWR    IAH        517      830           4.22         3.78
## 2 LGA    IAH        533      850           4.28         3.78
## 3 JFK    MIA        542      923           3.68         2.67
## 4 JFK    BQN        544     1004          NA            3.05
## 5 LGA    ATL        554      812           2.3          1.93

The computed elapsed time should be slightly larger than air_time (which is in-flight only and excludes taxi time), confirming the function works.

Assignment: Regex, Factors, Dates & Times

Charles

2026-05-13

1. Regular Expressions

1a) Patterns on the `words` data set

Words that are exactly four letters long

Words that are either four or five letters long

Words whose second letter is “s” or “t”

Words that contain a pattern like “oxx” (one letter, then a different letter repeated twice)

Words that contain “a”, “e”, and “o” at the same time

1b) Plots on the `sentences` data set

Bar plot — sentences with vs. without “the” / “The”

Scatterplot — average word length vs. number of vowel-starting words

1c) Application — Oxford English Dictionary

i–iii. Read the file, make a tibble, drop blank lines

iv. Extract the headword for each entry into a column `words`

v. Words containing “a”, “e”, “i”, “o”, “u”, and “y” all at the same time

2. Factors

2a) `BankChurners.csv`

Which features can be regarded as a (nominal) factor?

Which features can be regarded as an ordered factor (ordinal)?

Convert the relevant columns to factors / ordered factors

Effect of education level on average utilization ratio

2b) `gss_cat` data set

Levels of `marital`

Collapse “Separated”, “Divorced”, “Widowed” into “Once Married”

Does marital status affect `tvhours`?

3. Date and Time — `nycflights13`

How many timezones cover all destination airports (excluding NA)?

Time difference (in hours) between NYC and selected cities

Function — `Time_difference_NYC(dest)`

Function — `flight_time(dep_time, arr_time, origin, dest)`

Assignment: Regex, Factors, Dates & Times

Charles

2026-05-13

1. Regular Expressions

1a) Patterns on the words data set

Words that are exactly four letters long

Words that are either four or five letters long

Words whose second letter is “s” or “t”

Words that contain a pattern like “oxx” (one letter, then a different letter repeated twice)

Words that contain “a”, “e”, and “o” at the same time

1b) Plots on the sentences data set

Bar plot — sentences with vs. without “the” / “The”

Scatterplot — average word length vs. number of vowel-starting words

1c) Application — Oxford English Dictionary

i–iii. Read the file, make a tibble, drop blank lines

iv. Extract the headword for each entry into a column words

v. Words containing “a”, “e”, “i”, “o”, “u”, and “y” all at the same time

2. Factors

2a) BankChurners.csv

Which features can be regarded as a (nominal) factor?

Which features can be regarded as an ordered factor (ordinal)?

Convert the relevant columns to factors / ordered factors

Effect of education level on average utilization ratio

2b) gss_cat data set

Levels of marital

Collapse “Separated”, “Divorced”, “Widowed” into “Once Married”

Does marital status affect tvhours?

3. Date and Time — nycflights13

How many timezones cover all destination airports (excluding NA)?

Time difference (in hours) between NYC and selected cities

Function — Time_difference_NYC(dest)

Function — flight_time(dep_time, arr_time, origin, dest)

1a) Patterns on the `words` data set

1b) Plots on the `sentences` data set

iv. Extract the headword for each entry into a column `words`

2a) `BankChurners.csv`

2b) `gss_cat` data set

Levels of `marital`

Does marital status affect `tvhours`?

3. Date and Time — `nycflights13`

Function — `Time_difference_NYC(dest)`

Function — `flight_time(dep_time, arr_time, origin, dest)`