1. Regular Expressions

1a) Patterns on the words data set

The words vector ships with the stringr package.

length(words)
## [1] 980
head(words)
## [1] "a"        "able"     "about"    "absolute" "accept"   "account"

Words that are exactly four letters long

four <- words[str_detect(words, "^.{4}$")]
length(four)
## [1] 263
head(four, 20)
##  [1] "able" "also" "area" "away" "baby" "back" "ball" "bank" "base" "bear"
## [11] "beat" "best" "bill" "blow" "blue" "boat" "body" "book" "both" "busy"

Words that are either four or five letters long

four_five <- words[str_detect(words, "^.{4,5}$")]
length(four_five)
## [1] 463
head(four_five, 20)
##  [1] "able"  "about" "admit" "after" "again" "agent" "agree" "allow" "along"
## [10] "also"  "apart" "apply" "area"  "argue" "aware" "away"  "awful" "baby" 
## [19] "back"  "ball"

Words whose second letter is “s” or “t”

second_st <- words[str_detect(words, "^.[st]")]
length(second_st)
## [1] 38
head(second_st, 20)
##  [1] "as"        "ask"       "associate" "assume"    "at"        "attend"   
##  [7] "especial"  "issue"     "it"        "item"      "other"     "otherwise"
## [13] "staff"     "stage"     "stairs"    "stand"     "standard"  "start"    
## [19] "state"     "station"

Words that contain a pattern like “oxx” (one letter, then a different letter repeated twice)

oxx <- words[str_detect(words, "(.)(.)\\2")]
length(oxx)
## [1] 157
head(oxx, 20)
##  [1] "accept"      "account"     "across"      "add"         "address"    
##  [6] "affect"      "afford"      "afternoon"   "agree"       "all"        
## [11] "allow"       "apparent"    "appear"      "apply"       "appoint"    
## [16] "approach"    "appropriate" "arrange"     "associate"   "assume"

Words that contain “a”, “e”, and “o” at the same time

aeo <- words[str_detect(words, "a") &
             str_detect(words, "e") &
             str_detect(words, "o")]
aeo
##  [1] "absolute"    "afternoon"   "another"     "appropriate" "associate"  
##  [6] "colleague"   "compare"     "encourage"   "operate"     "organize"   
## [11] "probable"    "programme"   "reason"      "relation"

1b) Plots on the sentences data set

length(sentences)
## [1] 720
head(sentences, 3)
## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."

Bar plot — sentences with vs. without “the” / “The”

sent_df <- tibble(sentence = sentences) %>%
  mutate(has_the = str_detect(sentence, "\\b[Tt]he\\b"))

ggplot(sent_df, aes(x = has_the, fill = has_the)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.3) +
  scale_x_discrete(labels = c("FALSE" = "Without 'the'",
                              "TRUE"  = "With 'the'")) +
  labs(title = "Sentences containing 'the' (or 'The')",
       x = NULL, y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

Scatterplot — average word length vs. number of vowel-starting words

sent_scatter <- tibble(sentence = sentences) %>%
  mutate(
    word_list      = str_extract_all(sentence, "[A-Za-z]+"),
    avg_word_len   = map_dbl(word_list, ~ mean(str_length(.x))),
    n_vowel_start  = map_int(word_list,
                             ~ sum(str_detect(.x, "^[aeiouAEIOU]")))
  )

ggplot(sent_scatter, aes(avg_word_len, n_vowel_start)) +
  geom_point(alpha = 0.5, colour = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, colour = "tomato") +
  labs(title = "Sentence structure: avg word length vs. vowel-starting words",
       x = "Average word length in the sentence",
       y = "Number of words starting with a vowel") +
  theme_minimal()

1c) Application — Oxford English Dictionary

i–iii. Read the file, make a tibble, drop blank lines

oed_raw <- read_lines("Oxford_English_Dictionary.txt")

oed <- tibble(line = oed_raw) %>%
  filter(str_detect(line, "\\S"))   # keep only lines containing non-whitespace

nrow(oed)
## [1] 36692
head(oed, 5)
## # A tibble: 5 × 1
##   line                                                                          
##   <chr>                                                                         
## 1 "A "                                                                          
## 2 "A-  prefix (also an- before a vowel sound) not, without (amoral). [greek]"   
## 3 "Aa  abbr. 1 automobile association. 2 alcoholics anonymous. 3 anti-aircraft."
## 4 "Aardvark  n. Mammal with a tubular snout and a long tongue, feeding on termi…
## 5 "Ab-  prefix off, away, from (abduct). [latin]"

iv. Extract the headword for each entry into a column words

oed <- oed %>%
  mutate(words = str_extract(line, "^\\S+"))

oed %>% select(words, line) %>% head(10)
## # A tibble: 10 × 2
##    words     line                                                               
##    <chr>     <chr>                                                              
##  1 A         "A "                                                               
##  2 A-        "A-  prefix (also an- before a vowel sound) not, without (amoral).…
##  3 Aa        "Aa  abbr. 1 automobile association. 2 alcoholics anonymous. 3 ant…
##  4 Aardvark  "Aardvark  n. Mammal with a tubular snout and a long tongue, feedi…
##  5 Ab-       "Ab-  prefix off, away, from (abduct). [latin]"                    
##  6 Aback     "Aback  adv. \u007f take aback surprise, disconcert. [old english:…
##  7 Abacus    "Abacus  n. (pl. -cuses) 1 frame with wires along which beads are …
##  8 Abaft     "Abaft  naut. —adv. In the stern half of a ship. —prep. Nearer the…
##  9 Abandon   "Abandon  —v. 1 give up. 2 forsake, desert. 3 (often foll. By to; …
## 10 Abandoned "Abandoned  adj. 1 deserted, forsaken. 2 unrestrained, profligate."

v. Words containing “a”, “e”, “i”, “o”, “u”, and “y” all at the same time

all_vowels_y <- oed %>%
  mutate(w = str_to_lower(words)) %>%
  filter(str_detect(w, "a"),
         str_detect(w, "e"),
         str_detect(w, "i"),
         str_detect(w, "o"),
         str_detect(w, "u"),
         str_detect(w, "y")) %>%
  distinct(words)

all_vowels_y
## # A tibble: 6 × 1
##   words          
##   <chr>          
## 1 Byelorussian   
## 2 Fully-fashioned
## 3 Immunotherapy  
## 4 Praseodymium   
## 5 Revolutionary  
## 6 Uncomplimentary

2. Factors

2a) BankChurners.csv

bank <- read_csv("BankChurners.csv")
glimpse(bank)
## Rows: 10,127
## Columns: 23
## $ CLIENTNUM                                                                                                                          <dbl> …
## $ Attrition_Flag                                                                                                                     <chr> …
## $ Customer_Age                                                                                                                       <dbl> …
## $ Gender                                                                                                                             <chr> …
## $ Dependent_count                                                                                                                    <dbl> …
## $ Education_Level                                                                                                                    <chr> …
## $ Marital_Status                                                                                                                     <chr> …
## $ Income_Category                                                                                                                    <chr> …
## $ Card_Category                                                                                                                      <chr> …
## $ Months_on_book                                                                                                                     <dbl> …
## $ Total_Relationship_Count                                                                                                           <dbl> …
## $ Months_Inactive_12_mon                                                                                                             <dbl> …
## $ Contacts_Count_12_mon                                                                                                              <dbl> …
## $ Credit_Limit                                                                                                                       <dbl> …
## $ Total_Revolving_Bal                                                                                                                <dbl> …
## $ Avg_Open_To_Buy                                                                                                                    <dbl> …
## $ Total_Amt_Chng_Q4_Q1                                                                                                               <dbl> …
## $ Total_Trans_Amt                                                                                                                    <dbl> …
## $ Total_Trans_Ct                                                                                                                     <dbl> …
## $ Total_Ct_Chng_Q4_Q1                                                                                                                <dbl> …
## $ Avg_Utilization_Ratio                                                                                                              <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …

Which features can be regarded as a (nominal) factor?

These columns hold a small set of unordered categories:

  • Attrition_FlagExisting Customer / Attrited Customer
  • GenderM / F
  • Marital_StatusMarried, Single, Divorced, Unknown

Which features can be regarded as an ordered factor (ordinal)?

These columns have a meaningful ordering:

  • Education_LevelUneducated < High School < College < Graduate < Post-Graduate < Doctorate (plus Unknown)
  • Income_CategoryLess than $40K < $40K - $60K < $60K - $80K < $80K - $120K < $120K + (plus Unknown)
  • Card_CategoryBlue < Silver < Gold < Platinum

Convert the relevant columns to factors / ordered factors

bank <- bank %>%
  mutate(
    Attrition_Flag  = factor(Attrition_Flag),
    Gender          = factor(Gender),
    Marital_Status  = factor(Marital_Status),

    Education_Level = factor(Education_Level,
                             levels = c("Uneducated", "High School", "College",
                                        "Graduate", "Post-Graduate",
                                        "Doctorate", "Unknown"),
                             ordered = TRUE),

    Income_Category = factor(Income_Category,
                             levels = c("Less than $40K", "$40K - $60K",
                                        "$60K - $80K",   "$80K - $120K",
                                        "$120K +",       "Unknown"),
                             ordered = TRUE),

    Card_Category   = factor(Card_Category,
                             levels = c("Blue", "Silver", "Gold", "Platinum"),
                             ordered = TRUE)
  )

bank %>% select(Education_Level, Income_Category, Card_Category) %>% summary()
##       Education_Level       Income_Category  Card_Category 
##  Uneducated   :1487   Less than $40K:3561   Blue    :9436  
##  High School  :2013   $40K - $60K   :1790   Silver  : 555  
##  College      :1013   $60K - $80K   :1402   Gold    : 116  
##  Graduate     :3128   $80K - $120K  :1535   Platinum:  20  
##  Post-Graduate: 516   $120K +       : 727                  
##  Doctorate    : 451   Unknown       :1112                  
##  Unknown      :1519

Effect of education level on average utilization ratio

bank_edu <- bank %>%
  filter(Education_Level != "Unknown") %>%
  group_by(Education_Level) %>%
  summarise(mean_util = mean(Avg_Utilization_Ratio, na.rm = TRUE),
            se        = sd(Avg_Utilization_Ratio,  na.rm = TRUE) / sqrt(n()),
            .groups   = "drop")

ggplot(bank_edu, aes(Education_Level, mean_util, fill = Education_Level)) +
  geom_col(show.legend = FALSE) +
  geom_errorbar(aes(ymin = mean_util - se, ymax = mean_util + se),
                width = 0.2) +
  labs(title = "Average utilization ratio by education level",
       x = "Education level",
       y = "Mean Avg_Utilization_Ratio") +
  theme_minimal()

ggplot(filter(bank, Education_Level != "Unknown"),
       aes(Education_Level, Avg_Utilization_Ratio, fill = Education_Level)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Distribution of utilization ratio by education level",
       x = "Education level", y = "Avg_Utilization_Ratio") +
  theme_minimal()

The mean utilization ratio is roughly similar (~0.27–0.28) across education levels, suggesting education has only a small effect on how heavily customers use their available credit.

2b) gss_cat data set

gss_cat
## # A tibble: 21,483 × 9
##     year marital         age race  rincome        partyid    relig denom tvhours
##    <int> <fct>         <int> <fct> <fct>          <fct>      <fct> <fct>   <int>
##  1  2000 Never married    26 White $8000 to 9999  Ind,near … Prot… Sout…      12
##  2  2000 Divorced         48 White $8000 to 9999  Not str r… Prot… Bapt…      NA
##  3  2000 Widowed          67 White Not applicable Independe… Prot… No d…       2
##  4  2000 Never married    39 White Not applicable Ind,near … Orth… Not …       4
##  5  2000 Divorced         25 White Not applicable Not str d… None  Not …       1
##  6  2000 Married          25 White $20000 - 24999 Strong de… Prot… Sout…      NA
##  7  2000 Never married    36 White $25000 or more Not str r… Chri… Not …       3
##  8  2000 Divorced         44 White $7000 to 7999  Ind,near … Prot… Luth…      NA
##  9  2000 Married          44 White $25000 or more Not str d… Prot… Other       0
## 10  2000 Married          47 White $25000 or more Strong re… Prot… Sout…       3
## # ℹ 21,473 more rows

Levels of marital

levels(gss_cat$marital)
## [1] "No answer"     "Never married" "Separated"     "Divorced"     
## [5] "Widowed"       "Married"

Collapse “Separated”, “Divorced”, “Widowed” into “Once Married”

gss2 <- gss_cat %>%
  mutate(marital = fct_collapse(marital,
                                "Once Married" = c("Separated",
                                                   "Divorced",
                                                   "Widowed")))

levels(gss2$marital)
## [1] "No answer"     "Never married" "Once Married"  "Married"
gss2 %>% count(marital)
## # A tibble: 4 × 2
##   marital           n
##   <fct>         <int>
## 1 No answer        17
## 2 Never married  5416
## 3 Once Married   5933
## 4 Married       10117

Does marital status affect tvhours?

gss2 %>%
  filter(!is.na(tvhours), marital != "No answer") %>%
  group_by(marital) %>%
  summarise(mean_tv = mean(tvhours), n = n(), .groups = "drop") %>%
  ggplot(aes(fct_reorder(marital, mean_tv), mean_tv, fill = marital)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = round(mean_tv, 2)), vjust = -0.3) +
  labs(title = "Average daily TV hours by (collapsed) marital status",
       x = "Marital status", y = "Mean tvhours") +
  theme_minimal()

gss2 %>%
  filter(!is.na(tvhours), marital != "No answer") %>%
  ggplot(aes(marital, tvhours, fill = marital)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "TV hours distribution by marital status",
       x = "Marital status", y = "tvhours") +
  theme_minimal()

gss_anova <- gss2 %>% filter(!is.na(tvhours), marital != "No answer")
summary(aov(tvhours ~ marital, data = gss_anova))
##                Df Sum Sq Mean Sq F value Pr(>F)    
## marital         2   1179   589.3   89.36 <2e-16 ***
## Residuals   11325  74685     6.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The “Once Married” and “Never married” groups watch noticeably more TV on average than “Married” respondents; the ANOVA p-value is essentially zero, so the effect is statistically significant.


3. Date and Time — nycflights13

library(nycflights13)
head(flights, 3)
## # A tibble: 3 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
head(airports, 3)
## # A tibble: 3 × 8
##   faa   name                            lat   lon   alt    tz dst   tzone       
##   <chr> <chr>                         <dbl> <dbl> <dbl> <dbl> <chr> <chr>       
## 1 04G   Lansdowne Airport              41.1 -80.6  1044    -5 A     America/New…
## 2 06A   Moton Field Municipal Airport  32.5 -85.7   264    -6 A     America/Chi…
## 3 06C   Schaumburg Regional            42.0 -88.1   801    -6 A     America/Chi…

How many timezones cover all destination airports (excluding NA)?

dest_tz <- flights %>%
  distinct(dest) %>%
  left_join(airports, by = c("dest" = "faa")) %>%
  filter(!is.na(tzone)) %>%
  distinct(tzone)

dest_tz
## # A tibble: 7 × 1
##   tzone              
##   <chr>              
## 1 America/Chicago    
## 2 America/New_York   
## 3 America/Los_Angeles
## 4 America/Phoenix    
## 5 America/Denver     
## 6 Pacific/Honolulu   
## 7 America/Anchorage
nrow(dest_tz)
## [1] 7

Time difference (in hours) between NYC and selected cities

nyc_tz <- airports %>%
  filter(faa %in% c("JFK", "LGA", "EWR")) %>%
  summarise(tz = unique(tz)) %>% pull(tz)
nyc_tz
## [1] -5
target <- tibble(
  city = c("Chicago", "Dallas", "Denver", "Seattle", "Anchorage", "Honolulu"),
  faa  = c("ORD",     "DFW",    "DEN",    "SEA",     "ANC",       "HNL")
)

target %>%
  left_join(airports, by = "faa") %>%
  mutate(time_diff_from_NYC = tz - nyc_tz) %>%
  select(city, faa, name, tz, time_diff_from_NYC)
## # A tibble: 6 × 5
##   city      faa   name                          tz time_diff_from_NYC
##   <chr>     <chr> <chr>                      <dbl>              <dbl>
## 1 Chicago   ORD   Chicago Ohare Intl            -6                 -1
## 2 Dallas    DFW   Dallas Fort Worth Intl        -6                 -1
## 3 Denver    DEN   Denver Intl                   -7                 -2
## 4 Seattle   SEA   Seattle Tacoma Intl           -8                 -3
## 5 Anchorage ANC   Ted Stevens Anchorage Intl    -9                 -4
## 6 Honolulu  HNL   Honolulu Intl                -10                 -5

A negative value means the destination is behind New York City.

Function — Time_difference_NYC(dest)

Time_difference_NYC <- function(dest) {
  nyc_tz  <- airports %>% filter(faa == "JFK") %>% pull(tz)
  dest_tz <- airports %>% filter(faa == dest) %>% pull(tz)

  if (length(dest_tz) == 0) {
    warning("FAA code '", dest, "' not found in airports.")
    return(NA_real_)
  }
  dest_tz - nyc_tz
}

# Tests
Time_difference_NYC("LAX")   # Los Angeles
## [1] -3
Time_difference_NYC("ORD")   # Chicago
## [1] -1
Time_difference_NYC("HNL")   # Honolulu
## [1] -5
Time_difference_NYC("ANC")   # Anchorage
## [1] -4

Function — flight_time(dep_time, arr_time, origin, dest)

flight_time <- function(dep_time, arr_time, origin, dest) {

  hhmm_to_hours <- function(x) (x %/% 100) + (x %% 100) / 60

  dep_local <- hhmm_to_hours(dep_time)
  arr_local <- hhmm_to_hours(arr_time)

  origin_tz <- airports %>% filter(faa == origin) %>% pull(tz)
  dest_tz   <- airports %>% filter(faa == dest)   %>% pull(tz)

  if (length(origin_tz) == 0 || length(dest_tz) == 0) {
    warning("Unknown FAA code for origin or destination.")
    return(NA_real_)
  }

  dep_utc <- dep_local - origin_tz
  arr_utc <- arr_local - dest_tz

  if (arr_utc < dep_utc) arr_utc <- arr_utc + 24

  arr_utc - dep_utc
}

flights %>%
  filter(!is.na(dep_time), !is.na(arr_time)) %>%
  slice(1:5) %>%
  rowwise() %>%
  mutate(computed_hours = flight_time(dep_time, arr_time, origin, dest),
         air_time_hrs   = air_time / 60) %>%
  select(origin, dest, dep_time, arr_time,
         computed_hours, air_time_hrs)
## # A tibble: 5 × 6
## # Rowwise: 
##   origin dest  dep_time arr_time computed_hours air_time_hrs
##   <chr>  <chr>    <int>    <int>          <dbl>        <dbl>
## 1 EWR    IAH        517      830           4.22         3.78
## 2 LGA    IAH        533      850           4.28         3.78
## 3 JFK    MIA        542      923           3.68         2.67
## 4 JFK    BQN        544     1004          NA            3.05
## 5 LGA    ATL        554      812           2.3          1.93

The computed elapsed time should be slightly larger than air_time (which is in-flight only and excludes taxi time), confirming the function works.