1.Regular expressions

a) Use the words data set, find all the words that match each of the following patterns:

are exactly four letters long

Answer:

four_letters <- str_subset(words, "^.{4}$")

head(four_letters)

## [1] "able" "also" "area" "away" "baby" "back"

are either four or five letters long

Answer:

fourOrFive_letters <- str_subset(words, "^.{4,5}$")

head(fourOrFive_letters)

## [1] "able"  "about" "admit" "after" "again" "agent"

the second letter is “s” or “t”

Answer:

sec_letterST <- str_subset(words, "^.[st]")

head(sec_letterST)

## [1] "as"        "ask"       "associate" "assume"    "at"        "attend"

contains the pattern like “oxx” where “o” is one letter and “x” is another letter

Answer:

special_pattern <- str_subset(words, "([a-z])([a-z])\\2")

head(special_pattern)

## [1] "accept"  "account" "across"  "add"     "address" "affect"

contains “a”, “e” and “o” at the same time

Answer: lookaheads

contain_aeo <- str_subset(words, "(?=.*a)(?=.*e)(?=.*o)")

head(contain_aeo)

## [1] "absolute"    "afternoon"   "another"     "appropriate" "associate"  
## [6] "colleague"

b) Use the sentences data set, make the following plot

a bar plot counting sentences with and without “the” (or “The”).

Answer:

sentences_df <- tibble(
  sentence = sentences
)

summary_the <- sentences_df %>%
  mutate(if_the = str_detect(str_to_lower(sentence), "(the)"))

ggplot(summary_the) +
  geom_bar(aes(if_the, fill = if_the)) + 
  labs(
    title = "Frequency of the Word 'the' in Sentences",
    x = "Have 'the' or not",
    y = "Number of the sentences"
  )

a scatterplot with 𝑥 being the average length of words in a sentence, and 𝑦 being the number of words starting with “a” or “e” or “i” or “o” or “u” in the sentence.

Answer:

summary_length <- sentences_df %>%
  mutate(
    words_length = str_count(str_to_lower(sentence), "[a-z]")/(str_count(sentence, "\\s")+1),
    swords_count = str_count(str_to_lower(sentence), "(\\b[aeiou][a-z])*\\b")
  )

ggplot(summary_length) +
  geom_point(aes(words_length, swords_count)) +
  labs(
    title = "Scatterplot of average length of words and words start with aeiou count",
    x = "Words that started with a,e,i,o,u",
    y = "Average length of words"
  )

c) Applicarion

Download the Oxford English Dictionary as a “.txt” file from https://drive.google.com/file/d/1r0MrJDUGVv_Xh1I1cZ4ldMJedYOsiIAH/view?usp=sharing
Read it into RStudio with read_lines() function (check how to use it by yourself)

Answer:

Oxford_dictionary <- read_lines("/Users/yuhe/Downloads/Oxford_English_Dictionary.txt")

Turn the dictionary into a tibble and remove all blank lines

Answer:

Oxford_dictionary_df <- tibble(content = Oxford_dictionary) %>%
  filter(str_detect(content, "\\S"))

glimpse(Oxford_dictionary_df)

## Rows: 36,692
## Columns: 1
## $ content <chr> "A ", "A-  prefix (also an- before a vowel sound) not, without…

Use regular expression to extract all words for each item in a separate column named “words”

Answer:

Oxford_dictionary_df<- Oxford_dictionary_df %>%
  mutate(words = str_extract(content, "^[A-Za-z\\-]+")) %>%
  filter(!is.na(words))

glimpse(Oxford_dictionary_df)

## Rows: 36,661
## Columns: 2
## $ content <chr> "A ", "A-  prefix (also an- before a vowel sound) not, without…
## $ words   <chr> "A", "A-", "Aa", "Aardvark", "Ab-", "Aback", "Abacus", "Abaft"…

Find all words in the dictionary that contain “a”, “e”, “i”, “o”, “u” and “y” at the same time.

Answer:

matchwords <- str_subset(Oxford_dictionary_df$words, "^(?=.*a)(?=.*e)(?=.*i)(?=.*o)(?=.*u)(?=.*y)")

head(matchwords)

## [1] "Byelorussian"    "Fully-fashioned" "Praseodymium"    "Revolutionary"

Factors

a) Use the BankChurners.csv to answer the following questions:

Which features can be regarded as a factor?

Answer: The “Attrition_Flag”, “Gender”, “Marital_Status” can be regarded as factors.

Which features can be regarded as an ordered factor (ordinal)?

Answer: The “Education_Level”, “Income_Category”, “Card_Category” can be regarded ass ordered factors.

Read BankChurners.csv into RStudio, then change the columns that you answered above into factors or ordered factors.

bank_churners <- read_csv("/Users/yuhe/Downloads/BankChurners 2.csv")

## Rows: 10127 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (17): CLIENTNUM, Customer_Age, Dependent_count, Months_on_book, Total_Re...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

bank_churners_2 <- bank_churners %>%
  mutate(
    Attrition_Flag = as.factor(Attrition_Flag),
    Gender = as.factor(Gender),
    Marital_Status = as.factor(Marital_Status),
    
    Education_Level = factor(Education_Level,
                             levels = c("Unknown", "Uneducated", "High School", "College", "Graduate", "Post-Graduate", "Doctorate"), 
                             ordered = TRUE),
    Category = factor(Income_Category, 
                      levels = c("Unknown", "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +"), 
                      ordered = TRUE),
    Card_Category = factor(Card_Category, 
                           levels = c("Blue", "Silver", "Gold", "Platinum"), 
                           ordered = TRUE)
  )

glimpse(bank_churners_2)

## Rows: 10,127
## Columns: 24
## $ CLIENTNUM                                                                                                                          <dbl> …
## $ Attrition_Flag                                                                                                                     <fct> …
## $ Customer_Age                                                                                                                       <dbl> …
## $ Gender                                                                                                                             <fct> …
## $ Dependent_count                                                                                                                    <dbl> …
## $ Education_Level                                                                                                                    <ord> …
## $ Marital_Status                                                                                                                     <fct> …
## $ Income_Category                                                                                                                    <chr> …
## $ Card_Category                                                                                                                      <ord> …
## $ Months_on_book                                                                                                                     <dbl> …
## $ Total_Relationship_Count                                                                                                           <dbl> …
## $ Months_Inactive_12_mon                                                                                                             <dbl> …
## $ Contacts_Count_12_mon                                                                                                              <dbl> …
## $ Credit_Limit                                                                                                                       <dbl> …
## $ Total_Revolving_Bal                                                                                                                <dbl> …
## $ Avg_Open_To_Buy                                                                                                                    <dbl> …
## $ Total_Amt_Chng_Q4_Q1                                                                                                               <dbl> …
## $ Total_Trans_Amt                                                                                                                    <dbl> …
## $ Total_Trans_Ct                                                                                                                     <dbl> …
## $ Total_Ct_Chng_Q4_Q1                                                                                                                <dbl> …
## $ Avg_Utilization_Ratio                                                                                                              <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …
## $ Category                                                                                                                           <ord> …

Visualize the effect of education level on average utilization ratio

ggplot(bank_churners_2, aes(x = Education_Level, y = Avg_Utilization_Ratio, fill = Education_Level)) +
  geom_boxplot(show.legend = FALSE) +
  labs(
    title = "Effect of Education Level on Average Utilization Ratio",
    x = "Education Level",
    y = "Average Utilization Ratio"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

b) Use the gss_cat data set

What are the levels of marital variable?

Answer:

levels(gss_cat$marital)

## [1] "No answer"     "Never married" "Separated"     "Divorced"     
## [5] "Widowed"       "Married"

Combine “Separated”, “Divorced”, “Widowed” into a new category “Once Married”

gss_cat_2 <- gss_cat %>%
  mutate(marital = fct_collapse(marital, 
                               "Once Married" = c("Separated", "Divorced", "Widowed")
                                    ))

levels(gss_cat_2$marital)

## [1] "No answer"     "Never married" "Once Married"  "Married"

Use the new levels, explore whether there is an effect of martial status on tvhours.

summary_1 <- gss_cat_2 %>%
  group_by(marital) %>%
  summarise(
    avg_tvhours = mean(tvhours, na.rm = TRUE), 
    count = n()
  ) %>%
  arrange(desc(avg_tvhours))

ggplot(summary_1, aes(marital, avg_tvhours, color = marital)) +
  geom_point() +
  labs(
    title = "Effect of Marital Status on average TV hours",
    x = "Marital",
    y = "Average TV Hours"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Answer: Once married people have highest average tv hours. Married people have second lowest average tv hours. Never married stay between them. So there is an effect of martial status on tv hours.

3. Date and Time - nycflights13 data set

How many timezones are there for all the destination airports (excluding NA)?

num_timezones <- flights %>%
  distinct(dest) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  summarise(timezone_counts = n_distinct(tz))

num_timezones

## # A tibble: 1 × 1
##   timezone_counts
##             <int>
## 1               6

Answer: There are 6 timezones for all destination airports.

Use the data in flights and airports only along with R code to show the time difference (in hours) between New York City and the following cities:
- Chicago
- Dallas
- Denver
- Seattle
- Anchorage
- Honolulu

city_df <- data.frame(
  city = c("Chicago", "Dallas", "Denver", "Seattle", "Anchorage", "Honolulu"),
  faa = c("ORD", "DFW", "DEN", "SEA", "ANC", "HNL")
)

nyc <- airports %>%
  filter(faa == "JFK")

city_tz_diff <- city_df %>%
  left_join(airports, by = "faa") %>%
  mutate(
    nyc_tz = nyc$tz,
    time_diff_hours = nyc_tz - tz
  ) %>%
  select(city, faa, dest_tz = tz, nyc_tz, time_diff_hours)

city_tz_diff

##        city faa dest_tz nyc_tz time_diff_hours
## 1   Chicago ORD      -6     -5               1
## 2    Dallas DFW      -6     -5               1
## 3    Denver DEN      -7     -5               2
## 4   Seattle SEA      -8     -5               3
## 5 Anchorage ANC      -9     -5               4
## 6  Honolulu HNL     -10     -5               5

Write a function Time_difference_NYC(dest) to compute the time difference (in hours) between any destination airport (in faa code) and New York City.

Time_difference_NYC <- function(dest) {
  nyc_tz <- -5
  
  dest <- airports %>%
    filter(faa == dest)
  
  if(length(dest) == 0) {
    return("Airport code not found in dataset.")
  }
  
  time_diff <- nyc_tz - dest$tz
  
  return(time_diff)
}

Time_difference_NYC("ORD")

## [1] 1

Write a function flight_time(dep_time, arr_time, origin, dest) to compute the actual flight time from the dep_time at the origin airport to the arr_time in the dest airport. dep_time and arr_time should be in the form of an integer such as 830 (8:30am); origin and dest should be in faa codes.

flight_time <- function(dep_time, arr_time, origin, dest) {
  origin_tz <- airports %>%
    filter(faa == origin) %>%
    pull(tz)
  dest_tz <- airports %>%
    filter(faa == dest) %>%
    pull(tz)
  
  time_fixer <- function(time) {
    hours <- time %/% 100
    mins <- time %% 100
    return(hours*60 + mins)
  }
  
  time_diff_min <- (dest_tz - origin_tz)*60
  
  adjusted_dep_time <- dep_time + time_diff_min
  
  flight_time_raw <- time_fixer(arr_time) - time_fixer(adjusted_dep_time)
  
  flight_time_final <- flight_time_raw %/% 100 + (flight_time_raw %% 100)/60
  
  return(paste0("flight time (hour):", flight_time_final))
}

flight_time(830, 1120, "JFK", "ORD")

## [1] "flight time (hour):2.5"

HW#4_VariousDataType

Yuhe Luo

2026-04-17

1.Regular expressions

a) Use the words data set, find all the words that match each of the following patterns:

b) Use the sentences data set, make the following plot

c) Applicarion

Factors

a) Use the BankChurners.csv to answer the following questions:

b) Use the gss_cat data set

3. Date and Time - nycflights13 data set