1.Regular expressions

a) Use the words data set, find all the words that match each of the following patterns:

  • are exactly four letters long

Answer:

four_letters <- str_subset(words, "^.{4}$")

head(four_letters)
## [1] "able" "also" "area" "away" "baby" "back"
  • are either four or five letters long

Answer:

fourOrFive_letters <- str_subset(words, "^.{4,5}$")

head(fourOrFive_letters)
## [1] "able"  "about" "admit" "after" "again" "agent"
  • the second letter is “s” or “t”

Answer:

sec_letterST <- str_subset(words, "^.[st]")

head(sec_letterST)
## [1] "as"        "ask"       "associate" "assume"    "at"        "attend"
  • contains the pattern like “oxx” where “o” is one letter and “x” is another letter

Answer:

special_pattern <- str_subset(words, "([a-z])([a-z])\\2")

head(special_pattern)
## [1] "accept"  "account" "across"  "add"     "address" "affect"
  • contains “a”, “e” and “o” at the same time

Answer: lookaheads

contain_aeo <- str_subset(words, "(?=.*a)(?=.*e)(?=.*o)")

head(contain_aeo)
## [1] "absolute"    "afternoon"   "another"     "appropriate" "associate"  
## [6] "colleague"

b) Use the sentences data set, make the following plot

  • a bar plot counting sentences with and without “the” (or “The”).

Answer:

sentences_df <- tibble(
  sentence = sentences
)

summary_the <- sentences_df %>%
  mutate(if_the = str_detect(str_to_lower(sentence), "(the)"))

ggplot(summary_the) +
  geom_bar(aes(if_the, fill = if_the)) + 
  labs(
    title = "Frequency of the Word 'the' in Sentences",
    x = "Have 'the' or not",
    y = "Number of the sentences"
  )

  • a scatterplot with 𝑥 being the average length of words in a sentence, and 𝑦 being the number of words starting with “a” or “e” or “i” or “o” or “u” in the sentence.

Answer:

summary_length <- sentences_df %>%
  mutate(
    words_length = str_count(str_to_lower(sentence), "[a-z]")/(str_count(sentence, "\\s")+1),
    swords_count = str_count(str_to_lower(sentence), "(\\b[aeiou][a-z])*\\b")
  )

ggplot(summary_length) +
  geom_point(aes(words_length, swords_count)) +
  labs(
    title = "Scatterplot of average length of words and words start with aeiou count",
    x = "Words that started with a,e,i,o,u",
    y = "Average length of words"
  )

c) Applicarion

  1. Download the Oxford English Dictionary as a “.txt” file from https://drive.google.com/file/d/1r0MrJDUGVv_Xh1I1cZ4ldMJedYOsiIAH/view?usp=sharing

  2. Read it into RStudio with read_lines() function (check how to use it by yourself)

Answer:

Oxford_dictionary <- read_lines("/Users/yuhe/Downloads/Oxford_English_Dictionary.txt")
  1. Turn the dictionary into a tibble and remove all blank lines

Answer:

Oxford_dictionary_df <- tibble(content = Oxford_dictionary) %>%
  filter(str_detect(content, "\\S"))

glimpse(Oxford_dictionary_df)
## Rows: 36,692
## Columns: 1
## $ content <chr> "A ", "A-  prefix (also an- before a vowel sound) not, without…
  1. Use regular expression to extract all words for each item in a separate column named “words”

Answer:

Oxford_dictionary_df<- Oxford_dictionary_df %>%
  mutate(words = str_extract(content, "^[A-Za-z\\-]+")) %>%
  filter(!is.na(words))

glimpse(Oxford_dictionary_df)
## Rows: 36,661
## Columns: 2
## $ content <chr> "A ", "A-  prefix (also an- before a vowel sound) not, without…
## $ words   <chr> "A", "A-", "Aa", "Aardvark", "Ab-", "Aback", "Abacus", "Abaft"…
  1. Find all words in the dictionary that contain “a”, “e”, “i”, “o”, “u” and “y” at the same time.

Answer:

matchwords <- str_subset(Oxford_dictionary_df$words, "^(?=.*a)(?=.*e)(?=.*i)(?=.*o)(?=.*u)(?=.*y)")

head(matchwords)
## [1] "Byelorussian"    "Fully-fashioned" "Praseodymium"    "Revolutionary"

Factors

a) Use the BankChurners.csv to answer the following questions:

  • Which features can be regarded as a factor?

Answer: The “Attrition_Flag”, “Gender”, “Marital_Status” can be regarded as factors.

  • Which features can be regarded as an ordered factor (ordinal)?

Answer: The “Education_Level”, “Income_Category”, “Card_Category” can be regarded ass ordered factors.

  • Read BankChurners.csv into RStudio, then change the columns that you answered above into factors or ordered factors.
bank_churners <- read_csv("/Users/yuhe/Downloads/BankChurners 2.csv")
## Rows: 10127 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (17): CLIENTNUM, Customer_Age, Dependent_count, Months_on_book, Total_Re...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bank_churners_2 <- bank_churners %>%
  mutate(
    Attrition_Flag = as.factor(Attrition_Flag),
    Gender = as.factor(Gender),
    Marital_Status = as.factor(Marital_Status),
    
    Education_Level = factor(Education_Level,
                             levels = c("Unknown", "Uneducated", "High School", "College", "Graduate", "Post-Graduate", "Doctorate"), 
                             ordered = TRUE),
    Category = factor(Income_Category, 
                      levels = c("Unknown", "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +"), 
                      ordered = TRUE),
    Card_Category = factor(Card_Category, 
                           levels = c("Blue", "Silver", "Gold", "Platinum"), 
                           ordered = TRUE)
  )

glimpse(bank_churners_2)
## Rows: 10,127
## Columns: 24
## $ CLIENTNUM                                                                                                                          <dbl> …
## $ Attrition_Flag                                                                                                                     <fct> …
## $ Customer_Age                                                                                                                       <dbl> …
## $ Gender                                                                                                                             <fct> …
## $ Dependent_count                                                                                                                    <dbl> …
## $ Education_Level                                                                                                                    <ord> …
## $ Marital_Status                                                                                                                     <fct> …
## $ Income_Category                                                                                                                    <chr> …
## $ Card_Category                                                                                                                      <ord> …
## $ Months_on_book                                                                                                                     <dbl> …
## $ Total_Relationship_Count                                                                                                           <dbl> …
## $ Months_Inactive_12_mon                                                                                                             <dbl> …
## $ Contacts_Count_12_mon                                                                                                              <dbl> …
## $ Credit_Limit                                                                                                                       <dbl> …
## $ Total_Revolving_Bal                                                                                                                <dbl> …
## $ Avg_Open_To_Buy                                                                                                                    <dbl> …
## $ Total_Amt_Chng_Q4_Q1                                                                                                               <dbl> …
## $ Total_Trans_Amt                                                                                                                    <dbl> …
## $ Total_Trans_Ct                                                                                                                     <dbl> …
## $ Total_Ct_Chng_Q4_Q1                                                                                                                <dbl> …
## $ Avg_Utilization_Ratio                                                                                                              <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …
## $ Category                                                                                                                           <ord> …
  • Visualize the effect of education level on average utilization ratio
ggplot(bank_churners_2, aes(x = Education_Level, y = Avg_Utilization_Ratio, fill = Education_Level)) +
  geom_boxplot(show.legend = FALSE) +
  labs(
    title = "Effect of Education Level on Average Utilization Ratio",
    x = "Education Level",
    y = "Average Utilization Ratio"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

b) Use the gss_cat data set

Answer:

levels(gss_cat$marital)
## [1] "No answer"     "Never married" "Separated"     "Divorced"     
## [5] "Widowed"       "Married"
gss_cat_2 <- gss_cat %>%
  mutate(marital = fct_collapse(marital, 
                               "Once Married" = c("Separated", "Divorced", "Widowed")
                                    ))

levels(gss_cat_2$marital)
## [1] "No answer"     "Never married" "Once Married"  "Married"
summary_1 <- gss_cat_2 %>%
  group_by(marital) %>%
  summarise(
    avg_tvhours = mean(tvhours, na.rm = TRUE), 
    count = n()
  ) %>%
  arrange(desc(avg_tvhours))

ggplot(summary_1, aes(marital, avg_tvhours, color = marital)) +
  geom_point() +
  labs(
    title = "Effect of Marital Status on average TV hours",
    x = "Marital",
    y = "Average TV Hours"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Answer: Once married people have highest average tv hours. Married people have second lowest average tv hours. Never married stay between them. So there is an effect of martial status on tv hours.

3. Date and Time - nycflights13 data set

num_timezones <- flights %>%
  distinct(dest) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  summarise(timezone_counts = n_distinct(tz))

num_timezones
## # A tibble: 1 × 1
##   timezone_counts
##             <int>
## 1               6

Answer: There are 6 timezones for all destination airports.

city_df <- data.frame(
  city = c("Chicago", "Dallas", "Denver", "Seattle", "Anchorage", "Honolulu"),
  faa = c("ORD", "DFW", "DEN", "SEA", "ANC", "HNL")
)

nyc <- airports %>%
  filter(faa == "JFK")

city_tz_diff <- city_df %>%
  left_join(airports, by = "faa") %>%
  mutate(
    nyc_tz = nyc$tz,
    time_diff_hours = nyc_tz - tz
  ) %>%
  select(city, faa, dest_tz = tz, nyc_tz, time_diff_hours)

city_tz_diff
##        city faa dest_tz nyc_tz time_diff_hours
## 1   Chicago ORD      -6     -5               1
## 2    Dallas DFW      -6     -5               1
## 3    Denver DEN      -7     -5               2
## 4   Seattle SEA      -8     -5               3
## 5 Anchorage ANC      -9     -5               4
## 6  Honolulu HNL     -10     -5               5
Time_difference_NYC <- function(dest) {
  nyc_tz <- -5
  
  dest <- airports %>%
    filter(faa == dest)
  
  if(length(dest) == 0) {
    return("Airport code not found in dataset.")
  }
  
  time_diff <- nyc_tz - dest$tz
  
  return(time_diff)
}

Time_difference_NYC("ORD")
## [1] 1
flight_time <- function(dep_time, arr_time, origin, dest) {
  origin_tz <- airports %>%
    filter(faa == origin) %>%
    pull(tz)
  dest_tz <- airports %>%
    filter(faa == dest) %>%
    pull(tz)
  
  time_fixer <- function(time) {
    hours <- time %/% 100
    mins <- time %% 100
    return(hours*60 + mins)
  }
  
  time_diff_min <- (dest_tz - origin_tz)*60
  
  adjusted_dep_time <- dep_time + time_diff_min
  
  flight_time_raw <- time_fixer(arr_time) - time_fixer(adjusted_dep_time)
  
  flight_time_final <- flight_time_raw %/% 100 + (flight_time_raw %% 100)/60
  
  return(paste0("flight time (hour):", flight_time_final))
}

flight_time(830, 1120, "JFK", "ORD")
## [1] "flight time (hour):2.5"