Answer:
four_letters <- str_subset(words, "^.{4}$")
head(four_letters)
## [1] "able" "also" "area" "away" "baby" "back"
Answer:
fourOrFive_letters <- str_subset(words, "^.{4,5}$")
head(fourOrFive_letters)
## [1] "able" "about" "admit" "after" "again" "agent"
Answer:
sec_letterST <- str_subset(words, "^.[st]")
head(sec_letterST)
## [1] "as" "ask" "associate" "assume" "at" "attend"
Answer:
special_pattern <- str_subset(words, "([a-z])([a-z])\\2")
head(special_pattern)
## [1] "accept" "account" "across" "add" "address" "affect"
Answer: lookaheads
contain_aeo <- str_subset(words, "(?=.*a)(?=.*e)(?=.*o)")
head(contain_aeo)
## [1] "absolute" "afternoon" "another" "appropriate" "associate"
## [6] "colleague"
Answer:
sentences_df <- tibble(
sentence = sentences
)
summary_the <- sentences_df %>%
mutate(if_the = str_detect(str_to_lower(sentence), "(the)"))
ggplot(summary_the) +
geom_bar(aes(if_the, fill = if_the)) +
labs(
title = "Frequency of the Word 'the' in Sentences",
x = "Have 'the' or not",
y = "Number of the sentences"
)
Answer:
summary_length <- sentences_df %>%
mutate(
words_length = str_count(str_to_lower(sentence), "[a-z]")/(str_count(sentence, "\\s")+1),
swords_count = str_count(str_to_lower(sentence), "(\\b[aeiou][a-z])*\\b")
)
ggplot(summary_length) +
geom_point(aes(words_length, swords_count)) +
labs(
title = "Scatterplot of average length of words and words start with aeiou count",
x = "Words that started with a,e,i,o,u",
y = "Average length of words"
)
Download the Oxford English Dictionary as a “.txt” file from https://drive.google.com/file/d/1r0MrJDUGVv_Xh1I1cZ4ldMJedYOsiIAH/view?usp=sharing
Read it into RStudio with read_lines() function (check how to use it by yourself)
Answer:
Oxford_dictionary <- read_lines("/Users/yuhe/Downloads/Oxford_English_Dictionary.txt")
Answer:
Oxford_dictionary_df <- tibble(content = Oxford_dictionary) %>%
filter(str_detect(content, "\\S"))
glimpse(Oxford_dictionary_df)
## Rows: 36,692
## Columns: 1
## $ content <chr> "A ", "A- prefix (also an- before a vowel sound) not, without…
Answer:
Oxford_dictionary_df<- Oxford_dictionary_df %>%
mutate(words = str_extract(content, "^[A-Za-z\\-]+")) %>%
filter(!is.na(words))
glimpse(Oxford_dictionary_df)
## Rows: 36,661
## Columns: 2
## $ content <chr> "A ", "A- prefix (also an- before a vowel sound) not, without…
## $ words <chr> "A", "A-", "Aa", "Aardvark", "Ab-", "Aback", "Abacus", "Abaft"…
Answer:
matchwords <- str_subset(Oxford_dictionary_df$words, "^(?=.*a)(?=.*e)(?=.*i)(?=.*o)(?=.*u)(?=.*y)")
head(matchwords)
## [1] "Byelorussian" "Fully-fashioned" "Praseodymium" "Revolutionary"
Answer: The “Attrition_Flag”, “Gender”, “Marital_Status” can be regarded as factors.
Answer: The “Education_Level”, “Income_Category”, “Card_Category” can be regarded ass ordered factors.
bank_churners <- read_csv("/Users/yuhe/Downloads/BankChurners 2.csv")
## Rows: 10127 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (17): CLIENTNUM, Customer_Age, Dependent_count, Months_on_book, Total_Re...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bank_churners_2 <- bank_churners %>%
mutate(
Attrition_Flag = as.factor(Attrition_Flag),
Gender = as.factor(Gender),
Marital_Status = as.factor(Marital_Status),
Education_Level = factor(Education_Level,
levels = c("Unknown", "Uneducated", "High School", "College", "Graduate", "Post-Graduate", "Doctorate"),
ordered = TRUE),
Category = factor(Income_Category,
levels = c("Unknown", "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +"),
ordered = TRUE),
Card_Category = factor(Card_Category,
levels = c("Blue", "Silver", "Gold", "Platinum"),
ordered = TRUE)
)
glimpse(bank_churners_2)
## Rows: 10,127
## Columns: 24
## $ CLIENTNUM <dbl> …
## $ Attrition_Flag <fct> …
## $ Customer_Age <dbl> …
## $ Gender <fct> …
## $ Dependent_count <dbl> …
## $ Education_Level <ord> …
## $ Marital_Status <fct> …
## $ Income_Category <chr> …
## $ Card_Category <ord> …
## $ Months_on_book <dbl> …
## $ Total_Relationship_Count <dbl> …
## $ Months_Inactive_12_mon <dbl> …
## $ Contacts_Count_12_mon <dbl> …
## $ Credit_Limit <dbl> …
## $ Total_Revolving_Bal <dbl> …
## $ Avg_Open_To_Buy <dbl> …
## $ Total_Amt_Chng_Q4_Q1 <dbl> …
## $ Total_Trans_Amt <dbl> …
## $ Total_Trans_Ct <dbl> …
## $ Total_Ct_Chng_Q4_Q1 <dbl> …
## $ Avg_Utilization_Ratio <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …
## $ Category <ord> …
ggplot(bank_churners_2, aes(x = Education_Level, y = Avg_Utilization_Ratio, fill = Education_Level)) +
geom_boxplot(show.legend = FALSE) +
labs(
title = "Effect of Education Level on Average Utilization Ratio",
x = "Education Level",
y = "Average Utilization Ratio"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Answer:
levels(gss_cat$marital)
## [1] "No answer" "Never married" "Separated" "Divorced"
## [5] "Widowed" "Married"
gss_cat_2 <- gss_cat %>%
mutate(marital = fct_collapse(marital,
"Once Married" = c("Separated", "Divorced", "Widowed")
))
levels(gss_cat_2$marital)
## [1] "No answer" "Never married" "Once Married" "Married"
summary_1 <- gss_cat_2 %>%
group_by(marital) %>%
summarise(
avg_tvhours = mean(tvhours, na.rm = TRUE),
count = n()
) %>%
arrange(desc(avg_tvhours))
ggplot(summary_1, aes(marital, avg_tvhours, color = marital)) +
geom_point() +
labs(
title = "Effect of Marital Status on average TV hours",
x = "Marital",
y = "Average TV Hours"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Answer: Once married people have highest average tv hours. Married people have second lowest average tv hours. Never married stay between them. So there is an effect of martial status on tv hours.
num_timezones <- flights %>%
distinct(dest) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
summarise(timezone_counts = n_distinct(tz))
num_timezones
## # A tibble: 1 × 1
## timezone_counts
## <int>
## 1 6
Answer: There are 6 timezones for all destination airports.
Use the data in flights and airports only along with R code to show the time difference (in hours) between New York City and the following cities:
Chicago
Dallas
Denver
Seattle
Anchorage
Honolulu
city_df <- data.frame(
city = c("Chicago", "Dallas", "Denver", "Seattle", "Anchorage", "Honolulu"),
faa = c("ORD", "DFW", "DEN", "SEA", "ANC", "HNL")
)
nyc <- airports %>%
filter(faa == "JFK")
city_tz_diff <- city_df %>%
left_join(airports, by = "faa") %>%
mutate(
nyc_tz = nyc$tz,
time_diff_hours = nyc_tz - tz
) %>%
select(city, faa, dest_tz = tz, nyc_tz, time_diff_hours)
city_tz_diff
## city faa dest_tz nyc_tz time_diff_hours
## 1 Chicago ORD -6 -5 1
## 2 Dallas DFW -6 -5 1
## 3 Denver DEN -7 -5 2
## 4 Seattle SEA -8 -5 3
## 5 Anchorage ANC -9 -5 4
## 6 Honolulu HNL -10 -5 5
Time_difference_NYC <- function(dest) {
nyc_tz <- -5
dest <- airports %>%
filter(faa == dest)
if(length(dest) == 0) {
return("Airport code not found in dataset.")
}
time_diff <- nyc_tz - dest$tz
return(time_diff)
}
Time_difference_NYC("ORD")
## [1] 1
flight_time <- function(dep_time, arr_time, origin, dest) {
origin_tz <- airports %>%
filter(faa == origin) %>%
pull(tz)
dest_tz <- airports %>%
filter(faa == dest) %>%
pull(tz)
time_fixer <- function(time) {
hours <- time %/% 100
mins <- time %% 100
return(hours*60 + mins)
}
time_diff_min <- (dest_tz - origin_tz)*60
adjusted_dep_time <- dep_time + time_diff_min
flight_time_raw <- time_fixer(arr_time) - time_fixer(adjusted_dep_time)
flight_time_final <- flight_time_raw %/% 100 + (flight_time_raw %% 100)/60
return(paste0("flight time (hour):", flight_time_final))
}
flight_time(830, 1120, "JFK", "ORD")
## [1] "flight time (hour):2.5"