Hint: “NA” is an actual word. It means “no” or “not”.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
text_mania <- read_tsv(file = "https://dcgerard.github.io/stat_412_612/data/words.txt",
na = character())
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## word = col_character()
## )
sample_n(text_mania,10)
text_mania %>%
filter(str_detect(word, "(AEI).?"))
_ test for tutoing student
text_mania %>%
mutate(wordsLen = str_length(word)) %>%
count(wordsLen)
#check
text_mania %>%
filter(is.na(word))
Ans: 885 units
text_mania %>%
filter(str_count(word, "^X") | str_count(word, "X$")) %>%
nrow()
## [1] 885
Ans: 3476 units
text_mania %>%
filter(str_detect(word, "A") & str_detect(word, "E")
& str_detect(word, "I") & str_detect(word, "O")
& str_detect(word, "U")) -> text_vowels
nrow(text_vowels)
## [1] 3476
Ans: DOULEIA, EULOGIA, MIAOUED, MOINEAU and SEQUOIA
text_vowels %>%
mutate(count = str_length(word)) %>% # the length of shortest words is seven.
arrange(count) %>%
filter(count == 7)
Ans: 21287 units
text_mania %>%
mutate(convert_word = str_replace_all
(word,"^([A-Z])(.*)([A-Z])$", "\\3\\2\\1")) %>%
mutate( is_word = convert_word %in% word) %>%
filter(is_word == TRUE) -> still_words
head(still_words)
count(still_words)
Ans: 1696 units
still_words %>%
mutate(same_first_last = str_sub(word,str_length(word), )
== str_sub(word,1,1)) %>%
# we can use "word" or "convert_word" columns.
filter(same_first_last == FALSE) -> still_words_firstlast_different
count(still_words_firstlast_different)
still_words_firstlast_different %>%
mutate(length = str_length(word)) %>%
arrange(desc(length)) %>%
head(6)
US_commercial_banks <- read.csv(file = "./data/fed_large_c_bank_ratings.csv")
sample_n(US_commercial_banks,10)
US_commercial_banks %>%
separate("name", into = c("name", "alternate_name"), sep = "/") ->
US_commercial_banks_V02
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [15].
head(US_commercial_banks_V02)
# recheck if they have additional names
# US_commercial_banks_V02 %>%
# mutate(add_name = str_detect(alternate_name, "/")) %>%
# filter(add_name == TRUE)
Ans: 2
US_commercial_banks_V02 %>%
filter(str_detect(name, "^\\d")) %>%
nrow()
## [1] 2
Ans: 41
US_commercial_banks_V02 %>%
filter(str_detect(name, "BANK")) %>%
nrow()
## [1] 41
US_commercial_banks_V02 %>%
mutate(name = str_replace_all(name, "BK", "BANK")
, position_banks = if_else(str_detect(name, "^BANK "), "first",
if_else(str_detect(name, " BANK$"), "last",
if_else(str_detect(name, " BANK "), "middle","none")))) -> US_commercial_banks_V03
US_commercial_banks_V03 %>%
group_by(position_banks) %>%
summarize(proportions = n()/nrow(US_commercial_banks_V02))
# traditional method
US_commercial_banks_V02 %>%
mutate(name = str_replace_all(name, "BK", "BANK")) %>%
filter(str_detect(name, "^BANK ")) -> first_banks
nrow(first_banks) #21 have “BANK” as the first word
## [1] 21
prop_first <- (nrow(first_banks)/ nrow(US_commercial_banks_V02))*100
prop_first
## [1] 5.6
# traditional method
US_commercial_banks_V02 %>%
mutate(name = str_replace_all(name, "BK", "BANK")) %>%
filter(str_detect(name, " BANK$")) -> last_banks
nrow(last_banks) #249 have “BANK” as the last word
## [1] 249
prop_last<- (nrow(last_banks)/ nrow(US_commercial_banks_V02))*100
prop_last
## [1] 66.4
# traditional method
US_commercial_banks_V02 %>%
mutate(name = str_replace_all(name, "BK", "BANK")) %>%
filter(str_detect(name, " BANK ")) -> middle_banks
nrow(middle_banks) #36 have “BANK” neither in the first nor in the last word
## [1] 36
prop_middle <- (nrow(middle_banks)/ nrow(US_commercial_banks_V02))*100
prop_middle
## [1] 9.6
# traditional method
US_commercial_banks_V02 %>%
mutate(name = str_replace_all(name, "BK", "BANK")) %>% #
anti_join(first_banks) %>%
anti_join(last_banks) %>%
anti_join(middle_banks) -> no_banks
## Joining, by = c("name", "alternate_name", "rank", "charter", "consolidated_assets")
## Joining, by = c("name", "alternate_name", "rank", "charter", "consolidated_assets")
## Joining, by = c("name", "alternate_name", "rank", "charter", "consolidated_assets")
nrow(no_banks) # 69 have no bank words
## [1] 69
prop_no <- (nrow(no_banks)/ nrow(US_commercial_banks_V02))*100
prop_no
## [1] 18.4
Ans: the position of the bank’s name does not have inseparable effects on total assets.
US_commercial_banks_V03 %>%
ggplot(aes(x = position_banks, y = consolidated_assets, fill = position_banks))+
geom_boxplot()+
scale_y_log10()+
theme_bw()+
ggtitle("The relevance between bank name and assets")
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
# Note:
# 1e+03 = 1 * 10^3 = 1000
# 1e+04 = 1 * 10^4 = 10000
# 1e+05 = 1 * 10^5 = 100000
# 1e-03 = 1 * 10^(-3) = 0.001
– I used the following functions: str_replace(), str_locate(), rank(), c(), str_match(), as.numeric().
– Thinking in reverse order, you need to create a regex pattern based on the order of the terms in the input format you can use to parse the input string into vector of year, month, and day.
– Start by using regex to convert the input format (#Y, #m, #d) into a new regex string
* As an example, use regex to replace “#Y” with “([0-9]{4})”
– Create a vector with the order of the elements in the input format.
– Create a vector of the matches for the input string with the new parsing regex
– Using the vector you created for the order of the input format to return the elements matching year, month, and day
– Make sure each element of the output vector is a number.
• You are not allowed to use any pre-built date parsers.
• You have to use regular expressions.
• Test out your parser on the following three inputs.
my_d_parser <- function(string, pattern) {
pattern_2 <- str_replace(pattern, "#Y", "YYYY")
pattern_2 <- str_replace(pattern_2, "#m", "mm")
pattern_2 <- str_replace(pattern_2, "#d", "dd")
y_index <- str_locate(pattern_2, "YYYY")
year <- as.numeric(str_sub(string, y_index[1], y_index[2]))
m_index <- str_locate(pattern_2, "mm")
month <- as.numeric(str_sub(string, m_index[1], m_index[2]))
d_index <- str_locate(pattern_2, "dd")
day <- as.numeric(str_sub(string, d_index[1], d_index[2]))
return(c(year, month, day))
}
pattern <- "#Y, #d, #m"
string <- "2021, 12, 02"
my_d_parser(string, pattern)
## [1] 2021 2 12
pattern <- "#d-#Y,#m"
string <- "01-2020,05"
my_d_parser(string, pattern)
## [1] 2020 5 1
pattern <- "#m/#d/#Y"
string <- "05/29/2017"
my_d_parser(string, pattern)
## [1] 2017 5 29
#another solution
my_d_parser_anotherway <- function(string, pattern){
str_replace(pattern, pattern = "#Y", "([0-9]{4})") %>% #{} exactly{n}
str_replace(pattern = "#m", "([0-9]{2})") %>%
str_replace(pattern = "#d", "([0-9]{2})") -> pattern_V02
dpattern <- str_locate(pattern, "#d")[1]
mpattern <- str_locate(pattern, "#m")[1]
ypattern <- str_locate(pattern, "#Y")[1]
combined <- rank(c(ypattern, mpattern, dpattern))
str_match(string = string, pattern = pattern_V02)[, -1] %>%
as.numeric() -> finalparser
return(finalparser[combined])
}
my_d_parser_anotherway (string, pattern)
## [1] 2017 5 29