library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(htmlwidgets)
Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”
The readr package is a great way to load data from github
library(readr)
major_data <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
## Rows: 174 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): FOD1P, Major, Major_Category
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
retrieve_majors_with_x <- function(major_data, words) {
pipe_words <- str_c(words, collapse= "|")
return(major_data$Major %>% str_subset(pipe_words))
}
input_major_string <- c("DATA","STATISTICS")
final_value <- retrieve_majors_with_x(major_data, input_major_string)
final_value
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
input_major_string <- c("BUSINESS","DATA","STATISTICS","PSYCHOLOGY")
final_value <- retrieve_majors_with_x(major_data, input_major_string)
final_value
## [1] "COGNITIVE SCIENCE AND BIOPSYCHOLOGY"
## [2] "GENERAL BUSINESS"
## [3] "BUSINESS MANAGEMENT AND ADMINISTRATION"
## [4] "BUSINESS ECONOMICS"
## [5] "INTERNATIONAL BUSINESS"
## [6] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [7] "MISCELLANEOUS BUSINESS & MEDICAL ADMINISTRATION"
## [8] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [9] "STATISTICS AND DECISION SCIENCE"
## [10] "PSYCHOLOGY"
## [11] "EDUCATIONAL PSYCHOLOGY"
## [12] "CLINICAL PSYCHOLOGY"
## [13] "COUNSELING PSYCHOLOGY"
## [14] "INDUSTRIAL AND ORGANIZATIONAL PSYCHOLOGY"
## [15] "SOCIAL PSYCHOLOGY"
## [16] "MISCELLANEOUS PSYCHOLOGY"
Write code that transforms ‘input_data’ (in the form of a .CSV file) into ‘expected_output’
input_data <- read.csv("homework_2_q_2.csv", header = FALSE)
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on 'homework_2_q_2.csv'
expected_output <- c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
input_data
My approach is to first concatenate all of the rows of the csv file into one string. Following this I will use regex to remove the [##] values. At this point, I can extract any text that is surrounded by quotation marks. Following this extraction, I remove the quotation marks, and the result will match the expected data.
format_conversion <- function(input_data) {
# first we concatenate all the rows of the .CSV file
concat <- ""
for (x in 1:nrow(input_data)) {
concat <- str_c(concat, input_data[x,])
}
# next we remove all '[#]'s from the concatenated string and extract everything where there is text/spaces between quotation marks eg. '"apple sauce"'
extracted_values <- concat %>%
str_replace_all("\\[\\d+\\]\\s", "") %>%
str_extract_all("\"[a-z\\s]+\"")
#finally, convert extracted_values into a character vector, and remove all quotation marks
final_format <- str_replace_all(unlist(extracted_values[1]),"\"","")
return(final_format)
}
Another, simpler function I made after some more thought:
convert_input <- function(input_data) {
ret <- str_c(input_data$V1, collapse="") %>%
str_extract_all('[A-Za-z]+.?[A-Za-z]+')
return(unlist(ret[1]))
}
output <- format_conversion(input_data)
output2 <- convert_input(input_data)
output == expected_output
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
output2 == expected_output
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
Without quotations, this should be interpreted as a “numeric expression”. As such, this expression would match against any substring that occurs three times in a row within a string.
But if you try to enter this into R with quotations, you will match against something entirely different, and unexpcted. You would match against a substring that contains any character, followed by two escaped 1’s:
example <- c("brrr\1\1r","hmmm\1","assass\2\2in")
str_extract(example,"(.)\1\1")
## [1] "r\001\001" NA NA
If you want to get three characters in a row, then you need to convert the numeric expression to a string expression, by escaping the backslashes: “(.)\1\1”
The regex “(.)\1\1” would match substrings with three repeating characters.
example <- c("brrrr","hmmmattt","assassin")
str_extract_all(example,"(.)\\1\\1")
## [[1]]
## [1] "rrr"
##
## [[2]]
## [1] "mmm" "ttt"
##
## [[3]]
## character(0)
This will match 4 character palindrome substrings
example <- c("racecar","woowzers","rocks")
str_extract(example,"(.)(.)\\2\\1")
## [1] NA "woow" NA
This is similar to the first one. It will match any two characters followed by an escaped 1 (\1) unless you convert to a string expression.
example <- c("hello\1","h\1","bingo")
str_extract(example, "(..)\1")
## [1] "lo\001" NA NA
When using double slashes, like “(..)\1”, then it would match a four character string, starting with the first two characters repeating
example <- c("banana","titilating","money")
str_extract(example, "(..)\\1")
## [1] "anan" "titi" NA
This will match a character X, followed by a single character (unspecificed), followed by X, followed by a character (unspecified), followed by X.
example <- c("T-T6T Freight Engine", "Monopoly","battleship")
str_extract(example, "(.).\\1.\\1")
## [1] "T-T6T" "onopo" NA
This will match a substring whose last three characters are the reverse of the first three characters. However there must be any number of characters between the first three, and the last three.
example <- c("123thisisnotmypassword321","hello_mister_sim","dogs")
str_extract(example, "(.)(.)(.).*\\3\\2\\1")
## [1] "123thisisnotmypassword321" "mister_sim"
## [3] NA
pattern <- "(.).*\\1"
example <- c("nylon","velcro","Mr. Greenberg")
str_extract(example, pattern)
## [1] "nylon" NA "r. Greenber"
pattern <- ".*(..).*\\1.*"
example <- c("church","bigbird","other")
str_extract(example, pattern)
## [1] "church" "bigbird" NA
pattern <- ".*(.).*\\1.*\\1.*"
example <- c("eleven","bananana","mount sinai")
str_extract(example, pattern)
## [1] "eleven" "bananana" NA