#knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”
raw_file = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv'
majors <- as_tibble(read.csv(raw_file))
majors_sub <- majors %>%
mutate(matched_name = str_match(Major,'.*DATA.*|.*STATISTICS.*')) %>%
select(Major,matched_name) %>%
filter(!is.na(matched_name))
majors_sub
## # A tibble: 3 × 2
## Major matched_name[,1]
## <chr> <chr>
## 1 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS MANAGEMENT INFORMATION SYSTEMS …
## 2 COMPUTER PROGRAMMING AND DATA PROCESSING COMPUTER PROGRAMMING AND DATA P…
## 3 STATISTICS AND DECISION SCIENCE STATISTICS AND DECISION SCIENCE
#2 Write code that transforms the data below:
[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5]
“blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
#2 Write code that transforms the data below:
str1 <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"'
str2 <- '[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry" '
str3 <-'[9] "elderberry" "lime" "lychee" "mulberry" '
str4 <- '[13] "olive" "salal berry"'
#Into a format like this: character vector
clean_str_vec <-function(str_ex) {
# 1) Replace all of the one or more repeated spaces that are starting and ending with double quotes
# 2) Replace any characters that aren't letters, pipe, or spaces with blanks
# 3) Break the string on pipe character into an k sized list based on number of pipe characters
# 4) Combine lists into an atomic vector
unlist(str_split(str_trim(str_replace_all(str_replace_all(str_ex,'\\"\\s{1,}\\"','\\|'),'[^a-z\\|\\s]','')),"\\|"))
}
new_format_vec <- str_sort(c(clean_str_vec(str1),clean_str_vec(str2),clean_str_vec(str3),clean_str_vec(str4)))
new_format_vec
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
Provided a more detailed description of each step within the nested functions to clean and prepare the string for this transformation in the function. This will also be helpful reference after some times away from the code to reorient myself with the flow of each part of the syntax.
#3 Describe, in words, what these expressions will match:
(.)\1\1 Regular expression not represented in string format that has the same character repeated 3 times
“(.)(.)\2\1” #This will match two characters that appear once and then in reverse order
(..)\1 Regular expression not represented in string format that has two characters repeated twice in the same order
“(.).\1.\1” # This will match a character with the first character repeated in the 3rd and 5th positions with any characters permitted in the other positions except a newline
“(.)(.)(.).*\3\2\1” This pattern will match 3 characters that are following by zero or more characters and then have the pattern in reverse order
test_vec <- c('aaa','bbbght','abba','1221','yxyx','zrzrjlk','13171','abaza','123djakfl;jdafkjad321','abccba')
str_match(test_vec, "(.)\\1\\1") #1
## [,1] [,2]
## [1,] "aaa" "a"
## [2,] "bbb" "b"
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
## [10,] NA NA
str_match(test_vec,"(.)(.)\\2\\1") #2
## [,1] [,2] [,3]
## [1,] NA NA NA
## [2,] NA NA NA
## [3,] "abba" "a" "b"
## [4,] "1221" "1" "2"
## [5,] NA NA NA
## [6,] NA NA NA
## [7,] NA NA NA
## [8,] NA NA NA
## [9,] NA NA NA
## [10,] "bccb" "b" "c"
str_match(test_vec,"(..)\\1") #3
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] "yxyx" "yx"
## [6,] "zrzr" "zr"
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
## [10,] NA NA
str_match(test_vec,"(.).\\1.\\1") #4
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] "13171" "1"
## [8,] "abaza" "a"
## [9,] NA NA
## [10,] NA NA
str_match(test_vec,"(.)(.)(.).*\\3\\2\\1") #5
## [,1] [,2] [,3] [,4]
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## [7,] NA NA NA NA
## [8,] NA NA NA NA
## [9,] "123djakfl;jdafkjad321" "1" "2" "3"
## [10,] "abccba" "a" "b" "c"
Grouped the example strings into two different variations of string patterns to verify that the regex was working as understood and also modified the non-string expression to be compiled by R.
#4 Construct regular expressions to match words that:
reg_test <- c('abracadabra','racecar','noon ','zoo','churches','mississippi','missuszi','eleven','baaaseball')
#1. Start and end with the same character.
str_match(reg_test,'^(.).*\\1$')
## [,1] [,2]
## [1,] "abracadabra" "a"
## [2,] "racecar" "r"
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
#2 Contain a repeated pair of letters (e.g. "church" contains "ch" repeated twice.)
str_match(reg_test,'(..).*\\1')
## [,1] [,2]
## [1,] "abracadab" "ab"
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] "church" "ch"
## [6,] "issis" "is"
## [7,] NA NA
## [8,] NA NA
## [9,] "baaaseba" "ba"
#3 Contain one letter repeated in at least three places (e.g. "eleven" contains three "e"s.)
str_match(reg_test,'(.).*\\1.*\\1')
## [,1] [,2]
## [1,] "abracadabra" "a"
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## [5,] NA NA
## [6,] "ississippi" "i"
## [7,] "ssus" "s"
## [8,] "eleve" "e"
## [9,] "aaaseba" "a"
Included a few different examples that would satisfy the conditions specified for the homework