library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mjrs <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv", sep = ',')
head(mjrs)
## Major_code Major
## 1 1100 GENERAL AGRICULTURE
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 3 1102 AGRICULTURAL ECONOMICS
## 4 1103 ANIMAL SCIENCES
## 5 1104 FOOD SCIENCE
## 6 1105 PLANT SCIENCE AND AGRONOMY
## Major_category Total Employed Employed_full_time_year_round
## 1 Agriculture & Natural Resources 128148 90245 74078
## 2 Agriculture & Natural Resources 95326 76865 64240
## 3 Agriculture & Natural Resources 33955 26321 22810
## 4 Agriculture & Natural Resources 103549 81177 64937
## 5 Agriculture & Natural Resources 24280 17281 12722
## 6 Agriculture & Natural Resources 79409 63043 51077
## Unemployed Unemployment_rate Median P25th P75th
## 1 2423 0.02614711 50000 34000 80000
## 2 2266 0.02863606 54000 36000 80000
## 3 821 0.03024832 63000 40000 98000
## 4 3619 0.04267890 46000 30000 72000
## 5 894 0.04918845 62000 38500 90000
## 6 2070 0.03179089 50000 35000 75000
#using str_view to find majors that contain data or statistics and saving that as a new object
Data_And_Stat_Majors <- str_view(mjrs$Major, "DATA|STATISTICS")
Data_And_Stat_Majors
## [20] │ COMPUTER PROGRAMMING AND <DATA> PROCESSING
## [93] │ <STATISTICS> AND DECISION SCIENCE
## [170] │ MANAGEMENT INFORMATION SYSTEMS AND <STATISTICS>
#save the data set from the problem in the original format
original_fruit <- fruit |>
str_subset("bell pepper|bilberry|blackberry|blood orange|blueberry|cantaloupe|chili pepper|cloudberry|elderberry|lime|lychee|mulberry|olive|salal berry"
)
#view the data that was saved
original_fruit
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
#adding commas to the vector
flat_fruit <- str_c(c(original_fruit),", ")
#flattening the vector to a string
flat_fruit <- str_flatten(c(flat_fruit))
#dropping the extra comma and space from the end of the string
new_fruit <- str_sub(flat_fruit,1,-3)
#view the new data
new_fruit
## [1] "bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry"
(.)\1\1 This will match when a character is repeated 3 times. The (.) refers to any character, and the \1 and \1 refer to the 1st index’s captured value. For example, ddd and 888. This is assuming it is surrounded by quotation marks in the regex.
“(.)(.)\2\1” This will match any sequence of 4 characters where the 1st and 4th characters are the same, and the 2nd and 3rd characters are the same. For example, abba, 6226, and 5555.
(..)\1 This will match any sequence of 2 characters that is repeated. Meaning any 2 character values repeated in the same order directly after the original time. For instance, 5656 and haha. This is assuming it is surrounded by quotation marks in the regex.
“(.).\1.\1” This will match a series of characters where the 1st, 3rd, and 5th characters are all the same value. The 2nd and 4th characters can have any value. Ex: pipop and 39303.
“(.)(.)(.).*\3\2\1” This will match where there is a series of any 3 characters, followed by any number of any characters (including 0 times), followed by the first 3 characters in reverse order. Ex: racecar, striperts and 6789202876.
wdf <- words
Words that start and end with the same character:
wdf |>
str_view("^(.).*\\1$")
## [36] │ <america>
## [49] │ <area>
## [209] │ <dad>
## [213] │ <dead>
## [223] │ <depend>
## [258] │ <educate>
## [266] │ <else>
## [268] │ <encourage>
## [270] │ <engine>
## [278] │ <europe>
## [283] │ <evidence>
## [285] │ <example>
## [287] │ <excuse>
## [288] │ <exercise>
## [291] │ <expense>
## [292] │ <experience>
## [296] │ <eye>
## [386] │ <health>
## [394] │ <high>
## [450] │ <knock>
## ... and 16 more
Words that contain a repeated pair of letters
wdf |>
str_view("(..).*\\1")
## [48] │ ap<propr>iate
## [152] │ <church>
## [181] │ c<ondition>
## [217] │ <decide>
## [275] │ <environmen>t
## [487] │ l<ondon>
## [598] │ pa<ragra>ph
## [603] │ p<articular>
## [617] │ <photograph>
## [638] │ p<repare>
## [641] │ p<ressure>
## [696] │ r<emem>ber
## [698] │ <repre>sent
## [699] │ <require>
## [739] │ <sense>
## [858] │ the<refore>
## [903] │ u<nderstand>
## [946] │ w<hethe>r
Words that contain one letter repeated in at least three places
wdf |>
str_view("(.).*\\1.*\\1")
## [48] │ a<pprop>riate
## [62] │ <availa>ble
## [86] │ b<elieve>
## [90] │ b<etwee>n
## [119] │ bu<siness>
## [221] │ d<egree>
## [229] │ diff<erence>
## [233] │ di<scuss>
## [265] │ <eleve>n
## [275] │ e<nvironmen>t
## [283] │ <evidence>
## [288] │ <exercise>
## [291] │ <expense>
## [292] │ <experience>
## [423] │ <indivi>dual
## [598] │ p<aragra>ph
## [684] │ r<eceive>
## [696] │ r<emembe>r
## [698] │ r<eprese>nt
## [845] │ t<elephone>
## ... and 2 more