data607_assignment3

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

#1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

csv_data <- read.csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv')

glimpse(csv_data)

## Rows: 173
## Columns: 22
## $ Major_code                   <int> 5601, 6004, 6211, 2201, 2001, 3201, 6206,…
## $ Major                        <chr> "CONSTRUCTION SERVICES", "COMMERCIAL ART …
## $ Major_category               <chr> "Industrial Arts & Consumer Services", "A…
## $ Grad_total                   <int> 9173, 53864, 24417, 5411, 9109, 1542, 190…
## $ Grad_sample_size             <int> 200, 882, 437, 72, 171, 22, 3738, 386, 98…
## $ Grad_employed                <int> 7098, 40492, 18368, 3590, 7512, 1008, 151…
## $ Grad_full_time_year_round    <int> 6511, 29553, 14784, 2701, 5622, 860, 1230…
## $ Grad_unemployed              <int> 681, 2482, 1465, 316, 466, 0, 8324, 473, …
## $ Grad_unemployment_rate       <dbl> 0.08754339, 0.05775585, 0.07386679, 0.080…
## $ Grad_median                  <dbl> 75000, 60000, 65000, 47000, 57000, 75000,…
## $ Grad_P25                     <int> 53000, 40000, 45000, 24500, 40600, 55000,…
## $ Grad_P75                     <dbl> 110000, 89000, 100000, 85000, 83700, 1200…
## $ Nongrad_total                <int> 86062, 461977, 179335, 37575, 53819, 8921…
## $ Nongrad_employed             <int> 73607, 347166, 145597, 29738, 43163, 6967…
## $ Nongrad_full_time_year_round <int> 62435, 250596, 113579, 23249, 34231, 6063…
## $ Nongrad_unemployed           <int> 3928, 25484, 7409, 1661, 3389, 518, 45519…
## $ Nongrad_unemployment_rate    <dbl> 0.05066099, 0.06838588, 0.04842294, 0.052…
## $ Nongrad_median               <dbl> 65000, 48000, 50000, 41600, 52000, 50000,…
## $ Nongrad_P25                  <int> 47000, 34000, 35000, 29000, 36000, 34000,…
## $ Nongrad_P75                  <dbl> 98000, 71000, 75000, 60000, 78000, 75000,…
## $ Grad_share                   <dbl> 0.09631963, 0.10441977, 0.11983686, 0.125…
## $ Grad_premium                 <dbl> 0.15384615, 0.25000000, 0.30000000, 0.129…

string_match <- csv_data %>% 
  filter(str_detect(Major, "DATA|STATISTIC")) %>%
  head()
  
string_match

##   Major_code                                         Major
## 1       2101      COMPUTER PROGRAMMING AND DATA PROCESSING
## 2       6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS
## 3       3702               STATISTICS AND DECISION SCIENCE
##            Major_category Grad_total Grad_sample_size Grad_employed
## 1 Computers & Mathematics       5611               98          4716
## 2                Business      41970              963         36227
## 3 Computers & Mathematics      21973              429         16979
##   Grad_full_time_year_round Grad_unemployed Grad_unemployment_rate Grad_median
## 1                      3981             119             0.02461220       85000
## 2                     32121            1459             0.03871464       89000
## 3                     14113             751             0.04235759       92000
##   Grad_P25 Grad_P75 Nongrad_total Nongrad_employed Nongrad_full_time_year_round
## 1    56000   114000         28314            22024                        18381
## 2    64500   116000        150110           129179                       115177
## 3    64000   125000         22210            17024                        13665
##   Nongrad_unemployed Nongrad_unemployment_rate Nongrad_median Nongrad_P25
## 1               2222                0.09164398          60000       40000
## 2               5690                0.04218909          72000       50000
## 3                874                0.04883227          75000       45000
##   Nongrad_P75 Grad_share Grad_premium
## 1       85000  0.1653943    0.4166667
## 2      100000  0.2185027    0.2361111
## 3      106000  0.4973180    0.2266667

#2 Write code that transforms the data below:

[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry” [13] “olive” “salal berry”

Into a format like this:

c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

str_fruits <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"

[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  

[9] "elderberry"   "lime"         "lychee"       "mulberry"    

[13] "olive"        "salal berry"'
str_fruits_pattern <- str_extract_all(str_fruits,pattern = '[A-Za-z]+.?[A-Za-z]+')
str_fruits_pattern <- unlist(str_fruits_pattern)

str_fruits_pattern

##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

newformat <- str_c(str_fruits_pattern, sep = ",", collapse = "\", \"")
newformat <-  paste("c(\"", newformat,"\")")
newformat <- str_replace(newformat, " b", "b")
newformat <- str_replace(newformat, "y ", "y")
cat(newformat)

## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")

#3 Describe, in words, what these expressions will match:

test_string <- c('aaa', 'bbb', 'cat', 'doll', 'aba', 'cda', 'kind', 'noon', 'moon','byby', 'axaxa','racecar')

(.)\1\1

It will display any character that is repeated 3 times

str_match(test_string, '(.)\\1\\1')

##       [,1]  [,2]
##  [1,] "aaa" "a" 
##  [2,] "bbb" "b" 
##  [3,] NA    NA  
##  [4,] NA    NA  
##  [5,] NA    NA  
##  [6,] NA    NA  
##  [7,] NA    NA  
##  [8,] NA    NA  
##  [9,] NA    NA  
## [10,] NA    NA  
## [11,] NA    NA  
## [12,] NA    NA

(.)(.)\2\1

same letter before and after of 2 same character

str_match(test_string, '(.)(.)\\2\\1')

##       [,1]   [,2] [,3]
##  [1,] NA     NA   NA  
##  [2,] NA     NA   NA  
##  [3,] NA     NA   NA  
##  [4,] NA     NA   NA  
##  [5,] NA     NA   NA  
##  [6,] NA     NA   NA  
##  [7,] NA     NA   NA  
##  [8,] "noon" "n"  "o" 
##  [9,] NA     NA   NA  
## [10,] NA     NA   NA  
## [11,] NA     NA   NA  
## [12,] NA     NA   NA

(..)\1

Regular expression not represented in string format that has two characters repeated twice in the same order

str_match(test_string, '(..)\\1')

##       [,1]   [,2]
##  [1,] NA     NA  
##  [2,] NA     NA  
##  [3,] NA     NA  
##  [4,] NA     NA  
##  [5,] NA     NA  
##  [6,] NA     NA  
##  [7,] NA     NA  
##  [8,] NA     NA  
##  [9,] NA     NA  
## [10,] "byby" "by"
## [11,] "axax" "ax"
## [12,] NA     NA

“(.).\1.\1”

The letter is located that has any letter after it. Then the letter with any letter with the repeated letter

str_match(test_string, '(.).\\1.\\1')

##       [,1]    [,2]
##  [1,] NA      NA  
##  [2,] NA      NA  
##  [3,] NA      NA  
##  [4,] NA      NA  
##  [5,] NA      NA  
##  [6,] NA      NA  
##  [7,] NA      NA  
##  [8,] NA      NA  
##  [9,] NA      NA  
## [10,] NA      NA  
## [11,] "axaxa" "a" 
## [12,] NA      NA

“(.)(.)(.).*\3\2\1”

letters follows by same three letters but the third swap the position with the first letter

str_match(test_string, '(.)(.)(.).*\\3\\2\\1')

##       [,1]      [,2] [,3] [,4]
##  [1,] NA        NA   NA   NA  
##  [2,] NA        NA   NA   NA  
##  [3,] NA        NA   NA   NA  
##  [4,] NA        NA   NA   NA  
##  [5,] NA        NA   NA   NA  
##  [6,] NA        NA   NA   NA  
##  [7,] NA        NA   NA   NA  
##  [8,] NA        NA   NA   NA  
##  [9,] NA        NA   NA   NA  
## [10,] NA        NA   NA   NA  
## [11,] NA        NA   NA   NA  
## [12,] "racecar" "r"  "a"  "c"

#4 Construct regular expressions to match words that:

pr4_str <- c('wow', 'ada', 'peep', 'mama', 'kaka', 'eleven', 'rarer')

Start and end with the same character.

str_match(pr4_str, "^(.).*\\1$")

##      [,1]    [,2]
## [1,] "wow"   "w" 
## [2,] "ada"   "a" 
## [3,] "peep"  "p" 
## [4,] NA      NA  
## [5,] NA      NA  
## [6,] NA      NA  
## [7,] "rarer" "r"

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

str_match(pr4_str, "(..).*\\1")

##      [,1]   [,2]
## [1,] NA     NA  
## [2,] NA     NA  
## [3,] NA     NA  
## [4,] "mama" "ma"
## [5,] "kaka" "ka"
## [6,] NA     NA  
## [7,] NA     NA

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

str_match(pr4_str, "(.).*\\1.*\\1")

##      [,1]    [,2]
## [1,] NA      NA  
## [2,] NA      NA  
## [3,] NA      NA  
## [4,] NA      NA  
## [5,] NA      NA  
## [6,] "eleve" "e" 
## [7,] "rarer" "r"