library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(stringr)
library(htmltools)

Week 3 assignment Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to the problems below. You may work in a small group, but please submit separately with names of all group participants in your submission.

1 Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

df = read.csv(url('https://github.com/fivethirtyeight/data/raw/e48bfdad04d909610cecb01d5a4ba2c99cb997f3/college-majors/majors-list.csv'), stringsAsFactors = FALSE)
str(df)
## 'data.frame':    174 obs. of  3 variables:
##  $ FOD1P         : chr  "1100" "1101" "1102" "1103" ...
##  $ Major         : chr  "GENERAL AGRICULTURE" "AGRICULTURE PRODUCTION AND MANAGEMENT" "AGRICULTURAL ECONOMICS" "ANIMAL SCIENCES" ...
##  $ Major_Category: chr  "Agriculture & Natural Resources" "Agriculture & Natural Resources" "Agriculture & Natural Resources" "Agriculture & Natural Resources" ...
data_statistics_major =  df$Major[grep("DATA|STATISTICS", df$Major)]
data_statistics_major
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [3] "STATISTICS AND DECISION SCIENCE"

2 Write code that transforms the data below:

[1] “bell pepper” “bilberry” “blackberry” “blood orange”

[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”

[9] “elderberry” “lime” “lychee” “mulberry”

[13] “olive” “salal berry”

Into a format like this:

c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

fruits_veg <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"

[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  

[9] "elderberry"   "lime"         "lychee"       "mulberry"    

[13] "olive"        "salal berry"'
fruits_veg
## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n\n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n\n[13] \"olive\"        \"salal berry\""
fruits_veg_wo= unlist(str_extract_all(fruits_veg, pattern = "\"([a-z]+.[a-z]+)\""))
fruits_veg_wo
##  [1] "\"bell pepper\""  "\"bilberry\""     "\"blackberry\""   "\"blood orange\""
##  [5] "\"blueberry\""    "\"cantaloupe\""   "\"chili pepper\"" "\"cloudberry\""  
##  [9] "\"elderberry\""   "\"lime\""         "\"lychee\""       "\"mulberry\""    
## [13] "\"olive\""        "\"salal berry\""
fruits_veg_wo_cl= str_remove_all(fruits_veg_wo, "\"")
fruits_veg_wo_cl=c(fruits_veg_wo_cl)
fruits_veg_wo_cl
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version:

#3 Describe, in words, what these expressions will match:

words <- '[1] "bell pepper"  "bilberry"     "church"   "pressure"

[5] "blueberry"    "eleven"   "chili pepper" "apple"  

[9] "abcdfdfbcafgfdg"   "papaya"         "dad"       "mulberry"    

[13] "believe"        "salal berry" "Banana"  "coconut"'

words_wo= unlist(str_extract_all(words, pattern = "\"([a-z]+.[a-z]+)\""))
wrods_wo_cl= str_remove_all(words_wo, "\"")

(.)\1\1

Match any strings that have a character that repeats back to back

expression1 = "(.)\1\1"
wrods_wo_cl %>% 
  str_subset(expression1)
## character(0)

“(.)(.)\2\1”

This could match anything like 4 characters, with the first and last matching and then the two middle matching.

expression2 = "(.)(.)\\2\\1"
wrods_wo_cl %>% 
  str_subset(expression2)
## [1] "bell pepper"  "chili pepper"

(..)\1

String contains the exact same character, repeated 2 or more times

expression3 = "(..)\1"
wrods_wo_cl %>% 
  str_subset(expression3)
## character(0)

“(.).\1.\1”

it will look for 3 repeating words in a string.

expression4 = "(.).\\1.\\1"
wrods_wo_cl %>% 
  str_subset(expression4)
## [1] "eleven" "papaya"

"(.)(.)(.).*\3\2\1"

it will look at the first three letters and then see if there is an opposite pattern like abcdfdfdfcba.

expression5 = "(.)(.)(.).*\\3\\2\\1"
wrods_wo_cl %>% 
  str_subset(expression5)
## character(0)

#4 Construct regular expressions to match words that:

Start and end with the same character.

expression9 = "^(.).*\\1$"
wrods_wo_cl %>% 
  str_subset(expression9)
## [1] "dad"

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

expression6 = "(..).*\\1"
wrods_wo_cl %>% 
  str_subset(expression6)
## [1] "bell pepper"     "church"          "pressure"        "chili pepper"   
## [5] "abcdfdfbcafgfdg" "papaya"          "salal berry"     "coconut"

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

expression7 = "(.).*\\1.*\\1"
wrods_wo_cl %>% 
  str_subset(expression7)
## [1] "bell pepper"     "eleven"          "chili pepper"    "abcdfdfbcafgfdg"
## [5] "papaya"          "believe"