library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

theLink <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"

# load data into data frame
df_majors <- read.csv(file=theLink, header = TRUE, sep = ",")

head(df_majors)

##   FOD1P                                 Major
## 1  1100                   GENERAL AGRICULTURE
## 2  1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 3  1102                AGRICULTURAL ECONOMICS
## 4  1103                       ANIMAL SCIENCES
## 5  1104                          FOOD SCIENCE
## 6  1105            PLANT SCIENCE AND AGRONOMY
##                    Major_Category
## 1 Agriculture & Natural Resources
## 2 Agriculture & Natural Resources
## 3 Agriculture & Natural Resources
## 4 Agriculture & Natural Resources
## 5 Agriculture & Natural Resources
## 6 Agriculture & Natural Resources

df_majors %>% 
  filter(str_detect(Major, "DATA|STATISTICS"))

##   FOD1P                                         Major
## 1  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS
## 2  2101      COMPUTER PROGRAMMING AND DATA PROCESSING
## 3  3702               STATISTICS AND DECISION SCIENCE
##            Major_Category
## 1                Business
## 2 Computers & Mathematics
## 3 Computers & Mathematics

Problem 2

given_str <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry"'

given_str

## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n[13] \"olive\"        \"salal berry\""

# matches [a-z]+ for 1+ times
extracted_str <- str_extract_all(given_str, '[a-z]+\\s[a-z]+|[a-z]+')
extracted_str

## [[1]]
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
##  [5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
##  [9] "elderberry"   "lime"         "lychee"       "mulberry"    
## [13] "olive"        "salal berry"

gsub('[\"]', '', extracted_str)

## [1] "c(bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry)"

The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version:

#3 Describe, in words, what these expressions will match:

(.)\1\1

(.) is first capturing group and . matches any character
\1 matches the same text which is matched by 1st capturing grp
\1 matches the same text which is matched by 1st capturing grp

(.)(.)\2\1

(.) is first capturing group and . matches any character
(.) is second capturing group and . matches any character
\2 matches the same text which is matched by 2nd capturing grp
\1 matches the same text which is matched by 1st capturing grp

(..)\1

(..) is first capturing group where . matches any character and the next . matches any character
\1 matches the same text which is matched by 1st capturing grp

(.).\1.\1

(.) is first capturing group and . matches any character
. matches any character
\1 matches the same text which is matched by 1st capturing grp
. matches any character
\1 matches the same text which is matched by 1st capturing grp

(.)(.)(.).*\3\2\1

" matches the character "
(.) is first capturing group and . matches any character
next (.) is second capturing group and . matches any character
next (.) is third capturing group and . matches any character
.* matches any character as many number of times
\1 matches the same text which is matched by 3rd capturing grp
\2 matches the same text which is matched by 2nd capturing grp
\1 matches the same text which is matched by 1st capturing grp

#4 Construct regular expressions to match words that:

Start and end with the same character.

^(.).*\1$

str.4.1 <- c("ZOO","YAML","SAS","TOY","MOM")  
pattern.4.1 <- "^(.).*\\1$"
str.4.1 %>% 
  str_subset(pattern.4.1)

## [1] "SAS" "MOM"

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

(.)(.).*\1.

str.4.2 <- c("church","papa","apple")  
pattern.4.2 <- "(.)(.).*\\1"
str.4.2 %>% 
  str_subset(pattern.4.2)

## [1] "church" "papa"

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

(.).\1.\1

str.4.3 <- c("eleven","kjkjkjkj","apple")  
pattern.4.3 <- "(.).\\1.\\1"
str.4.3 %>% 
  str_subset(pattern.4.3)

## [1] "eleven"   "kjkjkjkj"

Data 607 - Assignment 3

Amit Kapoor

2/10/2020

Problem 2