DATA607 Week3 Assignment

Let’s load the required libraries in R for data analysis
library(dplyr)
library(downloader)
library(stringr)
library(htmlTable)


Get Majors data from csv file

url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
download_file <- "majors_list.csv" # name of the file on the local machine after download

# the file will be downloaded to the working directory
downloader::download(url, download_file)

# read the dataset into a dataframe
majors_list <- read.csv(download_file, header=TRUE, stringsAsFactors = FALSE)

1) Identify Majors with ‘DATA’ OR ‘STATISTICS’

### keywords <- c("STATISTICS" , "DATA")  ## test
keywords <- c( "DATA","STATISTICS")
filtered_majors <- majors_list[str_detect(majors_list$Major, paste( keywords, collapse = '|')), ]
htmlTable(filtered_majors)
FOD1P Major Major_Category
44 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
52 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
59 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics


2) Code to transforms the data format

strdata <- "[1] 'bell pepper'  'bilberry'     'blackberry'   'blood orange' [5] 'blueberry' 'cantaloupe'   'chili pepper'   'cloudberry'  [9] 'elderberry'   'lime'         'lychee'       'mulberry' [13] 'olive'  'salal berry'"

strdata1 <- str_replace_all(strdata, "([[0-9]])", "")
strdata2 <-str_replace_all(strdata1, "\\[\\]", "")
strdata3 <-str_replace_all(strdata2, "[[:punct:]]+", ",")
strdata4 <-str_replace_all(strdata3, ",[[:space:]]+,", ",")
strdata5 <-str_replace_all(strdata4, " ,", "")
strdata5_end <-str_replace_all(strdata5, ",$", "")
strdata6 <-str_replace_all(strdata5_end, ",", "\",\"")
strdata7 <-str_replace_all(strdata6, "\'", "\"")
strdata8 <- str_c( "c(\"" , strdata7 , "\")" )
htmlTable(strdata8)
c(“bell pepper”,“bilberry”,“blackberry”,“blood orange”,“blueberry”,“cantaloupe”,“chili pepper”,“cloudberry”,“elderberry”,“lime”,“lychee”,“mulberry”,“olive”,“salal berry”)


3) Describe, in words, what these expressions will match:

3.1) (.)\1\1

. Matches any character except line breaks (, and usually .
Back-references to previously matched subexpressions, grouped by (), \1 means the first match Another \1 means repeat the first match. Total repeat will be three times the matched character.

This would match any of the following strings: “aaa” “bbb” “ccc”


3.2) “(.)(.)\2\1”

. Matches any character except line breaks (, and usually .
There are two characters at the start
Back-references to previously matched subexpressions, grouped by (), \2 means the second match, followed by the first match

This would match any of the following strings: “abba” “poop” “cddc”


3.3) “(..)\1”

.. Matches any two characters, \1 means repeating twice the match immediately

Given fruits like “salal berry”, “banana”,“papaya”, str_match(fruits,above_pattern) will return

[1,] “alal” “al” [2,] “anan” “an” [3,] “papa” “pa”


3.4) “(.).\1.\1”

(.) Match first character, . means the Second character can be any character
\1 means repeating the match, .\1 means repeat the first match with second character as wild character

Given fruits like “salal berry”, “banana”,“papaya”, str_match(fruits,above_pattern) will return

[2,] “anana” “a” [3,] “apaya” “a”


_3.5) "(.)(.)(.).*\3\2\1"

(.)(.)(.) Match first three characters, Fourth onwards can be any number of characters, then followed by the third, then second and followed by the first character Example: “tomapotatomot”

[Result] “tomapotatomot” “t” “o” “m”


fruits <- c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry", "banana","apapaya", "tomapotatomotot","church")

#pattern <- "(.)\\1\\1"
#pattern <- "(.)(.)\\2\\1"
#pattern <- "(..)\\1"
#pattern <- "(.).\\1.\\1"
#pattern <- "(.)(.)(.).*\\3\\2\\1"
#pattern <- "^(.).*\\1$"
#pattern <- "(..).*\\1"
pattern <- "([a-z]).*\\1.*\\1"
#pattern <- "(.)\\..\\..\\.."
#fruits %>% 
str_match(fruits,pattern)
##       [,1]              [,2]
##  [1,] "ell peppe"       "e" 
##  [2,] NA                NA  
##  [3,] NA                NA  
##  [4,] "ood o"           "o" 
##  [5,] NA                NA  
##  [6,] NA                NA  
##  [7,] "pepp"            "p" 
##  [8,] NA                NA  
##  [9,] "elderbe"         "e" 
## [10,] NA                NA  
## [11,] NA                NA  
## [12,] NA                NA  
## [13,] NA                NA  
## [14,] NA                NA  
## [15,] "anana"           "a" 
## [16,] "apapaya"         "a" 
## [17,] "tomapotatomotot" "t" 
## [18,] NA                NA


4) Construct regular expressions to match words that:

4.1)Start and end with the same character.

pattern <- "^(.).*\1$"
[16,] “apapaya” “a”
[17,] “tomapotatomotot” “t”


4.2)Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

pattern <- "(..).*\1"
[18,] “church” “ch”


4.3)Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

pattern <- “([a-z]).\1.\1”