DATA_607_Week3_Assignment

DATA607 Week3 Assignment

Let’s load the required libraries in R for data analysis

library(dplyr)
library(downloader)
library(stringr)
library(htmlTable)

Get Majors data from csv file

url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
download_file <- "majors_list.csv" # name of the file on the local machine after download

# the file will be downloaded to the working directory
downloader::download(url, download_file)

# read the dataset into a dataframe
majors_list <- read.csv(download_file, header=TRUE, stringsAsFactors = FALSE)

1) Identify Majors with ‘DATA’ OR ‘STATISTICS’

### keywords <- c("STATISTICS" , "DATA")  ## test
keywords <- c( "DATA","STATISTICS")
filtered_majors <- majors_list[str_detect(majors_list$Major, paste( keywords, collapse = '|')), ]
htmlTable(filtered_majors)

	FOD1P	Major	Major_Category
44	6212	MANAGEMENT INFORMATION SYSTEMS AND STATISTICS	Business
52	2101	COMPUTER PROGRAMMING AND DATA PROCESSING	Computers & Mathematics
59	3702	STATISTICS AND DECISION SCIENCE	Computers & Mathematics

2) Code to transforms the data format

strdata <- "[1] 'bell pepper'  'bilberry'     'blackberry'   'blood orange' [5] 'blueberry' 'cantaloupe'   'chili pepper'   'cloudberry'  [9] 'elderberry'   'lime'         'lychee'       'mulberry' [13] 'olive'  'salal berry'"

strdata1 <- str_replace_all(strdata, "([[0-9]])", "")
strdata2 <-str_replace_all(strdata1, "\\[\\]", "")
strdata3 <-str_replace_all(strdata2, "[[:punct:]]+", ",")
strdata4 <-str_replace_all(strdata3, ",[[:space:]]+,", ",")
strdata5 <-str_replace_all(strdata4, " ,", "")
strdata5_end <-str_replace_all(strdata5, ",$", "")
strdata6 <-str_replace_all(strdata5_end, ",", "\",\"")
strdata7 <-str_replace_all(strdata6, "\'", "\"")
strdata8 <- str_c( "c(\"" , strdata7 , "\")" )
htmlTable(strdata8)

c(“bell pepper”,“bilberry”,“blackberry”,“blood orange”,“blueberry”,“cantaloupe”,“chili pepper”,“cloudberry”,“elderberry”,“lime”,“lychee”,“mulberry”,“olive”,“salal berry”)

3) Describe, in words, what these expressions will match:

3.1) (.)\1\1

. Matches any character except line breaks (, and usually .
Back-references to previously matched subexpressions, grouped by (), \1 means the first match Another \1 means repeat the first match. Total repeat will be three times the matched character.

This would match any of the following strings: “aaa” “bbb” “ccc”

3.2) “(.)(.)\2\1”

. Matches any character except line breaks (, and usually .
There are two characters at the start
Back-references to previously matched subexpressions, grouped by (), \2 means the second match, followed by the first match

This would match any of the following strings: “abba” “poop” “cddc”

3.3) “(..)\1”

.. Matches any two characters, \1 means repeating twice the match immediately

Given fruits like “salal berry”, “banana”,“papaya”, str_match(fruits,above_pattern) will return

[1,] “alal” “al” [2,] “anan” “an” [3,] “papa” “pa”

3.4) “(.).\1.\1”

(.) Match first character, . means the Second character can be any character
\1 means repeating the match, .\1 means repeat the first match with second character as wild character

Given fruits like “salal berry”, “banana”,“papaya”, str_match(fruits,above_pattern) will return

[2,] “anana” “a” [3,] “apaya” “a”

_3.5) "(.)(.)(.).*\3\2\1"

(.)(.)(.) Match first three characters, Fourth onwards can be any number of characters, then followed by the third, then second and followed by the first character Example: “tomapotatomot”

[Result] “tomapotatomot” “t” “o” “m”

fruits <- c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry", "banana","apapaya", "tomapotatomotot","church")

#pattern <- "(.)\\1\\1"
#pattern <- "(.)(.)\\2\\1"
#pattern <- "(..)\\1"
#pattern <- "(.).\\1.\\1"
#pattern <- "(.)(.)(.).*\\3\\2\\1"
#pattern <- "^(.).*\\1$"
#pattern <- "(..).*\\1"
pattern <- "([a-z]).*\\1.*\\1"
#pattern <- "(.)\\..\\..\\.."
#fruits %>% 
str_match(fruits,pattern)

##       [,1]              [,2]
##  [1,] "ell peppe"       "e" 
##  [2,] NA                NA  
##  [3,] NA                NA  
##  [4,] "ood o"           "o" 
##  [5,] NA                NA  
##  [6,] NA                NA  
##  [7,] "pepp"            "p" 
##  [8,] NA                NA  
##  [9,] "elderbe"         "e" 
## [10,] NA                NA  
## [11,] NA                NA  
## [12,] NA                NA  
## [13,] NA                NA  
## [14,] NA                NA  
## [15,] "anana"           "a" 
## [16,] "apapaya"         "a" 
## [17,] "tomapotatomotot" "t" 
## [18,] NA                NA

4) Construct regular expressions to match words that:

4.1)Start and end with the same character.

pattern <- "^(.).*\1$"
[16,] “apapaya” “a”
[17,] “tomapotatomotot” “t”

4.2)Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

pattern <- "(..).*\1"
[18,] “church” “ch”

4.3)Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

pattern <- “([a-z]).\1.\1”

DATA_607_Week3_Assignment

Bikram Barua

9/8/2021