HW_Assignment_3_Ahmed

filtering data for the majors with Data or Statistics in the major’s name

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

college_majors <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")

data_stats_majors <- college_majors %>%
  filter(grepl("DATA|STATISTICS", Major, ignore.case = TRUE))

print(data_stats_majors)

##   FOD1P                                         Major          Major_Category
## 1  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS                Business
## 2  2101      COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3  3702               STATISTICS AND DECISION SCIENCE Computers & Mathematics

transform fruit data into a vector

fruits <- c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
print(fruits)

##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

test_strings <- c("aaa", "aaaa",  "aaaab", "aaaabbb","aaaabbbbccccc","aaabbbccc","ababacabcabc","abba", "abab", "abaca", "fijtidkwptof", "abab", "1212", "abcdabcd", "aaaa", "abcabc", "abac", "a", "aaaaaaaaaabbbbbbbbb", "abcba", "hellolleh", "helloolleh", "aaaaabbbbccccc","abacadddddddefege","aaaaaa", "aaabaaa", "aaabbaaa", "aaabcdefaaa", "aaa14rtfterweiotje4riyjw4tyjiow4yjwtpiojwtaaa", "abcdcba", "abcdabc")

# testing  (.)\1\1
string_a <- "aaaaaaaaaabbbbbbbbb"
output_a <- regmatches(string_a, gregexpr("(.)\1\1", string_a))
print(output_a)

## [[1]]
## character(0)

string_b <- "aaaab"
output_b <- grepl("(.)\1\1", string_b)
print(output_b)

## [1] FALSE

string_c <- "aabaa"
output_c <- grepl("(.)\1\1", string_c)
print(output_c)

## [1] FALSE

#After doing some research on stackoverflow, it seems like single slashes are not allowed in R which is why all my test cases fail. The correct way to pass in this regex is to add in another backslash. So that the new regex would be "(.)\\1\\1". 


#Lets do some testing by passing in my test_strings to this new regex and see the behavior of this new regex. 

modified_regex1 <- "(.)\\1\\1"
modified_regex1_output <- grep(modified_regex1, test_strings, value = TRUE)
print(modified_regex1_output)

##  [1] "aaa"                                          
##  [2] "aaaa"                                         
##  [3] "aaaab"                                        
##  [4] "aaaabbb"                                      
##  [5] "aaaabbbbccccc"                                
##  [6] "aaabbbccc"                                    
##  [7] "aaaa"                                         
##  [8] "aaaaaaaaaabbbbbbbbb"                          
##  [9] "aaaaabbbbccccc"                               
## [10] "abacadddddddefege"                            
## [11] "aaaaaa"                                       
## [12] "aaabaaa"                                      
## [13] "aaabbaaa"                                     
## [14] "aaabcdefaaa"                                  
## [15] "aaa14rtfterweiotje4riyjw4tyjiow4yjwtpiojwtaaa"

#looks like what this new regex does is return the entire string that contains any character repeated three times consecutively based on my results.

# testing (.)(.)\\2\\1"

outputaa <- regmatches(string_a, gregexpr("(.)(.)\\2\\1", string_a))
print(outputaa)

## [[1]]
## [1] "aaaa" "aaaa" "bbbb" "bbbb"

outputbb <- regmatches(string_a, gregexpr("(.)(.)\\2\\1", string_b))
print(outputbb)

## [[1]]
## [1] "aaaa"

outputcc <- regmatches(string_a, gregexpr("(.)(.)\\2\\1", output_c))

outputs <- lapply(test_strings, function(x) regmatches(x, gregexpr("(.)(.)\\2\\1", x)))
print(outputs)

## [[1]]
## [[1]][[1]]
## character(0)
## 
## 
## [[2]]
## [[2]][[1]]
## [1] "aaaa"
## 
## 
## [[3]]
## [[3]][[1]]
## [1] "aaaa"
## 
## 
## [[4]]
## [[4]][[1]]
## [1] "aaaa"
## 
## 
## [[5]]
## [[5]][[1]]
## [1] "aaaa" "bbbb" "cccc"
## 
## 
## [[6]]
## [[6]][[1]]
## character(0)
## 
## 
## [[7]]
## [[7]][[1]]
## character(0)
## 
## 
## [[8]]
## [[8]][[1]]
## [1] "abba"
## 
## 
## [[9]]
## [[9]][[1]]
## character(0)
## 
## 
## [[10]]
## [[10]][[1]]
## character(0)
## 
## 
## [[11]]
## [[11]][[1]]
## character(0)
## 
## 
## [[12]]
## [[12]][[1]]
## character(0)
## 
## 
## [[13]]
## [[13]][[1]]
## character(0)
## 
## 
## [[14]]
## [[14]][[1]]
## character(0)
## 
## 
## [[15]]
## [[15]][[1]]
## [1] "aaaa"
## 
## 
## [[16]]
## [[16]][[1]]
## character(0)
## 
## 
## [[17]]
## [[17]][[1]]
## character(0)
## 
## 
## [[18]]
## [[18]][[1]]
## character(0)
## 
## 
## [[19]]
## [[19]][[1]]
## [1] "aaaa" "aaaa" "bbbb" "bbbb"
## 
## 
## [[20]]
## [[20]][[1]]
## character(0)
## 
## 
## [[21]]
## [[21]][[1]]
## character(0)
## 
## 
## [[22]]
## [[22]][[1]]
## [1] "lool"
## 
## 
## [[23]]
## [[23]][[1]]
## [1] "aaaa" "bbbb" "cccc"
## 
## 
## [[24]]
## [[24]][[1]]
## [1] "dddd"
## 
## 
## [[25]]
## [[25]][[1]]
## [1] "aaaa"
## 
## 
## [[26]]
## [[26]][[1]]
## character(0)
## 
## 
## [[27]]
## [[27]][[1]]
## [1] "abba"
## 
## 
## [[28]]
## [[28]][[1]]
## character(0)
## 
## 
## [[29]]
## [[29]][[1]]
## character(0)
## 
## 
## [[30]]
## [[30]][[1]]
## character(0)
## 
## 
## [[31]]
## [[31]][[1]]
## character(0)

string_d = "abcba"
string_e = "hellolleh"
string_f = "helloolleh"


output_d <- regmatches(string_d, gregexpr("(.)(.)\\2\\1", string_d))
output_e <- regmatches(string_e, gregexpr("(.)(.)\\2\\1", string_e))
output_f <- regmatches(string_f, gregexpr("(.)(.)\\2\\1", string_f))

print(output_d)

## [[1]]
## character(0)

print(output_e)

## [[1]]
## character(0)

print(output_f)

## [[1]]
## [1] "lool"

#Looks like (.)(.)\\2\\1 returns a total of 4 characters and they have to be mirrors of each other half way for example the first two characters have to be equal to the reverse of the second two characters in the string
# for example the string abba satisfies this condition because ab = ba

# testing (..)\1
output_aaa <- regmatches(string_a, gregexpr("(..)\1", string_a))
print(output_aaa)

## [[1]]
## character(0)

output_bbb <- regmatches(string_b, gregexpr("(..)\1", string_b))
print(output_bbb)

## [[1]]
## character(0)

output_results <- lapply(test_strings, function(x) regmatches(x, gregexpr("(..)\1", x)))

print(output_results)

## [[1]]
## [[1]][[1]]
## character(0)
## 
## 
## [[2]]
## [[2]][[1]]
## character(0)
## 
## 
## [[3]]
## [[3]][[1]]
## character(0)
## 
## 
## [[4]]
## [[4]][[1]]
## character(0)
## 
## 
## [[5]]
## [[5]][[1]]
## character(0)
## 
## 
## [[6]]
## [[6]][[1]]
## character(0)
## 
## 
## [[7]]
## [[7]][[1]]
## character(0)
## 
## 
## [[8]]
## [[8]][[1]]
## character(0)
## 
## 
## [[9]]
## [[9]][[1]]
## character(0)
## 
## 
## [[10]]
## [[10]][[1]]
## character(0)
## 
## 
## [[11]]
## [[11]][[1]]
## character(0)
## 
## 
## [[12]]
## [[12]][[1]]
## character(0)
## 
## 
## [[13]]
## [[13]][[1]]
## character(0)
## 
## 
## [[14]]
## [[14]][[1]]
## character(0)
## 
## 
## [[15]]
## [[15]][[1]]
## character(0)
## 
## 
## [[16]]
## [[16]][[1]]
## character(0)
## 
## 
## [[17]]
## [[17]][[1]]
## character(0)
## 
## 
## [[18]]
## [[18]][[1]]
## character(0)
## 
## 
## [[19]]
## [[19]][[1]]
## character(0)
## 
## 
## [[20]]
## [[20]][[1]]
## character(0)
## 
## 
## [[21]]
## [[21]][[1]]
## character(0)
## 
## 
## [[22]]
## [[22]][[1]]
## character(0)
## 
## 
## [[23]]
## [[23]][[1]]
## character(0)
## 
## 
## [[24]]
## [[24]][[1]]
## character(0)
## 
## 
## [[25]]
## [[25]][[1]]
## character(0)
## 
## 
## [[26]]
## [[26]][[1]]
## character(0)
## 
## 
## [[27]]
## [[27]][[1]]
## character(0)
## 
## 
## [[28]]
## [[28]][[1]]
## character(0)
## 
## 
## [[29]]
## [[29]][[1]]
## character(0)
## 
## 
## [[30]]
## [[30]][[1]]
## character(0)
## 
## 
## [[31]]
## [[31]][[1]]
## character(0)

# After doing some research on stackoverflow, it seems like single slashes are not allowed in R which is why all my test cases fail


#lets fix this regex by adding one more back slash so that our new regex would be (..)\\1
#lets test this new regex

modified_regex2 <- "(..)\\1"
modified_regex2_output <- grep(modified_regex2, test_strings, value = TRUE)
print(modified_regex2_output)

##  [1] "aaaa"                "aaaab"               "aaaabbb"            
##  [4] "aaaabbbbccccc"       "ababacabcabc"        "abab"               
##  [7] "abab"                "1212"                "aaaa"               
## [10] "aaaaaaaaaabbbbbbbbb" "aaaaabbbbccccc"      "abacadddddddefege"  
## [13] "aaaaaa"

#what this new regex does it goes into a string and look for repeated patterns within a string. For example if my string is abcabc, the repeated pattern in that string is abc

# Testing "(.).\\1.\\1"

output_results_reg_exp_4 <- lapply(test_strings, function(x) regmatches(x, gregexpr("(.).\\1.\\1", x)))
print(output_results_reg_exp_4)

## [[1]]
## [[1]][[1]]
## character(0)
## 
## 
## [[2]]
## [[2]][[1]]
## character(0)
## 
## 
## [[3]]
## [[3]][[1]]
## character(0)
## 
## 
## [[4]]
## [[4]][[1]]
## character(0)
## 
## 
## [[5]]
## [[5]][[1]]
## [1] "ccccc"
## 
## 
## [[6]]
## [[6]][[1]]
## character(0)
## 
## 
## [[7]]
## [[7]][[1]]
## [1] "ababa"
## 
## 
## [[8]]
## [[8]][[1]]
## character(0)
## 
## 
## [[9]]
## [[9]][[1]]
## character(0)
## 
## 
## [[10]]
## [[10]][[1]]
## [1] "abaca"
## 
## 
## [[11]]
## [[11]][[1]]
## character(0)
## 
## 
## [[12]]
## [[12]][[1]]
## character(0)
## 
## 
## [[13]]
## [[13]][[1]]
## character(0)
## 
## 
## [[14]]
## [[14]][[1]]
## character(0)
## 
## 
## [[15]]
## [[15]][[1]]
## character(0)
## 
## 
## [[16]]
## [[16]][[1]]
## character(0)
## 
## 
## [[17]]
## [[17]][[1]]
## character(0)
## 
## 
## [[18]]
## [[18]][[1]]
## character(0)
## 
## 
## [[19]]
## [[19]][[1]]
## [1] "aaaaa" "aaaaa" "bbbbb"
## 
## 
## [[20]]
## [[20]][[1]]
## character(0)
## 
## 
## [[21]]
## [[21]][[1]]
## character(0)
## 
## 
## [[22]]
## [[22]][[1]]
## character(0)
## 
## 
## [[23]]
## [[23]][[1]]
## [1] "aaaaa" "ccccc"
## 
## 
## [[24]]
## [[24]][[1]]
## [1] "abaca" "ddddd" "efege"
## 
## 
## [[25]]
## [[25]][[1]]
## [1] "aaaaa"
## 
## 
## [[26]]
## [[26]][[1]]
## [1] "aaaba"
## 
## 
## [[27]]
## [[27]][[1]]
## character(0)
## 
## 
## [[28]]
## [[28]][[1]]
## character(0)
## 
## 
## [[29]]
## [[29]][[1]]
## character(0)
## 
## 
## [[30]]
## [[30]][[1]]
## character(0)
## 
## 
## [[31]]
## [[31]][[1]]
## character(0)

# this regex here captures all substrings of a string where the string size can be any numbers of characters n and the substring output is equal to the legnth of 5. We can return a total of any amount of substrings from a string with the size of n. So we can have 1 substring, 2 substrings, 3 substrings, etc. The substring returned has to have to consecutive characters in the string itself. The first, the third, and the fifth character of the substring need to be equal. The second and fourth character of the substring can be of any value


# testing "(.)(.)(.).*\\3\\2\\1"
#output_results_reg_exp_4 <- lapply(test_strings, function(x) regmatches(x, gregexpr("(.)(.)(.).*\\3\\2\\1", x)))
output_results_reg_exp_4 <- grep("(.)(.)(.).*\\3\\2\\1", test_strings, value = TRUE)

print(output_results_reg_exp_4)

## [1] "aaaaaaaaaabbbbbbbbb"                          
## [2] "hellolleh"                                    
## [3] "helloolleh"                                   
## [4] "aaaaaa"                                       
## [5] "aaabaaa"                                      
## [6] "aaabbaaa"                                     
## [7] "aaabcdefaaa"                                  
## [8] "aaa14rtfterweiotje4riyjw4tyjiow4yjwtpiojwtaaa"
## [9] "abcdcba"

# this regex needs to capture a string with a minimum of 6 characters length. The first 3 characters much match the last three characters while maintaining the order. for examble first three characters abc and the last three characters: abc where abc = abc   but abc != cba. All characters in between the first three characters and the last three characters are captured as well. It doesnt matter what characters are in between and it doesnt matter their order. So this regex captures the entire string with a min length of 6 
#

Construct regular expressions to match words that:

Start and end with the same character.

#Start and end with the same character.

regex1 <- "(.).*\\1"

test_cases1 <- c("noon", "nonn", "aa", "a", "ab", "aba", "abba", "aabx", "arftkehjoegja", "ABCDEFGH")

output_results_regex1 <- grep("(.).*\\1", test_cases1, value = TRUE)

print(output_results_regex1)

## [1] "noon"          "nonn"          "aa"            "aba"          
## [5] "abba"          "aabx"          "arftkehjoegja"

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

regex2 <- "(.)(.).*\\1\\2"

test_cases2 <- c("church", "abab", "hello", "abcabc", "mississippi", "aabbaabb")


output_results_regex2 <- grep(regex2, test_cases2, value = TRUE)

print(output_results_regex2)

## [1] "church"   "abab"     "abcabc"   "aabbaabb"

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

test_cases3 <- c("eleven", "helllo", "mississippi", "applpe", "banana", "aabcbcdc")
regex3 <- "((.)\\2\\1\\2)|(.)\\1.*\\1|(.)(.*\\4){2,}"

output_results_regex3 <- grep(regex3, test_cases3, value = TRUE)
print(output_results_regex3)

## [1] "eleven"      "helllo"      "mississippi" "applpe"      "banana"     
## [6] "aabcbcdc"

#this one was complicated and I found one solution for it and I did use AI. Specifically Meta AI from whatsapp. I got three different solutions at first where the first returned eleven, the second solution returned mississipp, and the final solution returned applpe. So what I did  I just combined the three regex solutions to get this large regex that satisfies all condition.

HW_Assignment_3_Ahmed_Hassan.RMD

Ahmed Hassan

2024-09-15