DATA 607 HW #3

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#1

# load csv from github
majors_list <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")


# search  for DATA OR STATISTICS
pattern <- "DATA|STATISTICS"

# find the rows with matches
matching_rows <- grepl(pattern, majors_list$Major, ignore.case = TRUE)

# extract
majors <- majors_list[matching_rows, ]

print(majors)

##    FOD1P                                         Major          Major_Category
## 44  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS                Business
## 52  2101      COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 59  3702               STATISTICS AND DECISION SCIENCE Computers & Mathematics

#2


strIn <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry"'


# extract quoted strings
matches <- gregexpr("\"[^\"]+\"", strIn)

# extract matcehd strings
extracted_strings <- regmatches(strIn, matches)[[1]]

#concatenate strings
strOut <- paste(extracted_strings, collapse = ", ")

print(strOut)

## [1] "\"bell pepper\", \"bilberry\", \"blackberry\", \"blood orange\", \"blueberry\", \"cantaloupe\", \"chili pepper\", \"cloudberry\", \"elderberry\", \"lime\", \"lychee\", \"mulberry\", \"olive\", \"salal berry\""

#add commas in between quotes
commas <- gsub('"', '', strOut)
               
print(commas)

## [1] "bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry"

# remove quotes
quotes <- gsub('"', '', commas)

print(quotes)

## [1] "bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry"

# create list
list <- as.list(str_split(quotes,",")[[1]])

print(list)

## [[1]]
## [1] "bell pepper"
## 
## [[2]]
## [1] " bilberry"
## 
## [[3]]
## [1] " blackberry"
## 
## [[4]]
## [1] " blood orange"
## 
## [[5]]
## [1] " blueberry"
## 
## [[6]]
## [1] " cantaloupe"
## 
## [[7]]
## [1] " chili pepper"
## 
## [[8]]
## [1] " cloudberry"
## 
## [[9]]
## [1] " elderberry"
## 
## [[10]]
## [1] " lime"
## 
## [[11]]
## [1] " lychee"
## 
## [[12]]
## [1] " mulberry"
## 
## [[13]]
## [1] " olive"
## 
## [[14]]
## [1] " salal berry"

# trime white spaces to format as expected answer
final <- trimws(list)

print(final)

##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

# load expected output

y = c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")

# answer matches expected output
y == final

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#3

#(.)\1\1 = a character that appears three times in a row such as aaa
#"(.)(.)\\2\\1" =  a couple of characters and then the same characters in reversed such as 1221
#(..)\1 = any characters repeated such as 1212
#"(.).\\1.\\1" = a character, then another character, the original character, a different character, and then th original character  such as 12131
#"(.)(.)(.).*\\3\\2\\1" = three characters and then a character or no character and then the same three characters in reversed.
# such as 1234321


#4


#load  DATA that start and end with the same character
str_subset(words, "^(.)((.*\\1$)|\\1?$)")

##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "depend"     "educate"    "else"       "encourage"  "engine"    
## [11] "europe"     "evidence"   "example"    "excuse"     "exercise"  
## [16] "expense"    "experience" "eye"        "health"     "high"      
## [21] "knock"      "level"      "local"      "nation"     "non"       
## [26] "rather"     "refer"      "remember"   "serious"    "stairs"    
## [31] "test"       "tonight"    "transport"  "treat"      "trust"     
## [36] "window"     "yesterday"

# data with repeating letters
str_subset(words, "([A-Za-z][A-Za-z]).*\\1")

##  [1] "appropriate" "church"      "condition"   "decide"      "environment"
##  [6] "london"      "paragraph"   "particular"  "photograph"  "prepare"    
## [11] "pressure"    "remember"    "represent"   "require"     "sense"      
## [16] "therefore"   "understand"  "whether"

#words with one letter with three repeating characters
str_subset(words, "([a-z]).*\\1.*\\1")

##  [1] "appropriate" "available"   "believe"     "between"     "business"   
##  [6] "degree"      "difference"  "discuss"     "eleven"      "environment"
## [11] "evidence"    "exercise"    "expense"     "experience"  "individual" 
## [16] "paragraph"   "receive"     "remember"    "represent"   "telephone"  
## [21] "therefore"   "tomorrow"

DATA 607 HW #3

2024-02-11