data 607 assignment 3

library(plyr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::desc()      masks plyr::desc()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(ggpubr)

## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:plyr':
## 
##     mutate

library (readr)
library(RCurl)

## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete

library(rvest)

## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding

Introduction:

Loaded what was believed to be relevant libraries.
Homework questions and answers:
Question 1
Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

c <- getURL("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
major_list_df <- data.frame(read.csv(text = c) )
#put the data into a dataframe
major_list <- as.list(major_list_df[,2])
glimpse(major_list)

## List of 174
##  $ : chr "GENERAL AGRICULTURE"
##  $ : chr "AGRICULTURE PRODUCTION AND MANAGEMENT"
##  $ : chr "AGRICULTURAL ECONOMICS"
##  $ : chr "ANIMAL SCIENCES"
##  $ : chr "FOOD SCIENCE"
##  $ : chr "PLANT SCIENCE AND AGRONOMY"
##  $ : chr "SOIL SCIENCE"
##  $ : chr "MISCELLANEOUS AGRICULTURE"
##  $ : chr "FORESTRY"
##  $ : chr "NATURAL RESOURCES MANAGEMENT"
##  $ : chr "FINE ARTS"
##  $ : chr "DRAMA AND THEATER ARTS"
##  $ : chr "MUSIC"
##  $ : chr "VISUAL AND PERFORMING ARTS"
##  $ : chr "COMMERCIAL ART AND GRAPHIC DESIGN"
##  $ : chr "FILM VIDEO AND PHOTOGRAPHIC ARTS"
##  $ : chr "STUDIO ARTS"
##  $ : chr "MISCELLANEOUS FINE ARTS"
##  $ : chr "ENVIRONMENTAL SCIENCE"
##  $ : chr "BIOLOGY"
##  $ : chr "BIOCHEMICAL SCIENCES"
##  $ : chr "BOTANY"
##  $ : chr "MOLECULAR BIOLOGY"
##  $ : chr "ECOLOGY"
##  $ : chr "GENETICS"
##  $ : chr "MICROBIOLOGY"
##  $ : chr "PHARMACOLOGY"
##  $ : chr "PHYSIOLOGY"
##  $ : chr "ZOOLOGY"
##  $ : chr "NEUROSCIENCE"
##  $ : chr "MISCELLANEOUS BIOLOGY"
##  $ : chr "COGNITIVE SCIENCE AND BIOPSYCHOLOGY"
##  $ : chr "GENERAL BUSINESS"
##  $ : chr "ACCOUNTING"
##  $ : chr "ACTUARIAL SCIENCE"
##  $ : chr "BUSINESS MANAGEMENT AND ADMINISTRATION"
##  $ : chr "OPERATIONS LOGISTICS AND E-COMMERCE"
##  $ : chr "BUSINESS ECONOMICS"
##  $ : chr "MARKETING AND MARKETING RESEARCH"
##  $ : chr "FINANCE"
##  $ : chr "HUMAN RESOURCES AND PERSONNEL MANAGEMENT"
##  $ : chr "INTERNATIONAL BUSINESS"
##  $ : chr "HOSPITALITY MANAGEMENT"
##  $ : chr "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
##  $ : chr "MISCELLANEOUS BUSINESS & MEDICAL ADMINISTRATION"
##  $ : chr "COMMUNICATIONS"
##  $ : chr "JOURNALISM"
##  $ : chr "MASS MEDIA"
##  $ : chr "ADVERTISING AND PUBLIC RELATIONS"
##  $ : chr "COMMUNICATION TECHNOLOGIES"
##  $ : chr "COMPUTER AND INFORMATION SYSTEMS"
##  $ : chr "COMPUTER PROGRAMMING AND DATA PROCESSING"
##  $ : chr "COMPUTER SCIENCE"
##  $ : chr "INFORMATION SCIENCES"
##  $ : chr "COMPUTER ADMINISTRATION MANAGEMENT AND SECURITY"
##  $ : chr "COMPUTER NETWORKING AND TELECOMMUNICATIONS"
##  $ : chr "MATHEMATICS"
##  $ : chr "APPLIED MATHEMATICS"
##  $ : chr "STATISTICS AND DECISION SCIENCE"
##  $ : chr "MATHEMATICS AND COMPUTER SCIENCE"
##  $ : chr "GENERAL EDUCATION"
##  $ : chr "EDUCATIONAL ADMINISTRATION AND SUPERVISION"
##  $ : chr "SCHOOL STUDENT COUNSELING"
##  $ : chr "ELEMENTARY EDUCATION"
##  $ : chr "MATHEMATICS TEACHER EDUCATION"
##  $ : chr "PHYSICAL AND HEALTH EDUCATION TEACHING"
##  $ : chr "EARLY CHILDHOOD EDUCATION"
##  $ : chr "SCIENCE AND COMPUTER TEACHER EDUCATION"
##  $ : chr "SECONDARY TEACHER EDUCATION"
##  $ : chr "SPECIAL NEEDS EDUCATION"
##  $ : chr "SOCIAL SCIENCE OR HISTORY TEACHER EDUCATION"
##  $ : chr "TEACHER EDUCATION: MULTIPLE LEVELS"
##  $ : chr "LANGUAGE AND DRAMA EDUCATION"
##  $ : chr "ART AND MUSIC EDUCATION"
##  $ : chr "MISCELLANEOUS EDUCATION"
##  $ : chr "LIBRARY SCIENCE"
##  $ : chr "ARCHITECTURE"
##  $ : chr "GENERAL ENGINEERING"
##  $ : chr "AEROSPACE ENGINEERING"
##  $ : chr "BIOLOGICAL ENGINEERING"
##  $ : chr "ARCHITECTURAL ENGINEERING"
##  $ : chr "BIOMEDICAL ENGINEERING"
##  $ : chr "CHEMICAL ENGINEERING"
##  $ : chr "CIVIL ENGINEERING"
##  $ : chr "COMPUTER ENGINEERING"
##  $ : chr "ELECTRICAL ENGINEERING"
##  $ : chr "ENGINEERING MECHANICS PHYSICS AND SCIENCE"
##  $ : chr "ENVIRONMENTAL ENGINEERING"
##  $ : chr "GEOLOGICAL AND GEOPHYSICAL ENGINEERING"
##  $ : chr "INDUSTRIAL AND MANUFACTURING ENGINEERING"
##  $ : chr "MATERIALS ENGINEERING AND MATERIALS SCIENCE"
##  $ : chr "MECHANICAL ENGINEERING"
##  $ : chr "METALLURGICAL ENGINEERING"
##  $ : chr "MINING AND MINERAL ENGINEERING"
##  $ : chr "NAVAL ARCHITECTURE AND MARINE ENGINEERING"
##  $ : chr "NUCLEAR ENGINEERING"
##  $ : chr "PETROLEUM ENGINEERING"
##  $ : chr "MISCELLANEOUS ENGINEERING"
##  $ : chr "ENGINEERING TECHNOLOGIES"
##   [list output truncated]

#puts the majors column into a list of majors

Question 1 continued

Data was scraped from the web into a dataframe. the majors in the dataframe were then converted into a list.

“provide code that identifies the majors that contain either”DATA” or “STATISTICS””

data_maj <- grep(pattern = 'data', major_list, value = TRUE, ignore.case = TRUE)
# picks the majors with "data" in its string.
stats_maj <- grep(pattern = 'statistics', major_list, value = TRUE, ignore.case = TRUE)
#picks the majors with "statistics" in its string
data_or_stat <- c(data_maj, stats_maj)
#combines the statistics majors and the data majors into one list
print(data_or_stat)

## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [2] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [3] "STATISTICS AND DECISION SCIENCE"

Question1 continued

The statistics majors were pulled from the list, then the data majors were pulled. The lists were then combined into one list.

Question2:

Write code that transforms the data below
[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”

Into a format like this: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

answer <- c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
print(answer)

##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

#the answer to compare results to

fruity <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" [5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"   [9] "elderberry"   "lime"         "lychee"       "mulberry"     [13] "olive"        "salal berry"'
#assuming the "data" was a single string
print(fruity)

## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\" [5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"   [9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"     [13] \"olive\"        \"salal berry\""

fruity1 <- str_replace_all(fruity, "[:digit:]", "")
fruity2 <- gsub('[[]]', '', fruity1)
#removing all numbers and brackets
fruity_comma <- gsub('" ', '",', fruity2)
#add commas in between quotes
fruity_comma1 <- gsub('"', '', fruity_comma)
#remove the quotes
print(fruity_comma1)

## [1] " bell pepper, bilberry,    blackberry,  blood orange, blueberry,   cantaloupe,  chili pepper,cloudberry,   elderberry,  lime,        lychee,      mulberry,     olive,       salal berry"

#check quotes were removed
fruity_list <- as.list(str_split(fruity_comma1,",")[[1]])
#split the string so it fits into a list
fruity_list2 <- trimws(fruity_list)
#remove the blank spaces in the beginning and end of strings
print(fruity_list2)

##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

#check list

attempt1 <- fruity_list2
#does the list match the answer?
attempt1 == answer

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#list successfully matches answer

#assuming the data was in a dataframe
fruity1 <- data.frame("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")  



fruitys <- c(t(fruity1))
#Converting dataframe into vector/list.
#read the dataframe into a list row by row
attempt2 <- fruitys
#renaming list to attempt2
print(attempt2)

##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

attempt2 == answer

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

#list successfully matches answer

Question 3:

Describe, in words, what these expressions will match: (.)\1\1
this would be rewritten as (.)(.)(.) where (.)=(.)=(.) it will check for any character as denoted by “.” it will return parts of a string with three repeating characters “aaa”, “111” etc.
“(.)(.)\2\1”
returns parts of a string that has this pattern (1char)(2char)(2char)(1char) like “abba”
(..)\1
returns any two characters that repeat together (char1char2)(char1char2) like “toto” or “1111”
“(.).\1.\1”
returns three of the same character with any character in between each of the same characters or char1 anycharacter char1 anycharacter char1 like “c1chc”
“(.)(.)(.).*\3\2\1”
returns any three characters followed by any length string of characters followed by the same initial three characters in reverse order char1 char2 char3 anycharacters char3 char2 char1 such as “123abcde321”

randomstr <- c("aaa", "a111b","fjdkfjd","ckkc","ckck","abeethcba", "abbae1221", "tttt1t1t1", "b3b6b567", "abb222bba222", "abc123456cba")
#strings to test the regex expressions on
x <-  str_view(randomstr, '(.)\\1\\1')
print('(.)\\1\\1')

## [1] "(.)\\1\\1"

print(x)

##  [1] │ <aaa>
##  [2] │ a<111>b
##  [8] │ <ttt>t1t1t1
## [10] │ abb<222>bba<222>

#examples of the first expression
x1 <- str_view(randomstr, "(.)(.)\\2\\1") 
print("(.)(.)\\2\\1")

## [1] "(.)(.)\\2\\1"

print(x1)

## [4] │ <ckkc>
## [7] │ <abba>e<1221>
## [8] │ <tttt>1t1t1

#examples of the second expression
x2 <- str_view(randomstr, "(..)\\1") 
print("(..)\\1")

## [1] "(..)\\1"

print(x2)

## [5] │ <ckck>
## [8] │ <tttt><1t1t>1

#examples of the third expression
x3 <- str_view(randomstr, "(.).\\1.\\1") 
print("(.).\\1.\\1")

## [1] "(.).\\1.\\1"

print(x3)

## [8] │ t<ttt1t>1t1
## [9] │ <b3b6b>567

#examples of the fourth expression
x4 <- str_view(randomstr, "(.)(.)(.).*\\3\\2\\1") 
print("(.)(.)(.).*\\3\\2\\1")

## [1] "(.)(.)(.).*\\3\\2\\1"

print(x4)

## [10] │ <abb222bba>222
## [11] │ <abc123456cba>

#examples of the fifth expression

Question 4:

Construct regular expressions to match words that:
Start and end with the same character.
“^(.).*//1$” returns strings that start and end with the same character.

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
“(.)(\w)(\w).\2\3.*” returns a string of any length that contains repeating pair of letters

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
“(.)(\w).\2.\2.” Returns a string of any length that contains three repeated characters anywhere in the word.

randomstr <- c("aaa", "a111b","fjdkfjd","ckkc","ckck","abeethcba", "abbae1221", "tttt1t1t1", "b3b6b567", "abb222bba222", "abc123456cba", "church","eleven","beeen")
#strings to test the regex expressions
x5 <-  str_view(randomstr, "^(.).*\\1$")
print("^(.).*\\1$")

## [1] "^(.).*\\1$"

print(x5)

##  [1] │ <aaa>
##  [4] │ <ckkc>
##  [6] │ <abeethcba>
## [11] │ <abc123456cba>

x6 <- str_view(randomstr, "(.*)(\\w)(\\w).*\\2\\3.*") 
print("(.*)(\\w)(\\w).*\\2\\3.*")

## [1] "(.*)(\\w)(\\w).*\\2\\3.*"

print(x6)

##  [3] │ <fjdkfjd>
##  [5] │ <ckck>
##  [8] │ <tttt1t1t1>
## [10] │ <abb222bba222>
## [12] │ <church>

x7 <- str_view(randomstr, "(.*)(\\w).*\\2.*\\2.*") 
print("(.*)(\\w).*\\2.*\\2.*")

## [1] "(.*)(\\w).*\\2.*\\2.*"

print(x7)

##  [1] │ <aaa>
##  [2] │ <a111b>
##  [8] │ <tttt1t1t1>
##  [9] │ <b3b6b567>
## [10] │ <abb222bba222>
## [13] │ <eleven>
## [14] │ <beeen>

Conclusion:

Answered questions pertaining to selecting segments from strings.

data 607 assignment 3

Keith DeNivo

2024-02-10