A. Working with Strings

Regular expressions

fruits <- c('Apple', 'Banana', 'Orange', 'Grape', 'Pineapple', 'Kiwi', 'Peach', 'Mango', 'Strawberry', 'Guava', 'Cherry', 'Apple', 'banana')

fruits =="Banana" #exact match
##  [1] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE
which(fruits == "Banana") #use which to get position
## [1] 2
breakfast = c("Apple", "Banana", "Apple", "banana")
fruits %in% breakfast 
##  [1]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [13]  TRUE
match(breakfast, fruits) #notice that only the first match is returned
## [1]  1  2  1 13
fruits == "bana" #how do we search for pattern?
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE

grep: Identifying strings that match a particular criteria

grep(pattern = 'bana', fruits, value = TRUE, ignore.case = TRUE)
## [1] "Banana" "banana"

gsub: Replacing instances of a string with another of your choosing

gsub(pattern = "Ch", replacement = "B", fruits[11])
## [1] "Berry"
breakfast.ab = c("App", "bana") 
pmatch(breakfast.ab, fruits) #notice that Apple is not unique so it won't work
## [1] NA 13
grep("nana", fruits) #grep works but one pattern at a time
## [1]  2 13

gregexpr: Identifying positions of a string of interest

(positions_a <- gregexpr(pattern = "a", text = fruits, ignore.case = TRUE))
## [[1]]
## [1] 1
## attr(,"match.length")
## [1] 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] 2 4 6
## attr(,"match.length")
## [1] 1 1 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1] 3
## attr(,"match.length")
## [1] 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[4]]
## [1] 3
## attr(,"match.length")
## [1] 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[5]]
## [1] 5
## attr(,"match.length")
## [1] 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[6]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[7]]
## [1] 3
## attr(,"match.length")
## [1] 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[8]]
## [1] 2
## attr(,"match.length")
## [1] 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[9]]
## [1] 4
## attr(,"match.length")
## [1] 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[10]]
## [1] 3 5
## attr(,"match.length")
## [1] 1 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[11]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[12]]
## [1] 1
## attr(,"match.length")
## [1] 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
## 
## [[13]]
## [1] 2 4 6
## attr(,"match.length")
## [1] 1 1 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE

positions_a ### Paste

Use seperators to combine strings with other characters or numbers

paste("X", 1:5, sep = ".")
## [1] "X.1" "X.2" "X.3" "X.4" "X.5"

Use collapse to combine multiple string outputs together

paste("X", 1:5, sep = ".", collapse = "")
## [1] "X.1X.2X.3X.4X.5"

paste0 is the same as paste with an empty seperator

paste0("X", 1:5)
## [1] "X1" "X2" "X3" "X4" "X5"

NOTE: paste0 does not have a “sep” option that can be modified by the user

paste0(“a”, “b”, sep = “c”) == paste0(“a”, “b”, “c”)

paste0(rep(c(“A”,“C”,“G”,“T”), each=4), c(“A”,“C”,“G”,“T”), collapse = ““)

Extra material: Other string manipulations

Changing the case of strings

string1 <- 'Data Science'
tolower(string1) 
## [1] "data science"
toupper(string1)
## [1] "DATA SCIENCE"

Exercise 1: Exercise with Strings

Create this string ‘A&1B&2C&3’ using a paste function

paste(c("A","B","C"), 1:3, sep = "&", collapse = "")
## [1] "A&1B&2C&3"

B. Working with Dates

dates <- c('11/14/2011', '12/04/2012', '03/01/2013', '02/09/2019')
class(dates)
## [1] "character"
real_dates <- as.Date(dates, format = '%m/%d/%Y')
class(real_dates)
## [1] "Date"
other_format <- format(real_dates, '%A %B %d, %Y')
class(other_format)
## [1] "character"

For the codes used to identify and format dates:

?strptime
## starting httpd help server ... done

Identifying how long ago dates occurred

today <- Sys.Date()

(dif <- today - real_dates)
## Time differences in days
## [1] 4111 3725 3638 1467
class(dif)
## [1] "difftime"

To make a difference in times with a particular time unit of interest use difftime

difftime(today, real_dates, units = "hours")
## Time differences in hours
## [1] 98664 89400 87312 35208

Extra Materials: Lubridate

The lubridate package contains a powerful set of tools that can be used to extract and interact with dates.

There are functions like mdy that allow for simpler extration of date information from strings.

#install.packages("lubridate")
library(lubridate, quietly = TRUE)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
(lubri_dates <- mdy(dates))
## [1] "2011-11-14" "2012-12-04" "2013-03-01" "2019-02-09"

Extracting specific date information from a date object

year(lubri_dates)
## [1] 2011 2012 2013 2019
month(lubri_dates)
## [1] 11 12  3  2
day(lubri_dates)
## [1] 14  4  1  9

Exercise 2: Exercise with Dates

  1. Take the following date (November 11, 2011) and turn it into a date vector in R
  2. Display the date vector in the format (month.day.year’)