# Count number of characters in a bit of text:
sentence <- "Not a very long sentence."
nchar(sentence)
## [1] 25
# Extract the first 3 characters:
substr(sentence, 1, 3)
## [1] "Not"
# Substring all elements of a vector
substr(c("good","good riddance","good on ya"),1,4)
## [1] "good" "good" "good"
# Number of characters of all elements of a vector
nchar(c("hey","hi","how","ya","doin"))
## [1] 3 2 3 2 4
# Add a suffix to each text element of a vector:
txt <- c("apple","pear","banana")
paste(txt, "-fruit")
## [1] "apple -fruit" "pear -fruit" "banana -fruit"
# Glue them all together into a single string using the collapse argument
paste(txt, collapse="-")
## [1] "apple-pear-banana"
# Combine numbers and text:
paste("Question", 1:3)
## [1] "Question 1" "Question 2" "Question 3"
# This can be of use to make new variables in a dataframe,
# as in this example where we combine two factors to create a new one:
dataset <- read.csv("att.csv")
dataset$campaign <- with(dataset, paste(reachout, card, sep="-"))
head(dataset$campaign)
## [1] "NO-NO" "NO-NO" "NO-YES" "NO-NO" "NO-NO" "NO-NO"
# Change the names of a dataframe: h
names(dataset) # first print the old names
## [1] "pick" "income" "moves" "age" "education"
## [6] "employment" "usage" "nonpub" "reachout" "card"
## [11] "campaign"
names(dataset) <- c("pick1","income1","moves1","age1","education1", "employment1","usage1","nonpub1", "reachout1","card1") # then change the names
# Change only the first name (you can index names() just like you can a vector!)
names(dataset)[1] <- "provider"
#???nd out which columns have particular names
match(c("reachout1","card1"), names(dataset))
## [1] 9 10
When reading a dataset, R converts automatically any numbers into a numeric, otherwise a factor. Sometimes we want to keep a variable to be as a text. The arguement “stringAsFactors=FALSE” avoided the automatic conversion of character variables to factors:
# Read data, tell R to treat the last variable ('card') as character, not factor
dataset <- read.csv("att.csv")
dataset$card <- as.character(dataset$card)
# Read data, tell R to treat all non-numeric variables as character
dataset <- read.csv("att.csv", stringsAsFactors=FALSE)
# Make sure that the Pick is really a character vector:
is.character(dataset$pick)
## [1] TRUE
#Check the types of all variables in the dataset
print(str(dataset))
## 'data.frame': 1000 obs. of 10 variables:
## $ pick : chr "OCC" "ATT" "OCC" "OCC" ...
## $ income : chr "<7.5" "45-75" "" "" ...
## $ moves : chr "0" "2" "0" "2" ...
## $ age : chr "35-44" "25-34" "" "65+" ...
## $ education : chr "HS" "HS" "" "<HS" ...
## $ employment: chr "F" "F" "" "R" ...
## $ usage : int 9 2 6 7 0 0 3 1 0 2 ...
## $ nonpub : chr "YES" "YES" "NO" "NO" ...
## $ reachout : chr "NO" "NO" "NO" "NO" ...
## $ card : chr "NO" "NO" "YES" "NO" ...
## NULL
#Create new variable to descibe for the next function
dataset$campaign <- with(dataset, paste(reachout, card, sep="-"))
dataset$campaign <- paste(dataset$campaign, 1:1000)
# Extract campaign (for convenience).
campaign <- dataset$campaign
# Find the observations that have "YES-YES" in them.
# grep() returns the index of values that contain 'YES-YES'
grep("YES-YES",campaign)
## [1] 31 235 362 410 424 441 471 484 489 537 552 575 599 604 605 625 626
## [18] 678 721 749 770 773 793 817 845 883 887 891 931 935
#grepl("YES-YES",campaign) # to returns TRUE or FALSE
# That result just gives you the indices of the vector that have "YES-YES" in them.
# these are the corresponding names:
campaign[grep("YES-YES",campaign)]
## [1] "YES-YES 31" "YES-YES 235" "YES-YES 362" "YES-YES 410" "YES-YES 424"
## [6] "YES-YES 441" "YES-YES 471" "YES-YES 484" "YES-YES 489" "YES-YES 537"
## [11] "YES-YES 552" "YES-YES 575" "YES-YES 599" "YES-YES 604" "YES-YES 605"
## [16] "YES-YES 625" "YES-YES 626" "YES-YES 678" "YES-YES 721" "YES-YES 749"
## [21] "YES-YES 770" "YES-YES 773" "YES-YES 793" "YES-YES 817" "YES-YES 845"
## [26] "YES-YES 883" "YES-YES 887" "YES-YES 891" "YES-YES 931" "YES-YES 935"
# Now find the cereals whose name starts with "YES-YES 3".
# The ^ symbol is part of a 'regular expression', it indicates 'starts with':
grep("^YES-YES 3",campaign)
## [1] 31 362
# Or end with "29"
# The $ symbol is part of a 'regular expression', and indicates 'ends with':
grep("29$", campaign)
## [1] 29 129 229 329 429 529 629 729 829 929
# grepl will return FALSE when "YES-YES" is not found, TRUE otherwise
dataset$twoCampaign <- grepl("^YES-YES", campaign)
# Quick summary:
summary(dataset$twoCampaign)
## Mode FALSE TRUE
## logical 970 30