Tidyverse vignette with stringr

The stringr package is the package of choice for working with character strings in Tidyverse.

We will be using the pipe notation %>%for the vignette.

library(tidyverse)

Load the data into R from Kaggle containing information about breweries in the United States from the following link: https://www.kaggle.com/brkurzawa/us-breweries/data .

url <- 'https://raw.githubusercontent.com/Vthomps000/DATA607_VT/master/beer.csv'
beer <- read.csv(url)
head(beer)
##            brewery_name         type
## 1    Valley Brewing Co.      Brewpub
## 2    Valley Brewing Co.      Brewpub
## 3     Valley Brewing Co Microbrewery
## 4 Ukiah Brewing Company      Brewpub
## 5    Tustin Brewing Co.      Brewpub
## 6       Trumer Brauerei Microbrewery
##                                              address
## 1           PO Box 4653, Stockton, California, 95204
## 2         157 Adams St., Stockton, California, 95204
## 3       1950 W Freemont, Stockton, California, 95203
## 4         102 S. State St., Ukiah, California, 95482
## 5 13011 Newport Ave. #100, Tustin, California, 92780
## 6          1404 4th St., Berkeley, California, 94608
##                                website      state state_breweries
## 1           http://www.valleybrew.com/ california             284
## 2           http://www.valleybrew.com/ california             284
## 3           http://www.valleybrew.com/ california             284
## 4       http://www.ukiahbrewingco.com/ california             284
## 5        http://www.tustinbrewery.com/ california             284
## 6 http://www.trumer-international.com/ california             284

Detecting Matches

Detecing pattern matches in a string using str_detect, which returns boolean values.

b0 <- beer$address %>% str_detect("PO")
head(b0)
## [1]  TRUE FALSE FALSE FALSE FALSE FALSE
head(beer[b0,])
##                   brewery_name            type
## 1           Valley Brewing Co.         Brewpub
## 28  Spanish Peaks Brewing Co.  ContractBrewery
## 33        Snowshoe Brewing Co.         Brewpub
## 153      Half Moon Bay Brewing         Brewpub
## 196                  Chau Tien ContractBrewery
## 251           Bison Brewing Co    Microbrewery
##                                        address                         website
## 1     PO Box 4653, Stockton, California, 95204      http://www.valleybrew.com/
## 28    PO Box 820, King City, California, 93930    http://www.blackdogales.com/
## 33      PO Box 2224, Arnold, California, 95223 http://www.snowshoebrewing.com/
## 153 PO Box 879, El Granada , California, 94018    http://www.hmbbrewingco.com/
## 196   PO Box 221185, Carmel, California, 93923         http://www.paleale.com/
## 251   PO Box 4821, Berkeley, California, 94704       http://www.bisonbrew.com/
##          state state_breweries
## 1   california             284
## 28  california             284
## 33  california             284
## 153 california             284
## 196 california             284
## 251 california             284

Detecting matches using str_which

b1 <- beer$address %>% str_which("PO")
head(b1)
## [1]   1  28  33 153 196 251

We obtain the same result as above.

head(beer[b1,])
##                   brewery_name            type
## 1           Valley Brewing Co.         Brewpub
## 28  Spanish Peaks Brewing Co.  ContractBrewery
## 33        Snowshoe Brewing Co.         Brewpub
## 153      Half Moon Bay Brewing         Brewpub
## 196                  Chau Tien ContractBrewery
## 251           Bison Brewing Co    Microbrewery
##                                        address                         website
## 1     PO Box 4653, Stockton, California, 95204      http://www.valleybrew.com/
## 28    PO Box 820, King City, California, 93930    http://www.blackdogales.com/
## 33      PO Box 2224, Arnold, California, 95223 http://www.snowshoebrewing.com/
## 153 PO Box 879, El Granada , California, 94018    http://www.hmbbrewingco.com/
## 196   PO Box 221185, Carmel, California, 93923         http://www.paleale.com/
## 251   PO Box 4821, Berkeley, California, 94704       http://www.bisonbrew.com/
##          state state_breweries
## 1   california             284
## 28  california             284
## 33  california             284
## 153 california             284
## 196 california             284
## 251 california             284

Mutating Strings

Modifying/mutating strings.

inc <- beer$brewery_name %>% str_which("INC.|inc.|Inc.|INC|inc|Inc")
head(inc)
## [1]  94 108 177 273 278 343
t1 <- beer %>% select(brewery_name) %>% slice(inc)
head(t1)
##                                   brewery_name
## 1                  New English Brewing Co. Inc
## 2                        Marauder Brewing, Inc
## 3                   Etna Brewing Company, Inc.
## 4             Anheuser-Busch Inc.- Los Angeles
## 5                        American Beerguy Inc.
## 6 St. Louis Brewery, Inc./Schlafly Bottleworks
beer$brewery_name <- beer$brewery_name %>% str_replace("INC.|inc.|Inc.|INC|inc|Inc", "Inc.")
t2 <- beer %>% select(brewery_name) %>% slice(inc)
head(t2)
##                                   brewery_name
## 1                 New English Brewing Co. Inc.
## 2                       Marauder Brewing, Inc.
## 3                   Etna Brewing Company, Inc.
## 4             Anheuser-Busch Inc.- Los Angeles
## 5                        American Beerguy Inc.
## 6 St. Louis Brewery, Inc./Schlafly Bottleworks
t4 <- sapply(beer$address, function(x) str_to_upper(x))
head(t4)
## [1] "PO BOX 4653, STOCKTON, CALIFORNIA, 95204"          
## [2] "157 ADAMS ST., STOCKTON, CALIFORNIA, 95204"        
## [3] "1950 W FREEMONT, STOCKTON, CALIFORNIA, 95203"      
## [4] "102 S. STATE ST., UKIAH, CALIFORNIA, 95482"        
## [5] "13011 NEWPORT AVE. #100, TUSTIN, CALIFORNIA, 92780"
## [6] "1404 4TH ST., BERKELEY, CALIFORNIA, 94608"

Subseting Strings

t5 <- beer$address %>% str_extract_all("([:digit:]{5})$")
head(t5)
## [[1]]
## [1] "95204"
## 
## [[2]]
## [1] "95204"
## 
## [[3]]
## [1] "95203"
## 
## [[4]]
## [1] "95482"
## 
## [[5]]
## [1] "92780"
## 
## [[6]]
## [1] "94608"

Joining and splitting strings.

t6 <- beer$address %>% str_split(",")
head(t6)
## [[1]]
## [1] "PO Box 4653" " Stockton"   " California" " 95204"     
## 
## [[2]]
## [1] "157 Adams St." " Stockton"     " California"   " 95204"       
## 
## [[3]]
## [1] "1950 W Freemont" " Stockton"       " California"     " 95203"         
## 
## [[4]]
## [1] "102 S. State St." " Ukiah"           " California"      " 95482"          
## 
## [[5]]
## [1] "13011 Newport Ave. #100" " Tustin"                
## [3] " California"             " 92780"                 
## 
## [[6]]
## [1] "1404 4th St." " Berkeley"    " California"  " 94608"

I concentrate individual strings into one single string.

t7 <- sapply(t6, function(x) str_c(x, collapse = ","))
head(t7)
## [1] "PO Box 4653, Stockton, California, 95204"          
## [2] "157 Adams St., Stockton, California, 95204"        
## [3] "1950 W Freemont, Stockton, California, 95203"      
## [4] "102 S. State St., Ukiah, California, 95482"        
## [5] "13011 Newport Ave. #100, Tustin, California, 92780"
## [6] "1404 4th St., Berkeley, California, 94608"

Conclusion

The stringr package of Tidyverse was used to demonstarte the manipulation of strings.

Citations

Information came from, the stringr cheatsheet on github.