library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## Warning: package 'tibble' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)

Getting started

First we need to load these packages:

  • tidyverse
  • stringr
  • dplyr - used for subsetting data in our analysis
  • rmdformats - used to for styling html document
    We’re going to load a dataset from fivethirtyeight.com to help us show examples of stringr at work. Our data shows murders in cities in America from 2014 to 2015.

We’ll take the first 10 rows of the data for simplicity’s sake.

url <- 'https://raw.githubusercontent.com/fivethirtyeight/data/master/murder_2016/murder_2015_final.csv'
murder_raw <- read_csv(url)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   city = col_character(),
##   state = col_character(),
##   `2014_murders` = col_double(),
##   `2015_murders` = col_double(),
##   change = col_double()
## )

Ordering Strings

str_sort(character vector,decreasing = X) Purpose: Order a character vector alphabetically.

Input: character vector - what you want to order X - indicate whether to order characters decreasing (FALSE - alphabetically) or increasing (TRUE - order from Z to A)

Output: An ordered character vector

Example: We’ll order the column ‘city’ from our dataframe ‘murder

str_sort(murder_raw$city,decreasing=FALSE)
##  [1] "Albuquerque"           "Anaheim"               "Anchorage"            
##  [4] "Arlington"             "Atlanta"               "Aurora"               
##  [7] "Austin"                "Bakersfield"           "Baltimore"            
## [10] "Boston"                "Buffalo"               "Chandler"             
## [13] "Charlotte-Mecklenburg" "Chicago"               "Chula Vista"          
## [16] "Cincinnati"            "Cleveland"             "Colorado Springs"     
## [19] "Columbus"              "Corpus Christi"        "Dallas"               
## [22] "Denver"                "Detroit"               "Durham"               
## [25] "El Paso"               "Fort Wayne"            "Fort Worth"           
## [28] "Fresno"                "Greensboro"            "Henderson"            
## [31] "Honolulu"              "Houston"               "Indianapolis"         
## [34] "Irvine"                "Jacksonville"          "Jersey City"          
## [37] "Kansas City"           "Laredo"                "Las Vegas"            
## [40] "Lexington"             "Lincoln"               "Long Beach"           
## [43] "Los Angeles"           "Louisville"            "Memphis"              
## [46] "Mesa"                  "Miami"                 "Milwaukee"            
## [49] "Minneapolis"           "Mobile"                "Nashville"            
## [52] "New Orleans"           "New York"              "Newark"               
## [55] "Oakland"               "Oklahoma City"         "Omaha"                
## [58] "Orlando"               "Philadelphia"          "Phoenix"              
## [61] "Pittsburgh"            "Plano"                 "Portland"             
## [64] "Raleigh"               "Riverside"             "Sacramento"           
## [67] "San Antonio"           "San Diego"             "San Francisco"        
## [70] "San Jose"              "Santa Ana"             "Seattle"              
## [73] "St. Louis"             "St. Paul"              "St. Petersburg"       
## [76] "Stockton"              "Tampa"                 "Toledo"               
## [79] "Tucson"                "Tulsa"                 "Virginia Beach"       
## [82] "Washington"            "Wichita"

Combining Strings

str_c(String1,String2,…Stringn) Purpose: The function takes in a strings or vectors of strings and concatentates them together

Input: String or vector of strings separated by comma

Output: Single string of vector of combined strings

Example: You can combine as many strings as you want together at once

Let’s let’s see how we can combine two vectors of strings together from our dataframe: the city and the state

head(str_c(murder_raw$city,murder_raw$state))
## [1] "BaltimoreMaryland"  "ChicagoIllinois"    "HoustonTexas"      
## [4] "ClevelandOhio"      "WashingtonD.C."     "MilwaukeeWisconsin"
#separate city and state by sep = '' argument
murder_raw$City_State <- str_c(murder_raw$city,murder_raw$state,sep=",")
head(murder_raw$City_State)
## [1] "Baltimore,Maryland"  "Chicago,Illinois"    "Houston,Texas"      
## [4] "Cleveland,Ohio"      "Washington,D.C."     "Milwaukee,Wisconsin"

Replacing Strings

str_replace_all(string, pattern, string) Purpose: This function will replace all instances of a pattern with the given replacement

Input: String or vector of strings Pattern - you can use regular expressions here

Output: Single string of vector of combined strings

Example: Supposed we wanted to replace all appearances of , in the column ‘City_State’. We can easily do this with str_replace_all()

murder_raw$City_State <- str_replace_all(murder_raw$City_State,'[\\,]','*')
head(murder_raw$City_State)
## [1] "Baltimore*Maryland"  "Chicago*Illinois"    "Houston*Texas"      
## [4] "Cleveland*Ohio"      "Washington*D.C."     "Milwaukee*Wisconsin"

Get the Length of a String

str_length(string) Purpose: Find out the length of a string or a vector of strings

Input: String or vector of strings

Output: Integer

Example: Let’s find how out how long each city name

str_length(murder_raw$city)
##  [1]  9  7  7  9 10  9 12 11  9  9 13 10  6 11  6  8  7 11  5 10  9 21 11 11  6
## [26] 10 10  6 12  6  5  8 13 10  5 11 16  9  7  9  8  7 10 11  4 10 14  6  7  9
## [51] 12  7  7  8  5  8  6 11  7  9  8  7  7  7  6  9  9  7 14  8  8  6  7  5  9
## [76]  6  6  6 11 14 10  6  7
#Let’s only view the rows in the dataframe where the city has more than 9 letters in the name. To do this we’ll also use the filter function from the package dplyr.

filter(murder_raw,str_length(murder_raw$city) > 9)
## # A tibble: 28 x 6
##    city        state    `2014_murders` `2015_murders` change City_State         
##    <chr>       <chr>             <dbl>          <dbl>  <dbl> <chr>              
##  1 Washington  D.C.                105            162     57 Washington*D.C.    
##  2 Philadelph~ Pennsyl~            248            280     32 Philadelphia*Penns~
##  3 Kansas City Missouri             78            109     31 Kansas City*Missou~
##  4 Oklahoma C~ Oklahoma             45             73     28 Oklahoma City*Okla~
##  5 Louisville  Kentucky             56             81     25 Louisville*Kentucky
##  6 Los Angeles Califor~            260            282     22 Los Angeles*Califo~
##  7 Minneapolis Minneso~             31             47     16 Minneapolis*Minnes~
##  8 Sacramento  Califor~             28             43     15 Sacramento*Califor~
##  9 Charlotte-~ North C~             47             61     14 Charlotte-Mecklenb~
## 10 New Orleans Louisia~            150            164     14 New Orleans*Louisi~
## # ... with 18 more rows

Conclusion

In dplyr also there are so many verbs like pivot_longer,wider etc. which help us in getting the task done for specific required data view.