## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## Warning: package 'tibble' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
First we need to load these packages:
We’ll take the first 10 rows of the data for simplicity’s sake.
url <- 'https://raw.githubusercontent.com/fivethirtyeight/data/master/murder_2016/murder_2015_final.csv'
murder_raw <- read_csv(url)##
## -- Column specification --------------------------------------------------------
## cols(
## city = col_character(),
## state = col_character(),
## `2014_murders` = col_double(),
## `2015_murders` = col_double(),
## change = col_double()
## )
str_sort(character vector,decreasing = X) Purpose: Order a character vector alphabetically.
Input: character vector - what you want to order X - indicate whether to order characters decreasing (FALSE - alphabetically) or increasing (TRUE - order from Z to A)
Output: An ordered character vector
Example: We’ll order the column ‘city’ from our dataframe ‘murder
## [1] "Albuquerque" "Anaheim" "Anchorage"
## [4] "Arlington" "Atlanta" "Aurora"
## [7] "Austin" "Bakersfield" "Baltimore"
## [10] "Boston" "Buffalo" "Chandler"
## [13] "Charlotte-Mecklenburg" "Chicago" "Chula Vista"
## [16] "Cincinnati" "Cleveland" "Colorado Springs"
## [19] "Columbus" "Corpus Christi" "Dallas"
## [22] "Denver" "Detroit" "Durham"
## [25] "El Paso" "Fort Wayne" "Fort Worth"
## [28] "Fresno" "Greensboro" "Henderson"
## [31] "Honolulu" "Houston" "Indianapolis"
## [34] "Irvine" "Jacksonville" "Jersey City"
## [37] "Kansas City" "Laredo" "Las Vegas"
## [40] "Lexington" "Lincoln" "Long Beach"
## [43] "Los Angeles" "Louisville" "Memphis"
## [46] "Mesa" "Miami" "Milwaukee"
## [49] "Minneapolis" "Mobile" "Nashville"
## [52] "New Orleans" "New York" "Newark"
## [55] "Oakland" "Oklahoma City" "Omaha"
## [58] "Orlando" "Philadelphia" "Phoenix"
## [61] "Pittsburgh" "Plano" "Portland"
## [64] "Raleigh" "Riverside" "Sacramento"
## [67] "San Antonio" "San Diego" "San Francisco"
## [70] "San Jose" "Santa Ana" "Seattle"
## [73] "St. Louis" "St. Paul" "St. Petersburg"
## [76] "Stockton" "Tampa" "Toledo"
## [79] "Tucson" "Tulsa" "Virginia Beach"
## [82] "Washington" "Wichita"
str_c(String1,String2,…Stringn) Purpose: The function takes in a strings or vectors of strings and concatentates them together
Input: String or vector of strings separated by comma
Output: Single string of vector of combined strings
Example: You can combine as many strings as you want together at once
Let’s let’s see how we can combine two vectors of strings together from our dataframe: the city and the state
## [1] "BaltimoreMaryland" "ChicagoIllinois" "HoustonTexas"
## [4] "ClevelandOhio" "WashingtonD.C." "MilwaukeeWisconsin"
#separate city and state by sep = '' argument
murder_raw$City_State <- str_c(murder_raw$city,murder_raw$state,sep=",")
head(murder_raw$City_State)## [1] "Baltimore,Maryland" "Chicago,Illinois" "Houston,Texas"
## [4] "Cleveland,Ohio" "Washington,D.C." "Milwaukee,Wisconsin"
str_replace_all(string, pattern, string) Purpose: This function will replace all instances of a pattern with the given replacement
Input: String or vector of strings Pattern - you can use regular expressions here
Output: Single string of vector of combined strings
Example: Supposed we wanted to replace all appearances of , in the column ‘City_State’. We can easily do this with str_replace_all()
murder_raw$City_State <- str_replace_all(murder_raw$City_State,'[\\,]','*')
head(murder_raw$City_State)## [1] "Baltimore*Maryland" "Chicago*Illinois" "Houston*Texas"
## [4] "Cleveland*Ohio" "Washington*D.C." "Milwaukee*Wisconsin"
str_length(string) Purpose: Find out the length of a string or a vector of strings
Input: String or vector of strings
Output: Integer
Example: Let’s find how out how long each city name
## [1] 9 7 7 9 10 9 12 11 9 9 13 10 6 11 6 8 7 11 5 10 9 21 11 11 6
## [26] 10 10 6 12 6 5 8 13 10 5 11 16 9 7 9 8 7 10 11 4 10 14 6 7 9
## [51] 12 7 7 8 5 8 6 11 7 9 8 7 7 7 6 9 9 7 14 8 8 6 7 5 9
## [76] 6 6 6 11 14 10 6 7
#Let’s only view the rows in the dataframe where the city has more than 9 letters in the name. To do this we’ll also use the filter function from the package dplyr.
filter(murder_raw,str_length(murder_raw$city) > 9)## # A tibble: 28 x 6
## city state `2014_murders` `2015_murders` change City_State
## <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 Washington D.C. 105 162 57 Washington*D.C.
## 2 Philadelph~ Pennsyl~ 248 280 32 Philadelphia*Penns~
## 3 Kansas City Missouri 78 109 31 Kansas City*Missou~
## 4 Oklahoma C~ Oklahoma 45 73 28 Oklahoma City*Okla~
## 5 Louisville Kentucky 56 81 25 Louisville*Kentucky
## 6 Los Angeles Califor~ 260 282 22 Los Angeles*Califo~
## 7 Minneapolis Minneso~ 31 47 16 Minneapolis*Minnes~
## 8 Sacramento Califor~ 28 43 15 Sacramento*Califor~
## 9 Charlotte-~ North C~ 47 61 14 Charlotte-Mecklenb~
## 10 New Orleans Louisia~ 150 164 14 New Orleans*Louisi~
## # ... with 18 more rows
In dplyr also there are so many verbs like pivot_longer,wider etc. which help us in getting the task done for specific required data view.