This report is a summary of lesson by Datacamp
library(tidyverse)
theme_set(theme_bw())
") & single quotes(')
")를 우선적으로
사용해라.")가 포함되어 있는 경우에는 single
quotes(')를 사용해라.")와 single quotes(')
모두 포함되어 있는 경우에는 우선 double quotes(")를
사용하고 escape(\)를 사용해라.# Define line1
line1 <- "The table was a large one, but the three were all crowded together at one corner of it:"
# Define line2
line2 <- '"No room! No room!" they cried out when they saw Alice coming.'
# Define line3
line3 <- "\"There's plenty of room!\" said Alice indignantly, and she sat down in a large arm-chair at one end of the table.'"
writeLines(): escape(\)를 제외하고
출력lines <- c(line1, line2, line3)
# quotes가 포함
print(lines)
## [1] "The table was a large one, but the three were all crowded together at one corner of it:"
## [2] "\"No room! No room!\" they cried out when they saw Alice coming."
## [3] "\"There's plenty of room!\" said Alice indignantly, and she sat down in a large arm-chair at one end of the table.'"
# qoutes가 미포함
writeLines(lines)
## The table was a large one, but the three were all crowded together at one corner of it:
## "No room! No room!" they cried out when they saw Alice coming.
## "There's plenty of room!" said Alice indignantly, and she sat down in a large arm-chair at one end of the table.'
# 서로 이어져서 출력
writeLines(lines, sep = " ")
## The table was a large one, but the three were all crowded together at one corner of it: "No room! No room!" they cried out when they saw Alice coming. "There's plenty of room!" said Alice indignantly, and she sat down in a large arm-chair at one end of the table.'
writeLines("hello\n\U1F30D")
## hello
## 🌍
# Fixed -> Scientific 자동 변환
19890000000000000000000000000000000000000
## [1] 1.989e+40
R은 자동으로 Scientific으로 변환한다.
format(digits = ..., nsmall = ...)
digits: 숫자를 문자로 변환 시 유효숫자(significant
digits, 의미 있는 모든 숫자 자릿수)의 개수 설정
nsmall: 소수점 최소 자릿수 고정x <- c(19890000000000000000000000000000, 0.00000000000000008)
format(x, scientific = TRUE)
## [1] "1.989e+31" "8.000e-17"
format(x, scientific = FALSE)
## [1] "19889999999999999010228444808460.00000000000000000"
## [2] " 0.00000000000000008"
# Some vectors of numbers
percent_change <- c(4, -1.91, 3.00, -5.002)
income <- c(72.19, 1030.18, 10291.93, 1189192.18)
p_values <- c(0.12, 0.98, 0.0000191, 0.00000000002)
## 예시1
format(c(0.0011, 0.011, 1), digits = 1)
## [1] "0.001" "0.011" "1.000"
## 1. 숫자 벡터 내 가장 작은 수는 0.0011
## 2. 유효숫자 1개이므로 0.001 반환
## 3. 나머지 숫자 벡터 기준도 소수점 셋째자리로 통일
## 예시2
format(c(1.0011, 2.011, 1), digits = 1)
## [1] "1" "2" "1"
format()함수는 기본적으로 숫자들 앞에 빈 공간을
padding함으로써 수직 나열 시 표현하기 적합하도록 만들어준다.income <- c(72.19, 1030.18, 10291.93, 1189192.18)
formatted_income <- format(income, digits = 2)
print(formatted_income)
## [1] " 72" " 1030" " 10292" "1189192"
# 좌측부터 공백이 생겨 숫자 벡터들이 가지런히 우측정렬이 된 모습
writeLines(formatted_income)
## 72
## 1030
## 10292
## 1189192
trim: 공백 제거trimmed_income <- format(income, digits = 2, trim = TRUE)
# 공백이 제거되어 좌측 정렬된 모습
writeLines(trimmed_income)
## 72
## 1030
## 10292
## 1189192
big.markpretty_income <- format(income, digits = 2, big.mark = ",")
writeLines(pretty_income)
## 72
## 1,030
## 10,292
## 1,189,192
formatC(format = ...)
f: Fixed
digits: 소수점 이하 자리수 의미e: Scientific
digits: format()처럼 유효숫자 개수
의미g: Fixed unless Scientific saves spaceflag
+: 부호 표시-: 왼쪽 정렬0: 앞쪽부터 0 paddingx <- c(0.0011, 0.011, 1)
y <- c(1.0011, 2.011, 1)
# format = "f"이므로 digits는 소수점 이하 자리수
formatC(x, format = "f", digits = 1)
## [1] "0.0" "0.0" "1.0"
formatC(y, format = "f", digits = 1)
## [1] "1.0" "2.0" "1.0"
formatC(percent_change, format = "f", digits = 1)
## [1] "4.0" "-1.9" "3.0" "-5.0"
paste()
collapse를 통해 single string 생성 가능animal_goes <- "moo"
paste(c("Here", "There", "Everywhere"), "a", animal_goes, collapse = ", ")
## [1] "Here a moo, There a moo, Everywhere a moo"
pretty_income2 <- format(income, digits = 2, big.mark = ",", trim = TRUE)
pretty_percent2 <- formatC(percent_change, format = "f", digits = 1, flag = "+")
years <- c(2010, 2011, 2012, 2013)
# Add $ to pretty_income
paste("$", pretty_income2, sep = "")
## [1] "$72" "$1,030" "$10,292" "$1,189,192"
# Add % to pretty_percent
paste(pretty_percent2, "%", sep = "")
## [1] "+4.0%" "-1.9%" "+3.0%" "-5.0%"
# Create vector with elements like 2010: +4.0%`
paste(years, ": ", paste(pretty_percent2, "%", sep = ""), sep = "")
## [1] "2010: +4.0%" "2011: -1.9%" "2012: +3.0%" "2013: -5.0%"
# Collapse all years into single string
paste(years, ": ", paste(pretty_percent2, "%", sep = ""), sep = "", collapse = ", ")
## [1] "2010: +4.0%, 2011: -1.9%, 2012: +3.0%, 2013: -5.0%"
# Define the names vector
income_names <- c("Year 0", "Year 1", "Year 2", "Project Lifetime")
# Create pretty_income
pretty_income <- format(income, digits = 2, big.mark = ",")
# Create dollar_income
dollar_income <- paste("$", pretty_income, sep = "")
# Create formatted_names
formatted_names <- format(income_names, justify = "right")
# Create rows
rows <- paste(formatted_names, dollar_income, sep = " ")
# Write rows
writeLines(rows)
## Year 0 $ 72
## Year 1 $ 1,030
## Year 2 $ 10,292
## Project Lifetime $1,189,192
stringrstr_c: like paste() - sep(구분기호)
기본값이 ““임str_length(): 텍스트 길이 반환str_sub()library(babynames)
head(babynames)
## # A tibble: 6 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1880 F Mary 7065 0.0724
## 2 1880 F Anna 2604 0.0267
## 3 1880 F Emma 2003 0.0205
## 4 1880 F Elizabeth 1939 0.0199
## 5 1880 F Minnie 1746 0.0179
## 6 1880 F Margaret 1578 0.0162
# Extracting vectors for boys' and girls' names
babynames_2014 <- filter(babynames, year == 2014)
boy_names <- filter(babynames_2014, sex == "M")$name
girl_names <- filter(babynames_2014, sex == "F")$name
# Extract the first letter in girl_names, then tabulate
girl_first_letter <- str_sub(girl_names, 1, 1)
table(girl_first_letter)
## girl_first_letter
## A B C D E F G H I J K L M N O P
## 3101 699 946 810 933 209 345 469 373 1430 1694 1122 1746 752 143 303
## Q R S T U V W X Y Z
## 38 831 1369 683 28 214 85 62 294 502
# Extract the last letter in girl_names, then tabulate
girl_last_letter <- str_sub(girl_names, -1, -1)
table(girl_last_letter)
## girl_last_letter
## a b c d e f g h i j k l m n o p
## 6632 20 13 81 3114 8 21 1942 1581 12 31 450 115 2608 105 3
## q r s t u v w x y z
## 2 291 326 208 59 6 17 50 1435 51
str_detect(pattern = ...):
TRUE/FALSE로 포함여부 반환
pattern = fixed(): 문자열이 정확히 일치하는 경우만
반환str_subset(): 패턴이 포함된 문자열 자체를
반환str_count(): 문자열에 패턴이 포함된 횟수
반환str_extract(): 패턴이 포함될 경우 패턴을 반환, 나머지는
NA 반환str_subset(girl_names, fixed("U"))
## [1] "Unique" "Uma" "Unknown" "Una" "Uriah" "Ursula" "Unity"
## [8] "Umaiza" "Urvi" "Ulyana" "Ula" "Udy" "Urwa" "Ulani"
## [15] "Umaima" "Umme" "Ugochi" "Ulyssa" "Umika" "Uriyah" "Ubah"
## [22] "Umaira" "Umi" "Ume" "Urenna" "Uriel" "Urijah" "Uyen"
str_split(pattern = ..., n = ..., simplify = T/F)
n: 분할 횟수simplify = TRUE: matrix 반환str_split(string = "Tom & Jerry", pattern = " & ")
## [[1]]
## [1] "Tom" "Jerry"
chars <- c("Tom & Jerry", "Alvin & Simon & Theodore")
str_split(string = chars, pattern = " & ")
## [[1]]
## [1] "Tom" "Jerry"
##
## [[2]]
## [1] "Alvin" "Simon" "Theodore"
str_split(string = chars, pattern = " & ", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "Tom" "Jerry" ""
## [2,] "Alvin" "Simon" "Theodore"
str_replace(): 처음 등장한 패턴만 변경str_replace_all(): 모든 패턴 변경# Use all names in babynames_2014
all_names <- babynames_2014$name
# Get the last two letters of all_names
last_two_letters <- str_sub(all_names, -2, -1)
# Does the name end in "ee"?
ends_in_ee <- str_detect(last_two_letters, "ee")
# Extract rows and "sex" column
sex <- babynames_2014$sex[ends_in_ee]
# Display result as a table
table(sex)
## sex
## F M
## 572 84
str_viewrebus 패캐지를 통해 기존 정규식보다 더 직관적인 패턴식
작성 가능
%R% 연산자로 패턴 연결 가능DGT - 숫자 한자리 - \\dWRD - 영문자 - [A-Za-z]SPC - 공백 - \\sANY_CHAR - 아무 문자 하나 - .repeated(x, n) - x를 n번 반복 - x{n}START, END - 문자열의 시작/끝 -
^, $DOT, CARAT, DOLLAR -
\., \^, \$optional() - ?zero_or_more() - *one_or_more() - `+``exactly - 패턴이 정확히 일치하는 경우library(rebus)
# Some strings to practice with
x <- c("cat", "coat", "scotland", "tic toc")
str_view(x, pattern = START %R% "c")
## [1] │ <c>at
## [2] │ <c>oat
str_view(x, pattern = START %R% "cat" %R% END)
## [1] │ <cat>
# Match a string with exactly three characters
str_view(x, pattern = START %R% ANY_CHAR %R% ANY_CHAR %R% ANY_CHAR %R% END)
## [1] │ <cat>
or(): alternationor("dog", "cat")
## <regex> (?:dog|cat)
char_class("Aa")
## <regex> [Aa]
negated_char_class("Aa")
## <regex> [^Aa]
# Match names that start with Cath or Kath
ckath <- START %R% or("C", "K") %R% "ath"
str_view(girl_names, pattern = ckath, match = TRUE)
## [83] │ <Kath>erine
## [170] │ <Cath>erine
## [293] │ <Kath>ryn
## [674] │ <Kath>leen
## [1535] │ <Kath>y
## [2146] │ <Kath>arine
## [2203] │ <Kath>eryn
## [2929] │ <Cath>ryn
## [3293] │ <Kath>erin
## [3334] │ <Cath>y
## [4597] │ <Kath>ia
## [4598] │ <Kath>rine
## [5041] │ <Cath>leen
## [5642] │ <Cath>arine
## [6094] │ <Kath>arina
## [6843] │ <Kath>ya
## [7456] │ <Kath>alina
## [7853] │ <Kath>erina
## [7854] │ <Kath>rynn
## [8257] │ <Kath>ryne
## ... and 16 more
char_class(): 포함하고자하는 문자 패턴 생성# Vowels from last exercise
vowels <- char_class("aeiouAEIOU")
# See names with only vowels
# one_or_more -> 모음이 1개 이상인 경우
# exactly -> 오직 모음으로만 이루어진 경우
str_view(boy_names,
pattern = exactly(one_or_more(vowels)),
match = TRUE)
## [10019] │ <Io>
negated_char_class(): 포함하고자하는 문자 제외 패턴
생성# Vowels from last exercise
not_vowels <- negated_char_class("aeiouAEIOU")
# See names with only vowels
# one_or_more -> 모음이 1개 이상인 경우
# exactly -> 오직 자음으로만 이루어진 경우
str_view(boy_names,
pattern = exactly(one_or_more(not_vowels)),
match = TRUE)
## [425] │ <Ty>
## [485] │ <Rhys>
## [658] │ <Flynn>
## [1385] │ <Sky>
## [1679] │ <Fynn>
## [1769] │ <Kyng>
## [2004] │ <Cy>
## [2254] │ <Wynn>
## [2624] │ <Cj>
## [2974] │ <Tj>
## [3404] │ <Jc>
## [3656] │ <Ky>
## [3837] │ <Jr>
## [3861] │ <Rhythm>
## [4496] │ <Rj>
## [4628] │ <Md>
## [4775] │ <Kc>
## [5351] │ <Jj>
## [5606] │ <Lynn>
## [5921] │ <Rylyn>
## ... and 33 more
DGT: 숫자 1개 - NOT_DGT
dgt(): 숫자 여러개 매칭 가능WRD: 문자(숫자 포함) 1개 - NOT_WRDSPC: 공백 - NOT_SPCcontact <- c("Call me at 555-555-0191",
"123 Main St",
"(555) 555 0191",
"Phone: 555.555.0191 Mobile: 555.555.0192")
# Use this pattern
three_digits <- DGT %R% DGT %R% DGT
four_digits <- three_digits %R% DGT
separator <- char_class("-.() ")
phone_pattern <- optional(OPEN_PAREN) %R%
three_digits %R%
zero_or_more(separator) %R%
three_digits %R%
zero_or_more(separator) %R%
four_digits
# Extract phone numbers / 동일한 길이를 가진 벡터 반환
str_extract(contact, pattern = phone_pattern)
## [1] "555-555-0191" NA "(555) 555 0191" "555.555.0191"
# Extract ALL phone numbers / 동일한 길이를 가진 리스트 반환
str_extract_all(contact, pattern = phone_pattern)
## [[1]]
## [1] "555-555-0191"
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "(555) 555 0191"
##
## [[4]]
## [1] "555.555.0191" "555.555.0192"
narratives <- readRDS("narratives.rds")
# Use these patterns
age <- DGT %R% optional(DGT)
unit <- optional(SPC) %R% or("YO", "YR", "MO")
# 1. Test pattern with age then units
str_view(narratives, pattern = age %R% unit)
## [1] │ <19YO>M-SHOULDER STRAIN-WAS TACKLED WHILE PLAYING FOOTBALL W/ FRIENDS
## [2] │ <31 YO>F FELL FROM TOILET HITITNG HEAD SUSTAINING A CHI
## [3] │ ANKLE STR. <82 YO>M STRAINED ANKLE GETTING OUT OF BED
## [4] │ TRIPPED OVER CAT AND LANDED ON HARDWOOD FLOOR. LACERATION ELBOW, LEFT. <33 YO>F*
## [5] │ <10YO>M CUT THUMB ON METAL TRASH CAN DX AVULSION OF SKIN OF THUMB
## [6] │ <53 YO> F TRIPPED ON CARPET AT HOME. DX HIP CONTUSION
## [7] │ <13 MO>F TRYING TO STAND UP HOLDING ONTO BED FELL AND HIT FOREHEAD ON RADIATOR DX LACERATION
## [8] │ <14YR> M PLAYING FOOTBALL; DX KNEE SPRAIN
## [9] │ <55YO>M RIDER OF A BICYCLE AND FELL OFF SUSTAINED A CONTUSION TO KNEE
## [10] │ <5 YO>M ROLLING ON FLOOR DOING A SOMERSAULT AND SUSTAINED A CERVICAL STRA IN
# Pattern to match gender
gender <- optional(SPC) %R% or("M", "F")
# 2. Test pattern with age then units then gender
str_view(narratives, pattern = age %R% unit %R% gender)
## [1] │ <19YOM>-SHOULDER STRAIN-WAS TACKLED WHILE PLAYING FOOTBALL W/ FRIENDS
## [2] │ <31 YOF> FELL FROM TOILET HITITNG HEAD SUSTAINING A CHI
## [3] │ ANKLE STR. <82 YOM> STRAINED ANKLE GETTING OUT OF BED
## [4] │ TRIPPED OVER CAT AND LANDED ON HARDWOOD FLOOR. LACERATION ELBOW, LEFT. <33 YOF>*
## [5] │ <10YOM> CUT THUMB ON METAL TRASH CAN DX AVULSION OF SKIN OF THUMB
## [6] │ <53 YO F> TRIPPED ON CARPET AT HOME. DX HIP CONTUSION
## [7] │ <13 MOF> TRYING TO STAND UP HOLDING ONTO BED FELL AND HIT FOREHEAD ON RADIATOR DX LACERATION
## [8] │ <14YR M> PLAYING FOOTBALL; DX KNEE SPRAIN
## [9] │ <55YOM> RIDER OF A BICYCLE AND FELL OFF SUSTAINED A CONTUSION TO KNEE
## [10] │ <5 YOM> ROLLING ON FLOOR DOING A SOMERSAULT AND SUSTAINED A CERVICAL STRA IN
# 3. Extract age, unit, gender
age_gender <- str_extract(narratives, pattern = age %R% unit %R% gender)
age_gender
## [1] "19YOM" "31 YOF" "82 YOM" "33 YOF" "10YOM" "53 YO F" "13 MOF"
## [8] "14YR M" "55YOM" "5 YOM"
str_remove()# Extract age and make numeric
as.numeric(str_extract(age_gender, pattern = age))
## [1] 19 31 82 33 10 53 13 14 55 5
# Replace age and units with ""
genders <- str_remove(age_gender, pattern = age %R% unit)
# Replace extra spaces
str_remove(genders, " ")
## [1] "M" "F" "M" "F" "M" "F" "F" "M" "M" "M"
# Numeric ages, from previous step
ages_numeric <- as.numeric(str_extract(age_gender, age))
# Extract units
time_units <- str_extract(age_gender, unit)
# Extract first word character
time_units_clean <- str_extract(time_units, WRD)
# Turn ages in months to years
ifelse(time_units_clean == "Y", ages_numeric, ages_numeric / 12)
## [1] 19.000000 31.000000 82.000000 33.000000 10.000000 53.000000 1.083333
## [8] 14.000000 55.000000 5.000000
capture(): 단순히 문자열 패턴을 찾는 것을 넘어서
패턴을 추출하고 싶을 때 사용str_match(): str_extract와 마찬가지로
매칭된 전체 문자열 뿐만 아니라 캡쳐 그룹을 별도로 반환
/ matrix 형태로 반환str_match(c("Fat", "cat"),
pattern = capture(ANY_CHAR) %R% "a")
## [,1] [,2]
## [1,] "Fa" "F"
## [2,] "ca" "c"
pattern <- DOLLAR %R%
capture(DGT %R% optional(DGT)) %R%
DOT %R%
capture(dgt(2))
str_match(c("$5.50", "$32.00"), pattern = pattern)
## [,1] [,2] [,3]
## [1,] "$5.50" "5" "50"
## [2,] "$32.00" "32" "00"
# <regex>의 ?: 는 비캡쳐그룹 의미
# BUT need parentheses(괄호) to distinguish
# (dog|cat) do(g|c)at
or("dog", "cat")
## <regex> (?:dog|cat)
# 아래와 같은 2가지 방법으로 capture 가능
or("dog", "cat", capture = TRUE)
## <regex> (dog|cat)
capture(or("dog", "cat"))
## <regex> ((?:dog|cat))
# narratives has been pre-defined
narratives
## [1] "19YOM-SHOULDER STRAIN-WAS TACKLED WHILE PLAYING FOOTBALL W/ FRIENDS "
## [2] "31 YOF FELL FROM TOILET HITITNG HEAD SUSTAINING A CHI "
## [3] "ANKLE STR. 82 YOM STRAINED ANKLE GETTING OUT OF BED "
## [4] "TRIPPED OVER CAT AND LANDED ON HARDWOOD FLOOR. LACERATION ELBOW, LEFT. 33 YOF*"
## [5] "10YOM CUT THUMB ON METAL TRASH CAN DX AVULSION OF SKIN OF THUMB "
## [6] "53 YO F TRIPPED ON CARPET AT HOME. DX HIP CONTUSION "
## [7] "13 MOF TRYING TO STAND UP HOLDING ONTO BED FELL AND HIT FOREHEAD ON RADIATOR DX LACERATION"
## [8] "14YR M PLAYING FOOTBALL; DX KNEE SPRAIN "
## [9] "55YOM RIDER OF A BICYCLE AND FELL OFF SUSTAINED A CONTUSION TO KNEE "
## [10] "5 YOM ROLLING ON FLOOR DOING A SOMERSAULT AND SUSTAINED A CERVICAL STRA IN"
# Add capture() to get age, unit and sex
pattern <- capture(optional(DGT) %R% DGT) %R%
optional(SPC) %R% capture(or("YO", "YR", "MO")) %R%
optional(SPC) %R% capture(or("M", "F"))
# Pull out from narratives
str_match(narratives, pattern)
## [,1] [,2] [,3] [,4]
## [1,] "19YOM" "19" "YO" "M"
## [2,] "31 YOF" "31" "YO" "F"
## [3,] "82 YOM" "82" "YO" "M"
## [4,] "33 YOF" "33" "YO" "F"
## [5,] "10YOM" "10" "YO" "M"
## [6,] "53 YO F" "53" "YO" "F"
## [7,] "13 MOF" "13" "MO" "F"
## [8,] "14YR M" "14" "YR" "M"
## [9,] "55YOM" "55" "YO" "M"
## [10,] "5 YOM" "5" "YO" "M"
# Edit to capture just Y and M in units
pattern2 <- capture(optional(DGT) %R% DGT) %R%
optional(SPC) %R% capture(or("Y", "M")) %R% optional(or("O","R")) %R%
optional(SPC) %R% capture(or("M", "F"))
# Check pattern
str_view(narratives, pattern2)
## [1] │ <19YOM>-SHOULDER STRAIN-WAS TACKLED WHILE PLAYING FOOTBALL W/ FRIENDS
## [2] │ <31 YOF> FELL FROM TOILET HITITNG HEAD SUSTAINING A CHI
## [3] │ ANKLE STR. <82 YOM> STRAINED ANKLE GETTING OUT OF BED
## [4] │ TRIPPED OVER CAT AND LANDED ON HARDWOOD FLOOR. LACERATION ELBOW, LEFT. <33 YOF>*
## [5] │ <10YOM> CUT THUMB ON METAL TRASH CAN DX AVULSION OF SKIN OF THUMB
## [6] │ <53 YO F> TRIPPED ON CARPET AT HOME. DX HIP CONTUSION
## [7] │ <13 MOF> TRYING TO STAND UP HOLDING ONTO BED FELL AND HIT FOREHEAD ON RADIATOR DX LACERATION
## [8] │ <14YR M> PLAYING FOOTBALL; DX KNEE SPRAIN
## [9] │ <55YOM> RIDER OF A BICYCLE AND FELL OFF SUSTAINED A CONTUSION TO KNEE
## [10] │ <5 YOM> ROLLING ON FLOOR DOING A SOMERSAULT AND SUSTAINED A CERVICAL STRA IN
# Pull out pieces
str_match(narratives, pattern2)
## [,1] [,2] [,3] [,4]
## [1,] "19YOM" "19" "Y" "M"
## [2,] "31 YOF" "31" "Y" "F"
## [3,] "82 YOM" "82" "Y" "M"
## [4,] "33 YOF" "33" "Y" "F"
## [5,] "10YOM" "10" "Y" "M"
## [6,] "53 YO F" "53" "Y" "F"
## [7,] "13 MOF" "13" "M" "F"
## [8,] "14YR M" "14" "Y" "M"
## [9,] "55YOM" "55" "Y" "M"
## [10,] "5 YOM" "5" "Y" "M"
capture함수에 해당하는 패턴 순서대로 REF1
~ REF9x <- "Paris in the the spring"
pattern <- SPC %R%
capture(one_or_more(WRD)) %R%
SPC %R%
REF1
str_view(x, pattern)
## [1] │ Paris in< the the> spring
str_replace(x,
pattern = pattern,
replacement = str_c(" ", REF1))
## [1] "Paris in the spring"
# Names with three repeated letters
repeated_three_times <- capture(WRD) %R% REF1 %R% REF1
# Test it
str_view(boy_names, pattern = repeated_three_times, match = TRUE)
## [13940] │ Wi<lll>iam
# Build pattern to match words ending in "ING"
pattern <- capture(one_or_more(WRD) %R% "ING")
str_view(narratives, pattern)
## [1] │ 19YOM-SHOULDER STRAIN-WAS TACKLED WHILE <PLAYING> FOOTBALL W/ FRIENDS
## [2] │ 31 YOF FELL FROM TOILET HITITNG HEAD <SUSTAINING> A CHI
## [3] │ ANKLE STR. 82 YOM STRAINED ANKLE <GETTING> OUT OF BED
## [7] │ 13 MOF <TRYING> TO STAND UP <HOLDING> ONTO BED FELL AND HIT FOREHEAD ON RADIATOR DX LACERATION
## [8] │ 14YR M <PLAYING> FOOTBALL; DX KNEE SPRAIN
## [10] │ 5 YOM <ROLLING> ON FLOOR <DOING> A SOMERSAULT AND SUSTAINED A CERVICAL STRA IN
# Test replacement
str_replace(narratives, pattern, str_c("CARELESSLY", REF1, sep = " "))
## [1] "19YOM-SHOULDER STRAIN-WAS TACKLED WHILE CARELESSLY PLAYING FOOTBALL W/ FRIENDS "
## [2] "31 YOF FELL FROM TOILET HITITNG HEAD CARELESSLY SUSTAINING A CHI "
## [3] "ANKLE STR. 82 YOM STRAINED ANKLE CARELESSLY GETTING OUT OF BED "
## [4] "TRIPPED OVER CAT AND LANDED ON HARDWOOD FLOOR. LACERATION ELBOW, LEFT. 33 YOF*"
## [5] "10YOM CUT THUMB ON METAL TRASH CAN DX AVULSION OF SKIN OF THUMB "
## [6] "53 YO F TRIPPED ON CARPET AT HOME. DX HIP CONTUSION "
## [7] "13 MOF CARELESSLY TRYING TO STAND UP HOLDING ONTO BED FELL AND HIT FOREHEAD ON RADIATOR DX LACERATION"
## [8] "14YR M CARELESSLY PLAYING FOOTBALL; DX KNEE SPRAIN "
## [9] "55YOM RIDER OF A BICYCLE AND FELL OFF SUSTAINED A CONTUSION TO KNEE "
## [10] "5 YOM CARELESSLY ROLLING ON FLOOR DOING A SOMERSAULT AND SUSTAINED A CERVICAL STRA IN"
\u를 앞에 붙여서 표현 가능as.hexmode(utf8ToInt())함수로 Unicode 문자 출력
가능"\u03BC"
## [1] "μ"
"\U1F449"
## [1] "👉"
as.hexmode(utf8ToInt("a"))
## [1] "61"
\p followed by {name}str_view_all(x, greek_and_coptic())?Unicode?unicode_property?unicode_general_categoryè는 유니코드로 2가지 방법으로 표현이 가능
"\u00e8""\u0065\u0300" -> 일반 e와 결합 악센트stringi패키지의 다음 함수들이 이를 해결
stri_trans_nfc: 문자와 결합 악센트를 한 개 문자로
**결합(compose)*stri_trans_nfd: 문자와 결합 악센트를
분해(decompose)library(stringi)
# 코드는 다르지만 같은 문자를 의미
x <- c("\u00e8", "\u0065\u0300")
writeLines(x)
## è
## è
# Names with builtin accents
tay_son_builtin <- c(
"Nguy\u1ec5n Nh\u1ea1c",
"Nguy\u1ec5n Hu\u1ec7",
"Nguy\u1ec5n Quang To\u1ea3n"
)
writeLines(tay_son_builtin)
## Nguyễn Nhạc
## Nguyễn Huệ
## Nguyễn Quang Toản
# Convert to separate accents
tay_son_separate <- stri_trans_nfd(tay_son_builtin)
# 겉으로는 같은 문자열로 보이지만 내부적으로는 서로 다른 코드를 사용한다.
writeLines(tay_son_separate)
## Nguyễn Nhạc
## Nguyễn Huệ
## Nguyễn Quang Toản
ANY_CHAR: 하나의 code point만 매칭 가능GRAPHEME: ANY_CHAR와 달리 사람 눈에 보이는
하나의 글자(시각 문자) 매칭 가능str_view_all(tay_son_separate, ANY_CHAR)
## [1] │ <N><g><u><y><e><̂><̃><n>< ><N><h><a><̣><c>
## [2] │ <N><g><u><y><e><̂><̃><n>< ><H><u><e><̣><̂>
## [3] │ <N><g><u><y><e><̂><̃><n>< ><Q><u><a><n><g>< ><T><o><a><̉><n>
str_view_all(tay_son_separate, GRAPHEME)
## [1] │ <N><g><u><y><ễ><n>< ><N><h><ạ><c>
## [2] │ <N><g><u><y><ễ><n>< ><H><u><ệ>
## [3] │ <N><g><u><y><ễ><n>< ><Q><u><a><n><g>< ><T><o><ả><n>
tay_son_builtin <- stri_trans_nfc(tay_son_separate)
str_view_all(tay_son_builtin, GRAPHEME)
## [1] │ <N><g><u><y><ễ><n>< ><N><h><ạ><c>
## [2] │ <N><g><u><y><ễ><n>< ><H><u><ệ>
## [3] │ <N><g><u><y><ễ><n>< ><Q><u><a><n><g>< ><T><o><ả><n>
readLinesstringi::stri_read_lines(): 더 큰 텍스트파일에
적합str_whichearnest <- stri_read_lines("importance-of-being-earnest.txt")
# Detect start and end lines
start <- str_which(earnest, fixed("START OF THE PROJECT"))
end <- str_which(earnest, fixed("END OF THE PROJECT"))
# Get rid of gutenberg intro text
earnest_sub <- earnest[(start + 1):(end - 1)]
# Detect first act
lines_start <- str_which(earnest_sub, fixed("FIRST ACT"))
# Set up index
intro_line_index <- 1:(lines_start - 1)
# Split play into intro and play
intro_text <- earnest_sub[intro_line_index]
play_text <- earnest_sub[-intro_line_index]
stri_isempty(): 공백 제거# Get rid of empty strings
empty <- stri_isempty(play_text)
play_lines <- play_text[!empty]
# Pattern for start, word then .
pattern_1 <- START %R% one_or_more(WRD) %R% DOT
# Test pattern_1
str_view(play_lines, pattern_1, match = TRUE)
## [8] │ <Algernon.> Did you hear what I was playing, Lane?
## [9] │ <Lane.> I didn't think it polite to listen, sir.
## [10] │ <Algernon.> I'm sorry for that, for your sake. I don't play
## [12] │ <expression.> As far as the piano is concerned, sentiment is my forte. I
## [14] │ <Lane.> Yes, sir.
## [15] │ <Algernon.> And, speaking of the science of Life, have you got the
## [17] │ <Lane.> Yes, sir. [Hands them on a salver.]
## [18] │ <Algernon.> [Inspects them, takes two, and sits down on the sofa.] Oh! . . .
## [22] │ <Lane.> Yes, sir; eight bottles and a pint.
## [23] │ <Algernon.> Why is it that at a bachelor's establishment the servants
## [25] │ <Lane.> I attribute it to the superior quality of the wine, sir. I have
## [28] │ <Algernon.> Good heavens! Is marriage so demoralising as that?
## [29] │ <Lane.> I believe it _is_ a very pleasant state, sir. I have had very
## [33] │ <Algernon.> [Languidly_._] I don't know that I am much interested in your
## [35] │ <Lane.> No, sir; it is not a very interesting subject. I never think of
## [37] │ <Algernon.> Very natural, I am sure. That will do, Lane, thank you.
## [38] │ <Lane.> Thank you, sir. [Lane goes out.]
## [39] │ <Algernon.> Lane's views on marriage seem somewhat lax. Really, if the
## [42] │ <responsibility.>
## [44] │ <Lane.> Mr. Ernest Worthing.
## ... and 875 more
# Pattern for start, capital, word then .
# ascii_upper()는 영문 대문자만 포함
pattern_2 <- START %R% ascii_upper() %R% one_or_more(WRD) %R% DOT
# Test pattern_2
str_view(play_lines, pattern_2, match = TRUE)
## [8] │ <Algernon.> Did you hear what I was playing, Lane?
## [9] │ <Lane.> I didn't think it polite to listen, sir.
## [10] │ <Algernon.> I'm sorry for that, for your sake. I don't play
## [14] │ <Lane.> Yes, sir.
## [15] │ <Algernon.> And, speaking of the science of Life, have you got the
## [17] │ <Lane.> Yes, sir. [Hands them on a salver.]
## [18] │ <Algernon.> [Inspects them, takes two, and sits down on the sofa.] Oh! . . .
## [22] │ <Lane.> Yes, sir; eight bottles and a pint.
## [23] │ <Algernon.> Why is it that at a bachelor's establishment the servants
## [25] │ <Lane.> I attribute it to the superior quality of the wine, sir. I have
## [28] │ <Algernon.> Good heavens! Is marriage so demoralising as that?
## [29] │ <Lane.> I believe it _is_ a very pleasant state, sir. I have had very
## [33] │ <Algernon.> [Languidly_._] I don't know that I am much interested in your
## [35] │ <Lane.> No, sir; it is not a very interesting subject. I never think of
## [37] │ <Algernon.> Very natural, I am sure. That will do, Lane, thank you.
## [38] │ <Lane.> Thank you, sir. [Lane goes out.]
## [39] │ <Algernon.> Lane's views on marriage seem somewhat lax. Really, if the
## [44] │ <Lane.> Mr. Ernest Worthing.
## [47] │ <Algernon.> How are you, my dear Ernest? What brings you up to town?
## [48] │ <Jack.> Oh, pleasure, pleasure! What else should bring one anywhere?
## ... and 752 more
# Get subset of lines that match
lines <- str_subset(play_lines, pattern_2)
# Extract match from lines
who <- str_extract(lines, pattern_2)
# Let's see what we have
unique(who)
## [1] "Algernon." "Lane." "Jack." "Cecily." "Ernest."
## [6] "University." "Gwendolen." "July." "Chasuble." "Merriman."
## [11] "Sunday." "Mr." "London." "Cardew." "Opera."
## [16] "Markby." "Oxonian."
or1(): 여러개의 패턴을 벡터 형태로
전달 가능(or은 벡터로 전달 못함)# Variables from previous step
characters <- c("Algernon", "Jack", "Lane", "Cecily", "Gwendolen", "Chasuble",
"Merriman", "Lady Bracknell", "Miss Prism")
pattern_3 <- START %R% or1(characters) %R% DOT
# Pull out matches
lines <- str_subset(play_lines, pattern_3)
# Extract match from lines
who <- str_extract(lines, pattern_3)
# Let's see what we have
unique(who)
## [1] "Algernon." "Lane." "Jack." "Cecily."
## [5] "Gwendolen." "Lady Bracknell." "Miss Prism." "Chasuble."
## [9] "Merriman."
# Count lines per character
table(who)
## who
## Algernon. Cecily. Chasuble. Gwendolen. Jack.
## 201 154 42 102 219
## Lady Bracknell. Lane. Merriman. Miss Prism.
## 84 21 17 41
str_to_lower()whole_word(): 정확히 일치하는 단어만 매칭# 대소문자가 엉망진창 포함된
catcidents <- readRDS("catcidents.rds")
# Construct pattern of DOG in boundaries
whole_dog_pattern <- whole_dog_pattern <- whole_word("DOG")
# See matches to word DOG
str_view(catcidents, whole_dog_pattern, match = TRUE)
## [4] │ bLUNT CHest trAUma, R/o RIb fX, R/O CartiLAgE InJ To RIB cAge; 32YOM walKiNG <DOG>, dog took OfF aFtER cAt,FelL,stRucK CHest oN STepS,hiT rIbS
## [17] │ 67 YO F WENT TO WALK <DOG>, IT STARTED TO CHASE CAT JERKED LEASH PULLED H ER OFF PATIO, FELL HURT ANKLES. DX BILATERAL ANKLE FRACTURES
## [24] │ PUSHING HER UTD WITH SHOTS <DOG> AWAY FROM THE CAT'S BOWL&BITTEN TO FINGE R>>PW/<DOG> BITE
## [30] │ DX R SH PN: 27YOF W/ R SH PN X 5D. STATES WAS YANK' BY HER <DOG> ON LEASH W <DOG> RAN AFTER CAT; WORSE' PN SINCE. FULL ROM BUT VERY PAINFUL TO MOVE
# Transform catcidents to upper case
catcidents_upper <- str_to_upper(catcidents)
# View matches to word "DOG" again
str_view(catcidents_upper, whole_dog_pattern, match = TRUE)
## [4] │ BLUNT CHEST TRAUMA, R/O RIB FX, R/O CARTILAGE INJ TO RIB CAGE; 32YOM WALKING <DOG>, <DOG> TOOK OFF AFTER CAT,FELL,STRUCK CHEST ON STEPS,HIT RIBS
## [6] │ 4YOF <DOG> JUST HAD PUPPIES, CAT TRIED 2 GET PUPPIES, PT THRU CAT DWN STA IRS, LOST FOOTING & FELL DOWN ~12 STEPS; MINOR HEAD INJURY
## [7] │ UNHELMETED 14YOF RIDING HER BIKE WITH HER <DOG> WHEN SHE SAW A CAT AND SW ERVED C/O HEAD/SHOULDER/ELBOW PAIN.DX: MINOR HEAD INJURY,LEFT SHOULDER
## [10] │ RT SHOULDER STRAIN.26YOF WAS WALKING <DOG> ON LEASH AND DOT SAW A CAT AND PULLED LEASH.
## [17] │ 67 YO F WENT TO WALK <DOG>, IT STARTED TO CHASE CAT JERKED LEASH PULLED H ER OFF PATIO, FELL HURT ANKLES. DX BILATERAL ANKLE FRACTURES
## [19] │ 46YOF TAKING <DOG> OUTSIDE, <DOG> BENT HER FINGERS BACK ON A DOOR. <DOG> JERK ED WHEN SAW CAT. HAND HOLDING LEASH CAUGHT ON DOOR JAMB/CT HAND
## [24] │ PUSHING HER UTD WITH SHOTS <DOG> AWAY FROM THE CAT'S BOWL&BITTEN TO FINGE R>>PW/<DOG> BITE
## [30] │ DX R SH PN: 27YOF W/ R SH PN X 5D. STATES WAS YANK' BY HER <DOG> ON LEASH W <DOG> RAN AFTER CAT; WORSE' PN SINCE. FULL ROM BUT VERY PAINFUL TO MOVE
## [34] │ 39YOF <DOG> PULLED HER DOWN THE STAIRS WHILE CHASING A CAT DX: RT ANKLE INJ
## [36] │ 44YOF WALKING <DOG> AND THE DOF TOOK OFF AFTER A CAT AND PULLED PT DOWN B Y THE LEASH STRAINED NECK
# 엉망진창 대소문자 모두 포함시켜서 매칭
# 우선 모든 문자를 대문자로 변환 후 기존 데이터와 매칭
# Which strings match?
has_dog <- str_detect(catcidents_upper, whole_dog_pattern)
# Pull out matching strings in original
catcidents[has_dog]
## [1] "bLUNT CHest trAUma, R/o RIb fX, R/O CartiLAgE InJ To RIB cAge; 32YOM walKiNG DOG, dog took OfF aFtER cAt,FelL,stRucK CHest oN STepS,hiT rIbS"
## [2] "4YOf DOg jUst hAd PUpPieS, Cat TRIED 2 get PuPpIes, pT THru CaT dwn stA Irs, LoST foOTING & FELl down ~12 stePS; MInor hEaD iNJuRY"
## [3] "unhelmeted 14yof riding her bike with her dog when she saw a cat and sw erved c/o head/shoulder/elbow pain.dx: minor head injury,left shoulder"
## [4] "Rt Shoulder Strain.26Yof Was Walking Dog On Leash And Dot Saw A Cat And Pulled Leash."
## [5] "67 YO F WENT TO WALK DOG, IT STARTED TO CHASE CAT JERKED LEASH PULLED H ER OFF PATIO, FELL HURT ANKLES. DX BILATERAL ANKLE FRACTURES"
## [6] "46yof taking dog outside, dog bent her fingers back on a door. dog jerk ed when saw cat. hand holding leash caught on door jamb/ct hand"
## [7] "PUSHING HER UTD WITH SHOTS DOG AWAY FROM THE CAT'S BOWL&BITTEN TO FINGE R>>PW/DOG BITE"
## [8] "DX R SH PN: 27YOF W/ R SH PN X 5D. STATES WAS YANK' BY HER DOG ON LEASH W DOG RAN AFTER CAT; WORSE' PN SINCE. FULL ROM BUT VERY PAINFUL TO MOVE"
## [9] "39Yof dog pulled her down the stairs while chasing a cat dx: rt ankle inj"
## [10] "44Yof Walking Dog And The Dof Took Off After A Cat And Pulled Pt Down B Y The Leash Strained Neck"
regex()함수를 통해 매칭stringr::regex(ignore.case = T/F)# View matches to "TRIP"
str_view(catcidents, pattern = "TRIP", match = TRUE)
## [3] │ 87YOF <TRIP>PED OVER CAT, HIT LEG ON STEP. DX LOWER LEG CONTUSION
## [12] │ 31 YOM SUSTAINED A CONTUSION OF A HAND BY <TRIP>PING ON CAT & FALLING ON STAIRS.
## [25] │ DX CALF STRAIN R CALF: 15YOF R CALF PN AFTER FALL ON CARPETED STEPS, TR YING TO STEP OVER CAT, <TRIP>PED ON STAIRS, HIT LEG
## [26] │ DISLOCATION TOE - 80 YO FEMALE REPORTS SHE FELL AT HOME - <TRIP>PED OVER THE CAT LITTER BOX & FELL STRIKING TOE ON DOOR JAMB - ALSO SHOULDER INJ
## [27] │ 73YOF-RADIUS FX-<TRIP>PED OVER CAT LITTER BOX-FELL-@ HOME
## [33] │ FOREHEAD LAC.46YOM <TRIP>PED OVER CAT AND FELL INTO A DOOR FRAME.
## [39] │ PT OPENING HER REFRIGERATOR AND <TRIP>PED OVER A CAT AND FELL ONTO SHOULD ER FRACTURED HUMERUS
# Construct case insensitive pattern
trip_pattern <- stringr::regex("TRIP", ignore_case = TRUE)
# View case insensitive matches to "TRIP"
str_view(catcidents, pattern = trip_pattern, match = TRUE)
## [1] │ 79yOf Fractured fingeR <tRiP>PED ovER cAT ANd fell to FlOOr lAst nIGHT AT HOME*
## [3] │ 87YOF <TRIP>PED OVER CAT, HIT LEG ON STEP. DX LOWER LEG CONTUSION
## [12] │ 31 YOM SUSTAINED A CONTUSION OF A HAND BY <TRIP>PING ON CAT & FALLING ON STAIRS.
## [20] │ 19 YOF-FelL whIle WALKINg DOWn THE sTAIrS & <TRiP>pEd over a caT-fell oNT o "TaIlBoNe" dx coNtusIon LUMBaR, uti *
## [22] │ lEFT KNEE cOntusioN.78YOf <triP>PEd OVEr CaT aND fell and hIt knEE ON the fLoOr.
## [25] │ DX CALF STRAIN R CALF: 15YOF R CALF PN AFTER FALL ON CARPETED STEPS, TR YING TO STEP OVER CAT, <TRIP>PED ON STAIRS, HIT LEG
## [26] │ DISLOCATION TOE - 80 YO FEMALE REPORTS SHE FELL AT HOME - <TRIP>PED OVER THE CAT LITTER BOX & FELL STRIKING TOE ON DOOR JAMB - ALSO SHOULDER INJ
## [27] │ 73YOF-RADIUS FX-<TRIP>PED OVER CAT LITTER BOX-FELL-@ HOME
## [28] │ 57Yom-Back Pain-<Trip>ped Over A Cat-Fell Down 4 Steps-@ Home
## [32] │ 77 Y/o f <trip>ped over cat-c/o shoulder and upper arm pain. Fell to floo r at home. Dx proximal humerus fx
## [33] │ FOREHEAD LAC.46YOM <TRIP>PED OVER CAT AND FELL INTO A DOOR FRAME.
## [39] │ PT OPENING HER REFRIGERATOR AND <TRIP>PED OVER A CAT AND FELL ONTO SHOULD ER FRACTURED HUMERUS
stringi::stri_trans_totitle(type = ...)
type: “word”(기본값), “sentence”(문장 단위로 적용)stringr::str_to_title()# Get first five catcidents
cat5 <- catcidents[1:5]
# Take a look at original
writeLines(cat5)
## 79yOf Fractured fingeR tRiPPED ovER cAT ANd fell to FlOOr lAst nIGHT AT HOME*
## 21 YOF REPORTS SUS LACERATION OF HER LEFT HAND WHEN SHE WAS OPENING A CAN OF CAT FOOD JUST PTA. DX HAND LACERATION%
## 87YOF TRIPPED OVER CAT, HIT LEG ON STEP. DX LOWER LEG CONTUSION
## bLUNT CHest trAUma, R/o RIb fX, R/O CartiLAgE InJ To RIB cAge; 32YOM walKiNG DOG, dog took OfF aFtER cAt,FelL,stRucK CHest oN STepS,hiT rIbS
## 42YOF TO ER FOR BACK PAIN AFTER PUTTING DOWN SOME CAT LITTER DX: BACK PAIN, SCIATICA
# Transform to title case
writeLines(str_to_title(cat5))
## 79yof Fractured Finger Tripped Over Cat And Fell To Floor Last Night At Home*
## 21 Yof Reports Sus Laceration Of Her Left Hand When She Was Opening A Can Of Cat Food Just Pta. Dx Hand Laceration%
## 87yof Tripped Over Cat, Hit Leg On Step. Dx Lower Leg Contusion
## Blunt Chest Trauma, R/O Rib Fx, R/O Cartilage Inj To Rib Cage; 32yom Walking Dog, Dog Took Off After Cat,Fell,Struck Chest On Steps,Hit Ribs
## 42yof To Er For Back Pain After Putting Down Some Cat Litter Dx: Back Pain, Sciatica
# Transform to title case with stringi
writeLines(stri_trans_totitle(cat5))
## 79yof Fractured Finger Tripped Over Cat And Fell To Floor Last Night At Home*
## 21 Yof Reports Sus Laceration Of Her Left Hand When She Was Opening A Can Of Cat Food Just Pta. Dx Hand Laceration%
## 87yof Tripped Over Cat, Hit Leg On Step. Dx Lower Leg Contusion
## Blunt Chest Trauma, R/O Rib Fx, R/O Cartilage Inj To Rib Cage; 32yom Walking Dog, Dog Took Off After Cat,Fell,Struck Chest On Steps,Hit Ribs
## 42yof To Er For Back Pain After Putting Down Some Cat Litter Dx: Back Pain, Sciatica
# Transform to sentence case with stringi
writeLines(stri_trans_totitle(cat5, type = "sentence"))
## 79yof fractured finger tripped over cat and fell to floor last night at home*
## 21 yof reports sus laceration of her left hand when she was opening a can of cat food just pta. Dx hand laceration%
## 87yof tripped over cat, hit leg on step. Dx lower leg contusion
## Blunt chest trauma, r/o rib fx, r/o cartilage inj to rib cage; 32yom walking dog, dog took off after cat,fell,struck chest on steps,hit ribs
## 42yof to er for back pain after putting down some cat litter dx: back pain, sciatica
stringr functions
stringi when stringr doesn’t solve
your problem
stri_ Regular expressions