packages = c(
"dplyr","ggplot2","stringr", "dslabs", "readr", "tidyr", "purrr",
"lubridate", "rvest"
)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=T))
Sys.setlocale("LC_ALL","C")
[1] "C"
options(digits=4, scipen=12)
library(rvest)
library(readr)
library(dplyr)
library(ggplot2)
library(stringr)
library(lubridate)
library(tidyr)
library(dslabs)
library(rvest)
url = "https://en.wikipedia.org/wiki/Murder_in_the_United_States_by_state"
h = read_html(url)
tab = html_nodes(h, "table")[[2]] %>%
html_table %>%
setNames(c(
"state","population","total","murders","gun_murders",
"gun_ownersjip","total_rate","murder_rate","gun_murder_rate"))
Q1: Which of the following is NOT an application of string parsing?
Q1: Which of the following commands would not give you an error in R?
cat(" LeBron James is 6'8\" ")
LeBron James is 6'8"
stringr PackageQ1: Which of the following are advantages of the stringr package over string processing functions in base R? Select all that apply.
sapply(tab, str_detect, ",") %>% colSums
state population total murders
0 51 3 2
gun_murders gun_ownersjip total_rate murder_rate
1 0 0 0
gun_murder_rate
0
# 向量是同一種資料的集合(欄),可以是字串向量,數值向量,邏輯向量等等...
# sapply對一個collcetion(即tab,tab是一個資料框)做detect,回傳一個布林矩陣,再算成colSums
tab2 = tab %>% mutate_at(2:3, parse_number)
# mutatae是對某東西做轉換 # parse可以消除逗號並將字串轉回數字
sapply(tab2, str_detect, ",") %>% colSums
state population total murders
0 0 0 2
gun_murders gun_ownersjip total_rate murder_rate
1 0 0 0
gun_murder_rate
0
Q1: You have a dataframe of monthly sales and profits in R
dat = read.table("data/sales.txt", header=T, sep="", stringsAsFactors=F)
dat
Month Sales Profit
1 January $128,568 $16,234
2 February $109,523 $12,876
3 March $115,468 $17,920
4 April $122,274 $15,825
5 May $117,921 $15,437
Which of the following commands could convert the sales and profits columns to numeric? Select all that apply.
dat %>% mutate_at(2:3, parse_number)
Month Sales Profit
1 January 128568 16234
2 February 109523 12876
3 March 115468 17920
4 April 122274 15825
5 May 117921 15437
dat %>% mutate_at(2:3, funs(str_replace_all(., c("\\$|,"), "")))
Month Sales Profit
1 January 128568 16234
2 February 109523 12876
3 March 115468 17920
4 April 122274 15825
5 May 117921 15437
dat %>% mutate_at(2:3, str_replace_all, "\\$|,", "")
Month Sales Profit
1 January 128568 16234
2 February 109523 12876
3 March 115468 17920
4 April 122274 15825
5 May 117921 15437
# 對$或,做replace,變成沒有東西("":空集合)
dat %>% mutate_all(2:3, parse_number)
# mutate_all是將整個資料框都做過一次,但因為第一欄是月份沒有逗號,會失敗
dat$Profit <- str_replace_all(dat$Profit, c("\\$|,"), "")
dat$Sales <- parse_number(dat$Sales)
dat
Month Sales Profit
1 January 128568 16234
2 February 109523 12876
3 March 115468 17920
4 April 122274 15825
5 May 117921 15437
library(dslabs)
data(reported_heights)
reported_heights %>% head
time_stamp sex height
1 2014-09-02 13:40:36 Male 75
2 2014-09-02 13:46:59 Male 70
3 2014-09-02 13:59:20 Male 68
4 2014-09-02 14:51:53 Male 74
5 2014-09-02 15:16:15 Male 61
6 2014-09-02 15:16:16 Female 65
reported_heights %>%
mutate(new_height = as.numeric(height)) %>%
filter(is.na(new_height)) %>%
getElement("height")
NAs introduced by coercion
[1] "5' 4\"" "165cm"
[3] "5'7" ">9000"
[5] "5'7\"" "5'3\""
[7] "5 feet and 8.11 inches" "5'11"
[9] "5'9''" "5'10''"
[11] "5,3" "6'"
[13] "6,8" "5' 10"
[15] "Five foot eight inches" "5'5\""
[17] "5'2\"" "5,4"
[19] "5'3" "5'10''"
[21] "5'3''" "5'7''"
[23] "5'12" "2'33"
[25] "5'11" "5'3\""
[27] "5,8" "5'6''"
[29] "5'4" "1,70"
[31] "5'7.5''" "5'7.5''"
[33] "5'2\"" "5' 7.78\""
[35] "yyy" "5'5"
[37] "5'8" "5'6"
[39] "5 feet 7inches" "6*12"
[41] "5 .11" "5 11"
[43] "5'4" "5'8\""
[45] "5'5" "5'7"
[47] "5'6" "5'11\""
[49] "5'7\"" "5'7"
[51] "5'8" "5' 11\""
[53] "6'1\"" "69\""
[55] "5' 7\"" "5'10''"
[57] "5'10" "5'10"
[59] "5ft 9 inches" "5 ft 9 inches"
[61] "5'2" "5'11"
[63] "5'11''" "5'8\""
[65] "708,661" "5 feet 6 inches"
[67] "5'10''" "5'8"
[69] "6'3\"" "649,606"
[71] "728,346" "6 04"
[73] "5'9" "5'5''"
[75] "5'7\"" "6'4\""
[77] "5'4" "170 cm"
[79] "7,283,465" "5'6"
[81] "5'6"
# mutate是加一個新的變數 # 轉不成功的會變成NA
reported_heights %>%
filter(is.na(as.numeric(height))) %>%
.$height
NAs introduced by coercion
[1] "5' 4\"" "165cm"
[3] "5'7" ">9000"
[5] "5'7\"" "5'3\""
[7] "5 feet and 8.11 inches" "5'11"
[9] "5'9''" "5'10''"
[11] "5,3" "6'"
[13] "6,8" "5' 10"
[15] "Five foot eight inches" "5'5\""
[17] "5'2\"" "5,4"
[19] "5'3" "5'10''"
[21] "5'3''" "5'7''"
[23] "5'12" "2'33"
[25] "5'11" "5'3\""
[27] "5,8" "5'6''"
[29] "5'4" "1,70"
[31] "5'7.5''" "5'7.5''"
[33] "5'2\"" "5' 7.78\""
[35] "yyy" "5'5"
[37] "5'8" "5'6"
[39] "5 feet 7inches" "6*12"
[41] "5 .11" "5 11"
[43] "5'4" "5'8\""
[45] "5'5" "5'7"
[47] "5'6" "5'11\""
[49] "5'7\"" "5'7"
[51] "5'8" "5' 11\""
[53] "6'1\"" "69\""
[55] "5' 7\"" "5'10''"
[57] "5'10" "5'10"
[59] "5ft 9 inches" "5 ft 9 inches"
[61] "5'2" "5'11"
[63] "5'11''" "5'8\""
[65] "708,661" "5 feet 6 inches"
[67] "5'10''" "5'8"
[69] "6'3\"" "649,606"
[71] "728,346" "6 04"
[73] "5'9" "5'5''"
[75] "5'7\"" "6'4\""
[77] "5'4" "170 cm"
[79] "7,283,465" "5'6"
[81] "5'6"
reported_heights$height %>% .[is.na(as.numeric(.))]
NAs introduced by coercion
[1] "5' 4\"" "165cm"
[3] "5'7" ">9000"
[5] "5'7\"" "5'3\""
[7] "5 feet and 8.11 inches" "5'11"
[9] "5'9''" "5'10''"
[11] "5,3" "6'"
[13] "6,8" "5' 10"
[15] "Five foot eight inches" "5'5\""
[17] "5'2\"" "5,4"
[19] "5'3" "5'10''"
[21] "5'3''" "5'7''"
[23] "5'12" "2'33"
[25] "5'11" "5'3\""
[27] "5,8" "5'6''"
[29] "5'4" "1,70"
[31] "5'7.5''" "5'7.5''"
[33] "5'2\"" "5' 7.78\""
[35] "yyy" "5'5"
[37] "5'8" "5'6"
[39] "5 feet 7inches" "6*12"
[41] "5 .11" "5 11"
[43] "5'4" "5'8\""
[45] "5'5" "5'7"
[47] "5'6" "5'11\""
[49] "5'7\"" "5'7"
[51] "5'8" "5' 11\""
[53] "6'1\"" "69\""
[55] "5' 7\"" "5'10''"
[57] "5'10" "5'10"
[59] "5ft 9 inches" "5 ft 9 inches"
[61] "5'2" "5'11"
[63] "5'11''" "5'8\""
[65] "708,661" "5 feet 6 inches"
[67] "5'10''" "5'8"
[69] "6'3\"" "649,606"
[71] "728,346" "6 04"
[73] "5'9" "5'5''"
[75] "5'7\"" "6'4\""
[77] "5'4" "170 cm"
[79] "7,283,465" "5'6"
[81] "5'6"
not_inches <- function(x, smallest = 50, tallest = 84) {
inches <- suppressWarnings(as.numeric(x))
ind <- is.na(inches) | inches < smallest | inches > tallest
ind}
# 把英吋的大小控制在50-84的合理範圍
problems = reported_heights$height %>% .[not_inches(.)]
problems
[1] "6" "5' 4\""
[3] "5.3" "165cm"
[5] "511" "6"
[7] "2" "5'7"
[9] ">9000" "5'7\""
[11] "5'3\"" "5 feet and 8.11 inches"
[13] "5.25" "5'11"
[15] "5.5" "11111"
[17] "5'9''" "6"
[19] "6.5" "150"
[21] "5'10''" "103.2"
[23] "5.8" "19"
[25] "5" "5.6"
[27] "175" "177"
[29] "300" "5,3"
[31] "6'" "6"
[33] "5.9" "6,8"
[35] "5' 10" "5.5"
[37] "178" "163"
[39] "6.2" "175"
[41] "Five foot eight inches" "6.2"
[43] "5.8" "5.1"
[45] "178" "165"
[47] "5.11" "5'5\""
[49] "165" "180"
[51] "5'2\"" "5.75"
[53] "169" "5,4"
[55] "7" "5.4"
[57] "157" "6.1"
[59] "169" "5'3"
[61] "5.6" "214"
[63] "183" "5.6"
[65] "6" "162"
[67] "178" "180"
[69] "5'10''" "170"
[71] "5'3''" "178"
[73] "0.7" "190"
[75] "5.4" "184"
[77] "5'7''" "5.9"
[79] "5'12" "5.6"
[81] "5.6" "184"
[83] "6" "167"
[85] "2'33" "5'11"
[87] "5'3\"" "5.5"
[89] "5.2" "180"
[91] "5.5" "5.5"
[93] "6.5" "5,8"
[95] "180" "183"
[97] "170" "5'6''"
[99] "172" "612"
[101] "5.11" "168"
[103] "5'4" "1,70"
[105] "172" "87"
[107] "5.5" "176"
[109] "5'7.5''" "5'7.5''"
[111] "111" "5'2\""
[113] "173" "174"
[115] "176" "175"
[117] "5' 7.78\"" "6.7"
[119] "12" "6"
[121] "5.1" "5.6"
[123] "5.5" "yyy"
[125] "5.2" "5'5"
[127] "5'8" "5'6"
[129] "5 feet 7inches" "89"
[131] "5.6" "5.7"
[133] "183" "172"
[135] "34" "25"
[137] "6" "5.9"
[139] "168" "6.5"
[141] "170" "175"
[143] "6" "22"
[145] "5.11" "684"
[147] "6" "1"
[149] "1" "6*12"
[151] "5 .11" "87"
[153] "162" "165"
[155] "184" "6"
[157] "173" "1.6"
[159] "172" "170"
[161] "5.7" "5.5"
[163] "174" "170"
[165] "160" "120"
[167] "120" "23"
[169] "192" "5 11"
[171] "167" "150"
[173] "1.7" "174"
[175] "5.8" "6"
[177] "5'4" "5'8\""
[179] "5'5" "5.8"
[181] "5.1" "5.11"
[183] "5.7" "5'7"
[185] "5'6" "5'11\""
[187] "5'7\"" "5'7"
[189] "172" "5'8"
[191] "180" "5' 11\""
[193] "5" "180"
[195] "180" "6'1\""
[197] "5.9" "5.2"
[199] "5.5" "69\""
[201] "5' 7\"" "5'10''"
[203] "5.51" "5'10"
[205] "5'10" "5ft 9 inches"
[207] "5 ft 9 inches" "5'2"
[209] "5'11" "5.8"
[211] "5.7" "167"
[213] "168" "6"
[215] "6.1" "5'11''"
[217] "5.69" "178"
[219] "182" "164"
[221] "5'8\"" "185"
[223] "6" "86"
[225] "5.7" "708,661"
[227] "5.25" "5.5"
[229] "5 feet 6 inches" "5'10''"
[231] "172" "6"
[233] "5'8" "160"
[235] "6'3\"" "649,606"
[237] "10000" "5.1"
[239] "152" "1"
[241] "180" "728,346"
[243] "175" "158"
[245] "173" "164"
[247] "6 04" "169"
[249] "0" "185"
[251] "168" "5'9"
[253] "169" "5'5''"
[255] "174" "6.3"
[257] "179" "5'7\""
[259] "5.5" "6"
[261] "6" "170"
[263] "6" "172"
[265] "158" "100"
[267] "159" "190"
[269] "5.7" "170"
[271] "158" "6'4\""
[273] "180" "5.57"
[275] "5'4" "210"
[277] "88" "6"
[279] "162" "170 cm"
[281] "5.7" "170"
[283] "157" "186"
[285] "170" "7,283,465"
[287] "5" "5"
[289] "34" "161"
[291] "5'6" "5'6"
# 對一個vector做篩選可以用.[],"."指向前面的東西(在[]外面再包一個{}比較保險)
# 等於reported_heights$height[ not_inches(reported_heights$height) ]的縮寫
# 這是一個string factor
str_subset(problems, "cm|inches")
[1] "165cm" "5 feet and 8.11 inches"
[3] "Five foot eight inches" "5 feet 7inches"
[5] "5ft 9 inches" "5 ft 9 inches"
[7] "5 feet 6 inches" "170 cm"
str_subset(problems, "cm|inches") %>% str_extract("cm|inches")
[1] "cm" "inches" "inches" "inches" "inches" "inches" "inches"
[8] "cm"
str_extract(problems, "cm|inches")
[1] NA NA NA "cm" NA NA NA
[8] NA NA NA NA "inches" NA NA
[15] NA NA NA NA NA NA NA
[22] NA NA NA NA NA NA NA
[29] NA NA NA NA NA NA NA
[36] NA NA NA NA NA "inches" NA
[43] NA NA NA NA NA NA NA
[50] NA NA NA NA NA NA NA
[57] NA NA NA NA NA NA NA
[64] NA NA NA NA NA NA NA
[71] NA NA NA NA NA NA NA
[78] NA NA NA NA NA NA NA
[85] NA NA NA NA NA NA NA
[92] NA NA NA NA NA NA NA
[99] NA NA NA NA NA NA NA
[106] NA NA NA NA NA NA NA
[113] NA NA NA NA NA NA NA
[120] NA NA NA NA NA NA NA
[127] NA NA "inches" NA NA NA NA
[134] NA NA NA NA NA NA NA
[141] NA NA NA NA NA NA NA
[148] NA NA NA NA NA NA NA
[155] NA NA NA NA NA NA NA
[162] NA NA NA NA NA NA NA
[169] NA NA NA NA NA NA NA
[176] NA NA NA NA NA NA NA
[183] NA NA NA NA NA NA NA
[190] NA NA NA NA NA NA NA
[197] NA NA NA NA NA NA NA
[204] NA NA "inches" "inches" NA NA NA
[211] NA NA NA NA NA NA NA
[218] NA NA NA NA NA NA NA
[225] NA NA NA NA "inches" NA NA
[232] NA NA NA NA NA NA NA
[239] NA NA NA NA NA NA NA
[246] NA NA NA NA NA NA NA
[253] NA NA NA NA NA NA NA
[260] NA NA NA NA NA NA NA
[267] NA NA NA NA NA NA NA
[274] NA NA NA NA NA NA "cm"
[281] NA NA NA NA NA NA NA
[288] NA NA NA NA NA
Q1: In the video, we use the function not_inches to identify heights that were incorrectly entered
not_inches <- function(x, smallest = 50, tallest = 84) {
inches <- suppressWarnings(as.numeric(x))
ind <- is.na(inches) | inches < smallest | inches > tallest
ind
}
In this function, what TWO types of values are identified as not being correctly formatted in inches?
Q2: Which of the following arguments, when passed to the function not_inches, would return the vector c(FALSE)?
c(70) %>% not_inches
[1] FALSE
Q3: Our function not_inches returns the object ind. Which answer correctly describes ind?
ind is a logical vector of TRUE and FALSE, equal in length to the vector x (in the arguments list). TRUE indicates that a height entry is incorrectly formatted.Q1: Given the following code
s = c("70" ,"5 ft", "4'11", "", ".", "Six feet")
What pattern vector yields the following result?
pattern = "\\d|ft"
# 對應阿拉伯數字:\d,但由於在R裡面要在""裡打\,就要打兩次 ex.cat("\\") = "\"
# \' 是 ' \" 是 " \. 是 .
str_subset(s, pattern)
[1] "70" "5 ft" "4'11"
Character Classes - []
yes = as.character(4:7)
no = as.character(1:3)
str_detect(c(yes,no), "[4-7]")
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# [4-7]是4到7 # [A-Z]是全部大寫字母 # [a-zA-Z]是所有英文字母
# []裡面都是character,所以[1-20]是0,1,2而非1~20
# [0-9] == \\d
Anchors - ^ and $ (^是開始/$是結束的條件)
yes = c("1","5","9")
no = c("12","123"," 1","a4","b")
str_detect(c(yes,no), "^\\d$")
[1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
# 選擇開始第一個字是阿拉伯數字就結束
# str_detect(c(yes,no), "\\d$") 會變成只要阿拉伯數字結尾就好
Qualifiers - {} ({}前面的字元有幾個)
yes = c("1","5","9","12")
no = c("123","a4","b")
str_detect(c(yes,no), "^\\d{1,2}$")
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# 開始和結尾是阿拉伯數字,可以有1~2個數字
Pattern of Feets & Inches
pattern = "^[4-7]'\\d{1,2}\"$"
yes = c("5'7\"", "6'2\"", "5'12\"")
no = c("6,2\"", "6.2\"", "I am 5'11\"", "3'2\"", "64")
str_detect(c(yes,no), pattern)
[1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
# 開頭要是4-7,加個',再來1~2個阿拉伯數字,加個",結束 ex. 5'7"
Q1: You enter the following set of commands into your R console. What is your printed result?
animals <- c("cat", "puppy", "Moose", "MONKEY")
pattern <- "[a-z]"
str_detect(animals, pattern)
[1] TRUE TRUE TRUE FALSE
# 要有小寫字母
Q2: You enter the following set of commands into your R console. What is your printed result?
animals <- c("cat", "puppy", "Moose", "MONKEY")
pattern <- "[A-Z]$"
str_detect(animals, pattern)
[1] FALSE FALSE FALSE TRUE
# 結尾要是大寫字母
Q3: You enter the following set of commands into your R console. What is your printed result?
animals <- c("cat", "puppy", "Moose", "MONKEY")
pattern <- "[a-z]{4,5}"
str_detect(animals, pattern)
[1] FALSE TRUE TRUE FALSE
# 要包含4到5個小寫字母
Inital Pattern
pattern = "^[4-7]'\\d{1,2}$"
str_subset(problems, pattern) # 24
[1] "5'7" "5'11" "5'3" "5'12" "5'11" "5'4" "5'5" "5'8" "5'6"
[10] "5'4" "5'5" "5'7" "5'6" "5'7" "5'8" "5'10" "5'10" "5'2"
[19] "5'11" "5'8" "5'9" "5'4" "5'6" "5'6"
Replace Feet and Inches
pattern = "^[4-7]'\\d{1,2}$"
problems %>%
str_replace("feet|ft|foot","'") %>%
str_replace("inches|in|''|\"","") %>%
str_subset(pattern)
[1] "5'7" "5'7" "5'3" "5'11" "5'9" "5'10" "5'5" "5'2" "5'3"
[10] "5'10" "5'3" "5'7" "5'12" "5'11" "5'3" "5'6" "5'4" "5'2"
[19] "5'5" "5'8" "5'6" "5'4" "5'8" "5'5" "5'7" "5'6" "5'11"
[28] "5'7" "5'7" "5'8" "6'1" "5'10" "5'10" "5'10" "5'2" "5'11"
[37] "5'11" "5'8" "5'10" "5'8" "6'3" "5'9" "5'5" "5'7" "6'4"
[46] "5'4" "5'6" "5'6"
# str_detect(pattern) %>% sum # 48
More Qualifiers
* : 0 or more+ : 1 or more? : 0 or 1yes = c("AB","A1B","A11B","A111B","A1111B")
no = c("A2B","A21B")
str_detect(c(yes,no), "A1*B")
[1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE
# 在A後面可以有0或更多的1
Space - \\s
pattern = "^[4-7]\\s*'\\s*\\d{1,2}$"
problems %>%
str_replace("feet|ft|foot","'") %>%
str_replace("inches|in|''|\"","") %>%
str_detect(pattern) %>%
sum # 53
[1] 53
# 空白鍵: \s , 在R就要: \\s
# 用星號代表不限空白鍵的多寡
Q1: Given the following code, which TWO pattern vectors would yield the following result?
animals <- c("moose", "monkey", "meerkat", "mountain lion")
pattern = c("mo*","mo?","mo+","moo*")
sapply(pattern, function(p) str_detect(animals, p)) %>% t
[,1] [,2] [,3] [,4]
mo* TRUE TRUE TRUE TRUE
mo? TRUE TRUE TRUE TRUE
mo+ TRUE TRUE FALSE TRUE
moo* TRUE TRUE FALSE TRUE
# moo*是在mo後面有0或更多的o
Q2: You are working on some data from different universities. You have the following vector
schools = c(
"U. Kentucky","Univ New Hampshire","Univ. of Massachusetts",
"University Georgia","U California","California State University"
)
You want to clean this data to match the full names of each university. What of the following commands could accomplish this?
schools %>%
str_replace("^Univ\\.?\\s|^U\\.?\\s", "University ") %>%
str_replace("^University of |^University ", "University of ")
[1] "University of Kentucky" "University of New Hampshire"
[3] "University of Massachusetts" "University of Georgia"
[5] "University of California" "California State University"
Define Groups ()
pattern_no_group = "^[4-7],\\d*$"
pattern_group = "^([4-7]),(\\d*)$"
yes = c("5,9","5,11","6,","6,1")
no = c("5'9",",","2,8","6.1.1")
s = c(yes, no)
# 括號把他們分group
Groups do not affect pattern detection
str_detect(s, pattern_no_group)
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
str_detect(s, pattern_group)
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
The difference between
str_match()str_extarct()str_subset()str_detect()str_match(s, pattern_group)
[,1] [,2] [,3]
[1,] "5,9" "5" "9"
[2,] "5,11" "5" "11"
[3,] "6," "6" ""
[4,] "6,1" "6" "1"
[5,] NA NA NA
[6,] NA NA NA
[7,] NA NA NA
[8,] NA NA NA
str_extract(s, pattern_group)
[1] "5,9" "5,11" "6," "6,1" NA NA NA NA
str_subset(s, pattern_group)
[1] "5,9" "5,11" "6," "6,1"
str_detect(s, pattern_group)
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
Replace with Group
pattern = "^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$"
str_subset(problems, pattern) %>% str_match(pattern)
[,1] [,2] [,3]
[1,] "5.3" "5" "3"
[2,] "5.25" "5" "25"
[3,] "5.5" "5" "5"
[4,] "6.5" "6" "5"
[5,] "5.8" "5" "8"
[6,] "5.6" "5" "6"
[7,] "5,3" "5" "3"
[8,] "5.9" "5" "9"
[9,] "6,8" "6" "8"
[10,] "5.5" "5" "5"
[11,] "6.2" "6" "2"
[12,] "6.2" "6" "2"
[13,] "5.8" "5" "8"
[14,] "5.1" "5" "1"
[15,] "5.11" "5" "11"
[16,] "5.75" "5" "75"
[17,] "5,4" "5" "4"
[18,] "5.4" "5" "4"
[19,] "6.1" "6" "1"
[20,] "5.6" "5" "6"
[21,] "5.6" "5" "6"
[22,] "5.4" "5" "4"
[23,] "5.9" "5" "9"
[24,] "5.6" "5" "6"
[25,] "5.6" "5" "6"
[26,] "5.5" "5" "5"
[27,] "5.2" "5" "2"
[28,] "5.5" "5" "5"
[29,] "5.5" "5" "5"
[30,] "6.5" "6" "5"
[31,] "5,8" "5" "8"
[32,] "5.11" "5" "11"
[33,] "5.5" "5" "5"
[34,] "6.7" "6" "7"
[35,] "5.1" "5" "1"
[36,] "5.6" "5" "6"
[37,] "5.5" "5" "5"
[38,] "5.2" "5" "2"
[39,] "5.6" "5" "6"
[40,] "5.7" "5" "7"
[41,] "5.9" "5" "9"
[42,] "6.5" "6" "5"
[43,] "5.11" "5" "11"
[44,] "5 .11" "5" "11"
[45,] "5.7" "5" "7"
[46,] "5.5" "5" "5"
[47,] "5 11" "5" "11"
[48,] "5.8" "5" "8"
[49,] "5.8" "5" "8"
[50,] "5.1" "5" "1"
[51,] "5.11" "5" "11"
[52,] "5.7" "5" "7"
[53,] "5.9" "5" "9"
[54,] "5.2" "5" "2"
[55,] "5.5" "5" "5"
[56,] "5.51" "5" "51"
[57,] "5.8" "5" "8"
[58,] "5.7" "5" "7"
[59,] "6.1" "6" "1"
[60,] "5.69" "5" "69"
[61,] "5.7" "5" "7"
[62,] "5.25" "5" "25"
[63,] "5.5" "5" "5"
[64,] "5.1" "5" "1"
[65,] "6 04" "6" "04"
[66,] "6.3" "6" "3"
[67,] "5.5" "5" "5"
[68,] "5.7" "5" "7"
[69,] "5.57" "5" "57"
[70,] "5.7" "5" "7"
str_subset(problems, pattern) %>%
str_replace(pattern, "\\1'\\2")
[1] "5'3" "5'25" "5'5" "6'5" "5'8" "5'6" "5'3" "5'9" "6'8"
[10] "5'5" "6'2" "6'2" "5'8" "5'1" "5'11" "5'75" "5'4" "5'4"
[19] "6'1" "5'6" "5'6" "5'4" "5'9" "5'6" "5'6" "5'5" "5'2"
[28] "5'5" "5'5" "6'5" "5'8" "5'11" "5'5" "6'7" "5'1" "5'6"
[37] "5'5" "5'2" "5'6" "5'7" "5'9" "6'5" "5'11" "5'11" "5'7"
[46] "5'5" "5'11" "5'8" "5'8" "5'1" "5'11" "5'7" "5'9" "5'2"
[55] "5'5" "5'51" "5'8" "5'7" "6'1" "5'69" "5'7" "5'25" "5'5"
[64] "5'1" "6'04" "6'3" "5'5" "5'7" "5'57" "5'7"
# \\1代表group1;\\2代表group2
# str_replace(pattern, "\\1 feets and \\2 inches")
# 挖出來再重新定義
Q1: Rather than using the pattern_with_groups vector from the video, you accidentally write in the following code. What is your result?
pattern_w_groups = "^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$"
problems1 <- c("5.3", "5,5", "6 1", "5 .11", "5, 12")
pattern_with_groups <- "^([4-7])[,\\.](\\d*)$"
str_replace(problems1, pattern_with_groups, "\\1'\\2")
[1] "5'3" "5'5" "6 1" "5 .11" "5, 12"
Q2: You notice your mistake and correct your pattern regex to the following What is your result?
problems1 <- c("5.3", "5,5", "6 1", "5 .11", "5, 12")
pattern_with_groups <- "^([4-7])[,\\.\\s](\\d*)$"
str_replace(problems1, pattern_with_groups, "\\1'\\2")
[1] "5'3" "5'5" "6'1" "5 .11" "5, 12"
I think what it intends to do is …
problems1 <- c("5.3", "5,5", "6 1", "5 .11", "5, 12")
pattern_with_groups <- "^([4-7])\\s*[,\\.\\s]\\s*(\\d*)$"
str_replace(problems1, pattern_with_groups, "\\1'\\2")
[1] "5'3" "5'5" "6'1" "5'11" "5'12"
converted <- problems %>%
str_replace("feet|foot|ft", "'") %>%
str_replace("inches|in|''|\"", "") %>%
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"
index <- str_detect(converted, pattern)
mean(index) # 0.42123
[1] 0.4212
converted[!index]
[1] "6" "165cm" "511" "6"
[5] "2" ">9000" "5 ' and 8.11 " "11111"
[9] "6" "150" "103.2" "19"
[13] "5" "175" "177" "300"
[17] "6'" "6" "178" "163"
[21] "175" "Five ' eight " "178" "165"
[25] "165" "180" "169" "7"
[29] "157" "169" "214" "183"
[33] "6" "162" "178" "180"
[37] "170" "178" "0.7" "190"
[41] "184" "184" "6" "167"
[45] "2'33" "180" "180" "183"
[49] "170" "172" "612" "168"
[53] "1,70" "172" "87" "176"
[57] "5'7.5" "5'7.5" "111" "173"
[61] "174" "176" "175" "5' 7.78"
[65] "12" "6" "yyy" "89"
[69] "183" "172" "34" "25"
[73] "6" "168" "170" "175"
[77] "6" "22" "684" "6"
[81] "1" "1" "6*12" "87"
[85] "162" "165" "184" "6"
[89] "173" "1.6" "172" "170"
[93] "174" "170" "160" "120"
[97] "120" "23" "192" "167"
[101] "150" "1.7" "174" "6"
[105] "172" "180" "5" "180"
[109] "180" "69" "5' 9 " "5 ' 9 "
[113] "167" "168" "6" "178"
[117] "182" "164" "185" "6"
[121] "86" "708,661" "5 ' 6 " "172"
[125] "6" "160" "649,606" "10000"
[129] "152" "1" "180" "728,346"
[133] "175" "158" "173" "164"
[137] "169" "0" "185" "168"
[141] "169" "174" "179" "6"
[145] "6" "170" "6" "172"
[149] "158" "100" "159" "190"
[153] "170" "158" "180" "210"
[157] "88" "6" "162" "170 cm"
[161] "170" "157" "186" "170"
[165] "7,283,465" "5" "5" "34"
[169] "161"
Q1: In our example, we use the following code to detect height entries that do not match our pattern of x’y”.
problems1 <- c("5.3", "5,5", "6 1", "5 .11", "5, 12")
converted1 <- problems1 %>%
str_replace("feet|foot|ft", "'") %>%
str_replace("inches|in|''|\"", "") %>%
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"
index <- str_detect(converted1, pattern)
converted1[!index]
Which answer best describes the differences between the regex string we use as an argument in
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
And the regex string in
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"?
Q2: You notice a few entries that are not being properly converted using your str_replace and str_detect code
yes <- c("5 feet 7inches")
no <- c("5ft 9 inches", "5 ft 9 inches")
s <- c(yes, no)
converted <- s %>%
str_replace("feet|foot|ft", "'") %>%
str_replace("inches|in|''|\"", "") %>%
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
converted
[1] "5 ' 7" "5' 9 " "5 ' 9 "
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"
str_detect(converted, pattern)
[1] TRUE FALSE FALSE
It seems like the problem may be due to spaces around the words feet|foot|ft and inches|in. What is another way you could fix this problem?
converted <- s %>%
str_replace("\\s*feet|foot|ft\\s*", "'") %>%
str_replace("\\s*inches|in|''|\"\\s*", "") %>%
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
converted
[1] "5' 7" "5'9" "5 '9"
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"
str_detect(converted, pattern)
[1] TRUE TRUE TRUE
s = c("5'10", "6'1")
tab = data.frame(x = s)
separate(tab, x, c("feet", "inches"), sep="'")
feet inches
1 5 10
2 6 1
extract(tab, x, c("feet", "inches"), regex="(\\d)'(\\d{1,2})")
feet inches
1 5 10
2 6 1
s = c("5'10", "6'1\"","5'8inches")
tab = data.frame(x = s)
separate(tab, x, c("feet", "inches"), sep="'")
feet inches
1 5 10
2 6 1"
3 5 8inches
extract(tab, x, c("feet", "inches"), regex="(\\d)'(\\d{1,2})")
feet inches
1 5 10
2 6 1
3 5 8
** Q1:** If you use the extract code from our video, the decimal point is dropped. What modification of the code would allow you to put the decimals in a third column called “decimal”?
library(tidyr)
s <- c("5'10", "6'1\"", "5'8inches", "5'7.5")
tab <- data.frame(x = s)
rx = c("(\\d)'(\\d{1,2})(\\.)?",
"(\\d)'(\\d{1,2})(\\.\\d+)",
"(\\d)'(\\d{1,2})\\.\\d+?",
"(\\d)'(\\d{1,2})(\\.\\d+)?")
extract(tab, x, into=c("feet", "inches", "decimal"), regex=rx[4])
feet inches decimal
1 5 10 <NA>
2 6 1 <NA>
3 5 8 <NA>
4 5 7 .5
filename = system.file("extdata/murders.csv", package="dslabs")
lines = readLines(filename)
head(lines)
[1] "state,abb,region,population,total"
[2] "Alabama,AL,South,4779736,135"
[3] "Alaska,AK,West,710231,19"
[4] "Arizona,AZ,West,6392017,232"
[5] "Arkansas,AR,South,2915918,93"
[6] "California,CA,West,37253956,1257"
x = str_split(lines, ",", simplify=T)
head(x)
[,1] [,2] [,3] [,4] [,5]
[1,] "state" "abb" "region" "population" "total"
[2,] "Alabama" "AL" "South" "4779736" "135"
[3,] "Alaska" "AK" "West" "710231" "19"
[4,] "Arizona" "AZ" "West" "6392017" "232"
[5,] "Arkansas" "AR" "South" "2915918" "93"
[6,] "California" "CA" "West" "37253956" "1257"
as.data.frame(x[-1,]) %>%
setNames(x[1,]) %>%
mutate_all(parse_guess) %>%
head(10)
state abb region population total
1 Alabama AL South 4779736 135
2 Alaska AK West 710231 19
3 Arizona AZ West 6392017 232
4 Arkansas AR South 2915918 93
5 California CA West 37253956 1257
6 Colorado CO West 5029196 65
7 Connecticut CT Northeast 3574097 97
8 Delaware DE South 897934 38
9 District of Columbia DC South 601723 99
10 Florida FL South 19687653 669
Q1: You have the following table
schedule = data.frame(
day = c("Monday", "Tuesday"),
staff = c("Mandy, Chris and Laura", "Steve, Ruth and Frank"))
schedule
day staff
1 Monday Mandy, Chris and Laura
2 Tuesday Steve, Ruth and Frank
Which two commands would properly split the text in the “Staff” column into each individual name? Check all that apply.
lapply(c(",|and", ", | and ", ",\\s|\\sand\\s", "\\s?(,|and)\\s?"),
function(r) str_split(schedule$staff, r, simplify=T))
[[1]]
[,1] [,2] [,3] [,4]
[1,] "M" "y" " Chris " " Laura"
[2,] "Steve" " Ruth " " Frank" ""
[[2]]
[,1] [,2] [,3]
[1,] "Mandy" "Chris" "Laura"
[2,] "Steve" "Ruth" "Frank"
[[3]]
[,1] [,2] [,3]
[1,] "Mandy" "Chris" "Laura"
[2,] "Steve" "Ruth" "Frank"
[[4]]
[,1] [,2] [,3] [,4]
[1,] "M" "y" "Chris" "Laura"
[2,] "Steve" "Ruth" "Frank" ""
Q2: What code would successfully turn your “Schedule” table into the following tidy table
schedule %>%
mutate(staff = str_split(staff, ", | and ")) %>%
unnest()
day staff
1 Monday Mandy
2 Monday Chris
3 Monday Laura
4 Tuesday Steve
5 Tuesday Ruth
6 Tuesday Frank
# split出來變成一個list,再用unnest
library(ggplot2)
data("gapminder")
gapminder %>% filter(region == "Caribbean") %>%
ggplot(aes(year, life_expectancy, color=country)) +
geom_line()
gapminder %>% filter(region == "Caribbean") %>%
filter(str_length(country) >= 12) %>%
distinct(country)
country
1 Antigua and Barbuda
2 Dominican Republic
3 St. Vincent and the Grenadines
4 Trinidad and Tobago
gapminder %>% filter(region == "Caribbean") %>%
mutate(country = recode(
country,
`Antigua and Barbuda` = "Barbuda",
`Dominican Republic` = "DR",
`St. Vincent and the Grenadines` = "St. Vincent",
`Trinidad and Tobago` = "Trinidad"
)) %>%
ggplot(aes(year, life_expectancy, color=country)) +
geom_line()
Q1: Using the gapminder data, you want to recode countries longer than 12 letters in the region Middle Africa to their abbreviations in a new column, country_short. Which code would accomplish this?
library(dslabs)
data(gapminder)
gapminder %>% filter(region == "Middle Africa") %>%
filter(nchar(as.character(country)) >= 12) %>%
select(region, country) %>% distinct() %>%
mutate(country_short = recode(country,
"Central African Republic" = "CAR",
"Congo, Dem. Rep." = "DRC",
"Equatorial Guinea" = "Eq. Guinea"
) )
region country country_short
1 Middle Africa Central African Republic CAR
2 Middle Africa Congo, Dem. Rep. DRC
3 Middle Africa Equatorial Guinea Eq. Guinea
Q1: Which of the following is the standard ISO 8601 format for dates?
Q2: Which of the following commands could convert this string into the correct date format?
library(lubridate)
dates <- c("09-01-02", "01-12-07", "02-03-04")
ymd(dates)
[1] "2009-01-02" "2001-12-07" "2002-03-04"
mdy(dates)
[1] "2002-09-01" "2007-01-12" "2004-02-03"
dmy(dates)
[1] "2002-01-09" "2007-12-01" "2004-03-02"