packages = c(
"dplyr","ggplot2","stringr", "dslabs", "readr", "tidyr", "purrr",
"lubridate", "rvest"
)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=T))
Sys.setlocale("LC_ALL","C")
[1] "C"
options(digits=4, scipen=12)
library(rvest)
library(readr)
library(dplyr)
library(ggplot2)
library(stringr)
library(lubridate)
library(tidyr)
library(dslabs)
library(rvest)
url = "https://en.wikipedia.org/wiki/Murder_in_the_United_States_by_state"
h = read_html(url)
tab = html_nodes(h, "table")[[2]] %>%
html_table %>%
setNames(c(
"state","population","total","murders","gun_murders",
"gun_ownersjip","total_rate","murder_rate","gun_murder_rate"))
Q1: Which of the following is NOT an application of string parsing?
Q1: Which of the following commands would not give you an error in R?
cat(" LeBron James is 6'8\" ")
LeBron James is 6'8"
stringr PackageQ1: Which of the following are advantages of the stringr package over string processing functions in base R? Select all that apply.
sapply(tab, str_detect, ",") %>% colSums
state population total murders gun_murders
0 51 3 2 1
gun_ownersjip total_rate murder_rate gun_murder_rate
0 0 0 0
tab = tab %>% mutate_at(2:3, parse_number)
sapply(tab, str_detect, ",") %>% colSums
state population total murders gun_murders
0 0 0 2 1
gun_ownersjip total_rate murder_rate gun_murder_rate
0 0 0 0
Q1: You have a dataframe of monthly sales and profits in R
dat = read.table("data/sales.txt", header=T, sep="", stringsAsFactors=F)
dat
Month Sales Profit
1 January $128,568 $16,234
2 February $109,523 $12,876
3 March $115,468 $17,920
4 April $122,274 $15,825
5 May $117,921 $15,437
Which of the following commands could convert the sales and profits columns to numeric? Select all that apply.
dat %>% mutate_at(2:3, parse_number)
Month Sales Profit
1 January 128568 16234
2 February 109523 12876
3 March 115468 17920
4 April 122274 15825
5 May 117921 15437
dat %>% mutate_at(2:3, funs(str_replace_all(., c("\\$|,"), "")))
Month Sales Profit
1 January 128568 16234
2 February 109523 12876
3 March 115468 17920
4 April 122274 15825
5 May 117921 15437
dat %>% mutate_all(2:3, parse_number)
dat$Profit <- str_replace_all(dat$Profit, c("\\$|,"), "")
dat$Sales <- parse_number(dat$Sales)
dat
Month Sales Profit
1 January 128568 16234
2 February 109523 12876
3 March 115468 17920
4 April 122274 15825
5 May 117921 15437
library(dslabs)
data(reported_heights)
reported_heights %>% head
time_stamp sex height
1 2014-09-02 13:40:36 Male 75
2 2014-09-02 13:46:59 Male 70
3 2014-09-02 13:59:20 Male 68
4 2014-09-02 14:51:53 Male 74
5 2014-09-02 15:16:15 Male 61
6 2014-09-02 15:16:16 Female 65
reported_heights %>%
mutate(new_height = as.numeric(height)) %>%
filter(is.na(new_height)) %>%
getElement("height")
NAs introduced by coercion
[1] "5' 4\"" "165cm" "5'7"
[4] ">9000" "5'7\"" "5'3\""
[7] "5 feet and 8.11 inches" "5'11" "5'9''"
[10] "5'10''" "5,3" "6'"
[13] "6,8" "5' 10" "Five foot eight inches"
[16] "5'5\"" "5'2\"" "5,4"
[19] "5'3" "5'10''" "5'3''"
[22] "5'7''" "5'12" "2'33"
[25] "5'11" "5'3\"" "5,8"
[28] "5'6''" "5'4" "1,70"
[31] "5'7.5''" "5'7.5''" "5'2\""
[34] "5' 7.78\"" "yyy" "5'5"
[37] "5'8" "5'6" "5 feet 7inches"
[40] "6*12" "5 .11" "5 11"
[43] "5'4" "5'8\"" "5'5"
[46] "5'7" "5'6" "5'11\""
[49] "5'7\"" "5'7" "5'8"
[52] "5' 11\"" "6'1\"" "69\""
[55] "5' 7\"" "5'10''" "5'10"
[58] "5'10" "5ft 9 inches" "5 ft 9 inches"
[61] "5'2" "5'11" "5'11''"
[64] "5'8\"" "708,661" "5 feet 6 inches"
[67] "5'10''" "5'8" "6'3\""
[70] "649,606" "728,346" "6 04"
[73] "5'9" "5'5''" "5'7\""
[76] "6'4\"" "5'4" "170 cm"
[79] "7,283,465" "5'6" "5'6"
not_inches <- function(x, smallest = 50, tallest = 84) {
inches <- suppressWarnings(as.numeric(x))
ind <- is.na(inches) | inches < smallest | inches > tallest
ind}
problems = reported_heights$height %>% .[not_inches(.)]
problems
[1] "6" "5' 4\"" "5.3"
[4] "165cm" "511" "6"
[7] "2" "5'7" ">9000"
[10] "5'7\"" "5'3\"" "5 feet and 8.11 inches"
[13] "5.25" "5'11" "5.5"
[16] "11111" "5'9''" "6"
[19] "6.5" "150" "5'10''"
[22] "103.2" "5.8" "19"
[25] "5" "5.6" "175"
[28] "177" "300" "5,3"
[31] "6'" "6" "5.9"
[34] "6,8" "5' 10" "5.5"
[37] "178" "163" "6.2"
[40] "175" "Five foot eight inches" "6.2"
[43] "5.8" "5.1" "178"
[46] "165" "5.11" "5'5\""
[49] "165" "180" "5'2\""
[52] "5.75" "169" "5,4"
[55] "7" "5.4" "157"
[58] "6.1" "169" "5'3"
[61] "5.6" "214" "183"
[64] "5.6" "6" "162"
[67] "178" "180" "5'10''"
[70] "170" "5'3''" "178"
[73] "0.7" "190" "5.4"
[76] "184" "5'7''" "5.9"
[79] "5'12" "5.6" "5.6"
[82] "184" "6" "167"
[85] "2'33" "5'11" "5'3\""
[88] "5.5" "5.2" "180"
[91] "5.5" "5.5" "6.5"
[94] "5,8" "180" "183"
[97] "170" "5'6''" "172"
[100] "612" "5.11" "168"
[103] "5'4" "1,70" "172"
[106] "87" "5.5" "176"
[109] "5'7.5''" "5'7.5''" "111"
[112] "5'2\"" "173" "174"
[115] "176" "175" "5' 7.78\""
[118] "6.7" "12" "6"
[121] "5.1" "5.6" "5.5"
[124] "yyy" "5.2" "5'5"
[127] "5'8" "5'6" "5 feet 7inches"
[130] "89" "5.6" "5.7"
[133] "183" "172" "34"
[136] "25" "6" "5.9"
[139] "168" "6.5" "170"
[142] "175" "6" "22"
[145] "5.11" "684" "6"
[148] "1" "1" "6*12"
[151] "5 .11" "87" "162"
[154] "165" "184" "6"
[157] "173" "1.6" "172"
[160] "170" "5.7" "5.5"
[163] "174" "170" "160"
[166] "120" "120" "23"
[169] "192" "5 11" "167"
[172] "150" "1.7" "174"
[175] "5.8" "6" "5'4"
[178] "5'8\"" "5'5" "5.8"
[181] "5.1" "5.11" "5.7"
[184] "5'7" "5'6" "5'11\""
[187] "5'7\"" "5'7" "172"
[190] "5'8" "180" "5' 11\""
[193] "5" "180" "180"
[196] "6'1\"" "5.9" "5.2"
[199] "5.5" "69\"" "5' 7\""
[202] "5'10''" "5.51" "5'10"
[205] "5'10" "5ft 9 inches" "5 ft 9 inches"
[208] "5'2" "5'11" "5.8"
[211] "5.7" "167" "168"
[214] "6" "6.1" "5'11''"
[217] "5.69" "178" "182"
[220] "164" "5'8\"" "185"
[223] "6" "86" "5.7"
[226] "708,661" "5.25" "5.5"
[229] "5 feet 6 inches" "5'10''" "172"
[232] "6" "5'8" "160"
[235] "6'3\"" "649,606" "10000"
[238] "5.1" "152" "1"
[241] "180" "728,346" "175"
[244] "158" "173" "164"
[247] "6 04" "169" "0"
[250] "185" "168" "5'9"
[253] "169" "5'5''" "174"
[256] "6.3" "179" "5'7\""
[259] "5.5" "6" "6"
[262] "170" "6" "172"
[265] "158" "100" "159"
[268] "190" "5.7" "170"
[271] "158" "6'4\"" "180"
[274] "5.57" "5'4" "210"
[277] "88" "6" "162"
[280] "170 cm" "5.7" "170"
[283] "157" "186" "170"
[286] "7,283,465" "5" "5"
[289] "34" "161" "5'6"
[292] "5'6"
str_subset(problems, "cm|inches")
[1] "165cm" "5 feet and 8.11 inches" "Five foot eight inches"
[4] "5 feet 7inches" "5ft 9 inches" "5 ft 9 inches"
[7] "5 feet 6 inches" "170 cm"
str_subset(problems, "cm|inches") %>% str_extract("cm|inches")
[1] "cm" "inches" "inches" "inches" "inches" "inches" "inches" "cm"
Q1: In the video, we use the function not_inches to identify heights that were incorrectly entered
not_inches <- function(x, smallest = 50, tallest = 84) {
inches <- suppressWarnings(as.numeric(x))
ind <- is.na(inches) | inches < smallest | inches > tallest
ind
}
In this function, what TWO types of values are identified as not being correctly formatted in inches?
Q2: Which of the following arguments, when passed to the function not_inches, would return the vector c(FALSE)?
c(70) %>% not_inches
[1] FALSE
Q3: Our function not_inches returns the object ind. Which answer correctly describes ind?
ind is a logical vector of TRUE and FALSE, equal in length to the vector x (in the arguments list). TRUE indicates that a height entry is incorrectly formatted.Q1: Given the following code
s = c("70" ,"5 ft", "4'11", "", ".", "Six feet"); s
[1] "70" "5 ft" "4'11" "" "." "Six feet"
What pattern vector yields the following result?
pattern = "\\d|ft"
str_subset(s, pattern)
[1] "70" "5 ft" "4'11"
Character Classes - []
yes = as.character(4:7)
no = as.character(1:3)
str_detect(c(yes,no), "[4-7]")
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
Anchors - ^ and $
yes = c("1","5","9")
no = c("12","123"," 1","a4","b")
str_detect(c(yes,no), "^\\d$")
[1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Qualifiers - {}
yes = c("1","5","9","12")
no = c("123","a4","b")
str_detect(c(yes,no), "^\\d{1,2}$")
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
Pattern of Feets & Inches
pattern = "^[4-7]'\\d{1,2}\"$"
yes = c("5'7\"", "6'2\"", "5'12\"")
no = c("6,2\"", "6.2\"", "I am 5'11\"", "3'2\"", "64")
str_detect(c(yes,no), pattern)
[1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Q1: You enter the following set of commands into your R console. What is your printed result?
animals <- c("cat", "puppy", "Moose", "MONKEY")
pattern <- "[a-z]"
str_detect(animals, pattern)
[1] TRUE TRUE TRUE FALSE
Q2: You enter the following set of commands into your R console. What is your printed result?
animals <- c("cat", "puppy", "Moose", "MONKEY")
pattern <- "[A-Z]$"
str_detect(animals, pattern)
[1] FALSE FALSE FALSE TRUE
Q3: You enter the following set of commands into your R console. What is your printed result?
animals <- c("cat", "puppy", "Moose", "MONKEY")
pattern <- "[a-z]{4,5}"
str_detect(animals, pattern)
[1] FALSE TRUE TRUE FALSE
Inital Pattern
pattern = "^[4-7]'\\d{1,2}$"
str_subset(problems, pattern) # 23
[1] "5'7" "5'11" "5'3" "5'12" "5'11" "5'4" "5'5" "5'8" "5'6" "5'4" "5'5"
[12] "5'7" "5'6" "5'7" "5'8" "5'10" "5'10" "5'2" "5'11" "5'8" "5'9" "5'4"
[23] "5'6" "5'6"
Replace Feet and Inches
pattern = "^[4-7]'\\d{1,2}$"
problems %>%
str_replace("feet|ft|foot","'") %>%
str_replace("inches|in|''|\"","") %>%
str_detect(pattern) %>%
sum # 48
[1] 48
The More Qualifiers
* : 0 or more+ : 1 or more? : 0 or 1yes = c("AB","A1B","A11B","A111B","A1111B")
no = c("A2B","A21B")
str_detect(c(yes,no), "A1*B")
[1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE
Space - \\s
pattern = "^[4-7]\\s*'\\s*\\d{1,2}$"
problems %>%
str_replace("feet|ft|foot","'") %>%
str_replace("inches|in|''|\"","") %>%
str_detect(pattern) %>%
sum # 53
[1] 53
Q1: Given the following code, which TWO pattern vectors would yield the following result?
animals <- c("moose", "monkey", "meerkat", "mountain lion")
pattern = c("mo*","mo?","mo+","moo*")
sapply(pattern, function(p) str_detect(animals, p)) %>% t
[,1] [,2] [,3] [,4]
mo* TRUE TRUE TRUE TRUE
mo? TRUE TRUE TRUE TRUE
mo+ TRUE TRUE FALSE TRUE
moo* TRUE TRUE FALSE TRUE
Q2: You are working on some data from different universities. You have the following vector
schools = c(
"U. Kentucky","Univ New Hampshire","Univ. of Massachusetts",
"University Georgia","U California","California State University"
)
You want to clean this data to match the full names of each university. What of the following commands could accomplish this?
schools %>%
str_replace("^Univ\\.?\\s|^U\\.?\\s", "University ") %>%
str_replace("^University of |^University ", "University of ")
[1] "University of Kentucky" "University of New Hampshire"
[3] "University of Massachusetts" "University of Georgia"
[5] "University of California" "California State University"
Define Groups ()
pattern_no_group = "^[4-7],\\d*$"
pattern_group = "^([4-7]),(\\d*)$"
yes = c("5,9","5,11","6,","6,1")
no = c("5'9",",","2,8","6.1.1")
s = c(yes, no)
Groups do not affect pattern detection
str_detect(s, pattern_no_group)
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
str_detect(s, pattern_group)
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
The difference between
str_match()str_extarct()str_subset()str_detect()str_match(s, pattern_group)
[,1] [,2] [,3]
[1,] "5,9" "5" "9"
[2,] "5,11" "5" "11"
[3,] "6," "6" ""
[4,] "6,1" "6" "1"
[5,] NA NA NA
[6,] NA NA NA
[7,] NA NA NA
[8,] NA NA NA
str_extract(s, pattern_group)
[1] "5,9" "5,11" "6," "6,1" NA NA NA NA
str_subset(s, pattern_group)
[1] "5,9" "5,11" "6," "6,1"
str_detect(s, pattern_group)
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
Replace with Group
pattern = "^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$"
str_subset(problems, pattern)
[1] "5.3" "5.25" "5.5" "6.5" "5.8" "5.6" "5,3" "5.9" "6,8" "5.5"
[11] "6.2" "6.2" "5.8" "5.1" "5.11" "5.75" "5,4" "5.4" "6.1" "5.6"
[21] "5.6" "5.4" "5.9" "5.6" "5.6" "5.5" "5.2" "5.5" "5.5" "6.5"
[31] "5,8" "5.11" "5.5" "6.7" "5.1" "5.6" "5.5" "5.2" "5.6" "5.7"
[41] "5.9" "6.5" "5.11" "5 .11" "5.7" "5.5" "5 11" "5.8" "5.8" "5.1"
[51] "5.11" "5.7" "5.9" "5.2" "5.5" "5.51" "5.8" "5.7" "6.1" "5.69"
[61] "5.7" "5.25" "5.5" "5.1" "6 04" "6.3" "5.5" "5.7" "5.57" "5.7"
str_subset(problems, pattern) %>%
str_replace(pattern, "\\1'\\2")
[1] "5'3" "5'25" "5'5" "6'5" "5'8" "5'6" "5'3" "5'9" "6'8" "5'5" "6'2"
[12] "6'2" "5'8" "5'1" "5'11" "5'75" "5'4" "5'4" "6'1" "5'6" "5'6" "5'4"
[23] "5'9" "5'6" "5'6" "5'5" "5'2" "5'5" "5'5" "6'5" "5'8" "5'11" "5'5"
[34] "6'7" "5'1" "5'6" "5'5" "5'2" "5'6" "5'7" "5'9" "6'5" "5'11" "5'11"
[45] "5'7" "5'5" "5'11" "5'8" "5'8" "5'1" "5'11" "5'7" "5'9" "5'2" "5'5"
[56] "5'51" "5'8" "5'7" "6'1" "5'69" "5'7" "5'25" "5'5" "5'1" "6'04" "6'3"
[67] "5'5" "5'7" "5'57" "5'7"
Q1: Rather than using the pattern_with_groups vector from the video, you accidentally write in the following code. What is your result?
pattern_w_groups = "^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$"
problems1 <- c("5.3", "5,5", "6 1", "5 .11", "5, 12")
pattern_with_groups <- "^([4-7])[,\\.](\\d*)$"
str_replace(problems1, pattern_with_groups, "\\1'\\2")
[1] "5'3" "5'5" "6 1" "5 .11" "5, 12"
Q2: You notice your mistake and correct your pattern regex to the following What is your result?
problems1 <- c("5.3", "5,5", "6 1", "5 .11", "5, 12")
pattern_with_groups <- "^([4-7])[,\\.\\s](\\d*)$"
str_replace(problems1, pattern_with_groups, "\\1'\\2")
[1] "5'3" "5'5" "6'1" "5 .11" "5, 12"
I think what it intends to do is …
problems1 <- c("5.3", "5,5", "6 1", "5 .11", "5, 12")
pattern_with_groups <- "^([4-7])\\s*[,\\.\\s]\\s*(\\d*)$"
str_replace(problems1, pattern_with_groups, "\\1'\\2")
[1] "5'3" "5'5" "6'1" "5'11" "5'12"
converted <- problems %>%
str_replace("feet|foot|ft", "'") %>%
str_replace("inches|in|''|\"", "") %>%
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"
index <- str_detect(converted, pattern)
mean(index) # 0.42123
[1] 0.4212
converted[!index]
[1] "6" "165cm" "511" "6"
[5] "2" ">9000" "5 ' and 8.11 " "11111"
[9] "6" "150" "103.2" "19"
[13] "5" "175" "177" "300"
[17] "6'" "6" "178" "163"
[21] "175" "Five ' eight " "178" "165"
[25] "165" "180" "169" "7"
[29] "157" "169" "214" "183"
[33] "6" "162" "178" "180"
[37] "170" "178" "0.7" "190"
[41] "184" "184" "6" "167"
[45] "2'33" "180" "180" "183"
[49] "170" "172" "612" "168"
[53] "1,70" "172" "87" "176"
[57] "5'7.5" "5'7.5" "111" "173"
[61] "174" "176" "175" "5' 7.78"
[65] "12" "6" "yyy" "89"
[69] "183" "172" "34" "25"
[73] "6" "168" "170" "175"
[77] "6" "22" "684" "6"
[81] "1" "1" "6*12" "87"
[85] "162" "165" "184" "6"
[89] "173" "1.6" "172" "170"
[93] "174" "170" "160" "120"
[97] "120" "23" "192" "167"
[101] "150" "1.7" "174" "6"
[105] "172" "180" "5" "180"
[109] "180" "69" "5' 9 " "5 ' 9 "
[113] "167" "168" "6" "178"
[117] "182" "164" "185" "6"
[121] "86" "708,661" "5 ' 6 " "172"
[125] "6" "160" "649,606" "10000"
[129] "152" "1" "180" "728,346"
[133] "175" "158" "173" "164"
[137] "169" "0" "185" "168"
[141] "169" "174" "179" "6"
[145] "6" "170" "6" "172"
[149] "158" "100" "159" "190"
[153] "170" "158" "180" "210"
[157] "88" "6" "162" "170 cm"
[161] "170" "157" "186" "170"
[165] "7,283,465" "5" "5" "34"
[169] "161"
Q1: In our example, we use the following code to detect height entries that do not match our pattern of x’y”.
problems1 <- c("5.3", "5,5", "6 1", "5 .11", "5, 12")
converted1 <- problems1 %>%
str_replace("feet|foot|ft", "'") %>%
str_replace("inches|in|''|\"", "") %>%
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"
index <- str_detect(converted1, pattern)
converted1[!index]
Which answer best describes the differences between the regex string we use as an argument in
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
And the regex string in
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"?
Q2: You notice a few entries that are not being properly converted using your str_replace and str_detect code
yes <- c("5 feet 7inches")
no <- c("5ft 9 inches", "5 ft 9 inches")
s <- c(yes, no)
converted <- s %>%
str_replace("feet|foot|ft", "'") %>%
str_replace("inches|in|''|\"", "") %>%
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
converted
[1] "5 ' 7" "5' 9 " "5 ' 9 "
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"
str_detect(converted, pattern)
[1] TRUE FALSE FALSE
It seems like the problem may be due to spaces around the words feet|foot|ft and inches|in. What is another way you could fix this problem?
converted <- s %>%
str_replace("\\s*feet|foot|ft\\s*", "'") %>%
str_replace("\\s*inches|in|''|\"\\s*", "") %>%
str_replace("^([4-7])\\s*[,\\.\\s+]\\s*(\\d*)$", "\\1'\\2")
converted
[1] "5' 7" "5'9" "5 '9"
pattern <- "^[4-7]\\s*'\\s*\\d{1,2}$"
str_detect(converted, pattern)
[1] TRUE TRUE TRUE
s = c("5'10", "6'1")
tab = data.frame(x = s)
separate(tab, x, c("feet", "inches"), sep="'")
feet inches
1 5 10
2 6 1
extract(tab, x, c("feet", "inches"), regex="(\\d)'(\\d{1,2})")
feet inches
1 5 10
2 6 1
s = c("5'10", "6'1\"","5'8inches")
tab = data.frame(x = s)
separate(tab, x, c("feet", "inches"), sep="'")
feet inches
1 5 10
2 6 1"
3 5 8inches
extract(tab, x, c("feet", "inches"), regex="(\\d)'(\\d{1,2})")
feet inches
1 5 10
2 6 1
3 5 8
** Q1:** If you use the extract code from our video, the decimal point is dropped. What modification of the code would allow you to put the decimals in a third column called “decimal”?
library(tidyr)
s <- c("5'10", "6'1\"", "5'8inches", "5'7.5")
tab <- data.frame(x = s)
rx = c("(\\d)'(\\d{1,2})(\\.)?",
"(\\d)'(\\d{1,2})(\\.\\d+)",
"(\\d)'(\\d{1,2})\\.\\d+?",
"(\\d)'(\\d{1,2})(\\.\\d+)?")
extract(tab, x, into=c("feet", "inches", "decimal"), regex=rx[4])
feet inches decimal
1 5 10 <NA>
2 6 1 <NA>
3 5 8 <NA>
4 5 7 .5
filename = system.file("extdata/murders.csv", package="dslabs")
lines = readLines(filename)
head(lines)
[1] "state,abb,region,population,total" "Alabama,AL,South,4779736,135"
[3] "Alaska,AK,West,710231,19" "Arizona,AZ,West,6392017,232"
[5] "Arkansas,AR,South,2915918,93" "California,CA,West,37253956,1257"
x = str_split(lines, ",", simplify=T)
head(x)
[,1] [,2] [,3] [,4] [,5]
[1,] "state" "abb" "region" "population" "total"
[2,] "Alabama" "AL" "South" "4779736" "135"
[3,] "Alaska" "AK" "West" "710231" "19"
[4,] "Arizona" "AZ" "West" "6392017" "232"
[5,] "Arkansas" "AR" "South" "2915918" "93"
[6,] "California" "CA" "West" "37253956" "1257"
as.data.frame(x[-1,]) %>%
setNames(x[1,]) %>%
mutate_all(parse_guess) %>%
head(10)
state abb region population total
1 Alabama AL South 4779736 135
2 Alaska AK West 710231 19
3 Arizona AZ West 6392017 232
4 Arkansas AR South 2915918 93
5 California CA West 37253956 1257
6 Colorado CO West 5029196 65
7 Connecticut CT Northeast 3574097 97
8 Delaware DE South 897934 38
9 District of Columbia DC South 601723 99
10 Florida FL South 19687653 669
Q1: You have the following table
schedule = data.frame(
day = c("Monday", "Tuesday"),
staff = c("Mandy, Chris and Laura", "Steve, Ruth and Frank"))
schedule
day staff
1 Monday Mandy, Chris and Laura
2 Tuesday Steve, Ruth and Frank
Which two commands would properly split the text in the “Staff” column into each individual name? Check all that apply.
lapply(c(",|and", ", | and ", ",\\s|\\sand\\s", "\\s?(,|and)\\s?"),
function(r) str_split(schedule$staff, r, simplify=T))
[[1]]
[,1] [,2] [,3] [,4]
[1,] "M" "y" " Chris " " Laura"
[2,] "Steve" " Ruth " " Frank" ""
[[2]]
[,1] [,2] [,3]
[1,] "Mandy" "Chris" "Laura"
[2,] "Steve" "Ruth" "Frank"
[[3]]
[,1] [,2] [,3]
[1,] "Mandy" "Chris" "Laura"
[2,] "Steve" "Ruth" "Frank"
[[4]]
[,1] [,2] [,3] [,4]
[1,] "M" "y" "Chris" "Laura"
[2,] "Steve" "Ruth" "Frank" ""
Q2: What code would successfully turn your “Schedule” table into the following tidy table
schedule %>%
mutate(staff = str_split(staff, ", | and ")) %>%
unnest()
day staff
1 Monday Mandy
2 Monday Chris
3 Monday Laura
4 Tuesday Steve
5 Tuesday Ruth
6 Tuesday Frank
library(ggplot2)
data("gapminder")
gapminder %>% filter(region == "Caribbean") %>%
ggplot(aes(year, life_expectancy, color=country)) +
geom_line()
gapminder %>% filter(region == "Caribbean") %>%
filter(str_length(country) >= 12) %>%
distinct(country)
country
1 Antigua and Barbuda
2 Dominican Republic
3 St. Vincent and the Grenadines
4 Trinidad and Tobago
gapminder %>% filter(region == "Caribbean") %>%
mutate(country = recode(
country,
`Antigua and Barbuda` = "Barbuda",
`Dominican Republic` = "DR",
`St. Vincent and the Grenadines` = "St. Vincent",
`Trinidad and Tobago` = "Trinidad"
)) %>%
ggplot(aes(year, life_expectancy, color=country)) +
geom_line()
Q1: Using the gapminder data, you want to recode countries longer than 12 letters in the region Middle Africa to their abbreviations in a new column, country_short. Which code would accomplish this?
library(dslabs)
data(gapminder)
gapminder %>% filter(region == "Middle Africa") %>%
filter(nchar(as.character(country)) >= 12) %>%
select(region, country) %>% distinct() %>%
mutate(country_short = recode(country,
"Central African Republic" = "CAR",
"Congo, Dem. Rep." = "DRC",
"Equatorial Guinea" = "Eq. Guinea"
) )
region country country_short
1 Middle Africa Central African Republic CAR
2 Middle Africa Congo, Dem. Rep. DRC
3 Middle Africa Equatorial Guinea Eq. Guinea
Q1: Which of the following is the standard ISO 8601 format for dates?
Q2: Which of the following commands could convert this string into the correct date format?
library(lubridate)
dates <- c("09-01-02", "01-12-07", "02-03-04")
ymd(dates)
[1] "2009-01-02" "2001-12-07" "2002-03-04"
mdy(dates)
[1] "2002-09-01" "2007-01-12" "2004-02-03"
dmy(dates)
[1] "2002-01-09" "2007-12-01" "2004-03-02"