[Video]
# Define line1
line1 <- "The table was a large one, but the three were all crowded together at one corner of it:"
# Define line2
line2 <- '"No room! No room!" they cried out when they saw Alice coming.'
# Define line3
line3 <- "\"There's plenty of room!\" said Alice indignantly, and she sat down in a large arm-chair at one end of the table."
# Putting lines in a vector
lines <- c(line1, line2, line3)
# Print lines
lines
## [1] "The table was a large one, but the three were all crowded together at one corner of it:"
## [2] "\"No room! No room!\" they cried out when they saw Alice coming."
## [3] "\"There's plenty of room!\" said Alice indignantly, and she sat down in a large arm-chair at one end of the table."
# Use writeLines() on lines
writeLines(lines)
## The table was a large one, but the three were all crowded together at one corner of it:
## "No room! No room!" they cried out when they saw Alice coming.
## "There's plenty of room!" said Alice indignantly, and she sat down in a large arm-chair at one end of the table.
# Write lines with a space separator
writeLines(lines, sep = " ")
## The table was a large one, but the three were all crowded together at one corner of it: "No room! No room!" they cried out when they saw Alice coming. "There's plenty of room!" said Alice indignantly, and she sat down in a large arm-chair at one end of the table.
# Use writeLines() on the string "hello\n\U1F30D"
writeLines("hello\n\U1F30D")
## hello
## 🌍
# Should display: To have a \ you need \\
writeLines("To have a \\ you need \\\\")
## To have a \ you need \\
# Should display:
# This is a really
# really really
# long string
writeLines("This is a really \nreally really \nlong string")
## This is a really
## really really
## long string
# Use writeLines() with
# "\u0928\u092e\u0938\u094d\u0924\u0947 \u0926\u0941\u0928\u093f\u092f\u093e"
writeLines("\u0928\u092e\u0938\u094d\u0924\u0947 \u0926\u0941\u0928\u093f\u092f\u093e")
## नमस्ते दुनिया
[Video]
# Some vectors of numbers
percent_change <- c(4, -1.91, 3.00, -5.002)
income <- c(72.19, 1030.18, 10291.93, 1189192.18)
p_values <- c(0.12, 0.98, 0.0000191, 0.00000000002)
# Format c(0.0011, 0.011, 1) with digits = 1
format(c(0.0011, 0.011, 1), digits = 1)
## [1] "0.001" "0.011" "1.000"
# Format c(1.0011, 2.011, 1) with digits = 1
format(c(1.0011, 2.011, 1), digits = 1)
## [1] "1" "2" "1"
# Format percent_change to one place after the decimal point
format(percent_change, digits = 2)
## [1] " 4.0" "-1.9" " 3.0" "-5.0"
# Format income to whole numbers
format(income, digits = 2)
## [1] " 72" " 1030" " 10292" "1189192"
# Format p_values in fixed format
format(p_values, scientific = FALSE)
## [1] "0.12000000000" "0.98000000000" "0.00001910000" "0.00000000002"
formatted_income <- format(income, digits = 2)
# Print formatted_income
formatted_income
## [1] " 72" " 1030" " 10292" "1189192"
# Call writeLines() on the formatted income
writeLines(formatted_income)
## 72
## 1030
## 10292
## 1189192
# Define trimmed_income
trimmed_income <- format(income, digits = 2, trim = TRUE)
# Call writeLines() on the trimmed_income
writeLines(trimmed_income)
## 72
## 1030
## 10292
## 1189192
# Define pretty_income
pretty_income <- format(income, digits = 2, big.mark = ",")
# Call writeLines() on the pretty_income
writeLines(pretty_income)
## 72
## 1,030
## 10,292
## 1,189,192
# From the format() exercise
x <- c(0.0011, 0.011, 1)
y <- c(1.0011, 2.011, 1)
# formatC() on x with format = "f", digits = 1
formatC(x, format = "f", digits = 1)
## [1] "0.0" "0.0" "1.0"
# formatC() on y with format = "f", digits = 1
formatC(y, format = "f", digits = 1)
## [1] "1.0" "2.0" "1.0"
# Format percent_change to one place after the decimal point
formatC(percent_change, format = "f", digits = 1)
## [1] "4.0" "-1.9" "3.0" "-5.0"
# percent_change with flag = "+"
formatC(percent_change, format = "f", digits = 1, flag = "+")
## [1] "+4.0" "-1.9" "+3.0" "-5.0"
# Format p_values using format = "g" and digits = 2
formatC(p_values, format = "g", digits = 2)
## [1] "0.12" "0.98" "1.9e-05" "2e-11"
[Video]
# Add $ to pretty_income
paste("$", pretty_income, sep = "")
## [1] "$ 72" "$ 1,030" "$ 10,292" "$1,189,192"
# Add % to pretty_percent
paste(pretty_percent, "%", sep = "")
## [1] "+4.0%" "-1.9%" "+3.0%" "-5.0%"
# Create vector with elements like 2010: +4.0%`
year_percent <- paste(years, ": ", pretty_percent, "%", sep = "")
# Collapse all years into single string
paste(year_percent, collapse = ", ")
## [1] "2010: +4.0%, 2011: -1.9%, 2012: +3.0%, 2013: -5.0%"
# Define the names vector
income_names <- c("Year 0", "Year 1", "Year 2", "Project Lifetime")
# Create pretty_income
pretty_income <- format(income, digits = 2, big.mark = ",")
# Create dollar_income
dollar_income <- paste("$", pretty_income, sep = "")
# Create formatted_names
formatted_names <- format(income_names, justify = "right")
# Create rows
rows <- paste(formatted_names, dollar_income, sep = " ")
# Write rows
writeLines(rows)
## Year 0 $ 72
## Year 1 $ 1,030
## Year 2 $ 10,292
## Project Lifetime $1,189,192
# Randomly sample 3 toppings
my_toppings <- sample(toppings, size = 3)
# Print my_toppings
my_toppings
## [1] "onions" "sun-dried tomato" "ham"
# Paste "and " to last element: my_toppings_and
my_toppings_and <- paste(c("", "", "and "), my_toppings, sep = "")
# Collapse with comma space: these_toppings
these_toppings <- paste(my_toppings_and, collapse = ", ")
# Add rest of sentence: my_order
my_order <- paste("I want to order a pizza with ", these_toppings, ".", sep = "")
# Order pizza with writeLines()
writeLines(my_order)
## I want to order a pizza with onions, sun-dried tomato, and ham.
[Video]
library(stringr)
my_toppings <- c("cheese", NA, NA)
my_toppings_and <- paste(c("", "", "and "), my_toppings, sep = "")
# Print my_toppings_and
my_toppings_and
## [1] "cheese" "NA" "and NA"
# Use str_c() instead of paste(): my_toppings_str
my_toppings_str <- str_c(c("", "", "and "), my_toppings)
# Print my_toppings_str
my_toppings_str
## [1] "cheese" NA NA
# paste() my_toppings_and with collapse = ", "
paste(my_toppings_and, collapse = ", ")
## [1] "cheese, NA, and NA"
# str_c() my_toppings_str with collapse = ", "
str_c(my_toppings_str, collapse = ", ")
## [1] NA
library(stringr)
library(babynames)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Extracting vectors for boys' and girls' names
babynames_2014 <- filter(babynames, year == 2014)
boy_names <- filter(babynames_2014, sex == "M")$name
girl_names <- filter(babynames_2014, sex == "F")$name
# Take a look at a few boy_names
head(boy_names)
## [1] "Noah" "Liam" "Mason" "Jacob" "William" "Ethan"
# Find the length of all boy_names
boy_length <- str_length(boy_names)
# Take a look at a few lengths
head(boy_length)
## [1] 4 4 5 5 7 5
# Find the length of all girl_names
girl_length <- str_length(girl_names)
# Find the difference in mean length
mean(girl_length) - mean(boy_length)
## [1] 0.3374758
# Confirm str_length() works with factors
head(str_length(factor(boy_names)))
## [1] 4 4 5 5 7 5
# Extract first letter from boy_names
boy_first_letter <- str_sub(boy_names, 1, 1)
# Tabulate occurrences of boy_first_letter
table(boy_first_letter)
## boy_first_letter
## A B C D E F G H I J K L M N O P
## 1454 651 770 998 549 185 334 403 235 1390 1291 537 914 424 207 230
## Q R S T U V W X Y Z
## 56 778 806 771 43 160 174 56 252 379
# Extract the last letter in boy_names, then tabulate
boy_last_letter <- str_sub(boy_names, -1, -1)
table(boy_last_letter)
## boy_last_letter
## a b c d e f g h i j k l m n o p
## 421 104 92 436 1148 66 82 583 705 57 349 945 389 4672 730 32
## q r s t u v w x y z
## 19 1011 826 292 81 71 34 86 697 119
# Extract the first letter in girl_names, then tabulate
girl_first_letter <- str_sub(girl_names, 1, 1)
table(girl_first_letter)
## girl_first_letter
## A B C D E F G H I J K L M N O P
## 3101 699 946 810 933 209 345 469 373 1430 1694 1122 1746 752 143 303
## Q R S T U V W X Y Z
## 38 831 1369 683 28 214 85 62 294 502
# Extract the last letter in girl_names, then tabulate
girl_last_letter <- str_sub(girl_names, -1, -1)
table(girl_last_letter)
## girl_last_letter
## a b c d e f g h i j k l m n o p
## 6632 20 13 81 3114 8 21 1942 1581 12 31 450 115 2608 105 3
## q r s t u v w x y z
## 2 291 326 208 59 6 17 50 1435 51
[Video]
# Look for pattern "zz" in boy_names
contains_zz <- str_detect(boy_names, fixed("zz"))
# Examine str() of contains_zz
str(contains_zz)
## logi [1:14047] FALSE FALSE FALSE FALSE FALSE FALSE ...
# How many names contain "zz"?
sum(contains_zz)
## [1] 16
# Which names contain "zz"?
boy_names[contains_zz]
## [1] "Uzziah" "Ozzie" "Ozzy" "Jazz" "Uzziel" "Chazz"
## [7] "Izzy" "Azzam" "Izzac" "Izzak" "Fabrizzio" "Jazziel"
## [13] "Azzan" "Izzaiah" "Muizz" "Yazziel"
# Which rows in boy_df have names that contain "zz"?
boy_df[contains_zz, ]
## year sex name n prop
## 2091 2014 M Uzziah 67 3.28e-05
## 2191 2014 M Ozzie 62 3.03e-05
## 2312 2014 M Ozzy 57 2.79e-05
## 4601 2014 M Jazz 21 1.03e-05
## 4676 2014 M Uzziel 21 1.03e-05
## 5280 2014 M Chazz 17 8.32e-06
## 5558 2014 M Izzy 16 7.83e-06
## 6009 2014 M Azzam 14 6.85e-06
## 6425 2014 M Izzac 13 6.36e-06
## 9095 2014 M Izzak 8 3.91e-06
## 9950 2014 M Fabrizzio 7 3.42e-06
## 11316 2014 M Jazziel 6 2.93e-06
## 12270 2014 M Azzan 5 2.45e-06
## 12780 2014 M Izzaiah 5 2.45e-06
## 13385 2014 M Muizz 5 2.45e-06
## 13966 2014 M Yazziel 5 2.45e-06
# Find boy_names that contain "zz"
str_subset(boy_names, fixed("zz"))
## [1] "Uzziah" "Ozzie" "Ozzy" "Jazz" "Uzziel" "Chazz"
## [7] "Izzy" "Azzam" "Izzac" "Izzak" "Fabrizzio" "Jazziel"
## [13] "Azzan" "Izzaiah" "Muizz" "Yazziel"
# Find girl_names that contain "zz"
str_subset(girl_names, fixed("zz"))
## [1] "Izzabella" "Jazzlyn" "Jazzlynn" "Lizzie" "Izzy"
## [6] "Lizzy" "Mazzy" "Izzabelle" "Jazzmine" "Jazzmyn"
## [11] "Jazzelle" "Jazzmin" "Izzah" "Jazzalyn" "Jazzmyne"
## [16] "Izzabell" "Jazz" "Mazzie" "Alyzza" "Izza"
## [21] "Izzie" "Jazzlene" "Lizzeth" "Jazzalynn" "Jazzy"
## [26] "Alizzon" "Elizzabeth" "Jazzilyn" "Jazzlynne" "Jizzelle"
## [31] "Izzabel" "Izzabellah" "Izzibella" "Jazzabella" "Jazzabelle"
## [36] "Jazzel" "Jazzie" "Jazzlin" "Jazzlyne" "Aizza"
## [41] "Brizza" "Ezzah" "Fizza" "Izzybella" "Rozzlyn"
# Find girl_names that contain "U"
starts_U <- str_subset(girl_names, fixed("U"))
starts_U
## [1] "Unique" "Uma" "Unknown" "Una" "Uriah" "Ursula" "Unity"
## [8] "Umaiza" "Urvi" "Ulyana" "Ula" "Udy" "Urwa" "Ulani"
## [15] "Umaima" "Umme" "Ugochi" "Ulyssa" "Umika" "Uriyah" "Ubah"
## [22] "Umaira" "Umi" "Ume" "Urenna" "Uriel" "Urijah" "Uyen"
# Find girl_names that contain "U" and "z"
str_subset(starts_U, fixed("z"))
## [1] "Umaiza"
# Count occurrences of "a" in girl_names
number_as <- str_count(girl_names, fixed("a"))
# Count occurrences of "A" in girl_names
number_As <- str_count(girl_names, fixed("A"))
# Histograms of number_as and number_As
hist(number_as)
hist(number_As)
# Find total "a" + "A"
total_as <- number_as + number_As
# girl_names with more than 4 a's
girl_names[total_as > 4]
## [1] "Aaradhana"
[Video]
# Some date data
date_ranges <- c("23.01.2017 - 29.01.2017", "30.01.2017 - 06.02.2017")
# Split dates using " - "
split_dates <- str_split(date_ranges, fixed(" - "))
split_dates
## [[1]]
## [1] "23.01.2017" "29.01.2017"
##
## [[2]]
## [1] "30.01.2017" "06.02.2017"
# Split dates with n and simplify specified
split_dates_n <- str_split(date_ranges, fixed(" - "), n = 2, simplify = TRUE)
split_dates_n
## [,1] [,2]
## [1,] "23.01.2017" "29.01.2017"
## [2,] "30.01.2017" "06.02.2017"
# Subset split_dates_n into start_dates and end_dates
start_dates <- split_dates_n[ , 1]
# Split start_dates into day, month and year pieces
str_split(start_dates, fixed("."), n = 3, simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "23" "01" "2017"
## [2,] "30" "01" "2017"
both_names <- c("Box, George", "Cox, David")
# Split both_names into first_names and last_names
both_names_split <- str_split(both_names, fixed(", "), n = 2, simplify = TRUE)
# Get first names
first_names <- both_names_split[, 2]
# Get last names
last_names <- both_names_split[, 1]
# Split lines into words
words <- str_split(lines, " ")
# Number of words per line
lapply(words, length)
## [[1]]
## [1] 18
##
## [[2]]
## [1] 12
##
## [[3]]
## [1] 21
# Number of characters in each word
word_lengths <- lapply(words, str_length)
# Average word length per line
lapply(word_lengths, mean)
## [[1]]
## [1] 3.888889
##
## [[2]]
## [1] 4.25
##
## [[3]]
## [1] 4.380952
[Video]
# Some IDs
ids <- c("ID#: 192", "ID#: 118", "ID#: 001")
# Replace "ID#: " with ""
id_nums <- str_replace(ids, fixed("ID#: "), "")
# Turn id_nums into numbers
id_ints <- as.numeric(id_nums)
# Some (fake) phone numbers
phone_numbers <- c("510-555-0123", "541-555-0167")
# Use str_replace() to replace "-" with " "
str_replace(phone_numbers, fixed("-"), " ")
## [1] "510 555-0123" "541 555-0167"
# Use str_replace_all() to replace "-" with " "
str_replace_all(phone_numbers, fixed("-"), " ")
## [1] "510 555 0123" "541 555 0167"
# Turn phone numbers into the format xxx.xxx.xxxx
str_replace_all(phone_numbers, fixed("-"), ".")
## [1] "510.555.0123" "541.555.0167"
# Find the number of nucleotides in each sequence
str_length(genes)
## [1] 441 462 993
# Find the number of A's occur in each sequence
str_count(genes, fixed("A"))
## [1] 118 117 267
# Return the sequences that contain "TTTTTT"
str_subset(genes, fixed("TTTTTT"))
## [1] "TTAAGGAACGATCGTACGCATGATAGGGTTTTGCAGTGATATTAGTGTCTCGGTTGACTGGATCTCATCAATAGTCTGGATTTTGTTGATAAGTACCTGCTGCAATGCATCAATGGATTTACACATCACTTTAATAAATATGCTGTAGTGGCCAGTGGTGTAATAGGCCTCAACCACTTCTTCTAAGCTTTCCAATTTTTTCAAGGCGGAAGGGTAATCTTTGGCACTTTTCAAGATTATGCCAATAAAGCAGCAAACGTCGTAACCCAGTTGTTTTGGGTTAACGTGTACACAAGCTGCGGTAATGATCCCTGCTTGCCGCATCTTTTCTACTCTTACATGAATAGTTCCGGGGCTAACAGCGAGGTTTTTGGCTAATTCAGCATAGGGTGTGCGTGCATTTTCCATTAATGCTTTCAGGATGCTGCGATCGAGATTATCGATCTGATAAATTTCACTCAT"
# Replace all the "A"s in the sequences with a "_"
str_replace_all(genes, fixed("A"), "_")
## [1] "TT_G_GT___TT__TCC__TCTTTG_CCC___TCTCTGCTGG_TCCTCTGGT_TTTC_TGTTGG_TG_CGTC__TTTCT__T_TTTC_CCC__CCGTTG_GC_CCTTGTGCG_TC__TTGTTG_TCC_GTTTT_TG_TTGC_CCGC_G___GTGTC_T_TTCTG_GCTGCCT___CC__CCGCCCC___GCGT_CTTGGG_T___TC_GGCTTTTGTTGTTCG_TCTGTTCT__T__TGGCTGC__GTT_TC_GGT_G_TCCCCGGC_CC_TG_GTGG_TGTC_CG_TT__CC_C_GGCC_TTC_GCGT__GTTCGTCC__CTCTGGGCC_TG__GT_TTTCTGT_G____CCC_GCTTCTTCT__TTT_TCCGCT___TGTTC_GC__C_T_TTC_GC_CT_CC__GCGT_CTGCC_CTT_TC__CGTT_TGTC_GCC_T"
## [2] "TT__GG__CG_TCGT_CGC_TG_T_GGGTTTTGC_GTG_T_TT_GTGTCTCGGTTG_CTGG_TCTC_TC__T_GTCTGG_TTTTGTTG_T__GT_CCTGCTGC__TGC_TC__TGG_TTT_C_C_TC_CTTT__T___T_TGCTGT_GTGGCC_GTGGTGT__T_GGCCTC__CC_CTTCTTCT__GCTTTCC__TTTTTTC__GGCGG__GGGT__TCTTTGGC_CTTTTC__G_TT_TGCC__T___GC_GC___CGTCGT__CCC_GTTGTTTTGGGTT__CGTGT_C_C__GCTGCGGT__TG_TCCCTGCTTGCCGC_TCTTTTCT_CTCTT_C_TG__T_GTTCCGGGGCT__C_GCG_GGTTTTTGGCT__TTC_GC_T_GGGTGTGCGTGC_TTTTCC_TT__TGCTTTC_GG_TGCTGCG_TCG_G_TT_TCG_TCTG_T___TTTC_CTC_T"
## [3] "_TG______C__TTT_TCC_____C__C__C___TC_GCTTCGT____TC_TTCTTTTCCCGCC__TT_G_GC__C__CTTGGCTTG_TCG__GTCC_GGCTCCT_TTTTG_GCCGTGTGGGTG_TGG__CCC__G_T__CCTTTCTGGTTCTG_G___GCGGT_C_GGT____GTT__GTC_TTGCCGG_TTC__CTTTTG__GTTGT_C_TTC_TT_GCG__GTGG___CGT____CCTT_GGGCGTTTTG_TTTTGGTGCTG_CC__GGGGTGT_T_CCC_T_TG___GC_TTGCGCCC_G_TG__G_TCGCCTG_GTGCT_TTC_TTCTGT_T_TGT_G_TC_GTGGG_TTGGG__CGGGTT_TGGGGG_CGGTG__CGT__CCTGGCTT_CCTG___TCG_CTGTT__C__G_TTT_TGC_GCG_TT___G___CTG__GCGGCG_TC_GTGCTG_GTTTGGTGTG__GCCTTTCCTGCCGG_TC_T_TTC_GTTT_TCC_C_GTG___GCCTGCGGGCC_G_TTCCCTG_TTT_G_TGCT___GGCCGTG__CGTGC__TTGCC___G_GTT_GGTGCTGTCTTCCTT_T_GGG_TTGGTGGC___TTGGC_G_TGGTC__TCCC_TG_TGTTCGTGCGCC_G_TT_TG_TG_TTGG_CCTCTCCG_GTGCGG__GGTTTCTCTGG_TT___CGGCG_C_TT_TTGTCTGG__CCC__T_TTGG__G_TGCCTTTG_G_T_TCTTCT_TGGG__TTCGTGTTG_TGCCG__GCTCTT__GCGTC_GTT_GCCCTG_CTGGCG_TG__G_CCGCTTGG__CTGG__TGGC_TC__TC_CTGTTGCGCGGTG___TGCC_C___CT_TCGGGGG_GGT_TTGGTC_GTCCCGCTT_GTG_TGTT_TTGCTGC_G___C__C_T_TTGGTC_GGTGC__TGTGGTGTTTGGGGCCCTG___TC_GCG_G___GTTG_TGGCCTGCTGT__"
# Define some full names
names <- c("Diana Prince", "Clark Kent")
# Split into first and last names
names_split <- str_split(names, fixed(" "), simplify = TRUE)
# Extract the first letter in the first name
abb_first <- str_sub(names_split[, 1], 1, 1)
# Combine the first letter ". " and last name
str_c(abb_first, ". ", names_split[, 2])
## [1] "D. Prince" "C. Kent"
Michael is a hybrid thinker and doer—a byproduct of being a CliftonStrengths “Learner” over time. With 20+ years of engineering, design, and product experience, he helps organizations identify market needs, mobilize internal and external resources, and deliver delightful digital customer experiences that align with business goals. He has been entrusted with problem-solving for brands—ranging from Fortune 500 companies to early-stage startups to not-for-profit organizations.
Michael earned his BS in Computer Science from New York Institute of Technology and his MBA from the University of Maryland, College Park. He is also a candidate to receive his MS in Applied Analytics from Columbia University.
LinkedIn | Twitter | www.michaelmallari.com/data | www.columbia.edu/~mm5470