1 Combining Strings Together

paste("Hello", "Naimish", "Agarwal")
## [1] "Hello Naimish Agarwal"
paste("Hello", "Naimish", "Agarwal", sep = "/")
## [1] "Hello/Naimish/Agarwal"
x <- c("Hello", "Hi", "Holla")

y <- c("N1", "N2", "N3")

paste(x, y)
## [1] "Hello N1" "Hi N2"    "Holla N3"
paste("Hello", y)
## [1] "Hello N1" "Hello N2" "Hello N3"
paste("Hello", y, c("Good Bye", "Bye"))
## [1] "Hello N1 Good Bye" "Hello N2 Bye"      "Hello N3 Good Bye"
y <- c("Hello", "Naimish", "Agarwal")

paste(y)
## [1] "Hello"   "Naimish" "Agarwal"
paste(y, collapse = " ")
## [1] "Hello Naimish Agarwal"
x <- "Naimish"

paste("Hello ", x, " you are doing aweseme! ", "Bye ", sep = "")
## [1] "Hello Naimish you are doing aweseme! Bye "
sprintf("Hello %s, you are doing awesome!", x)
## [1] "Hello Naimish, you are doing awesome!"

2 Extracting Information out of text

library(XML)

congURL <- "http://www.loc.gov/rr/print/list/057_chron.html"

usPresidents <- readHTMLTable(doc = congURL, header = TRUE, which = 3, as.data.frame = TRUE, skip.rows = 1, stringsAsFactors = FALSE)

head(usPresidents)
##        YEAR         PRESIDENT
## 1 1789-1797 George Washington
## 2 1797-1801        John Adams
## 3 1801-1805  Thomas Jefferson
## 4 1805-1809  Thomas Jefferson
## 5 1809-1812     James Madison
## 6 1812-1813     James Madison
##                                       FIRST LADY   VICE PRESIDENT
## 1                              Martha Washington       John Adams
## 2                                  Abigail Adams Thomas Jefferson
## 3 Martha Wayles Skelton Jefferson\n   (no image)       Aaron Burr
## 4 Martha Wayles Skelton Jefferson\n   (no image)   George Clinton
## 5                                 Dolley Madison   George Clinton
## 6                                 Dolley Madison    office vacant
tail(usPresidents)
##                                                                                                                                                                                                                                                                                              YEAR
## 63                                                                                                                                                                                                                                                                                      2001-2009
## 64                                                                                                                                                                                                                                                                                          2009-
## 65 Presidents: Introduction (Rights/Ordering\n        Info.) | Adams\n      - Cleveland | Clinton - Harding Harrison\n      - Jefferson | Johnson - McKinley | Monroe\n                        - Roosevelt | Taft - Truman | Tyler\n                        - WilsonList of names, Alphabetically
## 66                       First Ladies: Introduction\n                  (Rights/Ordering Info.) | Adams\n                  - Coolidge | Eisenhower - HooverJackson\n                  - Pierce  | \n                                Polk - Wilson | List\n                of names, Alphabetically
## 67                                                                                                                            Vice Presidents: Introduction (Rights/Ordering Info.) | Adams - Coolidge | Curtis - Hobart Humphrey - Rockefeller | Roosevelt - WilsonList of names, Alphabetically
## 68                                                                                                                                                                                                                                                                     Top\n              of Page
##         PRESIDENT     FIRST LADY  VICE PRESIDENT
## 63 George W. Bush     Laura Bush  Richard Cheney
## 64   Barack Obama Michelle Obama Joseph R. Biden
## 65           <NA>           <NA>            <NA>
## 66           <NA>           <NA>            <NA>
## 67           <NA>           <NA>            <NA>
## 68           <NA>           <NA>            <NA>
tail(x = usPresidents$YEAR)
## [1] "2001-2009"                                                                                                                                                                                                                                                                                     
## [2] "2009-"                                                                                                                                                                                                                                                                                         
## [3] "Presidents: Introduction (Rights/Ordering\n        Info.) | Adams\n      - Cleveland | Clinton - Harding Harrison\n      - Jefferson | Johnson - McKinley | Monroe\n                        - Roosevelt | Taft - Truman | Tyler\n                        - WilsonList of names, Alphabetically"
## [4] "First Ladies: Introduction\n                  (Rights/Ordering Info.) | Adams\n                  - Coolidge | Eisenhower - HooverJackson\n                  - Pierce  | \n                                Polk - Wilson | List\n                of names, Alphabetically"                      
## [5] "Vice Presidents: Introduction (Rights/Ordering Info.) | Adams - Coolidge | Curtis - Hobart Humphrey - Rockefeller | Roosevelt - WilsonList of names, Alphabetically"                                                                                                                           
## [6] "Top\n              of Page"
usPresidents <- usPresidents[1:64, ]

head(usPresidents)
##        YEAR         PRESIDENT
## 1 1789-1797 George Washington
## 2 1797-1801        John Adams
## 3 1801-1805  Thomas Jefferson
## 4 1805-1809  Thomas Jefferson
## 5 1809-1812     James Madison
## 6 1812-1813     James Madison
##                                       FIRST LADY   VICE PRESIDENT
## 1                              Martha Washington       John Adams
## 2                                  Abigail Adams Thomas Jefferson
## 3 Martha Wayles Skelton Jefferson\n   (no image)       Aaron Burr
## 4 Martha Wayles Skelton Jefferson\n   (no image)   George Clinton
## 5                                 Dolley Madison   George Clinton
## 6                                 Dolley Madison    office vacant
tail(usPresidents)
##         YEAR      PRESIDENT             FIRST LADY    VICE PRESIDENT
## 59 1977-1981   Jimmy Carter        Rosalynn Carter Walter F. Mondale
## 60 1981-1989  Ronald Reagan           Nancy Reagan       George Bush
## 61 1989-1993    George Bush           Barbara Bush        Dan Quayle
## 62 1993-2001   Bill Clinton Hillary Rodham Clinton       Albert Gore
## 63 2001-2009 George W. Bush             Laura Bush    Richard Cheney
## 64     2009-   Barack Obama         Michelle Obama   Joseph R. Biden
library(stringr)

years <- str_split(string = usPresidents$YEAR, pattern = "-")

head(years)
## [[1]]
## [1] "1789" "1797"
## 
## [[2]]
## [1] "1797" "1801"
## 
## [[3]]
## [1] "1801" "1805"
## 
## [[4]]
## [1] "1805" "1809"
## 
## [[5]]
## [1] "1809" "1812"
## 
## [[6]]
## [1] "1812" "1813"
library(plyr)

ldply(.data = years, .fun = function(x) {
    c(Begin = x[1], End = x[2])
})
##    Begin  End
## 1   1789 1797
## 2   1797 1801
## 3   1801 1805
## 4   1805 1809
## 5   1809 1812
## 6   1812 1813
## 7   1813 1814
## 8   1814 1817
## 9   1817 1825
## 10  1825 1829
## 11  1829 1832
## 12  1833 1837
## 13  1837 1841
## 14  1841 <NA>
## 15  1841 1845
## 16  1845 1849
## 17  1849 1850
## 18  1850 1853
## 19  1853 <NA>
## 20  1853 1857
## 21  1857 1861
## 22  1861 1865
## 23  1865 <NA>
## 24  1865 1869
## 25  1869 1873
## 26  1873 1875
## 27  1875 1877
## 28  1877 1881
## 29  1881 <NA>
## 30  1881 1885
## 31  1885 <NA>
## 32  1885 1889
## 33  1889 1893
## 34  1893 1897
## 35  1897 1899
## 36  1899 1901
## 37  1901 <NA>
## 38  1901 1905
## 39  1905 1909
## 40  1909 1912
## 41  1912 1913
## 42  1913 1921
## 43  1921 1923
## 44  1923 1925
## 45  1925 1929
## 46  1929 1933
## 47  1933 1941
## 48  1941 1945
## 49  1945 <NA>
## 50  1945 1949
## 51  1949 1953
## 52  1953 1961
## 53  1961 1963
## 54  1963 1965
## 55  1963 1969
## 56  1969 1973
## 57  1973 1974
## 58  1974 1977
## 59  1977 1981
## 60  1981 1989
## 61  1989 1993
## 62  1993 2001
## 63  2001 2009
## 64  2009
yearMatrix <- data.frame(Reduce(f = rbind, x = years))
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
## --> row.names NOT used
head(yearMatrix)
##     X1   X2
## 1 1789 1797
## 2 1797 1801
## 3 1801 1805
## 4 1805 1809
## 5 1809 1812
## 6 1812 1813
names(yearMatrix) <- c("Begin", "End")

head(yearMatrix)
##   Begin  End
## 1  1789 1797
## 2  1797 1801
## 3  1801 1805
## 4  1805 1809
## 5  1809 1812
## 6  1812 1813
usPresidents <- cbind(usPresidents, yearMatrix)

head(usPresidents)
##        YEAR         PRESIDENT
## 1 1789-1797 George Washington
## 2 1797-1801        John Adams
## 3 1801-1805  Thomas Jefferson
## 4 1805-1809  Thomas Jefferson
## 5 1809-1812     James Madison
## 6 1812-1813     James Madison
##                                       FIRST LADY   VICE PRESIDENT Begin
## 1                              Martha Washington       John Adams  1789
## 2                                  Abigail Adams Thomas Jefferson  1797
## 3 Martha Wayles Skelton Jefferson\n   (no image)       Aaron Burr  1801
## 4 Martha Wayles Skelton Jefferson\n   (no image)   George Clinton  1805
## 5                                 Dolley Madison   George Clinton  1809
## 6                                 Dolley Madison    office vacant  1812
##    End
## 1 1797
## 2 1801
## 3 1805
## 4 1809
## 5 1812
## 6 1813
str_sub(string = usPresidents$PRESIDENT, start = 1, end = 3)
##  [1] "Geo" "Joh" "Tho" "Tho" "Jam" "Jam" "Jam" "Jam" "Jam" "Joh" "And"
## [12] "And" "Mar" "Wil" "Joh" "Jam" "Zac" "Mil" "Fra" "Fra" "Jam" "Abr"
## [23] "Abr" "And" "Uly" "Uly" "Uly" "Rut" "Jam" "Che" "Gro" "Gro" "Ben"
## [34] "Gro" "Wil" "Wil" "Wil" "The" "The" "Wil" "Wil" "Woo" "War" "Cal"
## [45] "Cal" "Her" "Fra" "Fra" "Fra" "Har" "Har" "Dwi" "Joh" "Lyn" "Lyn"
## [56] "Ric" "Ric" "Ger" "Jim" "Ron" "Geo" "Bil" "Geo" "Bar"
str_sub(string = usPresidents$PRESIDENT, start = 4, end = 8)
##  [1] "rge W" "n Ada" "mas J" "mas J" "es Ma" "es Ma" "es Ma" "es Ma"
##  [9] "es Mo" "n Qui" "rew J" "rew J" "tin V" "liam " "n Tyl" "es K."
## [17] "hary " "lard " "nklin" "nklin" "es Bu" "aham " "aham " "rew J"
## [25] "sses " "sses " "sses " "herfo" "es A." "ster " "ver C" "ver C"
## [33] "jamin" "ver C" "liam " "liam " "liam " "odore" "odore" "liam "
## [41] "liam " "drow " "ren G" "vin C" "vin C" "bert " "nklin" "nklin"
## [49] "nklin" "ry S." "ry S." "ght D" "n F. " "don B" "don B" "hard "
## [57] "hard " "ald R" "my Ca" "ald R" "rge B" "l Cli" "rge W" "ack O"
usPresidents[str_sub(string = usPresidents$Begin, start = 4, end = 4) == 1, ]
##         YEAR              PRESIDENT
## 3  1801-1805       Thomas Jefferson
## 14      1841 William Henry Harrison
## 15 1841-1845             John Tyler
## 22 1861-1865        Abraham Lincoln
## 29      1881      James A. Garfield
## 30 1881-1885      Chester A. Arthur
## 37      1901       William McKinley
## 38 1901-1905     Theodore Roosevelt
## 43 1921-1923      Warren G. Harding
## 48 1941-1945  Franklin D. Roosevelt
## 53 1961-1963        John F. Kennedy
## 60 1981-1989          Ronald Reagan
## 63 2001-2009         George W. Bush
##                                                      FIRST LADY
## 3                Martha Wayles Skelton Jefferson\n   (no image)
## 14                                 Anna Tuthill Symmes Harrison
## 15 Letitia Christian Tyler and Julia Gardiner Tyler (no images)
## 22                                            Mary Todd Lincoln
## 29                                    Lucretia Rudolph Garfield
## 30                                   Ellen Lewis Herndon Arthur
## 37                                          Ida Saxton McKinley
## 38                                 Edith Kermit Carow Roosevelt
## 43                                       Florence Kling Harding
## 48                                            Eleanor Roosevelt
## 53                                   Jacqueline Kennedy Onassis
## 60                                                 Nancy Reagan
## 63                                                   Laura Bush
##        VICE PRESIDENT Begin  End
## 3          Aaron Burr  1801 1805
## 14         John Tyler  1841 1841
## 15      office vacant  1841 1845
## 22    Hannibal Hamlin  1861 1865
## 29  Chester A. Arthur  1881 1881
## 30      office vacant  1881 1885
## 37 Theodore Roosevelt  1901 1901
## 38      office vacant  1901 1905
## 43    Calvin Coolidge  1921 1923
## 48   Henry A. Wallace  1941 1945
## 53  Lyndon B. Johnson  1961 1963
## 60        George Bush  1981 1989
## 63     Richard Cheney  2001 2009
usPresidents[str_sub(string = usPresidents$Begin, start = 4, end = 4) == 1, c("YEAR", "PRESIDENT", "Begin", "End")]
##         YEAR              PRESIDENT Begin  End
## 3  1801-1805       Thomas Jefferson  1801 1805
## 14      1841 William Henry Harrison  1841 1841
## 15 1841-1845             John Tyler  1841 1845
## 22 1861-1865        Abraham Lincoln  1861 1865
## 29      1881      James A. Garfield  1881 1881
## 30 1881-1885      Chester A. Arthur  1881 1885
## 37      1901       William McKinley  1901 1901
## 38 1901-1905     Theodore Roosevelt  1901 1905
## 43 1921-1923      Warren G. Harding  1921 1923
## 48 1941-1945  Franklin D. Roosevelt  1941 1945
## 53 1961-1963        John F. Kennedy  1961 1963
## 60 1981-1989          Ronald Reagan  1981 1989
## 63 2001-2009         George W. Bush  2001 2009
str_detect(usPresidents$PRESIDENT, "John")
##  [1] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [12] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
str_detect(usPresidents$PRESIDENT, ignore.case("John"))
## Please use (fixed|coll|regexp)(x, ignore_case = TRUE) instead of ignore.case(x)
##  [1] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [12] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
usPresidents[str_detect(usPresidents$PRESIDENT, ignore.case("John")), ]
## Please use (fixed|coll|regexp)(x, ignore_case = TRUE) instead of ignore.case(x)
##         YEAR         PRESIDENT
## 2  1797-1801        John Adams
## 10 1825-1829 John Quincy Adams
## 15 1841-1845        John Tyler
## 24 1865-1869    Andrew Johnson
## 53 1961-1963   John F. Kennedy
## 54 1963-1965 Lyndon B. Johnson
## 55 1963-1969 Lyndon B. Johnson
##                                                      FIRST LADY
## 2                                                 Abigail Adams
## 10                                       Louisa Catherine Adams
## 15 Letitia Christian Tyler and Julia Gardiner Tyler (no images)
## 24                                       Eliza McCardle Johnson
## 53                                   Jacqueline Kennedy Onassis
## 54                                            Lady Bird Johnson
## 55                                            Lady Bird Johnson
##        VICE PRESIDENT Begin  End
## 2    Thomas Jefferson  1797 1801
## 10    John C. Calhoun  1825 1829
## 15      office vacant  1841 1845
## 24      office vacant  1865 1869
## 53  Lyndon B. Johnson  1961 1963
## 54      office vacant  1963 1965
## 55 Hubert M. Humphrey  1963 1969
con <- url("http://www.jaredlander.com/data/warTimes.rdata")

load(con)

close(con)
head(warTimes, 12)
##  [1] "September 1, 1774 ACAEA September 3, 1783"
##  [2] "September 1, 1774 ACAEA March 17, 1776"   
##  [3] "1775ACAEA1783"                            
##  [4] "June 1775 ACAEA October 1776"             
##  [5] "July 1776 ACAEA March 1777"               
##  [6] "June 14, 1777 ACAEA October 17, 1777"     
##  [7] "1777ACAEA1778"                            
##  [8] "1775ACAEA1782"                            
##  [9] "1776ACAEA1794"                            
## [10] "1778ACAEA1782"                            
## [11] "1775ACAEA1782"                            
## [12] "1779ACAEA1782"
warTimes[str_detect(string = warTimes, pattern = "-")]
## [1] "6 June 1944 ACAEA mid-July 1944" "25 August-17 December 1944"
warTimes <- str_split(string = warTimes, pattern = "(ACAEA)|-", n = 2)

head(warTimes)
## [[1]]
## [1] "September 1, 1774 " " September 3, 1783"
## 
## [[2]]
## [1] "September 1, 1774 " " March 17, 1776"   
## 
## [[3]]
## [1] "1775" "1783"
## 
## [[4]]
## [1] "June 1775 "    " October 1776"
## 
## [[5]]
## [1] "July 1776 "  " March 1777"
## 
## [[6]]
## [1] "June 14, 1777 "    " October 17, 1777"
start <- sapply(X = warTimes, FUN = function(x){
    return(x[1])
})

head(start)
## [1] "September 1, 1774 " "September 1, 1774 " "1775"              
## [4] "June 1775 "         "July 1776 "         "June 14, 1777 "
str_trim(string = start)
##   [1] "September 1, 1774"  "September 1, 1774"  "1775"              
##   [4] "June 1775"          "July 1776"          "June 14, 1777"     
##   [7] "1777"               "1775"               "1776"              
##  [10] "1778"               "1775"               "1779"              
##  [13] "January"            "1785"               "1798"              
##  [16] "1801"               "August"             "June 18, 1812"     
##  [19] "1812"               "1813"               "1812"              
##  [22] "1812"               "1813"               "1813"              
##  [25] "1813"               "1814"               "1813"              
##  [28] "1814"               "1813"               "1815"              
##  [31] "November 22, 1817"  "1817"               "1819"              
##  [34] "November 5"         "1823"               "1825"              
##  [37] "1827"               "May"                "February 6"        
##  [40] "1838"               "December 23, 1835"  "December 1838"     
##  [43] "October 19"         "March 11, 1845"     "April 25, 1846"    
##  [46] "1846"               "1846"               "1846"              
##  [49] "1847"               "1847"               "1858"              
##  [52] "1847"               "April 28"           "1851"              
##  [55] "July 13, 1854"      "August 4, 1855"     "October 1855"      
##  [58] "1855"               "1855"               "1855"              
##  [61] "1855"               "1856"               "6"                 
##  [64] "1859"               "March 6, 1860"      "1860"              
##  [67] "April 12, 1861"     "1861"               "1861"              
##  [70] "1862"               "1862"               "1862"              
##  [73] "August 17"          "July 31, 1861"      "1863"              
##  [76] "July 20, 1863"      "1865"               "1864"              
##  [79] "1866"               "June 1867"          "1867"              
##  [82] "June 17, 1870"      "June 1, 1871"       "July 6, 1872"      
##  [85] "February 12, 1874"  "June 27, 1874"      "1876"              
##  [88] "1877"               "1878"               "1878"              
##  [91] "1879"               "1879"               "June"              
##  [94] "March 30"           "1887"               "November 1890"     
##  [97] "June 22"            "1891"               "January 21"        
## [100] "1898"               "April 25"           "1898"              
## [103] "1898"               "1898"               "1898"              
## [106] "1898"               "1898"               "June 2, 1899"      
## [109] "1899"               "September 28, 1899" "1912"              
## [112] "April 21, 1914"     "July 28, 1915"      "1916"              
## [115] "1917"               "1917"               "1917"              
## [118] "1917"               "1917"               "1917"              
## [121] "1918"               "1918"               "1918"              
## [124] "December 7, 1941"   "1941"               "1941"              
## [127] ""                   ""                   "June 3, 1942"      
## [130] "August 7, 1942"     "January 1942"       "November, 1943"    
## [133] "June"               "October 20, 1944"   "January"           
## [136] "1 May"              "16 February"        "1941"              
## [139] "8"                  "1942"               "17 November 1942"  
## [142] "9 July"             "1942"               "3 September 1943"  
## [145] "January 22, 1944"   "22 January 1944"    "6 June 1944"       
## [148] "6 June"             "15 August 1944"     "25 August"         
## [151] "25 August 1944"     "16 December 1944"   "February 8, 1945"  
## [154] "6 April 1945"       "1947"               "1950"              
## [157] "1950"               "1953"               "1953"              
## [160] "1970"               "April 28, 1965"     "25 October"        
## [163] "July 15"            "August 24, 1982"    "August 19, 1981"   
## [166] "March 1986"         "April 15, 1986"     "1987"              
## [169] "January 4, 1989"    "20 December 1989"   "August 2, 1990"    
## [172] "1991"               "1992"               "1993"              
## [175] "19 September 1994"  "August 20, 1998"    "March 24"          
## [178] "7 October 2001"     "7 October 2001"     "October 7, 2001"   
## [181] "15 January 2002"    "7 October 2002"     "6 February 2007"   
## [184] "March 20, 2003"     "March 16, 2004"     "January 14, 2010"  
## [187] "2003"               "March 19"
str_extract(string = start, pattern = "January")
##   [1] NA        NA        NA        NA        NA        NA        NA       
##   [8] NA        NA        NA        NA        NA        "January" NA       
##  [15] NA        NA        NA        NA        NA        NA        NA       
##  [22] NA        NA        NA        NA        NA        NA        NA       
##  [29] NA        NA        NA        NA        NA        NA        NA       
##  [36] NA        NA        NA        NA        NA        NA        NA       
##  [43] NA        NA        NA        NA        NA        NA        NA       
##  [50] NA        NA        NA        NA        NA        NA        NA       
##  [57] NA        NA        NA        NA        NA        NA        NA       
##  [64] NA        NA        NA        NA        NA        NA        NA       
##  [71] NA        NA        NA        NA        NA        NA        NA       
##  [78] NA        NA        NA        NA        NA        NA        NA       
##  [85] NA        NA        NA        NA        NA        NA        NA       
##  [92] NA        NA        NA        NA        NA        NA        NA       
##  [99] "January" NA        NA        NA        NA        NA        NA       
## [106] NA        NA        NA        NA        NA        NA        NA       
## [113] NA        NA        NA        NA        NA        NA        NA       
## [120] NA        NA        NA        NA        NA        NA        NA       
## [127] NA        NA        NA        NA        "January" NA        NA       
## [134] NA        "January" NA        NA        NA        NA        NA       
## [141] NA        NA        NA        NA        "January" "January" NA       
## [148] NA        NA        NA        NA        NA        NA        NA       
## [155] NA        NA        NA        NA        NA        NA        NA       
## [162] NA        NA        NA        NA        NA        NA        NA       
## [169] "January" NA        NA        NA        NA        NA        NA       
## [176] NA        NA        NA        NA        NA        "January" NA       
## [183] NA        NA        NA        "January" NA        NA
start[str_detect(string = start, pattern = "January")]
## [1] "January "          "January 21"        "January 1942 "    
## [4] "January "          "January 22, 1944 " "22 January 1944 " 
## [7] "January 4, 1989"   "15 January 2002 "  "January 14, 2010 "
head(str_extract(string = start, pattern = "[0-9][0-9][0-9][0-9]"), n = 20)
##  [1] "1774" "1774" "1775" "1775" "1776" "1777" "1777" "1775" "1776" "1778"
## [11] "1775" "1779" NA     "1785" "1798" "1801" NA     "1812" "1812" "1813"
head(str_extract(string = start, pattern = "[0-9]{4}"), n = 20)
##  [1] "1774" "1774" "1775" "1775" "1776" "1777" "1777" "1775" "1776" "1778"
## [11] "1775" "1779" NA     "1785" "1798" "1801" NA     "1812" "1812" "1813"
head(str_extract(string = start, pattern = "\\d{4}"), n = 20)
##  [1] "1774" "1774" "1775" "1775" "1776" "1777" "1777" "1775" "1776" "1778"
## [11] "1775" "1779" NA     "1785" "1798" "1801" NA     "1812" "1812" "1813"
head(str_extract(string = start, pattern = "\\d{1,3}"), n = 20)
##  [1] "1"   "1"   "177" "177" "177" "14"  "177" "177" "177" "177" "177"
## [12] "177" NA    "178" "179" "180" NA    "18"  "181" "181"
head(str_extract(string = start, pattern = "^\\d{4}"), n = 30)
##  [1] NA     NA     "1775" NA     NA     NA     "1777" "1775" "1776" "1778"
## [11] "1775" "1779" NA     "1785" "1798" "1801" NA     NA     "1812" "1813"
## [21] "1812" "1812" "1813" "1813" "1813" "1814" "1813" "1814" "1813" "1815"
head(str_extract(string = start, pattern = "^\\d{4}$"), n = 30)
##  [1] NA     NA     "1775" NA     NA     NA     "1777" "1775" "1776" "1778"
## [11] "1775" "1779" NA     "1785" "1798" "1801" NA     NA     "1812" "1813"
## [21] "1812" "1812" "1813" "1813" "1813" "1814" "1813" "1814" "1813" "1815"
head(str_replace(string = start, pattern = "\\d", replacement = "x"), 30)
##  [1] "September x, 1774 " "September x, 1774 " "x775"              
##  [4] "June x775 "         "July x776 "         "June x4, 1777 "    
##  [7] "x777"               "x775"               "x776"              
## [10] "x778"               "x775"               "x779"              
## [13] "January "           "x785"               "x798"              
## [16] "x801"               "August "            "June x8, 1812 "    
## [19] "x812"               "x813"               "x812"              
## [22] "x812"               "x813"               "x813"              
## [25] "x813"               "x814"               "x813"              
## [28] "x814"               "x813"               "x815"
head(str_replace_all(string = start, pattern = "\\d", replacement = "x"), 30)
##  [1] "September x, xxxx " "September x, xxxx " "xxxx"              
##  [4] "June xxxx "         "July xxxx "         "June xx, xxxx "    
##  [7] "xxxx"               "xxxx"               "xxxx"              
## [10] "xxxx"               "xxxx"               "xxxx"              
## [13] "January "           "xxxx"               "xxxx"              
## [16] "xxxx"               "August "            "June xx, xxxx "    
## [19] "xxxx"               "xxxx"               "xxxx"              
## [22] "xxxx"               "xxxx"               "xxxx"              
## [25] "xxxx"               "xxxx"               "xxxx"              
## [28] "xxxx"               "xxxx"               "xxxx"
head(str_replace_all(string = start, pattern = "\\d{1,4}", replacement = "x"), 30)
##  [1] "September x, x " "September x, x " "x"              
##  [4] "June x "         "July x "         "June x, x "     
##  [7] "x"               "x"               "x"              
## [10] "x"               "x"               "x"              
## [13] "January "        "x"               "x"              
## [16] "x"               "August "         "June x, x "     
## [19] "x"               "x"               "x"              
## [22] "x"               "x"               "x"              
## [25] "x"               "x"               "x"              
## [28] "x"               "x"               "x"
x <- c("<a href = 'index.html'>The link is here</a>", "<b>This is bold text</b>")

x
## [1] "<a href = 'index.html'>The link is here</a>"
## [2] "<b>This is bold text</b>"
str_replace(string = x, pattern = "<.+?>(.+?)<.+>", replacement = "\\1")
## [1] "The link is here"  "This is bold text"