library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(dplyr)
library(pander)
library(stringr)

library(htmlwidgets)
# grep() 

grep("[a-d]", letters) # chọn các ký tự a, b, c, d có cái nào trong letters ko, letters là 26 chữ cái, # 4 
## [1] 1 2 3 4
# ra kết quả là vị trí so với
txt <- c("arm","foot","lefroo", "bafoobar")
if(length(i <- grep("foo", txt)))
  cat("'foo' appears at least once in\n\t", txt, "\n")
## 'foo' appears at least once in
##   arm foot lefroo bafoobar
txt[i]
## [1] "foot"     "bafoobar"
grep("foo", txt) # có cái nào chứa foo ko trong c("arm","foot","lefroo", "bafoobar") # 2 
## [1] 2 4
# gsub()

## Double all 'a' or 'b's;  "\" must be escaped, i.e., 'doubled'
gsub("([ab])", "\\1_\\1_", "abc and ABC") ## "a_a_b_b_c a_a_nd ABC"
## [1] "a_a_b_b_c a_a_nd ABC"
txt <- c("The", "licenses", "for", "most", "software", "are",
  "designed", "to", "take", "away", "your", "freedom",
  "to", "share", "and", "change", "it.",
  "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
  "is", "intended", "to", "guarantee", "your", "freedom", "to",
  "share", "and", "change", "free", "software", "--",
  "to", "make", "sure", "the", "software", "is",
  "free", "for", "all", "its", "users")
(i <- grep("[gu]", txt))  # indices # xác định vị trí của txt có chữ g và u
## [1]  7 11 16 24 29 30 35 41 49
stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) ) # nó kiểm tra lại thôi = function stopifnot()


#grep("[gu]", txt) # xác định vị trí của txt có chữ g và u
txt[i] == grep("[gu]", txt, value = TRUE) # [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
txt[gsub("g","#", txt) !=
    gsub("g","#", txt, ignore.case = TRUE)] # the "G" words # = FALSE ko có cái nào
## [1] "GNU"     "General"
# regular expressions or Regexps # string_ 

x <- c("\"", "\\") 
writeLines(x) #   "   \ 
## "
## \
# "\n" new line 

x <- "aaaaaaaaaaaaaa\nbbbbbbbbbbbbbbbbbbbb"
x
## [1] "aaaaaaaaaaaaaa\nbbbbbbbbbbbbbbbbbbbb"
x <- "In ALGOL, you could do logical AND with /\\."
x # "In ALGOL, you could do logical AND with /\\."
## [1] "In ALGOL, you could do logical AND with /\\."
writeLines(x) # In ALGOL, you could do logical AND with /\.
## In ALGOL, you could do logical AND with /\.
x_new_line <- "long\tlines can be\nbroken with newlines"
x_new_line # "long\tlines can be\nbroken with newlines"
## [1] "long\tlines can be\nbroken with newlines"
writeLines(x_new_line)
## long lines can be
## broken with newlines
#long     lines can be
#broken with newlines

y_tab <- "aaaaaaaaaaaaaaaaaaa\tbbbbbbbbbbbbbbbbbbbbb"
y_tab # "aaaaaaaaaaaaaaaaaaa\tbbbbbbbbbbbbbbbbbbbbb"
## [1] "aaaaaaaaaaaaaaaaaaa\tbbbbbbbbbbbbbbbbbbbbb"
writeLines(y_tab) #aaaaaaaaaaaaaaaaaaa  bbbbbbbbbbbbbbbbbbbbb
## aaaaaaaaaaaaaaaaaaa  bbbbbbbbbbbbbbbbbbbbb
z_back_space = "aa\bb" 
writeLines(z_back_space) # ab
## aab
t_alert <- "aa\abb" # \a = dấu vuông
t_alert 
## [1] "aa\abb"
writeLines(t_alert) #aabb # ra dấu vuông 
## aabb
f_form_feed <- "aa\fbb" # \a = dấu vuông
f_form_feed 
## [1] "aa\fbb"
writeLines(f_form_feed) #aabb # ra dấu vuông 
## aabb
# \r
r_return <- "eeeeaa\rbbbb"
r_return 
## [1] "eeeeaa\rbbbb"
writeLines(r_return)
## eeeeaa
bbbb
writeLines("\u0126\u0119\u1114\u022d\u2001\u03e2\u0954\u0f3f\u13d3\u147b\u203c")
## He<U+1114><U+022D> <U+03E2><U+0954><U+0F3F><U+13D3><U+147B><U+203C>
r_return <- "ddddd12\rbbbeeee" # bỏ ddddd12 đưa bbbeeee lên đầu => bbbeeee
# string() continue 

str_length(c("a", "R for data science", NA))
## [1]  1 18 NA
#str_length(real_estate_bds_tibble$sqm) # only work with vectors

str_c("x", "y") # string combine 
## [1] "xy"
#> [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
#> [1] "xyz"

x <- c("apple", "banana", "pear", "aaaaap")

str_view(x,".p") # dấu . là lấy toàn bộ id: .p lấy toàn bộ x và trước x
str_view(c("abc", "a.c", "bef"), "a\\.c")
x <- "\\\\"
x
## [1] "\\\\"
writeLines(x)
## \\
x <- "a\\b"
str_view(x, "\\\\") # match dấu \ duy nhất , match a\b là phải a\\\\b
writeLines(x)
## a\b
Price <- lapply(paste0('https://nha.chotot.com/toan-quoc/thue-van-phong-mat-bang-kinh-doanh?page=', 1:2),
                function(url){
                  url %>% read_html() %>% 
                    html_nodes(".adPriceNormal___puYxd") %>% 
                    html_text() %>%
                    gsub('[\r\n\t]', '', .) # chưa hiểu tác dụng của thằng này
                  
                })
sqm <- lapply(paste0('https://nha.chotot.com/toan-quoc/thue-van-phong-mat-bang-kinh-doanh?page=', 1:2),
                function(url){
                  url %>% read_html() %>% 
                    html_nodes(".adItemCondition___2daw2") %>% 
                    html_text() %>%
                    gsub('[\r\n\t]', '', .) # chưa hiểu tác dụng của thằng này
                  
                })
real_estate_bds_tibble <- tibble(
  x = sqm,
  y = Price
)

typeof(Price)
## [1] "list"
typeof(real_estate_bds_tibble)
## [1] "list"
view(real_estate_bds_tibble)


sqm_new <- unlist(sqm)  # cover list of list to characters 
homicides <- readLines("https://raw.githubusercontent.com/hadv/PAR/master/homicides.txt") # read file 

g <- grep("iconHomicideShooting", homicides) # xem bao nhiêu chữ trong homicides
length(g)
## [1] 228
g <- grep("iconHomicideShooting|icon_homicide_shooting", homicides)
length(g)
## [1] 1003
g <- grep("Cause: shooting", homicides) # tìm từ shooting
length(g)
## [1] 228
g <- grep("Cause: [Ss]hooting", homicides) # tìm từ shooting chữ s cả hoa lẫn thường 

homicides[945]
## [1] "39.31193300000, -76.60867390000, icon_homicide_shooting, 'p825', '<dl><dt><a href=\"http://essentials.baltimoresun.com/micro_sun/homicides/victim/825/carlos-williams\">Carlos Williams</a></dt><dd class=\"address\">1900 Boone St<br />Baltimore, MD 21218</dd><dd>Race: Black<br />Gender: male<br />Age: 50 years old</dd><dd>Found on March 21, 2010</dd><dd>Victim died at Johns Hopkins Hospital</dd><dd>Cause: Shooting</dd><dd class=\"popup-note\"><p>Williams was shot in an apparent robbery as he was leaving for work, according to police</p></dd></dl>'"
#[1] "39.30677400000, -76.59891100000, icon_homicide_shooting, 'p816', '<dl><dt><a href=\"http://essentials.baltimoresun.com/micro_sun/homicides/victim/816/kenly-wheeler\">Kenly Wheeler</a></dt><dd class=\"address\">1400 N Caroline St<br />Baltimore, MD 21213</dd><dd>Race: Black<br />Gender: male<br />Age: 29 years old</dd><dd>Found on March  3, 2010</dd><dd>Victim died at Scene</dd><dd>Cause: Shooting</dd><dd class=\"popup-note\"><p>Wheeler\\'s body was&nbsp;found on the grounds of Dr. Bernard Harris Sr.&nbsp;Elementary School</p></dd></dl>'"


regexpr("<dd>[F|f]ound(.*)</dd>", homicides[1:10]) # find giữa 2 tag <dd> và <dd> bao gồm chữ found 
##  [1] 177 178 188 189 178 182 178 187 182 183
## attr(,"match.length")
##  [1] 93 86 89 90 89 84 85 84 88 84
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
#[1] 177 178 188 189 178 182 178 187 182 183 # tại vị trí 
#attr(,"match.length")
#[1] 93 86 89 90 89 84 85 84 88 84 # độ dài 
#attr(,"index.type")
#[1] "chars"
#attr(,"useBytes")
#[1] TRUE

homicides[1]
## [1] "39.311024, -76.674227, iconHomicideShooting, 'p2', '<dl><dt>Leon Nelson</dt><dd class=\"address\">3400 Clifton Ave.<br />Baltimore, MD 21216</dd><dd>black male, 17 years old</dd><dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd></dl>'"
#[1] "39.311024, -76.674227, iconHomicideShooting, 'p2', '<dl><dt>Leon Nelson</dt><dd 
#class=\"address\">3400 Clifton Ave.<br />Baltimore, 
#MD 21216</dd><dd>black male, 17 years old</dd><dd>Found on January 1, 2007</dd><dd>
#Victim died at Shock Trauma</dd><dd>Cause: shooting</dd></dl>'"
substr(homicides[1], 177, 177 + 93)  # mục đích là lọc toàn bộ ký tự giữa 2 tag <dd> và <dd> 
## [1] "<dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd><"
#"<dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd><" dư 1 chữ 
substr(homicides[1], 177, 177 + 93 -1)  # mục đích là lọc toàn bộ ký tự giữa 2 tag <dd> và <dd> 
## [1] "<dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd>"
#"<dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd>"

regexpr("<dd>[F|f]ound(.*?)</dd>", homicides[1:10]) #  (.*?) là lazy, dừng lại ngay </dd> đầu tiên
##  [1] 177 178 188 189 178 182 178 187 182 183
## attr(,"match.length")
##  [1] 33 33 33 33 33 33 33 33 33 33
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
r <- regexpr("<dd>[F|f]ound(.*?)</dd>", homicides[1:10])
regmatches(homicides[1:10], r) # cho ra kết quả luôn ko cần xài substr()
##  [1] "<dd>Found on January 1, 2007</dd>" "<dd>Found on January 2, 2007</dd>"
##  [3] "<dd>Found on January 2, 2007</dd>" "<dd>Found on January 3, 2007</dd>"
##  [5] "<dd>Found on January 5, 2007</dd>" "<dd>Found on January 5, 2007</dd>"
##  [7] "<dd>Found on January 5, 2007</dd>" "<dd>Found on January 7, 2007</dd>"
##  [9] "<dd>Found on January 8, 2007</dd>" "<dd>Found on January 8, 2007</dd>"
#[1] "<dd>Found on January 1, 2007</dd>" "<dd>Found on January 2, 2007</dd>"
#[3] "<dd>Found on January 2, 2007</dd>" "<dd>Found on January 3, 2007</dd>"
#[5] "<dd>Found on January 5, 2007</dd>" "<dd>Found on January 5, 2007</dd>"
#[7] "<dd>Found on January 5, 2007</dd>" "<dd>Found on January 7, 2007</dd>"
#[9] "<dd>Found on January 8, 2007</dd>" "<dd>Found on January 8, 2007</dd>"