Learned_function

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(rvest)

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(dplyr)
library(pander)
library(stringr)

library(htmlwidgets)

# grep() 

grep("[a-d]", letters) # chọn các ký tự a, b, c, d có cái nào trong letters ko, letters là 26 chữ cái, # 4

## [1] 1 2 3 4

# ra kết quả là vị trí so với
txt <- c("arm","foot","lefroo", "bafoobar")
if(length(i <- grep("foo", txt)))
  cat("'foo' appears at least once in\n\t", txt, "\n")

## 'foo' appears at least once in
##   arm foot lefroo bafoobar

txt[i]

## [1] "foot"     "bafoobar"

grep("foo", txt) # có cái nào chứa foo ko trong c("arm","foot","lefroo", "bafoobar") # 2

## [1] 2 4

# gsub()

## Double all 'a' or 'b's;  "\" must be escaped, i.e., 'doubled'
gsub("([ab])", "\\1_\\1_", "abc and ABC") ## "a_a_b_b_c a_a_nd ABC"

## [1] "a_a_b_b_c a_a_nd ABC"

txt <- c("The", "licenses", "for", "most", "software", "are",
  "designed", "to", "take", "away", "your", "freedom",
  "to", "share", "and", "change", "it.",
  "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
  "is", "intended", "to", "guarantee", "your", "freedom", "to",
  "share", "and", "change", "free", "software", "--",
  "to", "make", "sure", "the", "software", "is",
  "free", "for", "all", "its", "users")
(i <- grep("[gu]", txt))  # indices # xác định vị trí của txt có chữ g và u

## [1]  7 11 16 24 29 30 35 41 49

stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) ) # nó kiểm tra lại thôi = function stopifnot()


#grep("[gu]", txt) # xác định vị trí của txt có chữ g và u
txt[i] == grep("[gu]", txt, value = TRUE) # [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

txt[gsub("g","#", txt) !=
    gsub("g","#", txt, ignore.case = TRUE)] # the "G" words # = FALSE ko có cái nào

## [1] "GNU"     "General"

# regular expressions or Regexps # string_ 

x <- c("\"", "\\") 
writeLines(x) #   "   \

## "
## \

# "\n" new line 

x <- "aaaaaaaaaaaaaa\nbbbbbbbbbbbbbbbbbbbb"
x

## [1] "aaaaaaaaaaaaaa\nbbbbbbbbbbbbbbbbbbbb"

x <- "In ALGOL, you could do logical AND with /\\."
x # "In ALGOL, you could do logical AND with /\\."

## [1] "In ALGOL, you could do logical AND with /\\."

writeLines(x) # In ALGOL, you could do logical AND with /\.

## In ALGOL, you could do logical AND with /\.

x_new_line <- "long\tlines can be\nbroken with newlines"
x_new_line # "long\tlines can be\nbroken with newlines"

## [1] "long\tlines can be\nbroken with newlines"

writeLines(x_new_line)

## long lines can be
## broken with newlines

#long     lines can be
#broken with newlines

y_tab <- "aaaaaaaaaaaaaaaaaaa\tbbbbbbbbbbbbbbbbbbbbb"
y_tab # "aaaaaaaaaaaaaaaaaaa\tbbbbbbbbbbbbbbbbbbbbb"

## [1] "aaaaaaaaaaaaaaaaaaa\tbbbbbbbbbbbbbbbbbbbbb"

writeLines(y_tab) #aaaaaaaaaaaaaaaaaaa  bbbbbbbbbbbbbbbbbbbbb

## aaaaaaaaaaaaaaaaaaa  bbbbbbbbbbbbbbbbbbbbb

z_back_space = "aa\bb" 
writeLines(z_back_space) # ab

## aab

t_alert <- "aa\abb" # \a = dấu vuông
t_alert

## [1] "aa\abb"

writeLines(t_alert) #aabb # ra dấu vuông

## aabb

f_form_feed <- "aa\fbb" # \a = dấu vuông
f_form_feed

## [1] "aa\fbb"

writeLines(f_form_feed) #aabb # ra dấu vuông

## aabb

# \r
r_return <- "eeeeaa\rbbbb"
r_return

## [1] "eeeeaa\rbbbb"

writeLines(r_return)

## eeeeaa
bbbb

writeLines("\u0126\u0119\u1114\u022d\u2001\u03e2\u0954\u0f3f\u13d3\u147b\u203c")

## He<U+1114><U+022D> <U+03E2><U+0954><U+0F3F><U+13D3><U+147B><U+203C>

r_return <- "ddddd12\rbbbeeee" # bỏ ddddd12 đưa bbbeeee lên đầu => bbbeeee

# string() continue 

str_length(c("a", "R for data science", NA))

## [1]  1 18 NA

#str_length(real_estate_bds_tibble$sqm) # only work with vectors

str_c("x", "y") # string combine

## [1] "xy"

#> [1] "xy"
str_c("x", "y", "z")

## [1] "xyz"

#> [1] "xyz"

x <- c("apple", "banana", "pear", "aaaaap")

str_view(x,".p") # dấu . là lấy toàn bộ id: .p lấy toàn bộ x và trước x

str_view(c("abc", "a.c", "bef"), "a\\.c")

x <- "\\\\"
x

## [1] "\\\\"

writeLines(x)

## \\

x <- "a\\b"
str_view(x, "\\\\") # match dấu \ duy nhất , match a\b là phải a\\\\b

writeLines(x)

## a\b

Price <- lapply(paste0('https://nha.chotot.com/toan-quoc/thue-van-phong-mat-bang-kinh-doanh?page=', 1:2),
                function(url){
                  url %>% read_html() %>% 
                    html_nodes(".adPriceNormal___puYxd") %>% 
                    html_text() %>%
                    gsub('[\r\n\t]', '', .) # chưa hiểu tác dụng của thằng này
                  
                })
sqm <- lapply(paste0('https://nha.chotot.com/toan-quoc/thue-van-phong-mat-bang-kinh-doanh?page=', 1:2),
                function(url){
                  url %>% read_html() %>% 
                    html_nodes(".adItemCondition___2daw2") %>% 
                    html_text() %>%
                    gsub('[\r\n\t]', '', .) # chưa hiểu tác dụng của thằng này
                  
                })
real_estate_bds_tibble <- tibble(
  x = sqm,
  y = Price
)

typeof(Price)

## [1] "list"

typeof(real_estate_bds_tibble)

## [1] "list"

view(real_estate_bds_tibble)


sqm_new <- unlist(sqm)  # cover list of list to characters

homicides <- readLines("https://raw.githubusercontent.com/hadv/PAR/master/homicides.txt") # read file 

g <- grep("iconHomicideShooting", homicides) # xem bao nhiêu chữ trong homicides
length(g)

## [1] 228

g <- grep("iconHomicideShooting|icon_homicide_shooting", homicides)
length(g)

## [1] 1003

g <- grep("Cause: shooting", homicides) # tìm từ shooting
length(g)

## [1] 228

g <- grep("Cause: [Ss]hooting", homicides) # tìm từ shooting chữ s cả hoa lẫn thường 

homicides[945]

## [1] "39.31193300000, -76.60867390000, icon_homicide_shooting, 'p825', '<dl><dt><a href=\"http://essentials.baltimoresun.com/micro_sun/homicides/victim/825/carlos-williams\">Carlos Williams</a></dt><dd class=\"address\">1900 Boone St<br />Baltimore, MD 21218</dd><dd>Race: Black<br />Gender: male<br />Age: 50 years old</dd><dd>Found on March 21, 2010</dd><dd>Victim died at Johns Hopkins Hospital</dd><dd>Cause: Shooting</dd><dd class=\"popup-note\"><p>Williams was shot in an apparent robbery as he was leaving for work, according to police</p></dd></dl>'"

#[1] "39.30677400000, -76.59891100000, icon_homicide_shooting, 'p816', '<dl><dt><a href=\"http://essentials.baltimoresun.com/micro_sun/homicides/victim/816/kenly-wheeler\">Kenly Wheeler</a></dt><dd class=\"address\">1400 N Caroline St<br />Baltimore, MD 21213</dd><dd>Race: Black<br />Gender: male<br />Age: 29 years old</dd><dd>Found on March  3, 2010</dd><dd>Victim died at Scene</dd><dd>Cause: Shooting</dd><dd class=\"popup-note\"><p>Wheeler\\'s body was&nbsp;found on the grounds of Dr. Bernard Harris Sr.&nbsp;Elementary School</p></dd></dl>'"


regexpr("<dd>[F|f]ound(.*)</dd>", homicides[1:10]) # find giữa 2 tag <dd> và <dd> bao gồm chữ found

##  [1] 177 178 188 189 178 182 178 187 182 183
## attr(,"match.length")
##  [1] 93 86 89 90 89 84 85 84 88 84
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE

#[1] 177 178 188 189 178 182 178 187 182 183 # tại vị trí 
#attr(,"match.length")
#[1] 93 86 89 90 89 84 85 84 88 84 # độ dài 
#attr(,"index.type")
#[1] "chars"
#attr(,"useBytes")
#[1] TRUE

homicides[1]

## [1] "39.311024, -76.674227, iconHomicideShooting, 'p2', '<dl><dt>Leon Nelson</dt><dd class=\"address\">3400 Clifton Ave.<br />Baltimore, MD 21216</dd><dd>black male, 17 years old</dd><dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd></dl>'"

#[1] "39.311024, -76.674227, iconHomicideShooting, 'p2', '<dl><dt>Leon Nelson</dt><dd 
#class=\"address\">3400 Clifton Ave.<br />Baltimore, 
#MD 21216</dd><dd>black male, 17 years old</dd><dd>Found on January 1, 2007</dd><dd>
#Victim died at Shock Trauma</dd><dd>Cause: shooting</dd></dl>'"
substr(homicides[1], 177, 177 + 93)  # mục đích là lọc toàn bộ ký tự giữa 2 tag <dd> và <dd>

## [1] "<dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd><"

#"<dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd><" dư 1 chữ 
substr(homicides[1], 177, 177 + 93 -1)  # mục đích là lọc toàn bộ ký tự giữa 2 tag <dd> và <dd>

## [1] "<dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd>"

#"<dd>Found on January 1, 2007</dd><dd>Victim died at Shock Trauma</dd><dd>Cause: shooting</dd>"

regexpr("<dd>[F|f]ound(.*?)</dd>", homicides[1:10]) #  (.*?) là lazy, dừng lại ngay </dd> đầu tiên

##  [1] 177 178 188 189 178 182 178 187 182 183
## attr(,"match.length")
##  [1] 33 33 33 33 33 33 33 33 33 33
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE

r <- regexpr("<dd>[F|f]ound(.*?)</dd>", homicides[1:10])
regmatches(homicides[1:10], r) # cho ra kết quả luôn ko cần xài substr()

##  [1] "<dd>Found on January 1, 2007</dd>" "<dd>Found on January 2, 2007</dd>"
##  [3] "<dd>Found on January 2, 2007</dd>" "<dd>Found on January 3, 2007</dd>"
##  [5] "<dd>Found on January 5, 2007</dd>" "<dd>Found on January 5, 2007</dd>"
##  [7] "<dd>Found on January 5, 2007</dd>" "<dd>Found on January 7, 2007</dd>"
##  [9] "<dd>Found on January 8, 2007</dd>" "<dd>Found on January 8, 2007</dd>"

#[1] "<dd>Found on January 1, 2007</dd>" "<dd>Found on January 2, 2007</dd>"
#[3] "<dd>Found on January 2, 2007</dd>" "<dd>Found on January 3, 2007</dd>"
#[5] "<dd>Found on January 5, 2007</dd>" "<dd>Found on January 5, 2007</dd>"
#[7] "<dd>Found on January 5, 2007</dd>" "<dd>Found on January 7, 2007</dd>"
#[9] "<dd>Found on January 8, 2007</dd>" "<dd>Found on January 8, 2007</dd>"