Stage 1: Collect Data
# Clear workspace:
rm(list = ls())
# Load some R packages:
library(rvest)
library(tidyverse)
library(stringi)
# Url links of homes for sales in Hanoi:
url <- "https://alonhadat.com.vn/can-ban-nha-ha-noi-t1.htm"
# Extract links at distric level:
url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="right"]/div[4]/div[1]/ul') %>%
html_nodes("a") %>%
html_attr("href") %>%
.[-1] -> district_url
# We assume that there are 1000 pages:
n_pages <- paste0("https://alonhadat.com.vn", str_sub(district_url, start = 1, end = str_count(district_url) - 4), "/trang-1000.htm")
# Function extracts n pages:
get_n_pages <- function(link) {
link %>%
read_html() %>%
html_nodes(xpath = '//*[@id="left"]/div[2]/a[11]') %>%
html_text() %>%
as.numeric() %>%
return()
}
# Use the function:
number_pages <- lapply(n_pages, get_n_pages)
actual_pages <- n_pages[as.vector(number_pages) > 0]
actual_pages <- actual_pages[!is.na(actual_pages)]
number_pages <- number_pages %>% unlist()
# Number of pages:
df_pages <- data.frame(base_url = str_sub(actual_pages, start = 1, end = str_count(actual_pages) - 8), n_pages = number_pages)
# https://alonhadat.com.vn/can-ban-nha-quan-ba-dinh-ha-noi-q407/trang-2.htm
# Function extract all links for a specific district selected:
extract_PageLinks <- function(...) {
n <- nrow(df_pages)
pageLinks <- c()
for (i in 1:n) {
my_pages <- paste0(df_pages[i, 1], 1:df_pages[i, 2], ".htm")
pageLinks <- c(pageLinks, my_pages)
}
return(pageLinks)
}
# Use the function:
system.time(all_pages_for_RE <- extract_PageLinks())
# Get all links for homes:
get_specific_RE_links <- function(x) {
x %>%
read_html() %>%
html_nodes(xpath = '//*[@id="left"]/div[1]') %>%
html_nodes("a") %>%
html_attr("href") %>%
unique() -> m
return(paste0("https://alonhadat.com.vn", m))
}
all_links_forRE <- lapply(all_pages_for_RE, get_specific_RE_links) %>% unlist()
# Function collect all information for a specific home for sale:
get_allData_RE <- function(link) {
link %>%
read_html() -> m
m %>%
html_nodes("td") %>%
html_text(trim = TRUE) %>%
matrix(nrow = 2) %>%
as.data.frame() %>%
mutate_all(as.character) -> n
n %>%
slice(2) -> main_df
names(main_df) <- stringi::stri_trans_general(n %>% slice(1) %>% as.vector(), "Latin-ASCII") %>% str_replace_all(" ", "_")
m %>%
html_nodes(xpath = '//*[@id="left"]/div[1]/div[2]') %>%
html_text(trim = TRUE) -> re_des
if (length(re_des) == 0) {
re_des <- NA
}
m %>%
html_nodes(xpath = '//*[@id="left"]/div[1]/div[3]') %>%
html_text(trim = TRUE) -> gia_dien_tich
if (length(gia_dien_tich) == 0) {
gia_dien_tich <- NA
}
m %>%
html_nodes(xpath = '//*[@id="left"]/div[1]/div[4]') %>%
html_text(trim = TRUE) -> re_add
if (length(re_add) == 0) {
re_add <- NA
}
total_df <- main_df %>%
mutate(mieu_ta = re_des, gia_dien_tich = gia_dien_tich, dia_chi = re_add, source_link = link)
Sys.sleep(1)
return(total_df)
}
get_allData_RE_tryCatch <- function(link) {
return(tryCatch(get_allData_RE(link), error = function(e) {NULL}))
}
lapply(all_links_forRE, get_allData_RE_tryCatch) -> allData_RE_list
save(allData_RE_list, file = "allData_RE_list_24_06.RData")
# load("C:\\Users\\ADMIN\\Documents\\allData_RE_list_24_06.RData")
Stage 2: Data Processing
# Clear workspace:
rm(list = ls())
# Load some R packages:
library(tidyverse)
library(stringi)
# Load data:
load("C:\\Users\\Zbook\\Desktop\\re_project\\allData_RE_list_24_06.RData")
# Convert data to data frame:
df_allData <- do.call("bind_rows", allData_RE_list)
# Convert to Latin character:
df_allData %>% mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")}) -> df_allData
# Function Extracts area data:
extract_area <- function(x) {
x %>%
str_split(":", simplify = TRUE) %>%
data.frame() -> gia_dt
gia_dt$X3 %>%
str_split("m", simplify = TRUE) %>%
data.frame() %>%
pull(X1) %>%
as.character() %>%
str_squish() %>%
as.numeric() %>%
return()
}
# Extract price data:
extract_price <- function(x) {
x %>%
str_split(":", simplify = TRUE) %>%
data.frame() -> gia_dt
gia_dt$X2 %>%
str_to_lower() %>%
str_split("ty", simplify = TRUE) %>%
data.frame() %>%
pull(X1) %>%
as.character() %>%
str_split(",", simplify = TRUE) %>%
data.frame() %>%
mutate_all(function(x) {str_squish(x)}) %>%
mutate(X3 = case_when(str_count(X2) == 1 ~ paste0(X2, "0"),
str_count(X2) == 0 ~ "00",
TRUE ~ X2)) %>%
mutate(price = as.numeric(X1) + as.numeric(X3) / 100) %>%
pull(price) %>%
return()
}
# Function extracts add (district level):
extract_add <- function(x) {
x %>%
str_to_lower() %>%
str_split("\\, quan", simplify = TRUE) %>%
data.frame() %>%
pull(X2) %>%
as.character() %>%
str_squish() %>%
str_split("\\,", simplify = TRUE) %>%
data.frame() %>%
pull(X1) %>%
as.character() %>%
return()
}
#=========================================================================================================
# Function extract agent's phone numbers
# https://stackoverflow.com/questions/17215789/extract-a-substring-in-r-according-to-a-pattern
# https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html
#=========================================================================================================
# Function 1:
extract_phoneNumber1 <- function(x) {
x %>%
str_replace_all(" ", "") %>%
str_extract("[0-9]{10}") %>%
return()
}
df_allData %>%
mutate(phone_contact = extract_phoneNumber1(mieu_ta)) %>%
filter(is.na(phone_contact)) %>%
head() %>%
select(source_link, mieu_ta, phone_contact)
# Function 2:
extract_phoneNumber2 <- function(x) {
x %>%
str_replace_all(" ", "") %>%
str_extract("[0-9]{10}|[0-9]{4}") %>%
return()
}
df_allData %>%
mutate(phone_contact = extract_phoneNumber2(mieu_ta)) %>%
filter(is.na(phone_contact)) %>%
head() %>%
select(source_link, mieu_ta, phone_contact)
# Function 3:
extract_phoneNumber3 <- function(x) {
x %>%
str_replace_all(" ", "") %>%
str_replace_all("\\.", "") %>%
str_extract("[0-9]{10}") %>%
return()
}
df_allData %>%
mutate(phone_contact = extract_phoneNumber3(mieu_ta)) %>%
filter(is.na(phone_contact)) %>%
head() %>%
select(source_link, mieu_ta, phone_contact)
# Use functions:
df_allData %>%
transmute(dien_tich = extract_area(gia_dien_tich),
gia = extract_price(gia_dien_tich),
district = extract_add(dia_chi),
Loai_BDS = Loai_BDS,
Phap_ly = Phap_ly) -> df1
df1 %>%
filter(Loai_BDS == "Nha mat tien") %>%
group_by(district) %>%
count() %>%
ungroup() %>%
filter(str_count(district) != 0) %>%
mutate(district = case_when(str_detect(district, "hoang mai") ~ "hoang mai", TRUE ~ district)) %>%
group_by(district) %>%
summarise(n_sales = sum(n)) %>%
ungroup() %>%
arrange(n_sales) %>%
mutate(district = factor(district, levels = district)) -> df_mat_tien
df1 %>%
filter(Loai_BDS == "Nha mat tien") %>%
group_by(district) %>%
summarise(median_dien_tich = median(dien_tich, na.rm = TRUE)) %>%
ungroup() %>%
filter(str_count(district) != 0) %>%
arrange(median_dien_tich) %>%
mutate(district = factor(district, levels = district)) -> df_dien_tich
df1 %>%
filter(Loai_BDS == "Nha mat tien") %>%
group_by(district) %>%
summarise(median_gia = median(gia, na.rm = TRUE)) %>%
ungroup() %>%
filter(str_count(district) != 0) %>%
arrange(median_gia) %>%
mutate(district = factor(district, levels = district)) -> df_gia
df1 %>%
filter(Loai_BDS == "Nha mat tien") %>%
mutate(gia_m2 = 1000*gia / dien_tich) %>%
group_by(district) %>%
summarise(med_gia_m2 = median(gia_m2, na.rm = TRUE)) %>%
ungroup() %>%
mutate(med_gia_m2 = round(med_gia_m2, 0)) %>%
filter(str_count(district) != 0) %>%
arrange(med_gia_m2) %>%
mutate(district = factor(district, levels = district)) -> df_gia_median_m2
df_mat_tien %>%
ggplot(aes(district, n_sales)) +
geom_col() +
coord_flip()
df_gia %>%
ggplot(aes(district, median_gia)) +
geom_col() +
coord_flip()
df_gia_median_m2 %>%
ggplot(aes(district, med_gia_m2)) +
geom_col() +
coord_flip()
df_dien_tich %>%
ggplot(aes(district, median_dien_tich)) +
geom_col() +
coord_flip()
#====================================
# Check E agent (cò)
#====================================
df_allData %>%
mutate(phone_contact = extract_phoneNumber3(mieu_ta)) -> df_allData
df_allData %>%
group_by(phone_contact) %>%
count() %>%
ungroup() %>%
filter(!is.na(phone_contact)) %>%
arrange(-n) %>%
filter(n > 1) %>%
pull(phone_contact) -> re_agent_phones
df_allData %>%
mutate(re_agent = case_when(phone_contact %in% re_agent_phones ~ "Yes", TRUE ~ "No")) -> df_allData
# df_allData %>%
# select(phone_contact, source_link) %>%
# write.csv("re_phone_contact.csv", row.names = FALSE)
#===================
# Phone Providers
#===================
# Test your solution:
x <- c("098xdf", "012hg4")
str_detect(x, "^098")
str_detect(x, "^012")
str_detect(x, "^098|012")
p_old <- str_c("^", viettel_df$X1, sep = "") %>% str_flatten(collapse = "|")
p_new <- str_c("^", viettel_df$X2, sep = "") %>% str_flatten(collapse = "|")
viettel_pre <- str_c(p_old, p_new, sep = "|")
# Reference: https://www.gocit.vn/bai-viet/tong-hop-danh-sach-dau-so-mang-di-dong-o-viet-nam/
library(rvest)
link <- "https://www.gocit.vn/bai-viet/tong-hop-danh-sach-dau-so-mang-di-dong-o-viet-nam/"
# Extract one specific table:
link %>%
read_html() %>%
html_nodes(xpath = '//*[@id="post-11047"]/div[5]/div[1]/table[1]') %>%
html_table(fill = TRUE)
# Extract all tables:
link %>%
read_html() %>%
html_table(fill = TRUE) -> pre_phone
# lapply(pre_phone, function(df) {df %>% slice(-1)})
pre_phone[[1]] %>% slice(-1) -> viettel_df
pre_phone[[2]] %>% slice(-1) -> mobi_df
pre_phone[[3]] %>% slice(-1) -> vina_df
pre_phone[[4]] %>% slice(-1) -> vietnammobi_df
pre_phone[[5]] %>% slice(-1) -> Gmobi_df
# Function create pre-number for phones:
extrac_preNumber <- function(df) {
p_old <- str_c("^", df$X1, sep = "") %>% str_flatten(collapse = "|")
p_new <- str_c("^", df$X2, sep = "") %>% str_flatten(collapse = "|")
total_pre <- str_c(p_old, p_new, sep = "|")
return(total_pre)
}
pre_viettel <- extrac_preNumber(viettel_df)
pre_mobi <- extrac_preNumber(mobi_df)
pre_vina <- extrac_preNumber(vina_df)
pre_vietnam <- extrac_preNumber(vietnammobi_df)
pre_gmobil <- extrac_preNumber(Gmobi_df)
df_allData %>%
mutate(phone_provider = case_when(str_detect(phone_contact, pre_viettel) ~ "Viettel",
str_detect(phone_contact, pre_mobi) ~ "Mobifone",
str_detect(phone_contact, pre_vina) ~ "Vinaphone",
str_detect(phone_contact, pre_vietnam) ~ "Vietnammobile",
str_detect(phone_contact, pre_gmobil) ~ "Gmobile")) -> df_allData
df_allData %>%
select(phone_contact, phone_provider, source_link) %>%
head(n = 10)
df_allData %>%
filter(!duplicated(phone_contact)) %>%
group_by(phone_provider) %>%
count() %>%
ungroup() %>%
na.omit() %>%
arrange(-n)
---
title: "Mini Course: Collect Data for 78704 Homes for Sale in Hanoi"
author: "Nguyen Chi Dung"
subtitle: "Data Scraping Series"
output:
  html_document:
    code_download: yes
    # code_folding: hide
    highlight: zenburn
    theme: flatly
    toc: yes
    toc_float: yes
  word_document:
    toc: yes
---

```{r setup,include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.retina=2)
```

# Stage 1: Collect Data

```{r, eval=FALSE}
# Clear workspace: 
rm(list = ls())

# Load some R packages: 
library(rvest)
library(tidyverse)
library(stringi)

# Url links of homes for sales in Hanoi: 

url <- "https://alonhadat.com.vn/can-ban-nha-ha-noi-t1.htm"

# Extract links at distric level: 

url %>% 
  read_html() %>% 
  html_nodes(xpath = '//*[@id="right"]/div[4]/div[1]/ul') %>% 
  html_nodes("a") %>% 
  html_attr("href") %>% 
  .[-1] -> district_url


# We assume that there are 1000 pages: 

n_pages <- paste0("https://alonhadat.com.vn", str_sub(district_url, start = 1, end = str_count(district_url) - 4), "/trang-1000.htm")

# Function extracts n pages: 

get_n_pages <- function(link) {
  
  link %>% 
    read_html() %>% 
    html_nodes(xpath = '//*[@id="left"]/div[2]/a[11]') %>% 
    html_text() %>% 
    as.numeric() %>% 
    return()
}


# Use the function: 

number_pages <- lapply(n_pages, get_n_pages)

actual_pages <- n_pages[as.vector(number_pages) > 0]
actual_pages <- actual_pages[!is.na(actual_pages)]
number_pages <- number_pages %>% unlist()

# Number of pages: 
df_pages <- data.frame(base_url = str_sub(actual_pages, start = 1, end = str_count(actual_pages) - 8), n_pages = number_pages)



# https://alonhadat.com.vn/can-ban-nha-quan-ba-dinh-ha-noi-q407/trang-2.htm

# Function extract all links for a specific district selected: 

extract_PageLinks <- function(...) {
  n <- nrow(df_pages)
  pageLinks <- c() 
  
  for (i in 1:n) {
    my_pages <- paste0(df_pages[i, 1], 1:df_pages[i, 2], ".htm")
    pageLinks <- c(pageLinks, my_pages)
  }
  
  return(pageLinks)
  
}

# Use the function: 

system.time(all_pages_for_RE <- extract_PageLinks())



# Get all links for homes: 

get_specific_RE_links <- function(x) {
  
  x %>% 
    read_html() %>% 
    html_nodes(xpath = '//*[@id="left"]/div[1]') %>% 
    html_nodes("a") %>% 
    html_attr("href") %>% 
    unique() -> m
  
    return(paste0("https://alonhadat.com.vn", m))
}


all_links_forRE <- lapply(all_pages_for_RE, get_specific_RE_links) %>% unlist() 


# Function collect all information for a specific home for sale: 

get_allData_RE <- function(link) {
  
  link %>% 
    read_html() -> m
  
  m %>% 
    html_nodes("td") %>% 
    html_text(trim = TRUE) %>% 
    matrix(nrow = 2) %>% 
    as.data.frame() %>% 
    mutate_all(as.character) -> n
  
  n %>% 
    slice(2) -> main_df
  
  names(main_df) <- stringi::stri_trans_general(n %>% slice(1) %>% as.vector(), "Latin-ASCII") %>% str_replace_all(" ", "_")
  
  
  m %>% 
    html_nodes(xpath = '//*[@id="left"]/div[1]/div[2]') %>% 
    html_text(trim = TRUE) -> re_des
  
  if (length(re_des) == 0) {
    re_des <- NA
  }
  
  
  m %>% 
    html_nodes(xpath = '//*[@id="left"]/div[1]/div[3]') %>% 
    html_text(trim = TRUE) -> gia_dien_tich
  
  if (length(gia_dien_tich) == 0) {
    gia_dien_tich <- NA
  }
  
  m %>% 
    html_nodes(xpath = '//*[@id="left"]/div[1]/div[4]') %>% 
    html_text(trim = TRUE) -> re_add
  
  
  if (length(re_add) == 0) {
    re_add <- NA
  }
  
  
  total_df <- main_df %>% 
    mutate(mieu_ta = re_des, gia_dien_tich = gia_dien_tich, dia_chi = re_add, source_link = link)
  
  Sys.sleep(1)
  
  return(total_df)
  
}



get_allData_RE_tryCatch <- function(link) {
  return(tryCatch(get_allData_RE(link), error = function(e) {NULL}))
}



lapply(all_links_forRE, get_allData_RE_tryCatch) -> allData_RE_list

save(allData_RE_list, file = "allData_RE_list_24_06.RData")

# load("C:\\Users\\ADMIN\\Documents\\allData_RE_list_24_06.RData")


```

# Stage 2: Data Processing 

```{r, eval=FALSE}
# Clear workspace: 
rm(list = ls())

# Load some R packages: 
library(tidyverse)
library(stringi)


# Load data: 
load("C:\\Users\\Zbook\\Desktop\\re_project\\allData_RE_list_24_06.RData")

# Convert data to data frame: 
df_allData <- do.call("bind_rows", allData_RE_list)

# Convert to Latin character: 
df_allData %>% mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")}) -> df_allData


# Function Extracts area data:

extract_area <- function(x) {
  
  x %>% 
    str_split(":", simplify = TRUE) %>% 
    data.frame() -> gia_dt
  
  gia_dt$X3 %>% 
    str_split("m", simplify = TRUE) %>% 
    data.frame() %>% 
    pull(X1) %>% 
    as.character() %>% 
    str_squish() %>% 
    as.numeric() %>% 
    return()
}


# Extract price data: 

extract_price <- function(x) {
  
  x %>% 
    str_split(":", simplify = TRUE) %>% 
    data.frame() -> gia_dt
  
  
  gia_dt$X2 %>% 
    str_to_lower() %>% 
    str_split("ty", simplify = TRUE) %>% 
    data.frame() %>% 
    pull(X1) %>% 
    as.character() %>% 
    str_split(",", simplify = TRUE) %>% 
    data.frame() %>% 
    mutate_all(function(x) {str_squish(x)}) %>% 
    mutate(X3 = case_when(str_count(X2) == 1 ~ paste0(X2, "0"), 
                          str_count(X2) == 0 ~ "00",
                          TRUE ~ X2)) %>% 
    mutate(price = as.numeric(X1) + as.numeric(X3) / 100) %>% 
    pull(price) %>% 
    return()
}

# Function extracts add (district level): 

extract_add <- function(x) {
  
x %>% 
    str_to_lower() %>% 
    str_split("\\, quan", simplify = TRUE) %>% 
    data.frame() %>% 
    pull(X2) %>% 
    as.character() %>% 
    str_squish() %>% 
    str_split("\\,", simplify = TRUE) %>% 
    data.frame() %>% 
    pull(X1) %>% 
    as.character() %>% 
    return()
  
}



#=========================================================================================================
#    Function extract agent's phone numbers 
#    https://stackoverflow.com/questions/17215789/extract-a-substring-in-r-according-to-a-pattern
#    https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html 
#=========================================================================================================


# Function 1: 

extract_phoneNumber1 <- function(x) {
  x %>% 
    str_replace_all(" ", "") %>% 
    str_extract("[0-9]{10}") %>% 
    return()
}



df_allData %>% 
  mutate(phone_contact = extract_phoneNumber1(mieu_ta)) %>% 
  filter(is.na(phone_contact)) %>% 
  head() %>% 
  select(source_link, mieu_ta, phone_contact)


# Function 2: 

extract_phoneNumber2 <- function(x) {
  x %>% 
    str_replace_all(" ", "") %>% 
    str_extract("[0-9]{10}|[0-9]{4}") %>% 
    return()
}


df_allData %>% 
  mutate(phone_contact = extract_phoneNumber2(mieu_ta)) %>% 
  filter(is.na(phone_contact)) %>% 
  head() %>% 
  select(source_link, mieu_ta, phone_contact)

# Function 3: 

extract_phoneNumber3 <- function(x) {
  
  x %>% 
    str_replace_all(" ", "") %>% 
    str_replace_all("\\.", "") %>% 
    str_extract("[0-9]{10}") %>% 
    return()
}


df_allData %>% 
  mutate(phone_contact = extract_phoneNumber3(mieu_ta)) %>% 
  filter(is.na(phone_contact)) %>% 
  head() %>% 
  select(source_link, mieu_ta, phone_contact)


# Use functions: 

df_allData %>% 
  transmute(dien_tich = extract_area(gia_dien_tich), 
            gia = extract_price(gia_dien_tich), 
            district = extract_add(dia_chi), 
            Loai_BDS = Loai_BDS, 
            Phap_ly = Phap_ly) -> df1


df1 %>% 
  filter(Loai_BDS == "Nha mat tien") %>% 
  group_by(district) %>% 
  count() %>% 
  ungroup() %>% 
  filter(str_count(district) != 0) %>% 
  mutate(district = case_when(str_detect(district, "hoang mai") ~ "hoang mai", TRUE ~ district)) %>% 
  group_by(district) %>% 
  summarise(n_sales = sum(n)) %>% 
  ungroup() %>% 
  arrange(n_sales) %>% 
  mutate(district = factor(district, levels = district)) -> df_mat_tien


df1 %>% 
  filter(Loai_BDS == "Nha mat tien") %>% 
  group_by(district) %>% 
  summarise(median_dien_tich = median(dien_tich, na.rm = TRUE)) %>% 
  ungroup() %>% 
  filter(str_count(district) != 0) %>% 
  arrange(median_dien_tich) %>% 
  mutate(district = factor(district, levels = district)) -> df_dien_tich



df1 %>% 
  filter(Loai_BDS == "Nha mat tien") %>% 
  group_by(district) %>% 
  summarise(median_gia = median(gia, na.rm = TRUE)) %>% 
  ungroup() %>% 
  filter(str_count(district) != 0) %>% 
  arrange(median_gia) %>% 
  mutate(district = factor(district, levels = district)) -> df_gia


df1 %>% 
  filter(Loai_BDS == "Nha mat tien") %>% 
  mutate(gia_m2 = 1000*gia / dien_tich) %>% 
  group_by(district) %>% 
  summarise(med_gia_m2 = median(gia_m2, na.rm = TRUE)) %>% 
  ungroup() %>% 
  mutate(med_gia_m2 = round(med_gia_m2, 0)) %>% 
  filter(str_count(district) != 0) %>% 
  arrange(med_gia_m2) %>% 
  mutate(district = factor(district, levels = district)) -> df_gia_median_m2


df_mat_tien %>% 
  ggplot(aes(district, n_sales)) + 
  geom_col() + 
  coord_flip()


df_gia %>% 
  ggplot(aes(district, median_gia)) + 
  geom_col() + 
  coord_flip()


df_gia_median_m2 %>% 
  ggplot(aes(district, med_gia_m2)) + 
  geom_col() + 
  coord_flip()
  


df_dien_tich %>% 
  ggplot(aes(district, median_dien_tich)) + 
  geom_col() + 
  coord_flip()


#====================================
#         Check E agent (cò)
#====================================

df_allData %>% 
  mutate(phone_contact = extract_phoneNumber3(mieu_ta)) -> df_allData


df_allData %>% 
  group_by(phone_contact) %>% 
  count() %>% 
  ungroup() %>% 
  filter(!is.na(phone_contact)) %>% 
  arrange(-n) %>% 
  filter(n > 1) %>% 
  pull(phone_contact) -> re_agent_phones



df_allData %>% 
  mutate(re_agent = case_when(phone_contact %in% re_agent_phones ~ "Yes", TRUE ~ "No")) -> df_allData

# df_allData %>% 
#   select(phone_contact, source_link) %>% 
#   write.csv("re_phone_contact.csv", row.names = FALSE)

#===================
#  Phone Providers
#===================


# Test your solution: 

x <- c("098xdf", "012hg4")

str_detect(x, "^098")
str_detect(x, "^012")
str_detect(x, "^098|012")


p_old <- str_c("^", viettel_df$X1, sep = "") %>% str_flatten(collapse = "|")
p_new <- str_c("^", viettel_df$X2, sep = "") %>% str_flatten(collapse = "|")
viettel_pre <-  str_c(p_old, p_new, sep = "|")


# Reference: https://www.gocit.vn/bai-viet/tong-hop-danh-sach-dau-so-mang-di-dong-o-viet-nam/

library(rvest)
link <- "https://www.gocit.vn/bai-viet/tong-hop-danh-sach-dau-so-mang-di-dong-o-viet-nam/"


# Extract one specific table: 
link %>% 
  read_html() %>% 
  html_nodes(xpath = '//*[@id="post-11047"]/div[5]/div[1]/table[1]') %>% 
  html_table(fill = TRUE)


# Extract all tables: 
link %>% 
  read_html() %>% 
  html_table(fill = TRUE) -> pre_phone


# lapply(pre_phone, function(df) {df %>% slice(-1)})

pre_phone[[1]] %>% slice(-1) -> viettel_df
pre_phone[[2]] %>% slice(-1) -> mobi_df
pre_phone[[3]] %>% slice(-1) -> vina_df
pre_phone[[4]] %>% slice(-1) -> vietnammobi_df
pre_phone[[5]] %>% slice(-1) -> Gmobi_df

# Function create pre-number for phones: 

extrac_preNumber <- function(df) {
  
  p_old <- str_c("^", df$X1, sep = "") %>% str_flatten(collapse = "|")
  p_new <- str_c("^", df$X2, sep = "") %>% str_flatten(collapse = "|")
  total_pre <-  str_c(p_old, p_new, sep = "|")
  return(total_pre)
  
}


pre_viettel <- extrac_preNumber(viettel_df)
pre_mobi <- extrac_preNumber(mobi_df)
pre_vina <- extrac_preNumber(vina_df)
pre_vietnam <- extrac_preNumber(vietnammobi_df)
pre_gmobil <- extrac_preNumber(Gmobi_df)


df_allData %>% 
  mutate(phone_provider = case_when(str_detect(phone_contact, pre_viettel) ~ "Viettel", 
                                    str_detect(phone_contact, pre_mobi) ~ "Mobifone", 
                                    str_detect(phone_contact, pre_vina) ~ "Vinaphone", 
                                    str_detect(phone_contact, pre_vietnam) ~ "Vietnammobile", 
                                    str_detect(phone_contact, pre_gmobil) ~ "Gmobile")) -> df_allData


df_allData %>% 
  select(phone_contact, phone_provider, source_link) %>% 
  head(n = 10)


df_allData %>% 
  filter(!duplicated(phone_contact)) %>% 
  group_by(phone_provider) %>% 
  count() %>% 
  ungroup() %>% 
  na.omit() %>% 
  arrange(-n)


```

