Stage 1: Collect Data
# Clear workspace:
rm(list = ls())
# Load some R packages:
library(rvest)
library(tidyverse)
library(stringi)
# Url links of homes for sales in Hanoi:
url <- "https://alonhadat.com.vn/can-ban-nha-ha-noi-t1.htm"
# Extract links at distric level:
url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="right"]/div[4]/div[1]/ul') %>%
html_nodes("a") %>%
html_attr("href") %>%
.[-1] -> district_url
# We assume that there are 1000 pages:
n_pages <- paste0("https://alonhadat.com.vn", str_sub(district_url, start = 1, end = str_count(district_url) - 4), "/trang-1000.htm")
# Function extracts n pages:
get_n_pages <- function(link) {
link %>%
read_html() %>%
html_nodes(xpath = '//*[@id="left"]/div[2]/a[11]') %>%
html_text() %>%
as.numeric() %>%
return()
}
# Use the function:
number_pages <- lapply(n_pages, get_n_pages)
actual_pages <- n_pages[as.vector(number_pages) > 0]
actual_pages <- actual_pages[!is.na(actual_pages)]
number_pages <- number_pages %>% unlist()
# Number of pages:
df_pages <- data.frame(base_url = str_sub(actual_pages, start = 1, end = str_count(actual_pages) - 8), n_pages = number_pages)
# https://alonhadat.com.vn/can-ban-nha-quan-ba-dinh-ha-noi-q407/trang-2.htm
# Function extract all links for a specific district selected:
extract_PageLinks <- function(...) {
n <- nrow(df_pages)
pageLinks <- c()
for (i in 1:n) {
my_pages <- paste0(df_pages[i, 1], 1:df_pages[i, 2], ".htm")
pageLinks <- c(pageLinks, my_pages)
}
return(pageLinks)
}
# Use the function:
system.time(all_pages_for_RE <- extract_PageLinks())
# Get all links for homes:
get_specific_RE_links <- function(x) {
x %>%
read_html() %>%
html_nodes(xpath = '//*[@id="left"]/div[1]') %>%
html_nodes("a") %>%
html_attr("href") %>%
unique() -> m
return(paste0("https://alonhadat.com.vn", m))
}
all_links_forRE <- lapply(all_pages_for_RE, get_specific_RE_links) %>% unlist()
# Function collect all information for a specific home for sale:
get_allData_RE <- function(link) {
link %>%
read_html() -> m
m %>%
html_nodes("td") %>%
html_text(trim = TRUE) %>%
matrix(nrow = 2) %>%
as.data.frame() %>%
mutate_all(as.character) -> n
n %>%
slice(2) -> main_df
names(main_df) <- stringi::stri_trans_general(n %>% slice(1) %>% as.vector(), "Latin-ASCII") %>% str_replace_all(" ", "_")
m %>%
html_nodes(xpath = '//*[@id="left"]/div[1]/div[2]') %>%
html_text(trim = TRUE) -> re_des
if (length(re_des) == 0) {
re_des <- NA
}
m %>%
html_nodes(xpath = '//*[@id="left"]/div[1]/div[3]') %>%
html_text(trim = TRUE) -> gia_dien_tich
if (length(gia_dien_tich) == 0) {
gia_dien_tich <- NA
}
m %>%
html_nodes(xpath = '//*[@id="left"]/div[1]/div[4]') %>%
html_text(trim = TRUE) -> re_add
if (length(re_add) == 0) {
re_add <- NA
}
total_df <- main_df %>%
mutate(mieu_ta = re_des, gia_dien_tich = gia_dien_tich, dia_chi = re_add, source_link = link)
Sys.sleep(1)
return(total_df)
}
get_allData_RE_tryCatch <- function(link) {
return(tryCatch(get_allData_RE(link), error = function(e) {NULL}))
}
lapply(all_links_forRE, get_allData_RE_tryCatch) -> allData_RE_list
save(allData_RE_list, file = "allData_RE_list_24_06.RData")
# load("C:\\Users\\ADMIN\\Documents\\allData_RE_list_24_06.RData")
Stage 2: Data Processing
# Clear workspace:
rm(list = ls())
# Load some R packages:
library(tidyverse)
library(stringi)
# Load data:
load("C:\\Users\\Zbook\\Desktop\\re_project\\allData_RE_list_24_06.RData")
# Convert data to data frame:
df_allData <- do.call("bind_rows", allData_RE_list)
# Convert to Latin character:
df_allData %>% mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")}) -> df_allData
# Function Extracts area data:
extract_area <- function(x) {
x %>%
str_split(":", simplify = TRUE) %>%
data.frame() -> gia_dt
gia_dt$X3 %>%
str_split("m", simplify = TRUE) %>%
data.frame() %>%
pull(X1) %>%
as.character() %>%
str_squish() %>%
as.numeric() %>%
return()
}
# Extract price data:
extract_price <- function(x) {
x %>%
str_split(":", simplify = TRUE) %>%
data.frame() -> gia_dt
gia_dt$X2 %>%
str_to_lower() %>%
str_split("ty", simplify = TRUE) %>%
data.frame() %>%
pull(X1) %>%
as.character() %>%
str_split(",", simplify = TRUE) %>%
data.frame() %>%
mutate_all(function(x) {str_squish(x)}) %>%
mutate(X3 = case_when(str_count(X2) == 1 ~ paste0(X2, "0"),
str_count(X2) == 0 ~ "00",
TRUE ~ X2)) %>%
mutate(price = as.numeric(X1) + as.numeric(X3) / 100) %>%
pull(price) %>%
return()
}
# Function extracts add (district level):
extract_add <- function(x) {
x %>%
str_to_lower() %>%
str_split("\\, quan", simplify = TRUE) %>%
data.frame() %>%
pull(X2) %>%
as.character() %>%
str_squish() %>%
str_split("\\,", simplify = TRUE) %>%
data.frame() %>%
pull(X1) %>%
as.character() %>%
return()
}
#=========================================================================================================
# Function extract agent's phone numbers
# https://stackoverflow.com/questions/17215789/extract-a-substring-in-r-according-to-a-pattern
# https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html
#=========================================================================================================
# Function 1:
extract_phoneNumber1 <- function(x) {
x %>%
str_replace_all(" ", "") %>%
str_extract("[0-9]{10}") %>%
return()
}
df_allData %>%
mutate(phone_contact = extract_phoneNumber1(mieu_ta)) %>%
filter(is.na(phone_contact)) %>%
head() %>%
select(source_link, mieu_ta, phone_contact)
# Function 2:
extract_phoneNumber2 <- function(x) {
x %>%
str_replace_all(" ", "") %>%
str_extract("[0-9]{10}|[0-9]{4}") %>%
return()
}
df_allData %>%
mutate(phone_contact = extract_phoneNumber2(mieu_ta)) %>%
filter(is.na(phone_contact)) %>%
head() %>%
select(source_link, mieu_ta, phone_contact)
# Function 3:
extract_phoneNumber3 <- function(x) {
x %>%
str_replace_all(" ", "") %>%
str_replace_all("\\.", "") %>%
str_extract("[0-9]{10}") %>%
return()
}
df_allData %>%
mutate(phone_contact = extract_phoneNumber3(mieu_ta)) %>%
filter(is.na(phone_contact)) %>%
head() %>%
select(source_link, mieu_ta, phone_contact)
# Use functions:
df_allData %>%
transmute(dien_tich = extract_area(gia_dien_tich),
gia = extract_price(gia_dien_tich),
district = extract_add(dia_chi),
Loai_BDS = Loai_BDS,
Phap_ly = Phap_ly) -> df1
df1 %>%
filter(Loai_BDS == "Nha mat tien") %>%
group_by(district) %>%
count() %>%
ungroup() %>%
filter(str_count(district) != 0) %>%
mutate(district = case_when(str_detect(district, "hoang mai") ~ "hoang mai", TRUE ~ district)) %>%
group_by(district) %>%
summarise(n_sales = sum(n)) %>%
ungroup() %>%
arrange(n_sales) %>%
mutate(district = factor(district, levels = district)) -> df_mat_tien
df1 %>%
filter(Loai_BDS == "Nha mat tien") %>%
group_by(district) %>%
summarise(median_dien_tich = median(dien_tich, na.rm = TRUE)) %>%
ungroup() %>%
filter(str_count(district) != 0) %>%
arrange(median_dien_tich) %>%
mutate(district = factor(district, levels = district)) -> df_dien_tich
df1 %>%
filter(Loai_BDS == "Nha mat tien") %>%
group_by(district) %>%
summarise(median_gia = median(gia, na.rm = TRUE)) %>%
ungroup() %>%
filter(str_count(district) != 0) %>%
arrange(median_gia) %>%
mutate(district = factor(district, levels = district)) -> df_gia
df1 %>%
filter(Loai_BDS == "Nha mat tien") %>%
mutate(gia_m2 = 1000*gia / dien_tich) %>%
group_by(district) %>%
summarise(med_gia_m2 = median(gia_m2, na.rm = TRUE)) %>%
ungroup() %>%
mutate(med_gia_m2 = round(med_gia_m2, 0)) %>%
filter(str_count(district) != 0) %>%
arrange(med_gia_m2) %>%
mutate(district = factor(district, levels = district)) -> df_gia_median_m2
df_mat_tien %>%
ggplot(aes(district, n_sales)) +
geom_col() +
coord_flip()
df_gia %>%
ggplot(aes(district, median_gia)) +
geom_col() +
coord_flip()
df_gia_median_m2 %>%
ggplot(aes(district, med_gia_m2)) +
geom_col() +
coord_flip()
df_dien_tich %>%
ggplot(aes(district, median_dien_tich)) +
geom_col() +
coord_flip()
#====================================
# Check E agent (cò)
#====================================
df_allData %>%
mutate(phone_contact = extract_phoneNumber3(mieu_ta)) -> df_allData
df_allData %>%
group_by(phone_contact) %>%
count() %>%
ungroup() %>%
filter(!is.na(phone_contact)) %>%
arrange(-n) %>%
filter(n > 1) %>%
pull(phone_contact) -> re_agent_phones
df_allData %>%
mutate(re_agent = case_when(phone_contact %in% re_agent_phones ~ "Yes", TRUE ~ "No")) -> df_allData
# df_allData %>%
# select(phone_contact, source_link) %>%
# write.csv("re_phone_contact.csv", row.names = FALSE)
#===================
# Phone Providers
#===================
# Test your solution:
x <- c("098xdf", "012hg4")
str_detect(x, "^098")
str_detect(x, "^012")
str_detect(x, "^098|012")
p_old <- str_c("^", viettel_df$X1, sep = "") %>% str_flatten(collapse = "|")
p_new <- str_c("^", viettel_df$X2, sep = "") %>% str_flatten(collapse = "|")
viettel_pre <- str_c(p_old, p_new, sep = "|")
# Reference: https://www.gocit.vn/bai-viet/tong-hop-danh-sach-dau-so-mang-di-dong-o-viet-nam/
library(rvest)
link <- "https://www.gocit.vn/bai-viet/tong-hop-danh-sach-dau-so-mang-di-dong-o-viet-nam/"
# Extract one specific table:
link %>%
read_html() %>%
html_nodes(xpath = '//*[@id="post-11047"]/div[5]/div[1]/table[1]') %>%
html_table(fill = TRUE)
# Extract all tables:
link %>%
read_html() %>%
html_table(fill = TRUE) -> pre_phone
# lapply(pre_phone, function(df) {df %>% slice(-1)})
pre_phone[[1]] %>% slice(-1) -> viettel_df
pre_phone[[2]] %>% slice(-1) -> mobi_df
pre_phone[[3]] %>% slice(-1) -> vina_df
pre_phone[[4]] %>% slice(-1) -> vietnammobi_df
pre_phone[[5]] %>% slice(-1) -> Gmobi_df
# Function create pre-number for phones:
extrac_preNumber <- function(df) {
p_old <- str_c("^", df$X1, sep = "") %>% str_flatten(collapse = "|")
p_new <- str_c("^", df$X2, sep = "") %>% str_flatten(collapse = "|")
total_pre <- str_c(p_old, p_new, sep = "|")
return(total_pre)
}
pre_viettel <- extrac_preNumber(viettel_df)
pre_mobi <- extrac_preNumber(mobi_df)
pre_vina <- extrac_preNumber(vina_df)
pre_vietnam <- extrac_preNumber(vietnammobi_df)
pre_gmobil <- extrac_preNumber(Gmobi_df)
df_allData %>%
mutate(phone_provider = case_when(str_detect(phone_contact, pre_viettel) ~ "Viettel",
str_detect(phone_contact, pre_mobi) ~ "Mobifone",
str_detect(phone_contact, pre_vina) ~ "Vinaphone",
str_detect(phone_contact, pre_vietnam) ~ "Vietnammobile",
str_detect(phone_contact, pre_gmobil) ~ "Gmobile")) -> df_allData
df_allData %>%
select(phone_contact, phone_provider, source_link) %>%
head(n = 10)
df_allData %>%
filter(!duplicated(phone_contact)) %>%
group_by(phone_provider) %>%
count() %>%
ungroup() %>%
na.omit() %>%
arrange(-n)
