library(dplyr)
library(data.table)
library(xml2)
library(rvest)
library(stringr)
library(qdapRegex)
library(tidyr)
link ที่จะเก็บข้อมูลคือ “https://www.tripadvisor.com/ShowForum-g293915-i3686-Thailand.html” โดยแต่ละหน้าจะแสดงอยู่ประมาณ 20 กระทู้ โดย pattern การแยกหน้าคือ -o เช่น หน้าแรก “https://www.tripadvisor.com/ShowForum-g293915-i3686-Thailand.html” หน้าที่สอง คือ “https://www.tripadvisor.com/ShowForum-g293915-i3686-o20-Thailand.html”
โดยเจ้าของได้เริ่มเก็บข้อมูล ณ วันที่ 12 ส.ค. 62
#สร้างเลข dummy เอาไว้สำหรับการทำ Scraping หลายๆหน้า
#around 8610 page
total = 8610*20
sequence = seq(0,total,20)
sequence = sequence[-c(8610)] %>% data.table('sequence' = .)
sequence #172200
## sequence
## 1: 0
## 2: 20
## 3: 40
## 4: 60
## 5: 80
## ---
## 8606: 172100
## 8607: 172120
## 8608: 172140
## 8609: 172160
## 8610: 172200
สิ่งที่เราเลือกเก็บคือ 1.link กระทู้หลัก 2.มาจากกระทู้หลักอะไร 3.จำนวนการตอบกลับ 4.ใตรตอบกระทู้ล่าสุด
all_main_link = data.table()
for (i in (1:nrow(sequence))){
if (i == 1){
url = "https://www.tripadvisor.com/ShowForum-g293915-i3686-Thailand.html"
link <- read_html(url)%>%html_nodes("b a")%>%html_attrs()%>%data.table('link' = .)
from <-read_html(url)%>%html_nodes("tr+ tr .forumcol")%>%html_text()%>%data.table('from' = .)
replies <-read_html(url)%>%html_nodes(".reply")%>%html_text()%>%data.table('replies' = .)
last_reply <-read_html(url)%>%html_nodes(".rowentry > a")%>%html_text()%>%data.table('last_reply' = .)
all = cbind(link,from,replies,last_reply)
}else {
url = paste0("https://www.tripadvisor.com/ShowForum-g293915-i3686-o",sequence$sequence[i],"-Thailand.html")
link <- read_html(url)%>%html_nodes("b a")%>%html_attrs()%>%data.table('link' = .)
from <-read_html(url)%>%html_nodes("tr+ tr .forumcol")%>%html_text()%>%data.table('from' = .)
replies <-read_html(url)%>%html_nodes(".reply")%>%html_text()%>%data.table('replies' = .)
last_reply <-read_html(url)%>%html_nodes(".rowentry > a")%>%html_text()%>%data.table('last_reply' = .)
all = cbind(link,from,replies,last_reply)
}
all_main_link <-rbind(all_main_link,all)
rm(link)
rm(all)
rm(from)
}
#clean link
all_link <- separate(all_link,link,into = c("link","oneclick"),sep = ",")
all_link$link =str_extract(all_link$link,"\".*")
all_link$link =gsub("\"","",all_link$link)
all_link$oneclick <-NULL
#saveRDS(all_main_link,file = 'all_mainlink.rds')
all_link <-readRDS("main_link.rds")
all_link[,] <- lapply(all_link[,],str_squish) # clean \n\t pattern
all_link[sample(40),] %>% as_tibble()
## # A tibble: 40 x 6
## topic replies forum last_post i link
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Private hire care … 4 Phuket 10:44 am 1 /ShowTopic-g293920-i50…
## 2 Phi Phi island to … 2 Ko Jum 8:55 am by … 1 /ShowTopic-g1152743-i1…
## 3 Currently at JW Ma… 4 Khao … 7:21 am by … 2 /ShowTopic-g297914-i10…
## 4 Ko Samui in Feb wi… 12 Ko Sa… 10:10 am by… 1 /ShowTopic-g293918-i75…
## 5 Lonely Beach sand … 1 Ko Ch… 6:44 am by … 2 /ShowTopic-g580110-i98…
## 6 Bophut Market Frid… 1 Ko Sa… 7:35 am by … 1 /ShowTopic-g293918-i75…
## 7 Thailand in October 5 Thail… 9:05 am by … 1 /ShowTopic-g293915-i36…
## 8 Downtown Area 21 Bangk… 5:09 am by … 2 /ShowTopic-g293916-i36…
## 9 Overnight train 10 Bangk… 4:30 am by … 2 /ShowTopic-g293916-i36…
## 10 Hotels on Chaweng … 4 Chawe… 7:08 am by … 2 /ShowTopic-g676072-i12…
## # … with 30 more rows
ทำให้แน่ใจว่าไม่มีกระทู้่ที่ซ้ำ
all_link <-all_link %>% distinct(link,.keep_all = TRUE) #.kepp_all = T >>> is keep the other columns
กระทู้ไหนที่มีการตอบเยอะก็จะมีหลายหน้าซึ่งเหมือนกับตัว link หลักคือแบ่งหน้าของในกระทู้นั้นๆด้วย -o[num]- และจะเปลี่ยนเป็น 10 ความคิดเห็นต่อหนึ่งหน้า
https://www.tripadvisor.com/ShowTopic-g293915-i3686-k6601803-Peanut_allergy-Thailand.html #หน้าแรก https://www.tripadvisor.com/ShowTopic-g293915-i3686-k6601803-o150-Peanut_allergy-Thailand.html #หน้าประมาณ 16
ซึ่งการเก็บข้อมูลชุด replies มาเพื่อจะได้ทราบว่า กระทู้นั้นมีหน้าย่อยกี่หน้า
all_link <-all_link[-c(1:6),] #ลบกระทู้ที่โดนปักหมุด
##preprocess for find max_page_per_blog
all_link$replies <- gsub(',',"",all_link$replies) %>% str_squish() %>%
as.numeric(.) #แปลง text ให้เป็น numeric
all_link <- all_link[!(is.na(all_link$replies)),] #check NA
all_link %>% as_tibble()
## # A tibble: 170,200 x 6
## topic replies forum last_post i link
## <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 Giant Ibis Bus - … 6 Bangkok 10:05 am by… 1 /ShowTopic-g293916-i…
## 2 Apply for Myanmar… 2 Bangkok 9:48 am by … 1 /ShowTopic-g293916-i…
## 3 Bangkok skywalk 4 Bangkok 9:15 am by … 1 /ShowTopic-g293916-i…
## 4 Langkawi to Koh L… 0 Ko Lipe 9:11 am by … 1 /ShowTopic-g1024140-…
## 5 Thailand in Octob… 5 Thailand 9:05 am by … 1 /ShowTopic-g293915-i…
## 6 Evason Phuket 8 Rawai 9:01 am by … 1 /ShowTopic-g297934-i…
## 7 Phi Phi island to… 2 Ko Jum 8:55 am by … 1 /ShowTopic-g1152743-…
## 8 Kupu kupu or sant… 5 Ko Pha N… 8:26 am by … 1 /ShowTopic-g303907-i…
## 9 Please post your … 1 Khao Yai… 8:20 am by … 1 /ShowTopic-g3218970-…
## 10 Itinerary feedback 11 Bangkok 8:07 am by … 1 /ShowTopic-g293916-i…
## # … with 170,190 more rows
#find max page
max_replies = max(all_link$replies) #4560
max_page_perblog <-ceiling(all_link$replies/10)
#dummy สำหรับนำไปใช้กับ nested for loop
page_dummy <-seq(0,max_replies,10)
max_page_perblog <- data.table(all_link$replies,max_page_perblog)
colnames(max_page_perblog) <- c('replies','max_page')
max_page_perblog #ยิ่งจำนวนตอบกลับเยอะยิ่งมีหลายหน้าย่อย
## replies max_page
## 1: 6 1
## 2: 2 1
## 3: 4 1
## 4: 0 0
## 5: 5 1
## ---
## 170196: 1 1
## 170197: 2 1
## 170198: 7 1
## 170199: 49 5
## 170200: 40 4
แบ่ง link ออกเป็น 2 ส่วนเพื่อสำหรับการเข้าถึงกระทู้ย่อย
demo <- all_link[1:10,]
demo$link
## [1] "/ShowTopic-g293916-i3687-k12829145-Giant_Ibis_Bus_Bangkok_to_Siem_Reap-Bangkok.html"
## [2] "/ShowTopic-g293916-i3687-k12857408-Apply_for_Myanmar_visum_in_Bangkok-Bangkok.html"
## [3] "/ShowTopic-g293916-i3687-k12857233-Bangkok_skywalk-Bangkok.html"
## [4] "/ShowTopic-g1024140-i13523-k12857365-Langkawi_to_Koh_Lipe_in_september-Ko_Lipe_Satun_Province.html"
## [5] "/ShowTopic-g293915-i3686-k12856109-Thailand_in_October-Thailand.html"
## [6] "/ShowTopic-g297934-i13565-k12847819-Evason_Phuket-Rawai_Phuket.html"
## [7] "/ShowTopic-g1152743-i17500-k12857018-Phi_Phi_island_to_Koh_Jum_travel_options_in_september_2019-Ko_Jum_Krabi_Province.html"
## [8] "/ShowTopic-g303907-i9842-k12855669-Kupu_kupu_or_santhiya_hotel-Ko_Pha_Ngan_Surat_Thani_Province.html"
## [9] "/ShowTopic-g3218970-i26760-k12857280-Please_post_your_animal_sightings-Khao_Yai_National_Park_Nakhon_Ratchasima_Province.html"
## [10] "/ShowTopic-g293916-i3687-k12853910-Itinerary_feedback-Bangkok.html"
แบ่งเป็น 2 ส่วนคือ ด้านหน้า กับ ด้านหลัง
demo$front <- str_extract(demo$link,".*(k[0-9]+)")
demo$back <- sub("(k[0-9]+)","",demo$link) %>% str_extract(., "--.*") %>% sub('--',"",.)
ทดลอง Scrape
ข้อมูลที่เลือกเก็บคือ 1.Topic : ชื่อ กระทู้ 2.Comment : ความคิดเห็นของนักท่องเที่ยวในกระทู้นั้นๆ 3.User : ชื่อ user ของนักท่องเที่ยว 4.User_link : link profile ของ user นั้น 5.User_loc : location ของ user 6.From_mainlink : เอาไว้บอกว่ามาจาก main_link ไหน
demo_sublink <-data.table()
tryCatch({
for (i in (1:nrow(demo))) {
for (j in (1:max_page_perblog$max_page[i])){
if(j == 1 | j == 0){ #condition or
url <-paste0("https://www.tripadvisor.com",demo$front[i],"-",demo$back[i])
topic <- read_html(url) %>% html_nodes(".postTitle") %>% html_text()
comment <-read_html(url)%>%html_nodes(".postBody")%>%html_text()
user <-read_html(url)%>%html_nodes(".username span")%>%html_text()
user_link <- read_html(url) %>% html_nodes(".username") %>% html_attrs() %>% unlist() %>%
.[grepl('Profile',.)] %>% as.vector() %>% str_extract('/Profile/.*') %>% gsub("');","",.)
user_loc <- read_html(url)%>%html_nodes(".profile .location")%>%html_text()
post_date<-read_html(url)%>%html_nodes(".postDate")%>%html_text()
from_mainlink <- as.numeric(i)
#find useless pattern
del <- grep("-:- Message from Tripadvisor staff",comment)
if(length(del) == 0){
NA
}else{
topic <- topic[-del]
comment <- comment[-del]
user <- user[-del]
user_link <- user_link[-del]
user_loc <- user_loc[-del]
post_date <- post_date[-del]
}
all <-data.table(topic,comment,user,user_link,user_loc,post_date,from_mainlink)
Sys.sleep(1)
}else{
url <-paste0("https://www.tripadvisor.com",demo$front[i],"-o",page_dummy[j],"-",demo$back[i])
topic <- read_html(url) %>% html_nodes(".postTitle") %>% html_text()
comment <-read_html(url)%>%html_nodes(".postBody")%>%html_text()
user <-read_html(url)%>%html_nodes(".username span")%>%html_text()
user_link <- read_html(url) %>% html_nodes(".username") %>% html_attrs() %>% unlist() %>%
.[grepl('Profile',.)] %>% as.vector() %>% str_extract('/Profile/.*') %>% gsub("');","",.)
user_loc <- read_html(url)%>%html_nodes(".profile .location")%>%html_text()
post_date<-read_html(url)%>%html_nodes(".postDate")%>%html_text()
from_mainlink <- as.numeric(i)
#find useless pattern
del <- grep("-:- Message from Tripadvisor staff",comment)
if(length(del) == 0){
NA
}else{
topic <- topic[-del]
comment <- comment[-del]
user <- user[-del]
user_link <- user_link[-del]
user_loc <- user_loc[-del]
post_date <- post_date[-del]
}
all <-data.table(topic,comment,user,user_link,user_loc,post_date,from_mainlink)
Sys.sleep(1)
}
demo_sublink<-rbind(demo_sublink,all)
rm(all)
rm(del)
}
}
},error = function(e){})
demo_sublink %>% as_tibble()
OK it’s work
ทำด้วย link ทั้งหมด
all_link$front <- str_extract(all_link$link,".*(k[0-9]+)")
all_link$back <- sub("(k[0-9]+)","",all_link$link) %>% str_extract(., "--.*") %>% sub('--',"",.)
ใช้ TryCatch เพื่อป้องกันไม่ให้ Loop หลุด
TripData <-data.table()
tryCatch({
for (i in (1:nrow(all_link))) {
for (j in (1:max_page_perblog$max_page[i])){
if(j == 1 | j == 0){ #condition or
url <-paste0("https://www.tripadvisor.com",all_link$front[i],"-",all_link$back[i])
topic <- read_html(url) %>% html_nodes(".postTitle") %>% html_text()
comment <-read_html(url)%>%html_nodes(".postBody")%>%html_text()
user <-read_html(url)%>%html_nodes(".username span")%>%html_text()
user_link <- read_html(url) %>% html_nodes(".username") %>% html_attrs() %>% unlist() %>%
.[grepl('Profile',.)] %>% as.vector() %>% str_extract('/Profile/.*') %>% gsub("');","",.)
user_loc <- read_html(url)%>%html_nodes(".profile .location")%>%html_text()
post_date<-read_html(url)%>%html_nodes(".postDate")%>%html_text()
from_mainlink <- as.numeric(i)
#find useless pattern
del <- grep("-:- Message from Tripadvisor staff",comment)
if(length(del) == 0){
NA
}else{
topic <- topic[-del]
comment <- comment[-del]
user <- user[-del]
user_link <- user_link[-del]
user_loc <- user_loc[-del]
post_date <- post_date[-del]
}
all <-data.table(topic,comment,user,user_link,user_loc,post_date,from_mainlink)
Sys.sleep(1)
}else{
url <-paste0("https://www.tripadvisor.com",all_link$front[i],"-o",page_dummy[j],"-",all_link$back[i])
topic <- read_html(url) %>% html_nodes(".postTitle") %>% html_text()
comment <-read_html(url)%>%html_nodes(".postBody")%>%html_text()
user <-read_html(url)%>%html_nodes(".username span")%>%html_text()
user_link <- read_html(url) %>% html_nodes(".username") %>% html_attrs() %>% unlist() %>%
.[grepl('Profile',.)] %>% as.vector() %>% str_extract('/Profile/.*') %>% gsub("');","",.)
user_loc <- read_html(url)%>%html_nodes(".profile .location")%>%html_text()
post_date<-read_html(url)%>%html_nodes(".postDate")%>%html_text()
from_mainlink <- as.numeric(i)
#find useless pattern
del <- grep("-:- Message from Tripadvisor staff",comment)
if(length(del) == 0){
NA
}else{
topic <- topic[-del]
comment <- comment[-del]
user <- user[-del]
user_link <- user_link[-del]
user_loc <- user_loc[-del]
post_date <- post_date[-del]
}
all <-data.table(topic,comment,user,user_link,user_loc,post_date,from_mainlink)
Sys.sleep(1)
}
TripData<-rbind(TripData,all)
rm(all)
rm(del)
}
}
},error = function(e){})
ใช้เวลาทั้งหมดประมาณ 21 วัน
#saveRDS(TripData,'TripData.rds')
TripData <- readRDS("TripData.rds")
TripData %>% as_tibble()
## # A tibble: 1,187,305 x 7
## topic comment user user_link user_loc post_date from_mainlink
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 "Private … "\n\n\n\nCan… 433Za… /Profile/43… London,… Aug 18, … 1
## 2 "\nRe: Pr… "\n\n\n\nHi,… Chatc… /Profile/Br… Phuket Aug 18, … 1
## 3 "\nRe: Pr… "\n\n\n\nPan… paint… /Profile/pa… Biggar,… Aug 18, … 1
## 4 "\nRe: Pr… "\n\n\n\nAno… nairo… /Profile/na… Souther… Aug 18, … 1
## 5 "\nRe: Pr… "\n\n\n\nTha… 433Za… /Profile/43… London,… Aug 23, … 1
## 6 "\nRe: Pr… "\n\n\n\nAno… BomJD /Profile/Bo… Cairns,… Aug 23, … 1
## 7 "\nRe: Pr… "\n\n\n\nI u… Trail… /Profile/Tr… Singapo… Aug 24, … 1
## 8 "which ap… "\n\n\n\nPle… richa… /Profile/H3… Seoul, … 1 year a… 2
## 9 "\nRe: wh… "\n\n\n\nHi\… omega… /Profile/om… Singapo… 1 year a… 2
## 10 "\nRe: wh… "\n\n\n\n555… wtf_c… /Profile/wt… Ambrym,… 1 year a… 2
## # … with 1,187,295 more rows