library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(xml2)
library(rvest)
library(stringr)
library(qdapRegex)
## 
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:dplyr':
## 
##     explain
library(tidyr)
path = '/Users/fewsmc/TEST_git/Pila_test'
setwd(path)

link ที่จะเก็บข้อมูลคือ “https://www.tripadvisor.com/ShowForum-g293915-i3686-Thailand.html” โดยแต่ละหน้าจะแสดงอยู่ประมาณ 20 กระทู้ โดย pattern การแยกหน้าคือ -o เช่น หน้าแรก “https://www.tripadvisor.com/ShowForum-g293915-i3686-Thailand.html” หน้าที่สอง คือ “https://www.tripadvisor.com/ShowForum-g293915-i3686-o20-Thailand.html

โดยเจ้าของได้เริ่มเก็บข้อมูล ณ วันที่ 12 ส.ค. 62

#สร้างเลข dummy เอาไว้สำหรับการทำ Scraping หลายๆหน้า
#around 8610 page
total = 8610*20
sequence = seq(0,total,20)
sequence = sequence[-c(8610)] %>% data.table('sequence' = .)
sequence #172200
##       sequence
##    1:        0
##    2:       20
##    3:       40
##    4:       60
##    5:       80
##   ---         
## 8606:   172100
## 8607:   172120
## 8608:   172140
## 8609:   172160
## 8610:   172200

สิ่งที่เราเลือกเก็บคือ 1.link กระทู้หลัก 2.มาจากกระทู้หลักอะไร 3.จำนวนการตอบกลับ 4.ใตรตอบกระทู้ล่าสุด

all_main_link = data.table()
for (i in (1:nrow(sequence))){
  if (i == 1){
    url = "https://www.tripadvisor.com/ShowForum-g293915-i3686-Thailand.html"
    link <- read_html(url)%>%html_nodes("b a")%>%html_attrs()%>%data.table('link' = .)
    from <-read_html(url)%>%html_nodes("tr+ tr .forumcol")%>%html_text()%>%data.table('from' = .)
    replies <-read_html(url)%>%html_nodes(".reply")%>%html_text()%>%data.table('replies' = .)
    last_reply <-read_html(url)%>%html_nodes(".rowentry > a")%>%html_text()%>%data.table('last_reply' = .)
    all = cbind(link,from,replies,last_reply)
  }else {
    url = paste0("https://www.tripadvisor.com/ShowForum-g293915-i3686-o",sequence$sequence[i],"-Thailand.html")
    link <- read_html(url)%>%html_nodes("b a")%>%html_attrs()%>%data.table('link' = .)
    from <-read_html(url)%>%html_nodes("tr+ tr .forumcol")%>%html_text()%>%data.table('from' = .)
    replies <-read_html(url)%>%html_nodes(".reply")%>%html_text()%>%data.table('replies' = .)
    last_reply <-read_html(url)%>%html_nodes(".rowentry > a")%>%html_text()%>%data.table('last_reply' = .)
    all = cbind(link,from,replies,last_reply)
  }
  all_main_link <-rbind(all_main_link,all)
  rm(link)
  rm(all)
  rm(from)
}
#clean link 
all_link <- separate(all_link,link,into = c("link","oneclick"),sep = ",")
all_link$link =str_extract(all_link$link,"\".*")
all_link$link =gsub("\"","",all_link$link)
all_link$oneclick <-NULL

saveRDS(all_main_link,file = 'all_mainlink.rds')

all_link <-readRDS("main_link.rds")
all_link[,] <- lapply(all_link[,],str_squish) # clean \n\t pattern
all_link[sample(40),] %>% as_tibble()
## # A tibble: 40 x 6
##    topic              replies forum    last_post    i     link                  
##    <chr>              <chr>   <chr>    <chr>        <chr> <chr>                 
##  1 Giant Ibis Bus - … 6       Bangkok  10:05 am by… 1     /ShowTopic-g293916-i3…
##  2 Downtown Area      21      Bangkok  5:09 am by … 2     /ShowTopic-g293916-i3…
##  3 which app or site… 13      Thailand 10:40 am by… 1     /ShowTopic-g293915-i3…
##  4 Pak Meng to Trang… 2       Trang P… 5:09 am by … 2     /ShowTopic-g297938-i9…
##  5 Samui daily weath… 339     Ko Samui 7:58 am by … 1     /ShowTopic-g293918-i7…
##  6 Choeng Mon beach … 8       Ko Samui 10:31 am by… 1     /ShowTopic-g293918-i7…
##  7 Apply for Myanmar… 2       Bangkok  9:48 am by … 1     /ShowTopic-g293916-i3…
##  8 Lonely Beach sand… 1       Ko Chang 6:44 am by … 2     /ShowTopic-g580110-i9…
##  9 Transfer from BMK… 5       Bangkok  6:58 am by … 2     /ShowTopic-g293916-i3…
## 10 Kupu kupu or sant… 5       Ko Pha … 8:26 am by … 1     /ShowTopic-g303907-i9…
## # … with 30 more rows

ทำให้แน่ใจว่าไม่มีกระทู้่ที่ซ้ำ

all_link <-all_link %>% distinct(link,.keep_all = TRUE) #.kepp_all = T >>> is keep the other columns

กระทู้ไหนที่มีการตอบเยอะก็จะมีหลายหน้าซึ่งเหมือนกับตัว link หลักคือแบ่งหน้าของในกระทู้นั้นๆด้วย -o[num]- และจะเปลี่ยนเป็น 10 ความคิดเห็นต่อหนึ่งหน้า

https://www.tripadvisor.com/ShowTopic-g293915-i3686-k6601803-Peanut_allergy-Thailand.html #หน้าแรก https://www.tripadvisor.com/ShowTopic-g293915-i3686-k6601803-o150-Peanut_allergy-Thailand.html #หน้าประมาณ 16

ซึ่งการเก็บข้อมูลชุด replies มาเพื่อจะได้ทราบว่า กระทู้นั้นมีหน้าย่อยกี่หน้า

all_link <-all_link[-c(1:6),] #ลบกระทู้ที่โดนปักหมุด

##preprocess for find max_page_per_blog
all_link$replies <- gsub(',',"",all_link$replies) %>% str_squish() %>%
  as.numeric(.) #แปลง text ให้เป็น numeric
all_link <- all_link[!(is.na(all_link$replies)),] #check NA
all_link %>% as_tibble()
## # A tibble: 170,200 x 6
##    topic              replies forum     last_post    i     link                 
##    <chr>                <dbl> <chr>     <chr>        <chr> <chr>                
##  1 Giant Ibis Bus - …       6 Bangkok   10:05 am by… 1     /ShowTopic-g293916-i…
##  2 Apply for Myanmar…       2 Bangkok   9:48 am by … 1     /ShowTopic-g293916-i…
##  3 Bangkok skywalk          4 Bangkok   9:15 am by … 1     /ShowTopic-g293916-i…
##  4 Langkawi to Koh L…       0 Ko Lipe   9:11 am by … 1     /ShowTopic-g1024140-…
##  5 Thailand in Octob…       5 Thailand  9:05 am by … 1     /ShowTopic-g293915-i…
##  6 Evason Phuket            8 Rawai     9:01 am by … 1     /ShowTopic-g297934-i…
##  7 Phi Phi island to…       2 Ko Jum    8:55 am by … 1     /ShowTopic-g1152743-…
##  8 Kupu kupu or sant…       5 Ko Pha N… 8:26 am by … 1     /ShowTopic-g303907-i…
##  9 Please post your …       1 Khao Yai… 8:20 am by … 1     /ShowTopic-g3218970-…
## 10 Itinerary feedback      11 Bangkok   8:07 am by … 1     /ShowTopic-g293916-i…
## # … with 170,190 more rows

#find max page
max_replies = max(all_link$replies) #4560
max_page_perblog <-ceiling(all_link$replies/10)
#dummy สำหรับนำไปใช้กับ nested for loop
page_dummy <-seq(0,max_replies,10)
max_page_perblog <- data.table(all_link$replies,max_page_perblog)
colnames(max_page_perblog) <- c('replies','max_page')
max_page_perblog #ยิ่งจำนวนตอบกลับเยอะยิ่งมีหลายหน้าย่อย
##         replies max_page
##      1:       6        1
##      2:       2        1
##      3:       4        1
##      4:       0        0
##      5:       5        1
##     ---                 
## 170196:       1        1
## 170197:       2        1
## 170198:       7        1
## 170199:      49        5
## 170200:      40        4

แบ่ง link ออกเป็น 2 ส่วนเพื่อสำหรับการเข้าถึงกระทู้ย่อย

demo <- all_link[1:10,]
demo$link
##  [1] "/ShowTopic-g293916-i3687-k12829145-Giant_Ibis_Bus_Bangkok_to_Siem_Reap-Bangkok.html"                                          
##  [2] "/ShowTopic-g293916-i3687-k12857408-Apply_for_Myanmar_visum_in_Bangkok-Bangkok.html"                                           
##  [3] "/ShowTopic-g293916-i3687-k12857233-Bangkok_skywalk-Bangkok.html"                                                              
##  [4] "/ShowTopic-g1024140-i13523-k12857365-Langkawi_to_Koh_Lipe_in_september-Ko_Lipe_Satun_Province.html"                           
##  [5] "/ShowTopic-g293915-i3686-k12856109-Thailand_in_October-Thailand.html"                                                         
##  [6] "/ShowTopic-g297934-i13565-k12847819-Evason_Phuket-Rawai_Phuket.html"                                                          
##  [7] "/ShowTopic-g1152743-i17500-k12857018-Phi_Phi_island_to_Koh_Jum_travel_options_in_september_2019-Ko_Jum_Krabi_Province.html"   
##  [8] "/ShowTopic-g303907-i9842-k12855669-Kupu_kupu_or_santhiya_hotel-Ko_Pha_Ngan_Surat_Thani_Province.html"                         
##  [9] "/ShowTopic-g3218970-i26760-k12857280-Please_post_your_animal_sightings-Khao_Yai_National_Park_Nakhon_Ratchasima_Province.html"
## [10] "/ShowTopic-g293916-i3687-k12853910-Itinerary_feedback-Bangkok.html"

แบ่งเป็น 2 ส่วนคือ ด้านหน้า กับ ด้านหลัง

demo$front <- str_extract(demo$link,".*(k[0-9]+)")
demo$back <- sub("(k[0-9]+)","",demo$link) %>% str_extract(., "--.*") %>% sub('--',"",.)

ทดลอง Scrape

ข้อมูลที่เลือกเก็บคือ 1.Topic : ชื่อ กระทู้ 2.Comment : ความคิดเห็นของนักท่องเที่ยวในกระทู้นั้นๆ 3.User : ชื่อ user ของนักท่องเที่ยว 4.User_link : link profile ของ user นั้น 5.User_loc : location ของ user 6.From_mainlink : เอาไว้บอกว่ามาจาก main_link ไหน

demo_sublink <-data.table()
tryCatch({
  for (i in (1:nrow(demo))) {
    for (j in (1:max_page_perblog$max_page[i])){
      if(j == 1 | j == 0){ #condition or 
        url <-paste0("https://www.tripadvisor.com",demo$front[i],"-",demo$back[i])
        topic <- read_html(url) %>% html_nodes(".postTitle") %>% html_text()
        comment <-read_html(url)%>%html_nodes(".postBody")%>%html_text()
        user <-read_html(url)%>%html_nodes(".username span")%>%html_text()
        user_link <- read_html(url) %>% html_nodes(".username") %>% html_attrs() %>% unlist() %>% 
          .[grepl('Profile',.)] %>% as.vector() %>% str_extract('/Profile/.*') %>% gsub("');","",.)
        user_loc <- read_html(url)%>%html_nodes(".profile .location")%>%html_text()
        post_date<-read_html(url)%>%html_nodes(".postDate")%>%html_text()
        from_mainlink <- as.numeric(i)
        #find useless pattern
        del <- grep("-:- Message from Tripadvisor staff",comment)
        if(length(del) == 0){
          NA
        }else{
          topic <- topic[-del]
          comment <- comment[-del]
          user <- user[-del]
          user_link <- user_link[-del]
          user_loc <- user_loc[-del]
          post_date <- post_date[-del]
        }
        all <-data.table(topic,comment,user,user_link,user_loc,post_date,from_mainlink)
        Sys.sleep(1)
      }else{
        url <-paste0("https://www.tripadvisor.com",demo$front[i],"-o",page_dummy[j],"-",demo$back[i])
        topic <- read_html(url) %>% html_nodes(".postTitle") %>% html_text()
        comment <-read_html(url)%>%html_nodes(".postBody")%>%html_text()
        user <-read_html(url)%>%html_nodes(".username span")%>%html_text()
        user_link <- read_html(url) %>% html_nodes(".username") %>% html_attrs() %>% unlist() %>% 
          .[grepl('Profile',.)] %>% as.vector() %>% str_extract('/Profile/.*') %>% gsub("');","",.)
        user_loc <- read_html(url)%>%html_nodes(".profile .location")%>%html_text()
        post_date<-read_html(url)%>%html_nodes(".postDate")%>%html_text()
        from_mainlink <- as.numeric(i)
        #find useless pattern
        del <- grep("-:- Message from Tripadvisor staff",comment)
        if(length(del) == 0){
          NA
        }else{
          topic <- topic[-del]
          comment <- comment[-del]
          user <- user[-del]
          user_link <- user_link[-del]
          user_loc <- user_loc[-del]
          post_date <- post_date[-del]
        }
        all <-data.table(topic,comment,user,user_link,user_loc,post_date,from_mainlink) 
        Sys.sleep(1)
      }
      demo_sublink<-rbind(demo_sublink,all)
      rm(all)
      rm(del)
    }
  }
},error = function(e){})
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 3 has 7 rows but longest item has 8; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 4 has 7 rows but longest item has 8; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 5 has 7 rows but longest item has 8; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 3 has 3 rows but longest item has 4; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 4 has 3 rows but longest item has 4; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 5 has 3 rows but longest item has 4; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 3 has 7 rows but longest item has 8; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 4 has 7 rows but longest item has 8; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 5 has 7 rows but longest item has 8; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 3 has 3 rows but longest item has 4; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 4 has 3 rows but longest item has 4; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names
## = check.names, : Item 5 has 3 rows but longest item has 4; recycled with
## remainder.
## NULL
demo_sublink %>% as_tibble()
## # A tibble: 54 x 7
##    topic     comment       user   user_link    user_loc  post_date from_mainlink
##    <chr>     <chr>         <chr>  <chr>        <chr>     <chr>             <dbl>
##  1 "\nGiant… "\n\n\n\nHi … Werne… /Profile/we… "niagara… 1 year a…             1
##  2 "\n2.\n\… "\n\n\n\nI k… alist… /Profile/al… "Sheffie… 1 year a…             1
##  3 "\n3.\n\… "\n\n\n\nIf … Werne… /Profile/we… "niagara… 1 year a…             1
##  4 "\n4.\n\… "\n\n\n\nWil… Rolf S /Profile/24… "The Hag… 1 year a…             1
##  5 "\n5.\n\… "\n\n\n\nNo … Wendy… /Profile/82… ""        1 year a…             1
##  6 "\n6.\n\… "\n\n\n\nWe … alist… /Profile/al… "Sheffie… 1 year a…             1
##  7 "\n7.\n\… "\n\n\n\nI h… Steve… /Profile/Br… "High Wy… 1 year a…             1
##  8 "\n8.\n\… "\n\n\n\nWe … Werne… /Profile/we… "niagara… Nov 18, …             1
##  9 "\nApply… "\n\n\n\nIs … netti… /Profile/ne… "Mandala… 1 year a…             2
## 10 "\n1.\n\… "\n\n\n\nYes… Rolf S /Profile/24… "The Hag… 1 year a…             2
## # … with 44 more rows

OK it’s work

ทำด้วย link ทั้งหมด

all_link$front <- str_extract(all_link$link,".*(k[0-9]+)")
all_link$back <- sub("(k[0-9]+)","",all_link$link) %>% str_extract(., "--.*") %>% sub('--',"",.)

ใช้ TryCatch เพื่อป้องกันไม่ให้ Loop หลุด

TripData <-data.table()
tryCatch({
  for (i in (1:nrow(all_link))) {
  for (j in (1:max_page_perblog$max_page[i])){
    if(j == 1 | j == 0){ #condition or 
      url <-paste0("https://www.tripadvisor.com",all_link$front[i],"-",all_link$back[i])
      topic <- read_html(url) %>% html_nodes(".postTitle") %>% html_text()
      comment <-read_html(url)%>%html_nodes(".postBody")%>%html_text()
      user <-read_html(url)%>%html_nodes(".username span")%>%html_text()
      user_link <- read_html(url) %>% html_nodes(".username") %>% html_attrs() %>% unlist() %>% 
        .[grepl('Profile',.)] %>% as.vector() %>% str_extract('/Profile/.*') %>% gsub("');","",.)
      user_loc <- read_html(url)%>%html_nodes(".profile .location")%>%html_text()
      post_date<-read_html(url)%>%html_nodes(".postDate")%>%html_text()
      from_mainlink <- as.numeric(i)
      #find useless pattern
      del <- grep("-:- Message from Tripadvisor staff",comment)
      if(length(del) == 0){
        NA
      }else{
        topic <- topic[-del]
        comment <- comment[-del]
        user <- user[-del]
        user_link <- user_link[-del]
        user_loc <- user_loc[-del]
        post_date <- post_date[-del]
      }
      all <-data.table(topic,comment,user,user_link,user_loc,post_date,from_mainlink)
      Sys.sleep(1)
    }else{
      url <-paste0("https://www.tripadvisor.com",all_link$front[i],"-o",page_dummy[j],"-",all_link$back[i])
      topic <- read_html(url) %>% html_nodes(".postTitle") %>% html_text()
      comment <-read_html(url)%>%html_nodes(".postBody")%>%html_text()
      user <-read_html(url)%>%html_nodes(".username span")%>%html_text()
      user_link <- read_html(url) %>% html_nodes(".username") %>% html_attrs() %>% unlist() %>% 
        .[grepl('Profile',.)] %>% as.vector() %>% str_extract('/Profile/.*') %>% gsub("');","",.)
      user_loc <- read_html(url)%>%html_nodes(".profile .location")%>%html_text()
      post_date<-read_html(url)%>%html_nodes(".postDate")%>%html_text()
      from_mainlink <- as.numeric(i)
      #find useless pattern
      del <- grep("-:- Message from Tripadvisor staff",comment)
      if(length(del) == 0){
        NA
      }else{
        topic <- topic[-del]
        comment <- comment[-del]
        user <- user[-del]
        user_link <- user_link[-del]
        user_loc <- user_loc[-del]
        post_date <- post_date[-del]
      }
      all <-data.table(topic,comment,user,user_link,user_loc,post_date,from_mainlink) 
      Sys.sleep(1)
    }
    TripData<-rbind(TripData,all)
    rm(all)
    rm(del)
    }
    }
  },error = function(e){}) 

ใช้เวลาทั้งหมดประมาณ 21 วัน

#saveRDS(TripData,'TripData.rds')
TripData <- readRDS("TripData.rds")
TripData %>% as_tibble()
## # A tibble: 1,187,305 x 7
##    topic      comment       user   user_link    user_loc post_date from_mainlink
##    <chr>      <chr>         <chr>  <chr>        <chr>    <chr>             <dbl>
##  1 "Private … "\n\n\n\nCan… 433Za… /Profile/43… London,… Aug 18, …             1
##  2 "\nRe: Pr… "\n\n\n\nHi,… Chatc… /Profile/Br… Phuket   Aug 18, …             1
##  3 "\nRe: Pr… "\n\n\n\nPan… paint… /Profile/pa… Biggar,… Aug 18, …             1
##  4 "\nRe: Pr… "\n\n\n\nAno… nairo… /Profile/na… Souther… Aug 18, …             1
##  5 "\nRe: Pr… "\n\n\n\nTha… 433Za… /Profile/43… London,… Aug 23, …             1
##  6 "\nRe: Pr… "\n\n\n\nAno… BomJD  /Profile/Bo… Cairns,… Aug 23, …             1
##  7 "\nRe: Pr… "\n\n\n\nI u… Trail… /Profile/Tr… Singapo… Aug 24, …             1
##  8 "which ap… "\n\n\n\nPle… richa… /Profile/H3… Seoul, … 1 year a…             2
##  9 "\nRe: wh… "\n\n\n\nHi\… omega… /Profile/om… Singapo… 1 year a…             2
## 10 "\nRe: wh… "\n\n\n\n555… wtf_c… /Profile/wt… Ambrym,… 1 year a…             2
## # … with 1,187,295 more rows
TripData_part1 <- TripData[1:500000,]
TripData_part2 <- TripData[500001:nrow(TripData),]
saveRDS(TripData_part1,"TripData_part1.rds")
saveRDS(TripData_part2,"TripData_part2.rds")