'https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%82%9C%EB%AF%BC%20%22%EC%95%84%ED%94%84%EA%B0%84%22%20%7C%20%22%EC%95%84%ED%94%84%EA%B0%80%EB%8B%88%EC%8A%A4%ED%83%84%22&sort=2&photo=0&field=0&pd=3&ds=2021.08.15&de=2021.09.15&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20210815to20210915,a:all&start=1'
'https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%82%9C%EB%AF%BC%20%22%EC%95%84%ED%94%84%EA%B0%84%22%20%7C%20%22%EC%95%84%ED%94%84%EA%B0%80%EB%8B%88%EC%8A%A4%ED%83%84%22&sort=2&photo=0&field=0&pd=3&ds=2021.08.15&de=2021.09.15&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20210815to20210915,a:all&start=11'
'https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%82%9C%EB%AF%BC%20%22%EC%95%84%ED%94%84%EA%B0%84%22%20%7C%20%22%EC%95%84%ED%94%84%EA%B0%80%EB%8B%88%EC%8A%A4%ED%83%84%22&sort=2&photo=0&field=0&pd=3&ds=2021.08.15&de=2021.09.15&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20210815to20210915,a:all&start=21'
baseurl <- "https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%82%9C%EB%AF%BC%20%22%EC%95%84%ED%94%84%EA%B0%84%22%20%7C%20%22%EC%95%84%ED%94%84%EA%B0%80%EB%8B%88%EC%8A%A4%ED%83%84%22&sort=2&photo=0&field=0&pd=3&ds=2021.08.15&de=2021.09.15&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20210815to20210915,a:all&start="
pages <- seq(1, 3991, by=10)
urls <- paste(baseurl, pages, sep="")
urls[1:10]
length(urls)
'/html/body/div[3]/div[2]/div/div[1]/section[2]/div/div[2]/ul/li/div/div/div[1]/div[2]/a[2]'
'/html/body/div[3]/div[2]/div/div[1]/section[2]/div/div[2]/ul/li/div/div/div[1]/div[2]/a[2]'
url_extractor <- function(url){
page <- readLines(url)
html <- htmlParse(page)
urls <- xpathSApply(html, '/html/body/div[3]/div[2]/div/div[1]/section[2]/div/div[2]/ul/li/div/div/a', xmlGetAttr, "href")
return(urls)
}
url_extractor(urls[3])
length(urls)
news_urls <- lapply(urls[1:10], url_extractor)
class(news_urls)
news_urls <- unlist(news_urls)
news_urls
Web Scraping from any DAUM News of interest