Example of Web Scraping from NAVER News

  1. News Headline

  2. News Company Name

  3. Upload Time

  4. News Highlight

  5. Main Text

library(XML)
library(httr)

news <- readLines("https://news.naver.com/main/read.naver?mode=LSD&mid=shm&sid1=104&oid=001&aid=0012726579")

parsed_news <- htmlParse(news)
parsed_news

'//*[@id="articleTitle"]'

headline <- xpathSApply(parsed_news, '//*[@id="articleTitle"]',  fun = xmlValue)
headline 

#install.packages("rvest")
library(rvest)

headline <- repair_encoding(headline, from="utf-8")
headline

'//*[@id="main_content"]/div[1]/div[1]/a/img'



company <- xpathSApply(parsed_news, '//*[@id="main_content"]/div[1]/div[1]/a/img', fun = xmlGetAttr, "title")
company

company <- repair_encoding(company, from="utf-8")
company


'//*[@id="main_content"]/div[1]/div[3]/div/span'

time <- xpathSApply(parsed_news, '//*[@id="main_content"]/div[1]/div[3]/div/span', xmlValue)
time

time <- repair_encoding(time, from="utf-8")
time

'//*[@id="articleBodyContents"]/strong'

highlight <- xpathSApply(parsed_news, '//*[@id="articleBodyContents"]/strong'
, xmlValue)
highlight

highlight <- repair_encoding(highlight, from="utf-8")
highlight
highlight

'//*[@id="articleBodyContents"]/text()[1]'
'//*[@id="articleBodyContents"]/text()[5]'

'//*[@id="articleBodyContents"]/text()'


content <- xpathSApply(parsed_news, '//*[@id="articleBodyContents"]/text()', xmlValue)
content

content <- repair_encoding(content, from="utf-8")
content

length(content)

paste(content, collapse=" ")

library(tidyverse)
first_naver_news <- tibble(Headline=headline,
                           Company=company,
                           Time=time,
                           Highlight=highlight,
                           Content=paste(content, collapse=" "))
first_naver_news

Example of Collecting URLs to News Articles

'https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%82%9C%EB%AF%BC%20%22%EC%95%84%ED%94%84%EA%B0%84%22%20%7C%20%22%EC%95%84%ED%94%84%EA%B0%80%EB%8B%88%EC%8A%A4%ED%83%84%22&sort=2&photo=0&field=0&pd=3&ds=2021.08.15&de=2021.09.15&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20210815to20210915,a:all&start=1'

'https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%82%9C%EB%AF%BC%20%22%EC%95%84%ED%94%84%EA%B0%84%22%20%7C%20%22%EC%95%84%ED%94%84%EA%B0%80%EB%8B%88%EC%8A%A4%ED%83%84%22&sort=2&photo=0&field=0&pd=3&ds=2021.08.15&de=2021.09.15&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20210815to20210915,a:all&start=11'


'https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%82%9C%EB%AF%BC%20%22%EC%95%84%ED%94%84%EA%B0%84%22%20%7C%20%22%EC%95%84%ED%94%84%EA%B0%80%EB%8B%88%EC%8A%A4%ED%83%84%22&sort=2&photo=0&field=0&pd=3&ds=2021.08.15&de=2021.09.15&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20210815to20210915,a:all&start=21'


baseurl <- "https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EB%82%9C%EB%AF%BC%20%22%EC%95%84%ED%94%84%EA%B0%84%22%20%7C%20%22%EC%95%84%ED%94%84%EA%B0%80%EB%8B%88%EC%8A%A4%ED%83%84%22&sort=2&photo=0&field=0&pd=3&ds=2021.08.15&de=2021.09.15&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from20210815to20210915,a:all&start="
pages <- seq(1, 3991, by=10)

urls <- paste(baseurl, pages, sep="")
urls[1:10]

length(urls)

'/html/body/div[3]/div[2]/div/div[1]/section[2]/div/div[2]/ul/li/div/div/div[1]/div[2]/a[2]'

'/html/body/div[3]/div[2]/div/div[1]/section[2]/div/div[2]/ul/li/div/div/div[1]/div[2]/a[2]'

url_extractor <- function(url){
  page <- readLines(url) 
  html <- htmlParse(page)
  urls <- xpathSApply(html, '/html/body/div[3]/div[2]/div/div[1]/section[2]/div/div[2]/ul/li/div/div/a', xmlGetAttr, "href")  
  return(urls)
}
url_extractor(urls[3])

length(urls)
news_urls <- lapply(urls[1:10], url_extractor)
class(news_urls)
news_urls <- unlist(news_urls)
news_urls

Assignment

Web Scraping from any DAUM News of interest

  1. Headline
  2. Press Company Name
  3. Upload Time
  4. News Highlights
  5. Main Text