Web scraping in practice

library(XML)
library(rvest)
library(stringr)
page <- readLines("https://news.daum.net/ranking/popular")
page_parsed <- htmlParse(page)

https://news.daum.net/ranking/popular?regDate=20211108

First Headline XPath: โ€˜//[@id="mArticle"]/div[2]/ul[3]/li[1]/div[2]/strong/aโ€™ Second Headline XPath: โ€™//[@id="mArticle"]/div[2]/ul[3]/li[2]/div[2]/strong/aโ€™

First Company XPath: โ€˜//[@id="mArticle"]/div[2]/ul[3]/li[1]/div[2]/strong/spanโ€™ Second Company XPath: โ€™//[@id="mArticle"]/div[2]/ul[3]/li[2]/div[2]/strong/spanโ€™

headline_xpath <- '//*[@id="mArticle"]/div[2]/ul[3]/li/div[2]/strong/a'
company_xpath <- '//*[@id="mArticle"]/div[2]/ul[3]/li/div[2]/strong/span'

headlines <- xpathSApply(page_parsed, headline_xpath, xmlValue)
headlines
##  [1] "\"๋ฐ•์ฒ ๋ฏผ, 10์–ต ์ œ์‹œํ•˜๋ฉฐ '์ด์žฌ๋ช… ์กฐํญ ์—ฐ๋ฃจ' ํ—ˆ์œ„ ์ œ๋ณด ๋ถ€ํƒ\""                               
##  [2] "ํ™์ค€ํ‘œ \"์ดํšŒ์ฐฝ๋„ ํ•œ๋‹ฌ๋งŒ์— ์ง€์ง€์œจ ํญ๋ฝ, ๋” '๋‹ค์ด๋‚ด๋ฏน'ํ•ด์งˆ ๊ฒƒ\""                            
##  [3] "ํœ ์ฒด์–ด ํƒ„ ๋ฐฉ์‹œํ˜ ๊ทผํ™ฉ.. ์ž˜๋‚˜๊ฐ€๋Š” ํ•˜์ด๋ธŒ ๋ถˆ๋งค ์šด๋™ '์™œ'"                                    
##  [4] "[๋‹จ๋…]๋Œ€ํ†ต๋ น ๋”ธ๋„ '์•„๋น  ์ฐฌ์Šค' ๋…ผ๋ž€.. ์ž‘๋…„ ๋ง๋ถ€ํ„ฐ ์ž๋…€์™€ '้‘ ๊ฑฐ์ฃผ'"                         
##  [5] "์ง€๋ฆฌ์‚ฐ ์ฒœ๋…„์†ก ์•„๋ž˜, ์ดํ† ๋ก ์•„๋ฆ„๋‹ค์šด ๊ฒฐํ˜ผ์‹์ด๋ผ๋‹ˆ"                                          
##  [6] "๊น€์ข…์ธ \"์„ ๋Œ€์œ„ ์ž๋ฆฌ ๋น„์šฐ๊ณ  ์ „๊ถŒ ๋‹ฌ๋ผ\"..์œค์„์—ด\"์‚ฌ๋žŒ ๋‚ด๋ณด๋‚ธ๋‹ค๋Š” ๊ฑด ์•„๋ƒ\""                
##  [7] "[ํ˜„์žฅ์—ฐ๊ฒฐ] ํ™์ค€ํ‘œ \"๋‘ ์‚ฌ๋žŒ ์ค‘ ํ•œ ์‚ฌ๋žŒ์€ ์„ ๊ฑฐ ์ง€๋ฉด ๊ฐ์˜ฅ ๊ฐ€์•ผํ•  ๊ฒƒ\""                       
##  [8] "๊ณต์ˆ˜์ฒ˜, 4๋ฒˆ์งธ ์œค์„์—ด ์ˆ˜์‚ฌ ์ฐฉ์ˆ˜..์ด๋ฒˆ์—” 'ํŒ์‚ฌ์‚ฌ์ฐฐ ๋ฌธ๊ฑด' ์˜ํ˜น (์ข…ํ•ฉ)"                        
##  [9] "์–‘์žฌ์ฒœ์„œ 20๋Œ€็”ท ์ˆ˜์ƒํ•œ ์›€์ง์ž„..ํœด๋Œ€์ „ํ™”์„œ ์˜์ƒ ์Ÿ์•„์กŒ๋‹ค"                                   
## [10] "\"๋ฏธ๊ตญ์ด ์ดˆ๋ฐฅ ๋จน๊ฒŒ ๋œ ๊ฑด ์ด ํ•œ๊ตญ์ธ ๋•\".. NYT, ๋ฌธ์„ ๋ช… ์กฐ๋ช…"                                
## [11] "์ตœ๊ณ ์œ„ ๋ถˆ์ฐธํ•˜๊ณ  ํ™์ค€ํ‘œ ์บ ํ”„ ํ•ด๋‹จ์‹ ๊ฐ„ ๋ฐฐํ˜„์ง„"                                              
## [12] "์œ ํฅ์‹œ์„คยท๋…ธ๋ž˜๋ฐฉ '๋ฐฉ์—ญํŒจ์Šค' ๊ณ„๋„๊ธฐ๊ฐ„ ์ข…๋ฃŒ..์˜ค๋Š˜๋ถ€ํ„ฐ ์œ„๋ฐ˜ ์‹œ ์ฒ˜๋ถ„"                          
## [13] "๋‰ด์งˆ๋žœ๋“œ์˜ ์ง‘๊ฐ’ ์žก๊ธฐ ์„ฑ๊ณตํ• ๊นŒ..์ „์„ธ๊ณ„ ๋‹น๊ตญ์ž ๊ด€์‹ฌ ์ง‘์ค‘"                                    
## [14] "่ˆ‡ \"์ „๊ตญ๋ฏผ ์žฌ๋‚œ์ง€์›๊ธˆ 20๋งŒ~25๋งŒ์› ์ถ”๊ฐ€์ง€์›..์˜ฌํ•ด ๋‚ด๋„ ๊ฐ€๋Šฅ\"(์ข…ํ•ฉ)"                       
## [15] "์ทŒ์žฅ์•”๋ณด๋‹ค ์‚ฌ๋ง๋ฅ  ๋†’์€ ์•”.. 'ํ˜ˆ๋ณ€'์€ ๊ฝค ์ง„ํ–‰๋œ ๊ฒฝ์šฐ"                                       
## [16] "์†ก์˜๊ธธ \"๋ฏผ์ฃผ๋‹น ๋Œ€์„  ๋ฃฐ ์ ์šฉํ–ˆ์œผ๋ฉด ํ™์ค€ํ‘œ ๋‹น์„ \" ์œค์„์—ด ์„ ์ถœ ์ง๊ฒฉ"                         
## [17] "\"ํ…Œ์Šฌ๋ผ ์ฃผ์‹ ํŒ”๊นŒ?\"์— 200๋งŒ๋ช… \"OK\"..๋จธ์Šคํฌ ์ง„์งœ ํŒ”๋ฉด ์ฃผ๊ฐ€๋Š”?"                          
## [18] "๋ฒ ํŠธ๋‚จ ๋“ฑ์„œ ์š”์†Œ 1๋งŒt ์ˆ˜์ž… ์ถ”์ง„..์š”์†Œ์ˆ˜๋Š” 2๋งŒโ†’2.7๋งŒL"                                     
## [19] "์‹ฌ์ƒ์ • \"์ด์žฌ๋ช…, ์œค์„์—ด ๋ชป ๊บพ์–ด..์ด๋ฒˆ ๋Œ€์„  ๋‚˜์™€ ๅฐน ๋Œ€๊ฒฐ\""                                 
## [20] "\"ํ—ˆ๋ฆฌ ์•„ํŒŒ์š”\" ํƒ€๊ฐ„ ์•ฝ ๋งˆ์•ฝ์ด์—ˆ๋‹ค..์˜์‚ฌ๋Š” \"๋ถˆ์Œํ•ด์„œ ์ฒ˜๋ฐฉ\" [์˜์ƒ]"                       
## [21] "ํ™์ค€ํ‘œ \"๊นจ๋—ํ•˜๊ฒŒ ์Šน๋ณต\" ์‚ฌํ˜๋งŒ์— \"ๅฐน์€ ๋น„๋ฆฌํ˜์˜์ž\""                                     
## [22] "'์ถ”๋ฏธ์•  ์ง€์ง€' ๊น€๋ฏผ์›… ๋ชฉ์‚ฌ, ์œค์„์—ด ๋‹น์„ ๋œ ํ›„ ๋‚จ๊ธด ๊ธ€..\"๊ฒ€์–ธ๊ฐœํ˜ ์ด›๋ถˆํ–‰๋™\""                
## [23] "\"์•„๋“ค์ด ์ฐฉํ•ด์„œ ์•„๋ฒ„์ง€ ์ฃฝ์—ฌ\"..๋Œ€๊ตฌ A๋ณ‘์›์˜ ํ™ฉ๋‹น ํ•ญ๋ณ€"                                     
## [24] "์กฐ๊ตญ \"๋ถˆ๋ฒ•์‚ฌ์ฐฐ๋กœ ์กด์—„์„ฑ ํ›ผ์†\" vs ๋ฒ•๋ฌด๋ถ€ \"๋Œ€๋ถ€๋ถ„ ์†Œ๋ฉธ์‹œํšจ ์ง€๋‚˜\""                        
## [25] "ํ™๋‚จ๊ธฐ \"์ดˆ๊ณผ์„ธ์ˆ˜ ์˜ˆ์ƒ๋ณด๋‹ค 10์กฐ ๋” ๋“ค์–ด์˜ฌ ๋“ฏ..์˜ฌํ•ด ์ถ”๊ฒฝ ์–ด๋ ค์›Œ\""                          
## [26] "๊ณต์ˆ˜์ฒ˜, 'ํŒ์‚ฌ์‚ฌ์ฐฐ ์˜ํ˜น' ์œค์„์—ด ์ถ”๊ฐ€ ์ž…๊ฑด ํ†ต๋ณด \"๊ฒฝ์„  ๊ณ ๋ คํ–ˆ๋‹ค\"(์ข…ํ•ฉ)"                     
## [27] "์ •๋ถ€, ์š”์†Œ์ˆ˜ ์ˆ˜์ž…์— ๋ฏผํ•ญ ํ™”๋ฌผ๊ธฐ ํˆฌ์ž… ์ค€๋น„..ํ•ญ๊ณต์‚ฌ์™€ ์‚ฌ์ „ํ˜‘์˜"                              
## [28] "์š”์†Œ์ˆ˜ ๋‚˜๋น„ํšจ๊ณผ..์žฅ๊ธฐํ™”ํ•˜๋ฉด ์ œ์ฃผ ๋ผ์ง€๋ถ„๋‡จ ๋Œ€๋ž€ ์˜จ๋‹ค"                                       
## [29] "์ด์žฌ๋ช… \"ๅฐน์— 1๋Œ€1 ํšŒ๋™ ์ œ์•ˆ\"..์œค์„์—ด \"๋Œ€์žฅ๋™ ๋ชธํ†ต๊ณผ์˜ ์ „์Ÿ\""                           
## [30] "๊น€์ข…์ธ \"์ž๋ฆฌ์‚ฌ๋ƒฅ๊พผ๋“ค..ๅฐน, ์„ ๋Œ€์œ„ ๊ตฌ์„ฑ ๋ƒ‰์ •ํ•˜๊ฒŒ ํŒ๋‹จํ•ด์•ผ\""                                
## [31] "ๅฐน-์ด์ค€์„, '์„ ๋Œ€์œ„ ๊ตฌ์„ฑ' ๋†“๊ณ  ๋ฒŒ์จ ๊ฐˆ๋“ฑ ์กฐ์ง"                                              
## [32] "์ดํ˜ผ์†Œ์†ก ์ค‘ ์•„๋‚ด ์›๋ฃธ์— ์นจ์ž…ํ•ด ๋ถˆ๋ฅœ ์ดฌ์˜ํ•œ ๋‚จํŽธ ๋ฌด์ฃ„โ†’์œ ์ฃ„"                                
## [33] "ํŠน๋ณ„ํ•œ ์ผ ์—†๋Š”๋ฐ ๋–จ์–ด์ง€๋Š” ๆ–‡ ๋Œ€ํ†ต๋ น ์ง€์ง€์œจ.. ์™œ?"                                          
## [34] "์œค์„์—ด '์›ํŒ€' ์ ์‹ ํ˜ธ..ํ™์ค€ํ‘œ '์‚ฌ์‹ค์ƒ ๋ถˆ์ฐธ'-๊น€์ข…์ธ '์ „๋ฉด ์žฌ๊ตฌ์„ฑ'"                           
## [35] "์ •๋ถ€, ๋‚ด์ฃผ ๋ฒ ํŠธ๋‚จ ์ฐจ๋Ÿ‰์šฉ ์š”์†Œ 200t ๋„์ž…..ํ• ๋‹น๊ด€์„ธ 0% ์ธํ•˜ ์ถ”์ง„"                            
## [36] "๋ฐฉ์‹œํ˜, ํ™•์—ฐํžˆ ๋‹ฌ๋ผ์ง„ ๋ชจ์Šต์— ํœ ์ฒด์–ด๊นŒ์ง€.. ๊ทธ๊ฐ„ ๋ฌด์Šจ ์ผ์ด"                                  
## [37] "[๋‰ด์‹œ์Šค ์•ต๊ธ€] ์˜ฅ์ฒœ 'ํ–ฅ์ˆ˜ํ˜ธ์ˆ˜๊ธธ' ๋ฌผ์•ˆ๊ฐœ ์žฅ๊ด€..๋ฐฉ๋ฌธ๊ฐ ์ค„์ด์–ด"                                
## [38] "๊ณต์ˆ˜์ฒ˜, 'ํŒ์‚ฌ์‚ฌ์ฐฐ ๋ฌธ๊ฑด ์˜ํ˜น' ์œค์„์—ด ์ž…๊ฑด..๋ฒŒ์จ ๋„ค๋ฒˆ์งธ ์ˆ˜์‚ฌ"                                
## [39] "์œค์„์—ด, ์ด์žฌ๋ช…๋ณด๋‹ค 20๋Œ€ ์ง€์ง€์œจ 2๋ฐฐ ์ด์ƒ..๋ฏผ์ฃผ๋‹น์€ '๋น„์ƒ'"                                  
## [40] "ํ•ธ๋“œํฌ๋ฆผ, ์–ผ๊ตด์— ๋ฐ”๋ฅด๋ฉด โ—‹โ—‹โ—‹ ์ƒ๊ธด๋‹ค"                                                     
## [41] "์–€์„ผ ์ ‘์ข…์ž ๋ถ€์Šคํ„ฐ์ƒท ์ฒซ๋‚ ..\"๋”ฐ๋”\" ๋ง ๋๋‚˜๊ธฐ ๋ฌด์„ญ๊ฒŒ ์ ‘์ข… ์ข…๋ฃŒ"                            
## [42] "ๆ—ฅ ์ฝ”๋กœ๋‚˜ ์‚ฌ๋ง 0๋ช…..\"๋ธํƒ€๋ณ€์ด์— ๋Œ์—ฐ๋ณ€์ด ์ถ”๊ฐ€, ์ž๋ฉธ ๊ฐ€๋Šฅ์„ฑ\""                             
## [43] "'์‡ผํ•‘๋ชฐยท๋ณ‘์› ์„ธ์šด๋‹ค๋”๋‹ˆ ๋˜ ์•„ํŒŒํŠธ ์ง“๋‚˜'..๊ถŒ์„ ์ง€๊ตฌ ์ฃผ๋ฏผ๋“ค, ํŠนํ˜œ์˜ํ˜น ์ œ๊ธฐ"                  
## [44] "[๋‰ด์Šค1 PICK] '๋ณต์ฃผ๋จธ๋‹ˆ' ์†์— ๋“  ์œค์„์—ด..\"๋Œ€์žฅ๋™ ๊ฒŒ์ดํŠธ ๋ชธํ†ต๊ณผ ์‹ธ์šฐ๋Š” ๋ถ€ํŒจ์™€์˜ ์ „์Ÿ\" ์„ ํฌ"
## [45] "ํ˜ˆ์•กํ˜•๋งˆ๋‹ค ์ทจ์•ฝํ•œ ์งˆ๋ณ‘์ด ๋”ฐ๋กœ ์žˆ๋‹ค? (์—ฐ๊ตฌ)"                                                
## [46] "ไธญ์„ํƒ„๊ณต๊ธ‰ยท์ „๋ ฅ๋‚œํ•ด์†Œยท์š”์†Œ๊ฐ€๊ฒฉ ํ•˜๋ฝ, ้Ÿ“์š”์†Œ์ˆ˜ ๋Œ€๋ž€ ์ˆจํ†ต ํŠธ์ผ๊นŒ"                          
## [47] "๊ฒฝ๊ธฐ๋„ยท๊ณ ์–‘ยท๊น€ํฌยทํŒŒ์ฃผ์‹œ \"์ผ์‚ฐ๋Œ€๊ตใˆœ๋Š” ํ†ตํ–‰๋ฃŒ ๋ฌด๋ฃŒํ™” ์ˆ˜์šฉํ•˜๋ผ\""                        
## [48] "ํ˜ธ์ฃผ '๋ฐฑ์‹  ๋ณต๊ถŒ' 1๋“ฑ ๋‹น์ฒจ์ž๋Š” ์ค‘๊ตญ๊ณ„ ์—ฌ์„ฑ..ํ•˜๋ฃจ์•„์นจ์— ๋ฐฑ๋งŒ์žฅ์ž"                            
## [49] "'0์„ ' ์œค์„์—ด, ๋ฐ•์˜์žฅ์— \"์˜ํšŒ์— ๊ตญ์ • ์ค‘์‹ฌ ๋‘๋Š” ๋Œ€ํ†ต๋ น ๋  ๊ฒƒ\""                             
## [50] "ํ™์ค€ํ‘œ ํ•ด๋‹จ์‹ ์ฒญ๋…„ 300์—ฌ๋ช… ๋ชฐ๋ ค..ๆดช \"๊ณ ๋ง™๋‹ค ์žŠ์ง€ ์•Š๊ฒ ๋‹ค\""
companies <- xpathSApply(page_parsed, company_xpath, xmlValue)
companies
##  [1] "์„ธ๊ณ„์ผ๋ณด"     "์ด๋ฐ์ผ๋ฆฌ"     "์„œ์šธ์‹ ๋ฌธ"     "๋ฌธํ™”์ผ๋ณด"     "์˜ค๋งˆ์ด๋‰ด์Šค"  
##  [6] "๋…ธ์ปท๋‰ด์Šค"     "์—ฐํ•ฉ๋‰ด์ŠคTV"   "๋…ธ์ปท๋‰ด์Šค"     "์„œ์šธ์‹ ๋ฌธ"     "์กฐ์„ ์ผ๋ณด"    
## [11] "๋™์•„์ผ๋ณด"     "๋‰ด์Šค1"        "์—ฐํ•ฉ๋‰ด์Šค"     "์ด๋ฐ์ผ๋ฆฌ"     "์ฝ”๋ฉ”๋””๋‹ท์ปด"  
## [16] "๊ฒฝํ–ฅ์‹ ๋ฌธ"     "๋จธ๋‹ˆํˆฌ๋ฐ์ด"   "์—ฐํ•ฉ๋‰ด์Šค"     "๋‰ด์Šค1"        "์ค‘์•™์ผ๋ณด"    
## [21] "ํ”„๋ ˆ์‹œ์•ˆ"     "๋””์ง€ํ„ธํƒ€์ž„์Šค" "ํ”„๋ ˆ์‹œ์•ˆ"     "ํ•œ๊ฒจ๋ ˆ"       "๋‰ด์‹œ์Šค"      
## [26] "๋จธ๋‹ˆํˆฌ๋ฐ์ด"   "์—ฐํ•ฉ๋‰ด์Šค"     "๋‰ด์Šค1"        "YTN"          "์—ฐํ•ฉ๋‰ด์Šค"    
## [31] "ํ—ค๋Ÿด๋“œ๊ฒฝ์ œ"   "์—ฐํ•ฉ๋‰ด์Šค"     "์„ธ๊ณ„์ผ๋ณด"     "๋™์•„์ผ๋ณด"     "๋‰ด์‹œ์Šค"      
## [36] "์กฐ์„ ์ผ๋ณด"     "๋‰ด์‹œ์Šค"       "์‹œ์‚ฌ์ €๋„"     "๊ฒฝํ–ฅ์‹ ๋ฌธ"     "ํ—ฌ์Šค์กฐ์„ "    
## [41] "์•„์‹œ์•„๊ฒฝ์ œ"   "๋‰ด์Šค1"        "๋‰ด์Šค1"        "๋‰ด์Šค1"        "ํ•˜์ด๋‹ฅ"      
## [46] "ํŒŒ์ด๋‚ธ์…œ๋‰ด์Šค" "๋‰ด์Šค1"        "์„œ์šธ์‹ ๋ฌธ"     "์—ฐํ•ฉ๋‰ด์Šค"     "๋‰ด์‹œ์Šค"

Five steps of web scraping

  1. We identify the running mechanism in the URL syntax.
  2. We retrieve links to the running pages.
  3. We download the running pages.
  4. We retrieve links to the entries on the running pages.
  5. We download the single entries.

Manipulating URLs to access multiple pages

baseurl <- "https://news.daum.net/ranking/popular?regDate="
# Last week
dates <- seq(from=20211101, to=20211107, by=1)
# Last two weeks
seq(from=20211025, to=20211107, by=1)
##  [1] 20211025 20211026 20211027 20211028 20211029 20211030 20211031 20211032
##  [9] 20211033 20211034 20211035 20211036 20211037 20211038 20211039 20211040
## [17] 20211041 20211042 20211043 20211044 20211045 20211046 20211047 20211048
## [25] 20211049 20211050 20211051 20211052 20211053 20211054 20211055 20211056
## [33] 20211057 20211058 20211059 20211060 20211061 20211062 20211063 20211064
## [41] 20211065 20211066 20211067 20211068 20211069 20211070 20211071 20211072
## [49] 20211073 20211074 20211075 20211076 20211077 20211078 20211079 20211080
## [57] 20211081 20211082 20211083 20211084 20211085 20211086 20211087 20211088
## [65] 20211089 20211090 20211091 20211092 20211093 20211094 20211095 20211096
## [73] 20211097 20211098 20211099 20211100 20211101 20211102 20211103 20211104
## [81] 20211105 20211106 20211107
seq(from=as.Date("2021-10-25"), to=as.Date("2021-11-07"), by= "day")
##  [1] "2021-10-25" "2021-10-26" "2021-10-27" "2021-10-28" "2021-10-29"
##  [6] "2021-10-30" "2021-10-31" "2021-11-01" "2021-11-02" "2021-11-03"
## [11] "2021-11-04" "2021-11-05" "2021-11-06" "2021-11-07"
dates <- seq(from=as.Date("2021-10-25"), to=as.Date("2021-11-07"), by= "day")
dates
##  [1] "2021-10-25" "2021-10-26" "2021-10-27" "2021-10-28" "2021-10-29"
##  [6] "2021-10-30" "2021-10-31" "2021-11-01" "2021-11-02" "2021-11-03"
## [11] "2021-11-04" "2021-11-05" "2021-11-06" "2021-11-07"
# Change to date numbers in format using regular expression
library(stringr)
str_remove_all(dates, "-")
##  [1] "20211025" "20211026" "20211027" "20211028" "20211029" "20211030"
##  [7] "20211031" "20211101" "20211102" "20211103" "20211104" "20211105"
## [13] "20211106" "20211107"
dates <- str_remove_all(dates, "-")
# Attching the numbers to the url
urls <- str_c(baseurl, dates)
urls
##  [1] "https://news.daum.net/ranking/popular?regDate=20211025"
##  [2] "https://news.daum.net/ranking/popular?regDate=20211026"
##  [3] "https://news.daum.net/ranking/popular?regDate=20211027"
##  [4] "https://news.daum.net/ranking/popular?regDate=20211028"
##  [5] "https://news.daum.net/ranking/popular?regDate=20211029"
##  [6] "https://news.daum.net/ranking/popular?regDate=20211030"
##  [7] "https://news.daum.net/ranking/popular?regDate=20211031"
##  [8] "https://news.daum.net/ranking/popular?regDate=20211101"
##  [9] "https://news.daum.net/ranking/popular?regDate=20211102"
## [10] "https://news.daum.net/ranking/popular?regDate=20211103"
## [11] "https://news.daum.net/ranking/popular?regDate=20211104"
## [12] "https://news.daum.net/ranking/popular?regDate=20211105"
## [13] "https://news.daum.net/ranking/popular?regDate=20211106"
## [14] "https://news.daum.net/ranking/popular?regDate=20211107"

What is a function? What is the function, lapply() or sapply()?

urls
##  [1] "https://news.daum.net/ranking/popular?regDate=20211025"
##  [2] "https://news.daum.net/ranking/popular?regDate=20211026"
##  [3] "https://news.daum.net/ranking/popular?regDate=20211027"
##  [4] "https://news.daum.net/ranking/popular?regDate=20211028"
##  [5] "https://news.daum.net/ranking/popular?regDate=20211029"
##  [6] "https://news.daum.net/ranking/popular?regDate=20211030"
##  [7] "https://news.daum.net/ranking/popular?regDate=20211031"
##  [8] "https://news.daum.net/ranking/popular?regDate=20211101"
##  [9] "https://news.daum.net/ranking/popular?regDate=20211102"
## [10] "https://news.daum.net/ranking/popular?regDate=20211103"
## [11] "https://news.daum.net/ranking/popular?regDate=20211104"
## [12] "https://news.daum.net/ranking/popular?regDate=20211105"
## [13] "https://news.daum.net/ranking/popular?regDate=20211106"
## [14] "https://news.daum.net/ranking/popular?regDate=20211107"
remove_numbers <- function(x){
  y <- str_remove(x, "[[:digit:]]+")
  return(y)
}

?lapply
## starting httpd help server ... done
lapply(urls, remove_numbers)
## [[1]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[2]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[3]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[4]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[5]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[6]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[7]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[8]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[9]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[10]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[11]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[12]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[13]]
## [1] "https://news.daum.net/ranking/popular?regDate="
## 
## [[14]]
## [1] "https://news.daum.net/ranking/popular?regDate="

Letโ€™s apply the function for extracting headlines and companies from the URLs

urls
##  [1] "https://news.daum.net/ranking/popular?regDate=20211025"
##  [2] "https://news.daum.net/ranking/popular?regDate=20211026"
##  [3] "https://news.daum.net/ranking/popular?regDate=20211027"
##  [4] "https://news.daum.net/ranking/popular?regDate=20211028"
##  [5] "https://news.daum.net/ranking/popular?regDate=20211029"
##  [6] "https://news.daum.net/ranking/popular?regDate=20211030"
##  [7] "https://news.daum.net/ranking/popular?regDate=20211031"
##  [8] "https://news.daum.net/ranking/popular?regDate=20211101"
##  [9] "https://news.daum.net/ranking/popular?regDate=20211102"
## [10] "https://news.daum.net/ranking/popular?regDate=20211103"
## [11] "https://news.daum.net/ranking/popular?regDate=20211104"
## [12] "https://news.daum.net/ranking/popular?regDate=20211105"
## [13] "https://news.daum.net/ranking/popular?regDate=20211106"
## [14] "https://news.daum.net/ranking/popular?regDate=20211107"
headline_extractor <- function(pageurl) {
  page <- readLines(pageurl)
  page_parsed <- htmlParse(page)
  headlines <- xpathSApply(page_parsed, headline_xpath, xmlValue)
  return(headlines)
}

headline_list <- lapply(urls, headline_extractor)
class(headline_list)
## [1] "list"
length(headline_list)
## [1] 14
company_extractor <- function(pageurl) {
  page <- readLines(pageurl)
  page_parsed <- htmlParse(page)
  companies <- xpathSApply(page_parsed, company_xpath, xmlValue)
  return(companies)
}

company_list <- lapply(urls, company_extractor)
length(company_list)
## [1] 14