Overview

Further insights into the state of the labour market using open source data by scraping online. Web Scraping is to programmatically extract data from the HTML code of websites.

Loading Libraries

library(XML)
library(RCurl)
library(httr)
## Warning: package 'httr' was built under R version 3.6.3

Getting Data from Webpages

con <- url("https://rpubs.com/chenlianghe")
htmlCode <- readLines(con)
close(con)
htmlCode
##   [1] "<!DOCTYPE html>"                                                                                                                                                                                                                                                                                                                                                                                                                                                  
##   [2] "<html lang='en'>"                                                                                                                                                                                                                                                                                                                                                                                                                                                 
##   [3] "<head>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##   [4] "<meta content='IE=edge' http-equiv='X-UA-Compatible'>"                                                                                                                                                                                                                                                                                                                                                                                                            
##   [5] "<title>RPubs</title>"                                                                                                                                                                                                                                                                                                                                                                                                                                             
##   [6] "<meta name=\"csrf-param\" content=\"authenticity_token\" />"                                                                                                                                                                                                                                                                                                                                                                                                      
##   [7] "<meta name=\"csrf-token\" content=\"01P6guApzgGcrVzmuMFNoc5OKcbZqBGnNHIQHxDpI3fasdcw8U23w8/byrZUArV+0Mz154sYOsAgijas/nsSOg==\" />"                                                                                                                                                                                                                                                                                                                                
##   [8] "<link rel=\"stylesheet\" media=\"all\" href=\"/assets/application-3956e416c438f98e8d8b82b039d6ac6cd5417ad8d51825485256a39737302686.css\" />"                                                                                                                                                                                                                                                                                                                      
##   [9] "<script src=\"/assets/application-050918065a747f23455921e989643a0f9050e5da8573c9858fc4266f0ec88af2.js\"></script>"                                                                                                                                                                                                                                                                                                                                                
##  [10] "<meta content='width=device-width, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0, user-scalable=0' name='viewport'>"                                                                                                                                                                                                                                                                                                                                    
##  [11] "<link rel=\"stylesheet\" href=\"https://use.typekit.net/tzi3tjz.css\">"                                                                                                                                                                                                                                                                                                                                                                                           
##  [12] "<script>"                                                                                                                                                                                                                                                                                                                                                                                                                                                         
##  [13] "  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){"                                                                                                                                                                                                                                                                                                                                                                                   
##  [14] "  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),"                                                                                                                                                                                                                                                                                                                                                                                 
##  [15] "  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)"                                                                                                                                                                                                                                                                                                                                                                                
##  [16] "  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');"                                                                                                                                                                                                                                                                                                                                                                             
##  [17] "  ga('create', 'UA-20375833-3', 'auto', {'allowLinker': true});"                                                                                                                                                                                                                                                                                                                                                                                                  
##  [18] "  ga('require', 'linker');"                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [19] "  ga('linker:autoLink', ['rstudio.com', 'rstudio.github.io', 'rviews.rstudio.com', 'community.rstudio.com', 'rpubs.rstudio.com', 'environments.rstudio.com', 'rstudio.org', 'dailies.rstudio.com', 'pages.rstudio.com', 'db.rstudio.com', 'solutions.rstudio.com', 'docs.rstudio.com', 'spark.rstudio.com', 'shiny.rstudio.com', 'education.rstudio.com', 'rstudio.cloud', 'shinyapps.io', 'teamadmin.rstudio.com', 'blog.rstudio.com', 'support.rstudio.com'] );"
##  [20] "  ga('send', 'pageview');"                                                                                                                                                                                                                                                                                                                                                                                                                                        
##  [21] "</script>"                                                                                                                                                                                                                                                                                                                                                                                                                                                        
##  [22] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
##  [23] "</head>"                                                                                                                                                                                                                                                                                                                                                                                                                                                          
##  [24] "<body>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [25] "<div class='modal' id='login' style='display: none'>"                                                                                                                                                                                                                                                                                                                                                                                                             
##  [26] "<div class='modal-header'>"                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [27] "<h1>Sign In</h1>"                                                                                                                                                                                                                                                                                                                                                                                                                                                 
##  [28] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [29] "<div class='modal-body'>"                                                                                                                                                                                                                                                                                                                                                                                                                                         
##  [30] "<div class='alert' id='login_message' style='display: none'></div>"                                                                                                                                                                                                                                                                                                                                                                                               
##  [31] "<form action=\"/auth/login\" accept-charset=\"UTF-8\" method=\"post\"><input name=\"utf8\" type=\"hidden\" value=\"&#x2713;\" /><input type=\"hidden\" name=\"authenticity_token\" value=\"w8cjcQYaUEOtwqcd8cEe9c5gwmY1cC2ohw4BKtVe6rzTP+ud28RTpvT/T5Q8rjhwOzPSDVhv04h3N1GI0XCq3w==\" />"                                                                                                                                                                         
##  [32] "<input name='return_url' type='hidden'>"                                                                                                                                                                                                                                                                                                                                                                                                                          
##  [33] "<div class='fieldset'>"                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [34] "<div class='control-group'>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
##  [35] "<label class='control-label' for='login_username'>Username or Email</label>"                                                                                                                                                                                                                                                                                                                                                                                      
##  [36] "<div class='controls'>"                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [37] "<input class='input-xlarge' id='login_username' name='username' type='text'>"                                                                                                                                                                                                                                                                                                                                                                                     
##  [38] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [39] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [40] "<div class='control-group'>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
##  [41] "<label class='control-label' for='login_password'>Password</label>"                                                                                                                                                                                                                                                                                                                                                                                               
##  [42] "<div class='controls'>"                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [43] "<input class='input-xlarge' id='login_password' name='password' type='password'>"                                                                                                                                                                                                                                                                                                                                                                                 
##  [44] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [45] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [46] "<div class='control-group'>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
##  [47] "<a href='/auth/passwordhelp' target='_blank'>Forgot your password?</a>"                                                                                                                                                                                                                                                                                                                                                                                           
##  [48] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [49] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [50] "</form>"                                                                                                                                                                                                                                                                                                                                                                                                                                                          
##  [51] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
##  [52] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
##  [53] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [54] "<div class='modal-footer'>"                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [55] "<button class='btn btn-primary' id='login-modal-submit'>Sign In</button>"                                                                                                                                                                                                                                                                                                                                                                                         
##  [56] "<button class='btn' id='login-modal-cancel'>Cancel</button>"                                                                                                                                                                                                                                                                                                                                                                                                      
##  [57] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [58] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [59] "<div id='main'>"                                                                                                                                                                                                                                                                                                                                                                                                                                                  
##  [60] "<div id='pageheader'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
##  [61] "<div id='branding'>"                                                                                                                                                                                                                                                                                                                                                                                                                                              
##  [62] "<h1 id='logo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                                   
##  [63] "<a href='/'><span id='R'>R</span>Pubs"                                                                                                                                                                                                                                                                                                                                                                                                                            
##  [64] "</a>"                                                                                                                                                                                                                                                                                                                                                                                                                                                             
##  [65] "</h1>"                                                                                                                                                                                                                                                                                                                                                                                                                                                            
##  [66] "<span id='tagline'>by RStudio</span>"                                                                                                                                                                                                                                                                                                                                                                                                                             
##  [67] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [68] "<div id='identity'>"                                                                                                                                                                                                                                                                                                                                                                                                                                              
##  [69] "<div class='btn-group'>"                                                                                                                                                                                                                                                                                                                                                                                                                                          
##  [70] "<a class='btn btn-small pull-right' href='#' onclick='rpubs_showLogin(); return false'>"                                                                                                                                                                                                                                                                                                                                                                          
##  [71] "Sign in"                                                                                                                                                                                                                                                                                                                                                                                                                                                          
##  [72] "</a>"                                                                                                                                                                                                                                                                                                                                                                                                                                                             
##  [73] "<a class='btn btn-small pull-right' href='/users/new'>"                                                                                                                                                                                                                                                                                                                                                                                                           
##  [74] "Register"                                                                                                                                                                                                                                                                                                                                                                                                                                                         
##  [75] "</a>"                                                                                                                                                                                                                                                                                                                                                                                                                                                             
##  [76] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [77] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [78] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [79] "<div id='pagebody'>"                                                                                                                                                                                                                                                                                                                                                                                                                                              
##  [80] "<div class='pagebodyhead'>"                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [81] "<div class='userblock'>"                                                                                                                                                                                                                                                                                                                                                                                                                                          
##  [82] "<a href=\"https://rpubs.com/chenlianghe\" class=\"gravatar-link\"><img src=\"https://secure.gravatar.com/avatar/dd8355468216b0969ca61f3916f31be0?s=64\" class=\"gravatar\" alt=\"gravatar\" style=\"width: 64px; height: 64px\"/></a>"                                                                                                                                                                                                                            
##  [83] "<h1>chenlianghe</h1>"                                                                                                                                                                                                                                                                                                                                                                                                                                             
##  [84] "<h3>Lianghe</h3>"                                                                                                                                                                                                                                                                                                                                                                                                                                                 
##  [85] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [86] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [87] "<h2>Recently Published</h2>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
##  [88] "<div class='pubs'>"                                                                                                                                                                                                                                                                                                                                                                                                                                               
##  [89] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
##  [90] "<div class='pubrow3'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
##  [91] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
##  [92] "<a href=\"https://rpubs.com/chenlianghe/607956\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/59e9570dbe43dac4f8ff9c03b6a4d97c/png/?thumbnail_max_width=200&amp;unique=1588416714&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607956_b530c49de6ac4f05ba587bc9f704e964.html&amp;viewport=960x960\" /></a>"                                                                                      
##  [93] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
##  [94] "<h5><a href=\"https://rpubs.com/chenlianghe/607956\">2020-04-21 Test B</a></h5>"                                                                                                                                                                                                                                                                                                                                                                                  
##  [95] "<div class='desc'>Question 1</div>"                                                                                                                                                                                                                                                                                                                                                                                                                               
##  [96] "<time datetime='2020-05-02T10:51:54+00:00'>4 minutes ago</time>"                                                                                                                                                                                                                                                                                                                                                                                                  
##  [97] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [98] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [99] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [100] "<a href=\"https://rpubs.com/chenlianghe/607951\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/68f73ee1954accb5d5f1aa8932bb4564/png/?thumbnail_max_width=200&amp;unique=1588415808&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607951_5f08303483a44fe4afc8300393e14635.html&amp;viewport=960x960\" /></a>"                                                                                      
## [101] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [102] "<h5><a href=\"https://rpubs.com/chenlianghe/607951\">2020-04-21 Test A</a></h5>"                                                                                                                                                                                                                                                                                                                                                                                  
## [103] "<div class='desc'>Question 1</div>"                                                                                                                                                                                                                                                                                                                                                                                                                               
## [104] "<time datetime='2020-05-02T10:36:48+00:00'>19 minutes ago</time>"                                                                                                                                                                                                                                                                                                                                                                                                 
## [105] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [106] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [107] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [108] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [109] "<a href=\"https://rpubs.com/chenlianghe/607949\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/0cbd86eb98162fa54426a10a63f0776c/png/?thumbnail_max_width=200&amp;unique=1588415575&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607949_7efc18f05db041fca90841aa1b96a697.html&amp;viewport=960x960\" /></a>"                                                                                      
## [110] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [111] "<h5><a href=\"https://rpubs.com/chenlianghe/607949\">2020-04-17 Data Scientist Assignment</a></h5>"                                                                                                                                                                                                                                                                                                                                                               
## [112] "<div class='desc'>Question 4 [Topic Modelling]</div>"                                                                                                                                                                                                                                                                                                                                                                                                             
## [113] "<time datetime='2020-05-02T10:32:55+00:00'>23 minutes ago</time>"                                                                                                                                                                                                                                                                                                                                                                                                 
## [114] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [115] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [116] "<div class='pubrow3'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [117] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [118] "<a href=\"https://rpubs.com/chenlianghe/607948\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/e1ccb8e6f587e289413404718f2c4f6d/png/?thumbnail_max_width=200&amp;unique=1588415401&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607948_a5671772c11447d0907be2f0e089bee6.html&amp;viewport=960x960\" /></a>"                                                                                      
## [119] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [120] "<h5><a href=\"https://rpubs.com/chenlianghe/607948\">2020-04-17 Data Scientist Assignment</a></h5>"                                                                                                                                                                                                                                                                                                                                                               
## [121] "<div class='desc'>Question 3 [Estimating Treatment Effects]</div>"                                                                                                                                                                                                                                                                                                                                                                                                
## [122] "<time datetime='2020-05-02T10:30:01+00:00'>25 minutes ago</time>"                                                                                                                                                                                                                                                                                                                                                                                                 
## [123] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [124] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [125] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [126] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [127] "<a href=\"https://rpubs.com/chenlianghe/607947\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/503752cebe0979c536b2d0f790b0a15e/png/?thumbnail_max_width=200&amp;unique=1588415175&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607947_0ca57ca39ec140ca8827d6469e60b990.html&amp;viewport=960x960\" /></a>"                                                                                      
## [128] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [129] "<h5><a href=\"https://rpubs.com/chenlianghe/607947\">2020-04-17 Data Scientist Assignment</a></h5>"                                                                                                                                                                                                                                                                                                                                                               
## [130] "<div class='desc'>Question 2 [Hedonic Regression Modelling]</div>"                                                                                                                                                                                                                                                                                                                                                                                                
## [131] "<time datetime='2020-05-02T10:26:15+00:00'>29 minutes ago</time>"                                                                                                                                                                                                                                                                                                                                                                                                 
## [132] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [133] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [134] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [135] "<a href=\"https://rpubs.com/chenlianghe/607946\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/782c86c0628e8b3f2153bc8405f3385c/png/?thumbnail_max_width=200&amp;unique=1588414896&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607946_52f049f868714d4a8251de1b56aca70d.html&amp;viewport=960x960\" /></a>"                                                                                      
## [136] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [137] "<h5><a href=\"https://rpubs.com/chenlianghe/607946\">2020-04-17 Data Scientist Assignment</a></h5>"                                                                                                                                                                                                                                                                                                                                                               
## [138] "<div class='desc'>Question 1 [Geospatial Data Visualization]</div>"                                                                                                                                                                                                                                                                                                                                                                                               
## [139] "<time datetime='2020-05-02T10:21:36+00:00'>34 minutes ago</time>"                                                                                                                                                                                                                                                                                                                                                                                                 
## [140] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [141] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [142] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [143] "<div class='pubrow3'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [144] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [145] "<a href=\"https://rpubs.com/chenlianghe/607944\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/71d729e70ee99d41828bb51c2b3ee533/png/?thumbnail_max_width=200&amp;unique=1588414339&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607944_e8371d14ee0b416db0e83c701ec32a95.html&amp;viewport=960x960\" /></a>"                                                                                      
## [146] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [147] "<h5><a href=\"https://rpubs.com/chenlianghe/607944\">2020-03-17 Research Assignment</a></h5>"                                                                                                                                                                                                                                                                                                                                                                     
## [148] "<div class='desc'>Question Part (b)</div>"                                                                                                                                                                                                                                                                                                                                                                                                                        
## [149] "<time datetime='2020-05-02T10:12:19+00:00'>43 minutes ago</time>"                                                                                                                                                                                                                                                                                                                                                                                                 
## [150] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [151] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [152] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [153] "<a href=\"https://rpubs.com/chenlianghe/607943\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/def9e94b5de4024dfc72789bf104ea1e/png/?thumbnail_max_width=200&amp;unique=1588414283&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607943_43b5e10d887e4aa58ce88fdc387ce97d.html&amp;viewport=960x960\" /></a>"                                                                                      
## [154] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [155] "<h5><a href=\"https://rpubs.com/chenlianghe/607943\">2020-03-17 Research Assignment</a></h5>"                                                                                                                                                                                                                                                                                                                                                                     
## [156] "<div class='desc'>Question Part (a)</div>"                                                                                                                                                                                                                                                                                                                                                                                                                        
## [157] "<time datetime='2020-05-02T10:11:23+00:00'>44 minutes ago</time>"                                                                                                                                                                                                                                                                                                                                                                                                 
## [158] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [159] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [160] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [161] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [162] "<a href=\"https://rpubs.com/chenlianghe/607927\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/fdc1bef9e296fe1edd6d9dce6828665b/png/?thumbnail_max_width=200&amp;unique=1588411461&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607927_51d205d54b854e22b11fc6ed8623a9ba.html&amp;viewport=960x960\" /></a>"                                                                                      
## [163] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [164] "<h5><a href=\"https://rpubs.com/chenlianghe/607927\">2020-03-13 Case Study</a></h5>"                                                                                                                                                                                                                                                                                                                                                                              
## [165] "<div class='desc'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                         
## [166] "<time datetime='2020-05-02T09:24:21+00:00'>about 2 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [167] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [168] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [169] "<div class='pubrow3'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [170] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [171] "<a href=\"https://rpubs.com/chenlianghe/607923\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/27e34e0e5442cd14f29d5c8bace34492/png/?thumbnail_max_width=200&amp;unique=1588411021&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607923_00df51131e4e4c6099d9d64c8f01d977.html&amp;viewport=960x960\" /></a>"                                                                                      
## [172] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [173] "<h5><a href=\"https://rpubs.com/chenlianghe/607923\">2020-03-06 Assessment</a></h5>"                                                                                                                                                                                                                                                                                                                                                                              
## [174] "<div class='desc'>Question 2</div>"                                                                                                                                                                                                                                                                                                                                                                                                                               
## [175] "<time datetime='2020-05-02T09:17:01+00:00'>about 2 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [176] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [177] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [178] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [179] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [180] "<a href=\"https://rpubs.com/chenlianghe/607916\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/d313fde96454558521ea7f3aa24e66df/png/?thumbnail_max_width=200&amp;unique=1588410511&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607916_a63c74e75b41457687ffc8afd54af9c2.html&amp;viewport=960x960\" /></a>"                                                                                      
## [181] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [182] "<h5><a href=\"https://rpubs.com/chenlianghe/607916\">2020-03-06 Assessment</a></h5>"                                                                                                                                                                                                                                                                                                                                                                              
## [183] "<div class='desc'>Question 1</div>"                                                                                                                                                                                                                                                                                                                                                                                                                               
## [184] "<time datetime='2020-05-02T09:08:31+00:00'>about 2 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [185] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [186] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [187] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [188] "<a href=\"https://rpubs.com/chenlianghe/607893\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/95e3d0f0528cf7224d6557a827b74b35/png/?thumbnail_max_width=200&amp;unique=1588401433&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607893_041c277ab49b4afbb7ae0558169fe97e.html&amp;viewport=960x960\" /></a>"                                                                                      
## [189] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [190] "<h5><a href=\"https://rpubs.com/chenlianghe/607893\">A Fast and Easy Way to Predict Words</a></h5>"                                                                                                                                                                                                                                                                                                                                                               
## [191] "<div class='desc'>Coursera Data Science Specialization"                                                                                                                                                                                                                                                                                                                                                                                                           
## [192] "Data Science Capstone Final Project Submission</div>"                                                                                                                                                                                                                                                                                                                                                                                                             
## [193] "<time datetime='2020-05-02T06:37:13+00:00'>about 4 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [194] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [195] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [196] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [197] "<div class='pubrow3'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [198] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [199] "<a href=\"https://rpubs.com/chenlianghe/607892\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/a4de68f3ed98d690d5def280d654da14/png/?thumbnail_max_width=200&amp;unique=1588401280&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607892_4d3bc5f14cf54ff79a1175944fdb8833.html&amp;viewport=960x960\" /></a>"                                                                                      
## [200] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [201] "<h5><a href=\"https://rpubs.com/chenlianghe/607892\">Data Science Capstone Milestone Report</a></h5>"                                                                                                                                                                                                                                                                                                                                                             
## [202] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [203] "<time datetime='2020-05-02T06:34:40+00:00'>about 4 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [204] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [205] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [206] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [207] "<a href=\"https://rpubs.com/chenlianghe/607889\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/2b4fdebf985d6fc4d7fb5d7aa3c80ef7/png/?thumbnail_max_width=200&amp;unique=1588400709&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607889_6af48a22e2c8483e94a7f52ce1e3fdca.html&amp;viewport=960x960\" /></a>"                                                                                      
## [208] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [209] "<h5><a href=\"https://rpubs.com/chenlianghe/607889\">Prediction Assignment Writeup</a></h5>"                                                                                                                                                                                                                                                                                                                                                                      
## [210] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [211] "<time datetime='2020-05-02T06:25:09+00:00'>about 5 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [212] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [213] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [214] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [215] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [216] "<a href=\"https://rpubs.com/chenlianghe/607885\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/dda035ed6b189ce2d34a84bde01aac53/png/?thumbnail_max_width=200&amp;unique=1588399901&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607885_18f3f831056746409facc4beb8c22852.html&amp;viewport=960x960\" /></a>"                                                                                      
## [217] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [218] "<h5><a href=\"https://rpubs.com/chenlianghe/607885\">Prediction of a Car&#39;s Miles per Gallon (MPG)</a></h5>"                                                                                                                                                                                                                                                                                                                                                   
## [219] "<div class='desc'>Coursera Data Science Specialization"                                                                                                                                                                                                                                                                                                                                                                                                           
## [220] "Shiny Application and Reproducible Pitch</div>"                                                                                                                                                                                                                                                                                                                                                                                                                   
## [221] "<time datetime='2020-05-02T06:11:41+00:00'>about 5 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [222] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [223] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [224] "<div class='pubrow3'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [225] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [226] "<a href=\"https://rpubs.com/chenlianghe/607884\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/3911a538ecfc0e71dc42846d146bf78c/png/?thumbnail_max_width=200&amp;unique=1588399771&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607884_a8dcd52c474a43e99f81fbf7bdbe2493.html&amp;viewport=960x960\" /></a>"                                                                                      
## [227] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [228] "<h5><a href=\"https://rpubs.com/chenlianghe/607884\">Regression Models Course Project</a></h5>"                                                                                                                                                                                                                                                                                                                                                                   
## [229] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [230] "<time datetime='2020-05-02T06:09:31+00:00'>about 5 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [231] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [232] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [233] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [234] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [235] "<a href=\"https://rpubs.com/chenlianghe/607879\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/df4ce2a9577517d8c5612ae770a0d722/png/?thumbnail_max_width=200&amp;unique=1588398086&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607879_e39cd0ff774b46cbac993be9f35019a0.html&amp;viewport=960x960\" /></a>"                                                                                      
## [236] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [237] "<h5><a href=\"https://rpubs.com/chenlianghe/607879\">R Markdown Presentation &amp; Plotly</a></h5>"                                                                                                                                                                                                                                                                                                                                                               
## [238] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [239] "<time datetime='2020-05-02T05:41:26+00:00'>about 5 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [240] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [241] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [242] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [243] "<a href=\"https://rpubs.com/chenlianghe/607878\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/f4329b5a4a969f735606bc391a748421/png/?thumbnail_max_width=200&amp;unique=1588397968&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607878_3729d89267a94d6ab83a1298f3ffccc6.html&amp;viewport=960x960\" /></a>"                                                                                      
## [244] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [245] "<h5><a href=\"https://rpubs.com/chenlianghe/607878\">R Markdown and Leaflet</a></h5>"                                                                                                                                                                                                                                                                                                                                                                             
## [246] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [247] "<time datetime='2020-05-02T05:39:28+00:00'>about 5 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [248] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [249] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [250] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [251] "<div class='pubrow3'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [252] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [253] "<a href=\"https://rpubs.com/chenlianghe/607877\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/8b3156369b3689722af011a3fb54c56d/png/?thumbnail_max_width=200&amp;unique=1588397896&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607877_c107ab0c828c4be79be4a91addaecec9.html&amp;viewport=960x960\" /></a>"                                                                                      
## [254] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [255] "<h5><a href=\"https://rpubs.com/chenlianghe/607877\">Statistical Inference Course Project Part 2: Basic Inferential Data Analysis</a></h5>"                                                                                                                                                                                                                                                                                                                       
## [256] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [257] "<time datetime='2020-05-02T05:38:16+00:00'>about 5 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [258] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [259] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [260] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [261] "<a href=\"https://rpubs.com/chenlianghe/607876\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/0fe57a4649e2538ba7f068e8bf5accff/png/?thumbnail_max_width=200&amp;unique=1588397833&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607876_c77b320a24a44cbfb43efb9f750beb15.html&amp;viewport=960x960\" /></a>"                                                                                      
## [262] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [263] "<h5><a href=\"https://rpubs.com/chenlianghe/607876\">Statistical Inference Course Project Part 1: Simulation Exercise</a></h5>"                                                                                                                                                                                                                                                                                                                                   
## [264] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [265] "<time datetime='2020-05-02T05:37:13+00:00'>about 5 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [266] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [267] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [268] "<div class='pubrow2'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [269] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [270] "<a href=\"https://rpubs.com/chenlianghe/607875\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/8234aecd74d94dbccafd3422302aeea5/png/?thumbnail_max_width=200&amp;unique=1588397610&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607875_4ee0d922c1d34cce8621e4dbe7c47e51.html&amp;viewport=960x960\" /></a>"                                                                                      
## [271] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [272] "<h5><a href=\"https://rpubs.com/chenlianghe/607875\">Reproducible Research: Course Project 2</a></h5>"                                                                                                                                                                                                                                                                                                                                                            
## [273] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [274] "<time datetime='2020-05-02T05:33:30+00:00'>about 5 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [275] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [276] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [277] "<div class='pubrow3'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [278] "<div class='pubtile'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [279] "<a href=\"https://rpubs.com/chenlianghe/607866\"><img class=\"pubthumb\" alt=\"\" src=\"https://d38j8069scdbm5.cloudfront.net/v6/P5441C759E0FFE/72552985d5bf38144abb7194962a092a/png/?thumbnail_max_width=200&amp;unique=1588396111&amp;url=https%3A%2F%2Frstudio-pubs-static.s3.amazonaws.com%2F607866_75ca91eab68646d98eb211be08a8c1ac.html&amp;viewport=960x960\" /></a>"                                                                                      
## [280] "<div class='pubinfo'>"                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [281] "<h5><a href=\"https://rpubs.com/chenlianghe/607866\">Reproducible Research: Peer Assessment 1</a></h5>"                                                                                                                                                                                                                                                                                                                                                           
## [282] "<div class='desc'>Coursera Data Science Specialization</div>"                                                                                                                                                                                                                                                                                                                                                                                                     
## [283] "<time datetime='2020-05-02T05:08:31+00:00'>about 6 hours ago</time>"                                                                                                                                                                                                                                                                                                                                                                                              
## [284] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [285] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [286] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [287] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [288] ""                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [289] "<div class='clearfix'></div>"                                                                                                                                                                                                                                                                                                                                                                                                                                     
## [290] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [291] "</div>"                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [292] "</body>"                                                                                                                                                                                                                                                                                                                                                                                                                                                          
## [293] "</html>"

Parsing with XML

url <- "https://scholar.google.com/citations?hl=en&user=EXk94uwAAAAJ"
fileurl <- getURL(url)
html <- htmlTreeParse(fileurl, useInternalNodes = TRUE)
xpathSApply(html, "//title", xmlValue)
## [1] "Lianghe Chen - Google Scholar Citations"

GET Function from httr Package

html2 <- GET(url)
content2 <- content(html2, as = "text")
parsedHtml <- htmlParse(content2, asText = TRUE)
xpathSApply(parsedHtml, "//title", xmlValue)
## [1] "Lianghe Chen - Google Scholar Citations"

Accessing Websites with Passwords

pg1 <- GET("http://httpbin.org/basic-auth/user/passwd")
pg1
## Response [http://httpbin.org/basic-auth/user/passwd]
##   Date: 2020-05-02 10:55
##   Status: 401
##   Content-Type: <unknown>
## <EMPTY BODY>
names(pg1)
##  [1] "url"         "status_code" "headers"     "all_headers" "cookies"    
##  [6] "content"     "date"        "times"       "request"     "handle"
pg2 <- GET("http://httpbin.org/basic-auth/user/passwd",
           authenticate("user", "passwd"))
pg2
## Response [http://httpbin.org/basic-auth/user/passwd]
##   Date: 2020-05-02 10:55
##   Status: 200
##   Content-Type: application/json
##   Size: 47 B
## {
##   "authenticated": true, 
##   "user": "user"
## }
names(pg2)
##  [1] "url"         "status_code" "headers"     "all_headers" "cookies"    
##  [6] "content"     "date"        "times"       "request"     "handle"

Using Handles

google <- handle("http://google.com")
pg3 <- GET(handle = google, path = "search")
pg3
## Response [http://www.google.com/webhp]
##   Date: 2020-05-02 10:55
##   Status: 200
##   Content-Type: text/html; charset=ISO-8859-1
##   Size: 13.6 kB
## <!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="...
## document.documentElement.addEventListener("submit",function(b){var a;if(a=b.t...
## var a=window.location,b=a.href.indexOf("#");if(0<=b){var c=a.href.substring(b...
## </style><style>body,td,a,p,.h{font-family:arial,sans-serif}body{margin:0;over...
## if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.fo...
## }
## })();</script><div id="mngb"> <div id=gbar><nobr><b class=gb1>Search</b> <a c...
## else top.location='/doodles/';};})();</script><input value="AINFCbYAAAAAXq1ft...
## setTimeout(function(){var b=document;var a="SCRIPT";"application/xhtml+xml"==...
## function _F_installCss(c){}
names(pg3)
##  [1] "url"         "status_code" "headers"     "all_headers" "cookies"    
##  [6] "content"     "date"        "times"       "request"     "handle"