if (!require("RSelenium")) install.packages("RSelenium", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)

require(RSelenium)
setwd(working_path)

# Setup the driver 
remDr <- remoteDriver(browserName = "chrome")

# launch the Chrome browser
q <- remDr$open(silent=T)

# Go to the US Senator Page
remDr$navigate('https://www.senate.gov/senators/index.htm')
# Use Xpath to find all elements with Senator's name 
allsenators <- remDr$findElements("xpath","//td[@class='sorting_1']")

# A loop to extract all names into string array us-senators
us_senators <- c()
for (i in 1:length(allsenators)){
  us_senators <- c(us_senators,allsenators[[i]]$getElementText())
}

# Data cleaning - remove the brackets
us_senators <- gsub(' \\(.*\\)','',sapply(us_senators,"["))
#
# Google search one-by-one by a for-loop
# suppressmessage - to suppress unnecessary message
# tryCatch - error handling
#
for (name in us_senators){
  url <- "https://www.google.com/?hl=en"
  query <- paste0("How old is united states senator ",name,"\n")
  remDr$navigate(url)
  q <- remDr$findElement("xpath","//input[@name='q']")
  q$sendKeysToElement(list(query))
  suppressMessages(tryCatch({
    yearold <- remDr$findElement("xpath","//div[@class='Z0LcW']")
    age <- yearold$getElementText()
    if (nchar(age)==0){
      age <- " don't know"
    }
    print(paste0("Age of ",name,":",age))
  }
  ,error=function(e){
    print(paste0("Age of ",name,": don't know"))
  }))
  Sys.sleep(20)
}
## [1] "Age of Alexander, Lamar:79 years"
## [1] "Age of Baldwin, Tammy:58 years"
## [1] "Age of Barrasso, John:67 years"
## [1] "Age of Bennet, Michael F.:55 years"
## [1] "Age of Blackburn, Marsha:67 years"
## [1] "Age of Blumenthal, Richard:74 years"
## [1] "Age of Blunt, Roy: don't know"
## [1] "Age of Booker, Cory A.:50 years"
## [1] "Age of Boozman, John:69 years"
## [1] "Age of Braun, Mike:66 years"
## [1] "Age of Brown, Sherrod:67 years"
## [1] "Age of Burr, Richard:64 years"
## [1] "Age of Cantwell, Maria:61 years"
## [1] "Age of Capito, Shelley Moore:66 years"
## [1] "Age of Cardin, Benjamin L.:76 years"
## [1] "Age of Carper, Thomas R.:73 years"
## [1] "Age of Casey, Robert P., Jr.:60 years"
## [1] "Age of Cassidy, Bill:62 years"
## [1] "Age of Collins, Susan M.:67 years"
## [1] "Age of Coons, Christopher A.:56 years"
## [1] "Age of Cornyn, John:68 years"
## [1] "Age of Cortez Masto, Catherine:56 years"
## [1] "Age of Cotton, Tom:42 years"
## [1] "Age of Cramer, Kevin:59 years"
## [1] "Age of Crapo, Mike:68 years"
## [1] "Age of Cruz, Ted:49 years"
## [1] "Age of Daines, Steve:57 years"
## [1] "Age of Duckworth, Tammy:52 years"
## [1] "Age of Durbin, Richard J.:75 years"
## [1] "Age of Enzi, Michael B.:76 years"
## [1] "Age of Ernst, Joni:49 years"
## [1] "Age of Feinstein, Dianne:86 years"
## [1] "Age of Fischer, Deb:69 years"
## [1] "Age of Gardner, Cory:45 years"
## [1] "Age of Gillibrand, Kirsten E.:53 years"
## [1] "Age of Graham, Lindsey:64 years"
## [1] "Age of Grassley, Chuck:86 years"
## [1] "Age of Harris, Kamala D.:55 years"
## [1] "Age of Hassan, Margaret Wood:62 years"
## [1] "Age of Hawley, Josh:40 years"
## [1] "Age of Heinrich, Martin:48 years"
## [1] "Age of Hirono, Mazie K.:72 years"
## [1] "Age of Hoeven, John:63 years"
## [1] "Age of Hyde-Smith, Cindy:60 years"
## [1] "Age of Inhofe, James M.:85 years"
## [1] "Age of Johnson, Ron:65 years"
## [1] "Age of Jones, Doug:65 years"
## [1] "Age of Kaine, Tim:62 years"
## [1] "Age of Kennedy, John:68 years"
## [1] "Age of King, Angus S., Jr.:76 years"
## [1] "Age of Klobuchar, Amy:59 years"
## [1] "Age of Lankford, James:52 years"
## [1] "Age of Leahy, Patrick J.:80 years"
## [1] "Age of Lee, Mike:48 years"
## [1] "Age of Loeffler, Kelly:49 years"
## [1] "Age of Manchin, Joe, III:72 years"
## [1] "Age of Markey, Edward J.:73 years"
## [1] "Age of McConnell, Mitch:78 years"
## [1] "Age of McSally, Martha:54 years"
## [1] "Age of Menendez, Robert:66 years"
## [1] "Age of Merkley, Jeff:63 years"
## [1] "Age of Moran, Jerry:65 years"
## [1] "Age of Murkowski, Lisa:62 years"
## [1] "Age of Murphy, Christopher:46 years"
## [1] "Age of Murray, Patty:69 years"
## [1] "Age of Paul, Rand:57 years"
## [1] "Age of Perdue, David:70 years"
## [1] "Age of Peters, Gary C.:61 years"
## [1] "Age of Portman, Rob:64 years"
## [1] "Age of Reed, Jack:70 years"
## [1] "Age of Risch, James E.:76 years"
## [1] "Age of Roberts, Pat:83 years"
## [1] "Age of Romney, Mitt:73 years"
## [1] "Age of Rosen, Jacky:62 years"
## [1] "Age of Rounds, Mike:65 years"
## [1] "Age of Rubio, Marco:48 years"
## [1] "Age of Sanders, Bernard:78 years"
## [1] "Age of Sasse, Ben:48 years"
## [1] "Age of Schatz, Brian:47 years"
## [1] "Age of Schumer, Charles E.:69 years"
## [1] "Age of Scott, Rick:67 years"
## [1] "Age of Scott, Tim:54 years"
## [1] "Age of Shaheen, Jeanne:73 years"
## [1] "Age of Shelby, Richard C.:85 years"
## [1] "Age of Sinema, Kyrsten:43 years"
## [1] "Age of Smith, Tina:62 years"
## [1] "Age of Stabenow, Debbie:69 years"
## [1] "Age of Sullivan, Dan:55 years"
## [1] "Age of Tester, Jon:63 years"
## [1] "Age of Thune, John:59 years"
## [1] "Age of Tillis, Thom:59 years"
## [1] "Age of Toomey, Patrick J.:58 years"
## [1] "Age of Udall, Tom:71 years"
## [1] "Age of Van Hollen, Chris:61 years"
## [1] "Age of Warner, Mark R.:65 years"
## [1] "Age of Warren, Elizabeth:70 years"
## [1] "Age of Whitehouse, Sheldon:64 years"
## [1] "Age of Wicker, Roger F.:68 years"
## [1] "Age of Wyden, Ron:70 years"
## [1] "Age of Young, Todd:47 years"
# Close the driver 
q <- remDr$quit()
if (!require("pdftools")) install.packages("pdftools", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("tesseract")) install.packages("tesseract", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)

require(pdftools)
require(tesseract)
require(stringr)
#
# Pdf text extracting
#
# Source: https://intelligence.house.gov/social-media-content/
#
fn <- paste0(working_path,"ira_handles_june_2018.pdf")
ira_handles <- pdf_text(fn)
full_list <- strsplit(paste(ira_handles,collapse="",sep=""),'\r\n')

# Render pdf to png image
fn <- paste0(working_path,"2018_02_27_13_40_38_twitter_suppression.pdf")
pn <- pdf_info(fn)$pages # Number of page
img_file <- pdftools::pdf_convert(fn, format = 'png', pages = 1, dpi = 600) # Convert the fist page to png
## Converting page 1 to 2018_02_27_13_40_38_twitter_suppression_1.png... done!
# Extract text from png image
text <- ocr(img_file)
unlink(img_file)
text
## [1] "During the 2016 election, we removed tweets that were attempting to suppress or otherwise interfere with the\nexercise of voting rights, including by circulating intentionally misleading information. Below are examples of images\nin tweets that were removed. We have not found accounts associated with this activity to have obvious Russian\nOrigin, but some of the accounts appear to have been automated.\nfff\nVota desde casa o trabajo Ahorra tiempo. Vi Uy. [ee \"Hagamos\nEvita las colas. f ii, (jp - =a Vota desde casa o el trabajo. foros\n; tape le ee 7 juntas!” -H\n| b ae \\/ * \\\nse aad Lowa f os es Ee ee ee an : 2 Envia un sms ee yet “Hillary” ’ A ,\nal nUmero 59925 este 8 de Nov. Vota desde casa eos a ? al nUmero 59925 este 8 de Nov. eh 4\n_—— a eee ERAS AIO td its MRC NCRD SOCEM YERE NCL aa malic TIN ' a vA } a / § Y . 1\ni Ee Mn CUP AM eo hyo) Sa - }\n —— Ta COCO ha  —— i a ae\nae FS = 5 CE ote Spar pee oT ae = :\nig DAV VU bal Es IMPORTANT\n) e MOULEANINOWAV OME aby\nYOU WILL NEED\nm3 ONEINE TO BRING ALL | <ab> social Security Card Mm 4\nEvita las OF THE * U.S. or foreign passport\nVota desde casa. | i me BRE. cae: Love. - penne conn\na eS sh a. a 4x * Your IL driving license record\nTACT Tits Pedi: 5;  Gse xX isan pate-\nSpe es ey | pes ) nae MAKEIA DLE 010 ¢418 14 26 5 8 emi eae oe De oa £m\ni ae =—Cg aR :\n1\n"
#
# Societies Registration
#https://www.police.gov.hk/ppp_en/11_useful_info/licences/societies.html
#

Sys.setlocale(category = "LC_ALL", locale = "cht")
## [1] "LC_COLLATE=Chinese (Traditional)_Taiwan.950;LC_CTYPE=Chinese (Traditional)_Taiwan.950;LC_MONETARY=Chinese (Traditional)_Taiwan.950;LC_NUMERIC=C;LC_TIME=Chinese (Traditional)_Taiwan.950"
fn <- paste0(working_path,"srr012n.pdf")
txt_msg <- pdftools::pdf_text(fn)
txt_msg <- paste0(txt_msg,collapse="",sep="")
txt_msg <- strsplit(txt_msg,"\r\n                  |\r\n                 ")
txt_msg <- grep('^[ ]{8}[^ ]',txt_msg[[1]],value=T)
txt_msg <- gsub('^[ ]*','',txt_msg)
txt_msg <- strsplit(txt_msg,'  |\r')
txt_msg <- sapply(txt_msg, function(x){x[1]})
txt_msg[1:100] # First 100 names
##   [1] "中國影響力品牌促進會"              "\"羊城國際商貿中心\" 香港業主聯會"
##   [3] "\"輝煌\" 歌舞團"                   "( HAPPY )歡樂太極學會"            
##   [5] "1107媽媽 & BB會"                   "2﹒S 樂韻社"                      
##   [7] "61303"                             "700"                              
##   [9] "ACA羽毛球同樂會"                   "CANDY 舞藝坊"                     
##  [11] "FAN PIECE 運動同好會"              "JENNY'S 瑜伽太極同學會"           
##  [13] "LITTLE PEOPLE 童樂會"              "SAM & MAY 舞蹈坊"                 
##  [15] "SUN 花 SING 聲 CLUB"               "SUNSHINE 藝軒"                    
##  [17] "TC舞蹈室"                          "「2099」助學訪問協會"             
##  [19] "一羽羽毛球會"                      "上水敬老愛幼慈善社團會"           
##  [21] "世界儒學論壇"                      "世界易經科學發展聯合會"           
##  [23] "世界禪和文化藝術交流協會"          "丘華籃球"                         
##  [25] "中國國際扶貧基金會"                "中國廣場舞聯合會"                 
##  [27] "中國書協香港分會"                  "中國書法學會"                     
##  [29] "中國水療促進會"                    "中國香港木蘭拳總會天水圍同學會"   
##  [31] "中國香港高等教育中醫葯協進會"      "中大五十年編輯委員會"             
##  [33] "中港工商業發展促進會"              "中華傑出教育家協會"               
##  [35] "中華周易研究協會"                  "中華鳳山廣澤尊王文化交流協會"     
##  [37] "中西區各界婦女組織聯席會議"        "中西區監察議員大聯盟"             
##  [39] "九龍藝群協會"                      "亂<U+564F>23"                     
##  [41] "互惠人才市場"                      "五行逍遙功研習社"                 
##  [43] "亞太國際文化藝術交流促進會"        "今日藝術協會"                     
##  [45] "低級巨星"                          "佑才舞蹈社"                       
##  [47] "俊宏軒妍心社"                      "信望愛歌舞軒"                     
##  [49] "健樂會"                            "元朗愛心歌詠團"                   
##  [51] "元朗新亮力乒乓學院"                "內家功法養生研究學會"             
##  [53] "全球華人萬眾創新協進會"            "八鄉水流田村麒麟隊青年會"         
##  [55] "創己之舞"                          "創意泡沫關注組"                   
##  [57] "創美愛心慈善協會"                  "動影心弦"                         
##  [59] "動感舞藝會"                        "北勝黃伯光健身學會"               
##  [61] "北角渣華道街市檯商互助委員會"      "匡懷舍"                           
##  [63] "千江映月歌舞團"                    "卓賢籃球"                         
##  [65] "友共情樂坊"                        "友德揮高球會"                     
##  [67] "台山任遠獎學金會"                  "史必烈"                           
##  [69] "同心協力舞蹈坊"                    "同心會"                           
##  [71] "同心英仔舞團"                      "同樂歌舞團"                       
##  [73] "名曲歌藝會"                        "名磯會"                           
##  [75] "品味文學"                          "喜喜曲藝社"                       
##  [77] "國際書畫收藏家藝術交流協會"        "圍威喂曲藝社"                     
##  [79] "團結南區足球會"                    "地平線汽車會"                     
##  [81] "城市微觀文化協會"                  "培捷羽毛球會"                     
##  [83] "培新國術體育總會"                  "夢藝會"                           
##  [85] "大光明曲藝社"                      "大坑西<U+90A8>民興樓居民協會"     
##  [87] "大埔燒腩隊"                        "大埔起動樂康體協會"               
##  [89] "大埔錦石新村婦女會"                "大官殺校家長會"                   
##  [91] "大眾和諧歌唱團"                    "天恩<U+90A8>街坊聯會"             
##  [93] "天水圍康樂及游泳體育會"            "天水圍歌舞新天地"                 
##  [95] "天水圍田園曲藝社"                  "天水圍關社健康快樂舞步團"         
##  [97] "天行排球會"                        "奇妙舞蹈組"                       
##  [99] "奇緣妙舞軒"                        "奼紫嫣紅藝術團"