
if (!require("RSelenium")) install.packages("RSelenium", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
require(RSelenium)
setwd(working_path)
# Setup the driver
remDr <- remoteDriver(browserName = "chrome")
# launch the Chrome browser
q <- remDr$open(silent=T)
# Go to the US Senator Page
remDr$navigate('https://www.senate.gov/senators/index.htm')
# Use Xpath to find all elements with Senator's name
allsenators <- remDr$findElements("xpath","//td[@class='sorting_1']")
# A loop to extract all names into string array us-senators
us_senators <- c()
for (i in 1:length(allsenators)){
us_senators <- c(us_senators,allsenators[[i]]$getElementText())
}
# Data cleaning - remove the brackets
us_senators <- gsub(' \\(.*\\)','',sapply(us_senators,"["))
#
# Google search one-by-one by a for-loop
# suppressmessage - to suppress unnecessary message
# tryCatch - error handling
#
for (name in us_senators){
url <- "https://www.google.com"
query <- paste0("How old is us senator ",name,"\n")
remDr$navigate(url)
q <- remDr$findElement("xpath","//input[@name='q']")
q$sendKeysToElement(list(query))
suppressMessages(tryCatch({
yearold <- remDr$findElement("xpath","//div[@class='Z0LcW']")
age <- yearold$getElementText()
print(paste0("Age of ",name,":",age))
}
,error=function(e){
print(paste0("Age of ",name,": don't know"))
}))
Sys.sleep(20)
}
# Close the driver
remDr$quit()
if (!require("pdftools")) install.packages("pdftools", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("tesseract")) install.packages("tesseract", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
require(pdftools)
require(tesseract)
require(stringr)
#
# Pdf text extracting
#
fn <- paste0(working_path,"ira_handles_june_2018.pdf")
ira_handles <- pdf_text(fn)
full_list <- strsplit(paste(ira_handles,collapse="",sep=""),'\r\n')
# Render pdf to png image
fn <- paste0(working_path,"2018_02_27_13_40_38_twitter_suppression.pdf")
pn <- pdf_info(fn)$pages # Number of page
img_file <- pdftools::pdf_convert(fn, format = 'png', pages = 1, dpi = 600) # Cobvert the fist page to png
## Converting page 1 to 2018_02_27_13_40_38_twitter_suppression_1.png... done!
# Extract text from png image
text <- ocr(img_file)
unlink(img_file)
text
## [1] "During the 2016 election, we removed tweets that were attempting to suppress or otherwise interfere with the\nexercise of voting rights, including by circulating intentionally misleading information. Below are examples of images\nin tweets that were removed. We have not found accounts associated with this activity to have obvious Russian\nOrigin, but some of the accounts appear to have been automated.\nff)\nVota desde casa o trabajo Ahorra tiempo. V4 Yj. [ee \"Hagamos\nEvita las colas. f fi, (jp - =a Vota desde casa o el trabajo. foros\n; tape le ee 7 juntas!” -H\n| b ae \\/ * \\\nso Nae ld Lowa f x Ee ee ee aA : 2 V2 fe RT ey ik ea Neate “Hillary” ’ A ,\nal nUmero 59925 este 8 de Nov. Vota desde casa eos a ? al nUmero 59925 este 8 de Nov. eh 4\n_—— a eee) RNR AR AR its MRC NCRD SOCEM YERE NCL aa malic TN ' vA } i / § Y . 1\ni Bet p AM eb hy to) Sa - /}\n —— mi DOCH —— i Ss {|\nae FS = 5 CE ote Spar pee oT ae = :\nig DAV VU Fal Es IMPORTANT\ne MOULEANINOWAV OME aby\n: YOU WILL NEED\ne ONEINE TO BRING ALL | <ab> social Security Card Mm 4\nEvita las OF THE * U.S. or foreign passport\nVota desde casa. | i we RR. cae: Love. nn npr cman\n, eS sh a. a Ui * Your IL driving license record\nTACT ET ts Pedi: 5; Gse xX Risen pate-\nSpe es ey | pee ) nae MAKEIA DAS 010 210 9 496 D8 bio o2ie oo Ree oa £ ES\n1\n"
#
# Societies Registration
#https://www.police.gov.hk/ppp_en/11_useful_info/licences/societies.html
#
Sys.setlocale(category = "LC_ALL", locale = "cht")
## [1] "LC_COLLATE=Chinese (Traditional)_Taiwan.950;LC_CTYPE=Chinese (Traditional)_Taiwan.950;LC_MONETARY=Chinese (Traditional)_Taiwan.950;LC_NUMERIC=C;LC_TIME=Chinese (Traditional)_Taiwan.950"
fn <- paste0(working_path,"srr012n.pdf")
txt_msg <- pdftools::pdf_text(fn)
txt_msg <- paste0(txt_msg,collapse="",sep="")
txt_msg <- strsplit(txt_msg,"\r\n |\r\n ")
txt_msg <- grep('^[ ]',txt_msg[[1]],invert=T,value=T)
txt_msg <- gsub('\r\n[0-9]* [0-9]* REG','',txt_msg)
txt_msg <- gsub(' .*$','',txt_msg)
txt_msg[1:100] # First 100 names
## [1] "中國影響力品牌促進會"
## [2] "\"羊城國際商貿中心\" 香港業主聯會"
## [3] "\"蘭花\" 歌舞團"
## [4] "\"輝煌\" 歌舞團"
## [5] "\"鳳之聲\" 歌舞團"
## [6] "( HAPPY )歡樂太極學會"
## [7] "1107 媽媽 & BB 會"
## [8] "2﹒S 樂韻社"
## [9] "61303"
## [10] "700"
## [11] "ACA 羽毛球同樂會"
## [12] "CANDY 舞藝坊"
## [13] "FAN PIECE 運動同好會"
## [14] "JENNY'S 瑜伽太極同學會"
## [15] "LITTLE PEOPLE 童樂會"
## [16] "SAM & MAY 舞蹈坊"
## [17] "SUN 花 SING 聲 CLUB"
## [18] "SUNSHINE 藝軒"
## [19] "TC 舞蹈室"
## [20] "「2099」助學訪問協會"
## [21] "一羽羽毛球會"
## [22] "上水敬老愛幼慈善社團會"
## [23] "世界儒學論壇"
## [24] "世界易經科學發展聯合會"
## [25] "世界禪和文化藝術交流協會"
## [26] "世界華僑華人社團聯合總會"
## [27] "丘華籃球"
## [28] "中國國際扶貧基金會"
## [29] "中國廣場舞聯合會"
## [30] "中國書協香港分會"
## [31] "中國書法學會"
## [32] "中國民族歌舞協進會"
## [33] "中國水療促進會"
## [34] "中國香港木蘭拳總會天水圍同學會"
## [35] "中國香港高等教育中醫葯協進會"
## [36] "中大五十年編輯委員會"
## [37] "中港工商業發展促進會"
## [38] "中華傑出教育家協會"
## [39] "中華傳媒協會"
## [40] "中華周易研究協會"
## [41] "中華鳳山廣澤尊王文化交流協會"
## [42] "中西區各界婦女組織聯席會議"
## [43] "中西區監察議員大聯盟"
## [44] "乒之友乒乓球會"
## [45] "九龍藝群協會"
## [46] "亂 23"
## [47] "互惠人才市場"
## [48] "五到"
## [49] "五行逍遙功研習社"
## [50] "亞太國際文化藝術交流促進會"
## [51] "今日藝術協會"
## [52] "低級巨星"
## [53] "佑才舞蹈社"
## [54] "俊宏軒妍心社"
## [55] "信望愛歌舞軒"
## [56] "健樂會"
## [57] "元朗愛心歌詠團"
## [58] "元朗新亮力乒乓學院"
## [59] "內家功法養生研究學會"
## [60] "全球華人萬眾創新協進會"
## [61] "八鄉水流田村麒麟隊青年會"
## [62] "創己之舞"
## [63] "創意泡沫關注組"
## [64] "創美愛心慈善協會"
## [65] "動影心弦"
## [66] "動感綜藝坊"
## [67] "動感舞藝會"
## [68] "北勝黃伯光健身學會"
## [69] "北角渣華道街市檯商互助委員會"
## [70] "匡懷舍"
## [71] "千唐流空手道同學會"
## [72] "千唐流空手道體育會"
## [73] "千江映月歌舞團"
## [74] "卓思英文學校暨幼稚園家長教師會"
## [75] "卓賢籃球"
## [76] "南溪之友-葵青地區服務協會"
## [77] "友共情樂坊"
## [78] "友德揮高球會"
## [79] "台山任遠獎學金會"
## [80] "史必烈"
## [81] "司麥林康體社"
## [82] "同心協力舞蹈坊"
## [83] "同心會"
## [84] "同心英仔舞團"
## [85] "同樂歌舞團"
## [86] "名曲歌藝會"
## [87] "名磯會"
## [88] "品味文學"
## [89] "喜喜曲藝社"
## [90] "國際書畫收藏家藝術交流協會"
## [91] "圍威喂曲藝社"
## [92] "團結南區足球會"
## [93] "地平線汽車會"
## [94] "城市微觀文化協會"
## [95] "培捷羽毛球會"
## [96] "培新國術體育總會"
## [97] "夢幻組合歌舞團"
## [98] "夢藝會"
## [99] "大光明曲藝社"
## [100] "大坑西民興樓居民協會"