Step 1 - Select Walmart Wikipage

rm(list=ls())  # clears workspace

# need latest Java version installed on your machine b4 U start
require(rJava) || install.packages("rJava")
require(NLP) || install.packages("NLP")
require(openNLP) || install.packages("openNLP")
require(RWeka) || install.packages("RWeka")


library(rvest)
library(ggmap)
library(rworldmap)

Step 2 - Scraping Wiki Page using rvest

t1 = Sys.time()   # set timer
page = read_html('https://en.wikipedia.org/wiki/Walmart')

text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type

# Make one complete document
text = paste(text,collapse = " ") 

text = as.String(text)
Sys.time() - t1  # how much time did the above take?
## Time difference of 2.182449 secs

Step 3 - Annotate for Persons, Locations and Organizations

t1 = Sys.time()   # set timer
sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location")
person_annot = Maxent_Entity_Annotator(kind = "person")
org_annot = Maxent_Entity_Annotator(kind = "organization")


annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,person_annot, org_annot))
Sys.time() - t1  # how much time did the above take?
## Time difference of 12.7605 secs

Exract Locations

k <- sapply(annot.l1$features, `[[`, "kind")
walmart_locations = text[annot.l1[k == "location"]]

Step 4 - Plot all of the extracted location data in a Map

Walmart Locations from Wikipedia

t1 = Sys.time()   # set timer
all_places = unique(walmart_locations) # view contents of this obj

all_places_geocoded <- geocode(all_places) #[1:10]
all_places_geocoded # view contents of this obj
##              lon        lat
## 1    -91.8318334  35.201050
## 2    -95.7128910  37.090240
## 3   -106.3467710  56.130366
## 4    -91.8803310  39.165044
## 5     -3.4359730  55.378051
## 6    138.2529240  36.204824
## 7     78.9628800  20.593684
## 8    -63.6166720 -38.416097
## 9    -51.9252800 -14.235004
## 10   -74.4056612  40.058324
## 11    -1.0872979  53.959965
## 12   -77.1945247  41.203322
## 13   -95.7128910  37.090240
## 14   -95.7128910  37.090240
## 15   104.1953970  35.861660
## 16    10.4515260  51.165691
## 17   127.7669220  35.907757
## 18   -78.8773898  42.876089
## 19   -94.2088172  36.372854
## 20   -91.8318334  37.964253
## 21   -97.0928770  35.007752
## 22   -98.4842465  39.011902
## 23   -91.9623327  30.984298
## 24   -86.5804473  35.517491
## 25   -84.2700179  37.839333
## 26   -99.9018131  31.968599
## 27   -79.8284258  32.832322
## 28   -77.0368707  38.907192
## 29  -119.4179324  36.778261
## 30  -102.5527840  23.634501
## 31   -72.5778415  44.558803
## 32    15.2551187  54.525961
## 33   -96.6988856  33.019843
## 34   -96.6388833  32.912624
## 35   -77.0368707  38.907192
## 36   -66.5901490  18.220833
## 37   -73.7562317  42.652579
## 38   -74.0059413  40.712784
## 39  -103.2310149  44.080543
## 40   -99.9018131  43.969515
## 41  -114.7420408  44.068202
## 42  -110.3625658  46.879682
## 43   -99.9018131  41.492537
## 44  -101.0020119  47.551493
## 45   -81.1637245  33.836081
## 46  -111.0937311  39.320980
## 47   -80.4549026  38.597626
## 48  -107.2902839  43.075968
## 49    -0.0116755  42.001533
## 50  -149.4936733  64.200841
## 51   -75.5276699  38.910832
## 52  -155.5827818  19.896766
## 53   -93.0977020  41.878003
## 54   -69.4454689  45.253783
## 55   -76.6412712  39.045755
## 56   -71.3824374  42.407211
## 57   -85.6023643  44.314844
## 58   -94.6858998  46.729553
## 59   -71.5723953  43.193852
## 60   -82.9071230  40.417287
## 61   -71.4774291  41.580095
## 62   -98.4104996  29.840551
## 63   -95.3698028  29.760427
## 64  -112.0740373  33.448377
## 65  -111.0937311  34.048928
## 66   -79.0192997  35.759573
## 67   -87.6297982  41.878114
## 68   -82.4571776  27.950575
## 69   -81.5157535  27.664827
## 70   -90.5067726  38.596040
## 71  -112.1072224  46.320207
## 72  -104.9902510  39.739236
## 73    -0.0100245  35.004115
## 74   -86.2519898  41.676355
## 75   -71.5429690 -35.675147
## 76   -83.7534280   9.748917
## 77   -88.8965300  13.794185
## 78   -90.2307590  15.783471
## 79   -86.2419050  15.199999
## 80   -85.2072290  12.865416
## 81   -49.2671370 -25.428954
## 82   -88.8965300  13.794185
## 83   -88.8965300  13.794185
## 84  -109.2730555  25.993056
## 85   120.3209373  16.615891
## 86    42.5509603  29.298528
## 87    34.5085230  -8.783195
## 88    22.9375060 -30.559482
## 89    -1.0231940   7.946527
## 90    37.9061930  -0.023559
## 91    28.2336080 -29.609988
## 92    34.3015250 -13.254308
## 93    35.5295620 -18.665695
## 94    18.4904100 -22.957640
## 95     8.6752770   9.081999
## 96    32.2902750   1.373333
## 97   -84.5613355  39.399501
## 98    -0.1277583  51.507351
## 99  -117.6508876  34.063344
## 100  -97.0035982  28.805267
## 101 -127.6476206  53.726668
## 102   28.0473051 -26.204103
## 103  -71.1097335  42.373616
## 104  100.6196553  34.047863
## 105           NA         NA
## 106  -98.8721186  19.685267
## 107           NA         NA
## 108  -48.6478069 -26.993517
## 109 -120.5542012  43.804133
## 110 -122.4194155  37.774929
## 111 -122.6764816  45.523062
## 112  121.4737010  31.230416
## 113 -122.0575434  37.387474
## 114   -0.1277583  51.507351
## 115 -122.4433597  41.956063
## 116  -91.6514459  41.924790
newmap <- getMap(resolution = "high")
plot(newmap, 
     # xlim = c(-20, 59), ylim = c(35, 71),   # can select 'boxes' of lat-lon to focus on
     asp = 1)

points(all_places_geocoded$lon, 
       all_places_geocoded$lat, 
       col = "red", cex = 1.2, pch = 19)

Sys.time() - t1  # how much time did the above take?
## Time difference of 1.903601 mins

Extract the persons

walmart_person = text[annot.l1[k == "person"]]
walmart_organization = text[annot.l1[k == "organization"]]
walmart_person<-unique(walmart_person)
walmart_person<-setdiff(walmart_person,walmart_organization)
walmart_person<-setdiff(walmart_person,walmart_locations)
knitr::kable(walmart_person)
Sam Walton
Ben Franklin
Walton
David Glass
Lee Scott
Kenneth Stone
Steven Horwitz
Greg Foran
Wendy
Auntie Annes
Burger King
Tim Horton
Tesoro Corporation
Bud
Mi Changomas
Todo Dia
Maxxi Atacado
Walmart Posto
Walmart Supercenter
Despensa Familiar
Don Juan
Maxi Pali
Mas X Menos
Superbodega Acuenta
Bodega Aurrera
Asda Living
Builders Warehouse
Dion Wired
Game Foodco
Neighborhood Market
Scott Price
Adam Hartung
Clubs
Wholesale Club“
Doug McMillon
Gregory B. Penner
Jim Breyer
Michele Burns
James Cash
Roger Corbett
Douglas Daft
Marissa Mayer
Allen Questrom
Jim Walton
Christopher J. Williams
Linda S. Wolf
Hillary Clinton
Tom Coughlin
Clinton
Don Soderquist
George W. Bush
John Kerry
Alex Grigoryan
Betty Dukes
African-Americans
Wayne Pacelle

Step 5 - Observations