R Markdown

By Sree Kashyap Addanki - 716200072

Individual Assignment - Task 2

Inculde libraries

library(rvest)
## Loading required package: xml2
library(NLP)
library(openNLP)
library(ggmap)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(rworldmap)
## Warning: package 'rworldmap' was built under R version 3.3.2
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
page = read_html('https://en.wikipedia.org/wiki/Berkshire_Hathaway')

text = html_text(html_nodes(page,'p'))
text = text[text != ""]
text = gsub("\\[[0-9]]|\\[[0-9][0-9]]|\\[[0-9][0-9][0-9]]","",text) # removing refrences [101] type

# Make one complete document
text = paste(text,collapse = " ") 

text = as.String(text)

sent_annot = Maxent_Sent_Token_Annotator()
word_annot = Maxent_Word_Token_Annotator()
loc_annot = Maxent_Entity_Annotator(kind = "location") #annotate location
people_annot = Maxent_Entity_Annotator(kind = "person") #annotate person

annot.l1 = NLP::annotate(text, list(sent_annot,word_annot,loc_annot,people_annot))

k <- sapply(annot.l1$features, `[[`, "kind")
berk_locations = text[annot.l1[k == "location"]]
berk_people = text[annot.l1[k == "person"]]



# We could do much with this info, e.g., improve lists by editing them with external domain knowledge, etc. 
# E.g., geocode the locations and create a map of the world of each article. 

all_places = unique(berk_locations) # view contents of this obj

all_places_geocoded <- geocode(all_places) #[1:10]
all_places_geocoded # view contents of this obj
##           lon       lat
## 1   -95.99799  41.25236
## 2   -99.90181  41.49254
## 3   -95.71289  37.09024
## 4  -122.16860  37.42270
## 5   -85.88751  39.24389
## 6   -81.97561  35.01251
## 7   -71.47743  41.58009
## 8   -95.71289  37.09024
## 9   -89.77553  44.01970
## 10  -71.38244  42.40721
## 11  -70.93420  41.63622
## 12  -76.18660  42.30424
## 13         NA        NA
## 14  -84.27002  37.83933
## 15  -97.33077  32.75549
## 16  -99.90181  31.96860
## 17  -74.40566  40.05832
## 18 -106.34677  56.13037
## 19  -91.83183  37.96425
## 20  -83.92074  35.96064
## 21  -86.58045  35.51749
## 22  -79.38318  43.65323
## 23  -84.19161  39.75895
## 24  -73.87397  40.77693
## 25  -73.83308  40.76750
## 26  -74.00594  40.71278
## 27  -94.57857  39.09973
## 28  -96.82917  32.96179
## 29  -89.39853  40.63312
## 30 -119.41793  36.77826
## 31  -93.34995  44.88969
## 32  -94.68590  46.72955
## 33  -78.87837  42.88645
## 34  -93.09770  41.87800
## 35  -77.40398  37.64408
## 36  -79.79198  36.07264
## 37  -74.42293  39.36428
## 38  -80.19179  25.76168
## 39  -93.26501  44.97775
## 40  -82.90008  32.16562
## 41  -86.13490  40.26719
## 42   15.25512  54.52596
## 43  -51.92528 -14.23500
## 44  -87.34751  35.90090
newmap <- getMap(resolution = "high")
plot(newmap, 
         asp = 1)

points(all_places_geocoded$lon, 
       all_places_geocoded$lat, 
       col = "red", cex = 1.2, pch = 19)

unique(berk_people)
##  [1] "Dairy Queen"                                      
##  [2] "Helzberg Diamonds"                                
##  [3] "Warren Buffett"                                   
##  [4] "Charlie Munger"                                   
##  [5] "Oliver Chace"                                     
##  [6] "Samuel Slater"                                    
##  [7] "Seabury Stanton"                                  
##  [8] "Stanton"                                          
##  [9] "Buffett"                                          
## [10] "Magazine"                                         
## [11] "Lloyd Blankfein"                                  
## [12] "David Gottesman"                                  
## [13] "Franklin Otis Booth"                              
## [14] "Bill Gates"                                       
## [15] "Arnold Schwarzenegger"                            
## [16] "Jamie Lee Curtis"                                 
## [17] "Nicollette Sheridan"                              
## [18] "Walter Scott"                                     
## [19] "Thomas S. Murphy"                                 
## [20] "Howard Graham Buffett"                            
## [21] "Warren"                                           
## [22] "Ronald Olson"                                     
## [23] "Steve Burke"                                      
## [24] "Susan Decker"                                     
## [25] "Todd Combs"                                       
## [26] "A++"                                              
## [27] "Russell Corporation"                              
## [28] "Justin Brands"                                    
## [29] "Justin Boots"                                     
## [30] "Justin Original Workboots"                        
## [31] "Nocona Boots"                                     
## [32] "Tony Lama Boots"                                  
## [33] "Benjamin Moore"                                   
## [34] "Moore"                                            
## [35] "Inc. Shaw"                                        
## [36] "Clayton Homes"                                    
## [37] "Inc."                                             
## [38] "Clayton"                                          
## [39] "Ben Bridge Jeweler"                               
## [40] "Graham Holdings Company"                          
## [41] "Scott Fetzer Companies<U+0096>The Scott Fetzer Companies"
## [42] "Wayne Water Systems"                              
## [43] "Campbell Hausfeld"                                
## [44] "Scott Fetzer"                                     
## [45] "AAI"                                              
## [46] "Gymnastics"                                       
## [47] "Lee Enterprises"
unique(berk_locations)
##  [1] "Omaha"                   "Nebraska"               
##  [3] "United States"           "Mars"                   
##  [5] "Valley Falls Company"    "Valley Falls"           
##  [7] "Rhode Island"            "America"                
##  [9] "Adams"                   "Massachusetts"          
## [11] "New Bedford"             "Berkshire"              
## [13] "Charlotte Guyman"        "Kentucky"               
## [15] "Fort Worth"              "Texas"                  
## [17] "New Jersey"              "Canada"                 
## [19] "Missouri"                "Knoxville"              
## [21] "Tennessee"               "Toronto"                
## [23] "Dayton"                  "LaGuardia Airport"      
## [25] "Flushing"                "New York"               
## [27] "Kansas City"             "Addison"                
## [29] "Illinois"                "California"             
## [31] "Edina"                   "Minnesota"              
## [33] "Buffalo"                 "Iowa"                   
## [35] "Richmond Times-Dispatch" "Greensboro"             
## [37] "Atlantic City"           "Miami"                  
## [39] "Minneapolis"             "Georgia"                
## [41] "Indiana"                 "Europe"                 
## [43] "Brazil"                  "Wrigley"

Maxent_Entity_Annotator - Generates an annotator which computes entity annotations using the Apache OpenNLP Maxent name finder

Observed that locations from US and Europe as recognised better when compared to other parts of the world (tried iwth different wiki links).

There are some inconsistencies in identifying person names.
Also it is wrongly identifying few words such as Inc., AAI., Gymnastics, Lee Enterprises, A++