R Markdown

data = read.csv("C:/Users/HPi/OneDrive - HP Inc/Pradosh/Personal/Harrisburg/Course 540/data2.csv", header = TRUE)
str(data)
## 'data.frame':    61 obs. of  2 variables:
##  $ X           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ description2: Factor w/ 61 levels "Abadiânia, Brazil - A self-proclaimed Brazilian spiritual healer — who became internationally famous when Oprah"| __truncated__,..: 26 25 12 52 50 30 35 2 20 31 ...
#install.packages("wordcloud")
#install.packages("lda")
library(lda)
## Warning: package 'lda' was built under R version 3.6.1
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.1
## Loading required package: RColorBrewer
library(tm)
## Warning: package 'tm' was built under R version 3.6.1
## Loading required package: NLP
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
data.text = data$description2
str(data.text)
##  Factor w/ 61 levels "Abadiânia, Brazil - A self-proclaimed Brazilian spiritual healer — who became internationally famous when Oprah"| __truncated__,..: 26 25 12 52 50 30 35 2 20 31 ...
data.text = tolower(data.text)
data.text = tm::removeWords(data.text, stopwords("english"))
data.text = tm::removePunctuation(data.text)
data.text = tm::removeNumbers(data.text)
data.text = tm::stripWhitespace(data.text)
data.text = tm::stemDocument(data.text)
data.text = tm::Corpus(VectorSource(data.text))
dtm <- tm::DocumentTermMatrix(data.text, control = list(wordLengths = c(3,Inf)))

dtm
## <<DocumentTermMatrix (documents: 61, terms: 2478)>>
## Non-/sparse entries: 6668/144490
## Sparsity           : 96%
## Maximal term length: 23
## Weighting          : term frequency (tf)
inspect(dtm)
## <<DocumentTermMatrix (documents: 61, terms: 2478)>>
## Non-/sparse entries: 6668/144490
## Sparsity           : 96%
## Maximal term length: 23
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs accord alleg arrest assault charg polic rape said victim yearold
##   10      1     5      2       0     6     1    2    9      4       1
##   19      1     0      2       0     2     0    0    2      2       3
##   21      2     4      1       3     3     3    1    4      3       5
##   22      3     2      1       2    10     0    7    2      5       3
##   25      5     0      4       6     2     7    1    2      2       2
##   33      5     2      0       5     8     1    3    4      1       0
##   44      0     2      1       3     1     3    1    6      1       0
##   54      1     0      2       7     0     0    0    7      2       3
##   59      2     6      1       4     3     0    1    1      4       0
##   6       5     0      1       0     7     9    2   11      0       2
freq <- colSums(as.matrix(dtm)) #Collapse Matrix by summing over columns

length(freq) #Length should be total number of terms.
## [1] 2478
ord <- order(freq,decreasing=TRUE)#Create sort order (descending).

head(freq[ord]) #List all terms in decreasing order of freq and write to disk.
##    said   charg   polic assault    rape  accord 
##     154     125     109      85      82      75
write.csv(freq[ord],"word_freq.csv")

#Prepare the data for Model 1
n <- nrow(dtm)

splitter <- sample(1:n, round(n * 0.75))
train_set <- dtm[splitter, ]
valid_set <- dtm[-splitter, ]

#LDA ~ Topic Modeling…1
#Set the required elements.
burnin <- 4000
iter <- 2000
thin <- 500
seed <-list(2003,5,63,100001,765)
nstart <- 5
best <- TRUE
keep = 50
k = 10

#LDA ~ Topic Modeling…2
#Model 1 - 10 Topics.
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.6.1
ap_lda1 <- LDA(train_set,k, method="Gibbs", 
               control=list(nstart=nstart, seed = seed, best=best, 
                            burnin = burnin, iter = iter, thin=thin, keep = keep))
ap_lda1
## A LDA_Gibbs topic model with 10 topics.
#Evaluation of training data using Model 1
eval1.train = topicmodels::perplexity(ap_lda1, train_set, control = list(nstart=nstart, seed = 100, best=best, 
                                                                         burnin = burnin, iter = iter, thin=thin), 
                                      use_theta = TRUE,estimate_theta = TRUE)
eval1.train
## [1] 736.551
#Capture Results of Model 1…1
lda_inf <- posterior(ap_lda1, valid_set)
attributes(lda_inf)
## $names
## [1] "terms"  "topics"
nTerms(train_set)
## [1] 2478
#Capture Results of Model 1…2
#Topics are probability distribtions over the entire vocabulary.
beta <- lda_inf$terms   # get beta from results
dim(beta)  
## [1]   10 2478
rowSums(beta)
##  1  2  3  4  5  6  7  8  9 10 
##  1  1  1  1  1  1  1  1  1  1
nDocs(train_set)        # size of collection
## [1] 46
#For every document we have a probaility distribution of its contained topics.
theta <- lda_inf$topics 
dim(theta)               # nDocs(DTM) distributions over K topics
## [1] 15 10
rowSums(theta)[1:5] 
##  2  7  9 10 13 
##  1  1  1  1  1
#The 10 most likely terms within the term probabilities of Model 1
#Beta of the inferred topics.
terms(ap_lda1, 10)
##       Topic 1  Topic 2   Topic 3    Topic 4  Topic 5    Topic 6   
##  [1,] "“"      "assault" "rape"     "polic"  "offic"    "said"    
##  [2,] "polic"  "sexual"  "accord"   "woman"  "charg"    "polic"   
##  [3,] "live"   "case"    "charg"    "say"    "counti"   "girl"    
##  [4,] "murder" "victim"  "—"        "arrest" "child"    "charg"   
##  [5,] "home"   "count"   "hardin"   "call"   "arrest"   "attorney"
##  [6,] "—"      "year"    "court"    "victim" "jail"     "death"   
##  [7,] "news"   "accord"  "thursday" "two"    "sheriff’" "bodi"    
##  [8,] "karun"  "also"    "sexual"   "park"   "releas"   "home"    
##  [9,] "told"   "woman"   "author"   "record" "time"     "investig"
## [10,] "left"   "two"     "offici"   "crime"  "feloni"   "last"    
##       Topic 7     Topic 8    Topic 9   Topic 10    
##  [1,] "report"    "sentenc"  "yearold" "said"      
##  [2,] "alleg"     "children" "man"     "judg"      
##  [3,] "told"      "famili"   "murder"  "cosbi"     
##  [4,] "incid"     "jone"     "told"    "will"      
##  [5,] "sister"    "prison"   "week"    "ask"       
##  [6,] "kill"      "court"    "one"     "tuesday"   
##  [7,] "deputi"    "life"     "indict"  "appear"    
##  [8,] "wednesday" "said"     "brehmer" "last"      
##  [9,] "offer"     "face"     "say"     "prosecutor"
## [10,] "never"     "men"      "also"    "trial"
#Visualize the Topics of Model 1
#First, create pseudo-names for each topic using a concatenation of the 5 most likely words in each category.
top5termsPerTopic <- terms(ap_lda1, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")

#Visualization of Words and Topics
require(wordcloud)
# visualize topics as word cloud
topicToViz <- 11 # change for your own topic of interest
topicToViz <- grep('murder', topicNames)[1]

#Select40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms <- sort(lda_inf$terms[topicToViz,], decreasing=TRUE)[1:40]
words <- names(top40terms)
#Extract the probabilites of each of the 40 terms.
probabilities <- sort(lda_inf$terms[topicToViz,], decreasing=TRUE)[1:40]

#Visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

Deeper Dive into the Dsitribution…1 Look more closely at the distribution of topics within individual documents. To this end, we visualize the distribution in 3 sample documents.

exampleIds <- c(2, 5, 10)
lapply(data$description2[exampleIds], as.character)
## [[1]]
## [1] "Harrison County, KY - Two men have been charged in connection with the rape of a 12-year-old girl at gunpoint in Harrison County.\n\nOne of the suspects, 18-year-old Jonathan N. Carpenter, has been charged with rape and sodomy after authorities said Carpenter held a gun to the girl and forced her to perform a sex act in a vehicle. Hezekiah Hockensmith, 19,who is charged with facilitation to commit rape and sodomy, was inside the car when the incident took place.\n\nKentucky State Police said the two men then took the girl to Hockensmith’s residence where Carpenter raped the girl at gunpoint while Hockensmith waited outside.\n\nThe two suspects picked up the victim from her grandparents’ home in the early morning hours of May 31. They dropped her off at a gas station after her grandparents reported her missing.\n\nCarpenter, who was additionally charged with trafficking in a controlled substance (cocaine) and engaging in organized crime, is being held in the Bourbon County Detention Center on a $50,000 bond, while Hockensmith is free on a $25,000 bond."
## 
## [[2]]
## [1] "Pittsburgh, PA – Police in Pittsburgh are turning to the public for help in locating a missing woman.\n\nAccording to the Pittsburgh Bureau of Police, 49-year-old Jennifer Antonoplos was last seen leaving UPMC Western Psychiatric Hospital Wednesday.\n\nAntonoplos is described as 5-feet 4-inches tall, wearing a grat t-shirt with a bible verse on the back.\n\nPolice officals say she has lived in Homestead and South Park.\n\nAnyone with information is asked to call missing persons detectives at 412-323-7141."
## 
## [[3]]
## [1] "Los Angeles, CA (AP) —The leader and self-proclaimed apostle of La Luz del Mundo, a Mexico-based church with branches in the U.S. that claims over 1 million followers, has been charged with human trafficking and child rape, California officials announced Tuesday.\n\nJoaquín García and a follower of the church, 24-year-old Susana Medina Oaxaca, were arrested Monday after landing at Los Angeles International Airport, California Attorney General Xavier Becerra's office said.\n\nAround 1,000 worshippers gathered at the headquarters of La Luz del Mundo in Guadalajara, Jalisco, on Tuesday evening to pray for García as he was held in Los Angeles on $25 million bail.\n\nGarcía, 50, faces 26 counts of felony charges that range from human trafficking and production of child pornography to forcible rape of a minor. The charges detail a series of disturbing allegations involving three girls and one woman between 2015 and 2018 in Los Angeles County.\n\nThe fundamentalist Christian church, whose name translates to The Light of the World, was founded in 1926 by García's grandfather. It has been the subject of child sex abuse allegations for years but authorities in Mexico have never filed criminal charges. It has more than 15,000 churches in 58 countries, according to its website. The church's followers must adhere to a strict moral code in exchange for the promise of eternal salvation.\n\nGarcía - who was a minister in Los Angeles and other parts of Southern California before becoming the church's leader - coerced the victims into performing sex acts by telling them that refusing would be going against God, authorities said. He allegedly forced the victims, who were members of the church, to sexually touch themselves and each other. One of his co-defendants also allegedly took nude photographs of the victims and sent the pictures to García.\n\nGarcía told one of the victims and others in 2017, after they had completed a \"flirty\" dance wearing \"as little clothing as possible\" that kings can have mistresses and an apostle of God cannot be judged for his actions, the complaint states.\n\n\"Crimes like those alleged in this complaint have no place in our society. Period,\" Becerra said in a press release. \"We must not turn a blind eye to sexual violence and trafficking in our state.\"\n\nThe attorney general's investigation began in 2018, prompted in part by a tip to the state's Department of Justice through an online clergy abuse complaint form.\n\nDavid Correa, a spokesman from the church's headquarters, said in a phone call that they learned about the charges from the media.\n\n\"We categorically deny those false accusations,\" Correa said. \"We know him personally and he is an honorable and honest man.\"\n\nChurch officials said García has always behaved lawfully and rejected the attorney general's charges.\n\nThe church's statement Tuesday said they hoped the accusations would not lead to discrimination and religious intolerance against its followers.\n\nA third follower, Alondra Ocampo, 36, was arrested in Los Angeles County and is being held at the sheriff's Century Regional Detention Facility in Lynwood ahead of her arraignment Wednesday in Los Angeles. A fourth defendant, Azalea Rangel Melendez, remains at large.\n\nIt wasn't immediately clear whether the defendants had an attorney.\n\nIn May, an opera concert at Palacio de Bellas Artes, the main cultural venue in Mexico, generated controversy because in some places it was presented as a tribute to García. Critics said a secular state such as Mexico shouldn't use a public place for that purpose.\n\nThe work, \"The Guardian of the Mirror,\" was broadcast on social networks and screened outside the Palace, with the church's followers in the audience.\n\nLa Luz del Mundo denied that it was an homage and said the opinions expressed in social networks were not promoted by the institution.\n\nAirport police referred media inquiries to Customs and Border Patrol, which did not immediately have a comment."
library("reshape2")
library("ggplot2")
N <- length(exampleIds)


#Deeper Dive into the Dsitribution…2
#Get topic proportions form example documents.
topicProportionExamples <- theta[exampleIds]
vizDataFrame <- melt(cbind(data.frame(topicProportionExamples), document = factor(1:N)), variable.name = "topic", id.vars = "document")  

ggplot(data = vizDataFrame, aes(topic, value, fill = document), ylab = "proportion") + 
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  
  coord_flip() +
  facet_wrap(~ document, ncol = N)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.