1.1 When you roll a fair die 3 times, how many possible outcomes are there?

Solution:

The number of possible outcome is \(6^n\) where n is the number of roll(s). Hence, the solution is \(6^3\) = 216.

  1. 2 What is the probability of getting a sum total of 3 when you roll a die two times?
suppressMessages(library(knitr))
dice <- read.csv("https://raw.githubusercontent.com/mascotinme/GitHub/master/MSDA%20605/dice.csv", sep = ",", header = FALSE)

kable(head(dice))
V1 V2 V3 V4 V5 V6 V7
dice 1 2 3 4 5 6
1 1,1 1,2 1,3 1,4 1,5 1,6
2 2,1 2,2 2,3 2,4 2,5 2,6
3 3,1 3,2 3,3 3,4 3,5 3,6
4 4,1 4,2 4,3 4,4 4,5 4,6
5 5,1 5,2 5,3 5,4 5,5 5,6
  • From the above the probability of getting sum of 3 is:

\(\left( 2\quad \div \quad 36\quad =\quad 0.056 \right) \quad \quad or\quad 5.556\quad percent\)

i.e (1,2 & 2,1)

  1. Write a program to take a document in English and print out the estimated probabilities for each of the words that occur in that document.
# Kindly install all of the packages below if you dont already have it.

options(warn = -1)
suppressMessages(library(RKEA))
suppressMessages(library(NLP))
suppressMessages(library(tm))
suppressMessages(library(tidyr))
suppressMessages(library(stringr))
suppressMessages(library(dplyr))
suppressMessages(library(SnowballC))

suppressMessages(library(RTextTools));
suppressMessages(library(plyr))

Your program should take in a file containing a large document and write out the probabilities of each of the words that appear in that document.

mydata <- read.csv2("https://raw.githubusercontent.com/mascotinme/GitHub/master/MSDA%20605/assign6.txt", header = FALSE)

View(mydata)

review_text <- paste(mydata, collapse = " ")
review_text
## [1] "c(15, 13, 11, 22, 9, 36, 19, 4, 26, 32, 27, 33, 14, 38, 29, 8, 37, 34, 35, 10, 31, 2, 17, 28, 16, 6, 12, 21, 30, 18, 5, 20, 25, 23, 7, 24, 3, 1)"
review_source <- VectorSource(mydata)
corpus <- Corpus(review_source)
corpus <- tm_map(corpus, removePunctuation)

head(inspect(corpus))
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## $V1
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 8077
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
corpus <- tm_map(corpus, removeNumbers) # We remove numbers
corpus <- tm_map(corpus, tolower)       # Change case to lower
corpus <- tm_map(corpus, stripWhitespace) #Removing whitespaces
corpus <- tm_map(corpus, removeWords, stopwords("english")) # Removing words

head(corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)


dtm <- DocumentTermMatrix(corpus)

dtm <- removeSparseTerms(dtm, 0.2)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 505)>>
## Non-/sparse entries: 505/0
## Sparsity           : 0%
## Maximal term length: 18
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           â<U+0080><U+0098>    â<U+0080>      â<U+0080><U+009C> â<U+0080><U+009C>iâ<U+0080><U+0099>ve â<U+0080><U+009C>itâ<U+0080><U+0099>s â<U+0080><U+009C>right    â<U+0080><U+009C>threestrikesâ<U+0080>     
##   character(0)   1   1  10         1         2        1                  1
##               Terms
## Docs           â<U+0080><U+009C>weâ<U+0080><U+0099>re â<U+0080><U+009C>yes abundance abundant abus abuse abysmal
##   character(0)          1      1         1        1    2     1       1
##               Terms
## Docs           according across act acting administrationâ<U+0080><U+0099>s aging agreed
##   character(0)         1      1   1      1                  1     1      1
##               Terms
## Docs           alabama    alabamaâ<U+0080>      alabamaâ<U+0080><U+0099>s alabaster almost also
##   character(0)       4          1           1         1      2    1
##               Terms
## Docs           although among analyst angel anything appalled appetite
##   character(0)        1     3       1     1        2        1        1
##               Terms
## Docs           approval april arbuthnot argues arise armed asked assaults
##   character(0)        1     1         1      1     1     1     2        1
##               Terms
## Docs           assistant attention attorney autopsy average aware back
##   character(0)         1         1        1       1       1     1    1
##               Terms
## Docs           backward bad banned bas basic basics bathtub beaten began
##   character(0)        1   2      1   1     1      1       1      1     1
##               Terms
## Docs           beginning believe bentley better beyond bigger birth blind
##   character(0)         1       1       2      3      1      1     1     1
##               Terms
## Docs           bodies botched box budget build building built buried
##   character(0)      1       1   1      2     1        1     1      1
##               Terms
## Docs           called calls cam came cameras can candidate capac capacity
##   character(0)      1     1   1    2       1   2         1     1        1
##               Terms
## Docs           capita care case caution chairman challenging change
##   character(0)      1    1    1       1        1           1      2
##               Terms
## Docs              changeâ<U+0080>      changing charlotte child choices citizensâ<U+0080><U+0099>
##   character(0)         1        2         1     2       1           1
##               Terms
## Docs           civil clean clinical colby cologne coming commission
##   character(0)     1     1        1     2       1      1          1
##               Terms
## Docs           commissioner committee commodity conditions congress
##   character(0)            1         1         1          6        1
##               Terms
## Docs           constitutional contact contraband convicted conviction
##   character(0)              1       1          1         1          2
##               Terms
## Docs           corners corrections court courts created crime crimes
##   character(0)       1          10     1      1       1     1      3
##               Terms
## Docs           criminals crisis culture curb currency custodial damning
##   character(0)         1      1       1    1        1         1       1
##               Terms
## Docs           dangerously daughter dealing death december defendants
##   character(0)           1        1       1     1        1          1
##               Terms
## Docs           deliberate department departmentâ<U+0080><U+0099>s deprivation designed
##   character(0)          1          7              1           1        1
##               Terms
## Docs           disparages document donâ<U+0080><U+0099>t double drowned drug drugs
##   character(0)          1        1       2      1       1    2     1
##               Terms
## Docs              dynamiteâ<U+0080>      elderly employees enough environment equal even
##   character(0)           1       1         2      1           1     2    2
##               Terms
## Docs           examiner exchanged eyes faced faces failed family far
##   character(0)        1         1    1     1     1      1      1   1
##               Terms
## Docs              favorsâ<U+0080>      fearful federal female filled finally findings
##   character(0)         1       1       5      2      1       1        1
##               Terms
## Docs           fix food former forward fresh gave general george get
##   character(0)   2    1      1       1     1    1       1      1   4
##               Terms
## Docs           getting give going good gov government governor    governorâ<U+0080>     
##   character(0)       1    1     1    1   1          4        1           1
##               Terms
## Docs           grave great group guard guards guidelines guntoting half
##   character(0)     1     1     1     2      2          1         1    1
##               Terms
## Docs           happened harassed health helped    hereâ<U+0080>      highest highly hire
##   character(0)        1        1      2      1       1       1      1    1
##               Terms
## Docs           hired home iâ<U+0080><U+0099>ve ignoring important improve improved
##   character(0)     1    1      1        1         1       2        1
##               Terms
## Docs           included includes including indifference indigent inhumane
##   character(0)        1        1         1            1        1        1
##               Terms
## Docs           initiative inmate inmates inside instead institute
##   character(0)          2      1       4      2       1         1
##               Terms
## Docs           institutions intervention interview investigate
##   character(0)            1            1         2           1
##               Terms
## Docs           investigating investigation investigations issued    itâ<U+0080>     
##   character(0)             1             3              1      2     1
##               Terms
## Docs           itâ<U+0080><U+0099>s items jail januari january jocelyn judiciary julia
##   character(0)      4     2    1       1       1       1         1     2
##               Terms
## Docs           june just justice kim lack larger larry last law lawyer
##   character(0)    1    6       6   1    1      1     1    2   1      1
##               Terms
## Docs           least legal legislator legislature less levels liberal life
##   character(0)     2     1          1           3    1      1       1    2
##               Terms
## Docs           like likely live living locked long longtime look low
##   character(0)    5      1    2      1      1    1        1    1   1
##               Terms
## Docs           lowlevel make makeup male management many margin marked
##   character(0)        1    1      1    1          1    2      1      1
##               Terms
## Docs           marsha matter may medical mental met middle million minim
##   character(0)      1      1   1       2      2   1      1       4     1
##               Terms
## Docs           misconduct money    moneyâ<U+0080>      monica montgomeri month months
##   character(0)          1     2        1      1          1     1      3
##               Terms
## Docs           morrison mother moved much murder named national near need
##   character(0)        1      1     1    2      1     1        1    2    3
##               Terms
## Docs           needs never new nonviolent now number odds offenders
##   character(0)     2     1   1          1   3      1    1         2
##               Terms
## Docs           offenses offic officer officers    officersâ<U+0080>      officials often
##   character(0)        1     1       1        4           1         1     1
##               Terms
## Docs           one open organization organize original others overhaul
##   character(0)   2    1            2        1        1      2        1
##               Terms
## Docs           overturned page paper parole part past people per percent
##   character(0)          1    1     1      1    1    1      1   1       1
##               Terms
## Docs              periodâ<U+0080>      personally perspective places plan policies
##   character(0)         1          1           1      1    2        2
##               Terms
## Docs           policy political practices premature pressing primary
##   character(0)      3         1         1         1        1       1
##               Terms
## Docs           primitive prison prisonâ<U+0080><U+0099>s prisoners prisons problem
##   character(0)         1     12          2         6       6       2
##               Terms
## Docs           problems procedures programs project prominence promising
##   character(0)        1          1        1       1          1         1
##               Terms
## Docs           prompt property psychologist question quit raise rampant
##   character(0)      1        1            1        1    1     1       2
##               Terms
## Docs           raped rate recent recently recruiting rectify reform
##   character(0)     2    1      1        2          1       1      2
##               Terms
## Docs           relatives released releasing remained remains repeat
##   character(0)         1        2         1        1       3      1
##               Terms
## Docs           replaced report reports represents republican request
##   character(0)        1      4       2          1          2       1
##               Terms
## Docs           rescinding resellable review rights robbery robert rodney
##   character(0)          1          1      1      1       1      1      1
##               Terms
## Docs           routinely row rules running said samuels say says
##   character(0)         1   1     1       2   22       1   2    2
##               Terms
## Docs           scrutinizing secondhighest secure see seen sell senate
##   character(0)            1             1      1   1    1    1      1
##               Terms
## Docs           senator sending senior sent sentence sentencing series
##   character(0)       1       1      1    1        1          2      2
##               Terms
## Docs           serious served servic serving session several sex sexual
##   character(0)       1      2      1       1       1       1   4      3
##               Terms
## Docs           sexualized show showed showering sick since situation six
##   character(0)          1    1      1         1    1     3         2   4
##               Terms
## Docs           soft solution sometimes son spending split spots stacy
##   character(0)    1        1         2   1        1     1     1     1
##               Terms
## Docs           staffing state stateâ<U+0080><U+0099>s step stephen stepped stetson still
##   character(0)        2     4         2    1       1       1       1     6
##               Terms
## Docs           stillborn stockades strip strong stuff    stupidâ<U+0080>      support
##   character(0)         1         1     1      1     1         1       2
##               Terms
## Docs           system    systemâ<U+0080>      take tampon telephone texas thatâ<U+0080><U+0099>s
##   character(0)      2         1    1      1         1     1        2
##               Terms
## Docs           things think third    thisâ<U+0080>      thomas three tied toilet top
##   character(0)      1     1     1       1      3     1    1      1   2
##               Terms
## Docs           toxic track tracked    transparentâ<U+0080>      treatment troubled
##   character(0)     1     1       1              1         2        1
##               Terms
## Docs           trying tutwil tutwiler    tutwilerâ<U+0080>      two unconstitutional
##   character(0)      1      1       13           1   1                1
##               Terms
## Docs           uncovered unfolding uniform use using violations wanted
##   character(0)         1         1       1   2     1          1      1
##               Terms
## Docs           wants ward warden washington watched way weâ<U+0080><U+0099>re week
##   character(0)     1    3      1          2       1   1       1    1
##               Terms
## Docs           weighing well whether whose wideranging will without woman
##   character(0)        1    2       2     1           1    1       1     1
##               Terms
## Docs           women wood work worked working worse year yearâ<U+0080><U+0099>s years
##   character(0)     6    1    1      1       1     1    3        1     6

Obtaining the the keyword frequecies

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)   
head(table(freq), 40) 
## freq
##   1   2   3   4   5   6   7  10  12  13  22 
## 386  77  15  11   2   8   1   2   1   1   1

Glimpse of the keywords

wf <- data.frame(word=names(freq), freq=freq)   
wf[1:20, ]
##                    word freq
## said               said   22
## tutwiler       tutwiler   13
## prison           prison   12
## â<U+0080><U+009C>                 â<U+0080><U+009C>   10
## corrections corrections   10
## department   department    7
## conditions   conditions    6
## just               just    6
## justice         justice    6
## prisoners     prisoners    6
## prisons         prisons    6
## still             still    6
## women             women    6
## years             years    6
## federal         federal    5
## like               like    5
## alabama         alabama    4
## get                 get    4
## government   government    4
## inmates         inmates    4

Summation of all keywords.

add <- sum(wf$freq) # Summation of all the keywords
add
## [1] 761
firstword <- subset(wf, wf$word == "said") # Calling and assigning keyword "said"
firstword
##      word freq
## said said   22
secondword <- subset(wf, wf$word == "tutwiler")  # Calling and assigning keyword "tutwiler"
secondword 
##              word freq
## tutwiler tutwiler   13
PrA <- (secondword$freq/add) # Probability of A (said)
PrA
## [1] 0.01708279
PrB<- (firstword$freq/add) # Probability of B (tutwiler)
PrB
## [1] 0.02890933

The Joint probability of both of them occurring together

Pr_AnB <- (PrA*PrA)
Pr_AnB
## [1] 0.0002918216

A bar plot showing the frequencing of each word.

barplot(wf[1:10, ]$freq, las = 2, names.arg = wf[1:10,]$word, col = "brown",main = "Most Frequent Words", ylab ="Word frequecies")

A word cloud of the keywords

findAssocs(dtm, "remove" , corlimit=0.5)
## $remove
## numeric(0)
suppressMessages(library(wordcloud))

word <- names(freq)

set.seed(142)   
dark2 <- brewer.pal(6, "Dark2")   
wordcloud(names(freq), freq, max.words=100, rot.per=0.2, colors=dark2)