IS 605 FUNDAMENTALS OF COMPUTATIONAL MATHEMATICS - WEEK 6 | Data Analysis
1.1 When you roll a fair die 3 times, how many possible outcomes are there?
Solution:
The number of possible outcome is \(6^n\) where n is the number of roll(s). Hence, the solution is \(6^3\) = 216.
- 2 What is the probability of getting a sum total of 3 when you roll a die two times?
suppressMessages(library(knitr))
dice <- read.csv("https://raw.githubusercontent.com/mascotinme/GitHub/master/MSDA%20605/dice.csv", sep = ",", header = FALSE)
kable(head(dice))| V1 | V2 | V3 | V4 | V5 | V6 | V7 |
|---|---|---|---|---|---|---|
| dice | 1 | 2 | 3 | 4 | 5 | 6 |
| 1 | 1,1 | 1,2 | 1,3 | 1,4 | 1,5 | 1,6 |
| 2 | 2,1 | 2,2 | 2,3 | 2,4 | 2,5 | 2,6 |
| 3 | 3,1 | 3,2 | 3,3 | 3,4 | 3,5 | 3,6 |
| 4 | 4,1 | 4,2 | 4,3 | 4,4 | 4,5 | 4,6 |
| 5 | 5,1 | 5,2 | 5,3 | 5,4 | 5,5 | 5,6 |
\(\left( 2\quad \div \quad 36\quad =\quad 0.056 \right) \quad \quad or\quad 5.556\quad percent\)
i.e (1,2 & 2,1)
- Write a program to take a document in English and print out the estimated probabilities for each of the words that occur in that document.
# Kindly install all of the packages below if you dont already have it.
options(warn = -1)
suppressMessages(library(RKEA))
suppressMessages(library(NLP))
suppressMessages(library(tm))
suppressMessages(library(tidyr))
suppressMessages(library(stringr))
suppressMessages(library(dplyr))
suppressMessages(library(SnowballC))
suppressMessages(library(RTextTools));
suppressMessages(library(plyr))Your program should take in a file containing a large document and write out the probabilities of each of the words that appear in that document.
mydata <- read.csv2("https://raw.githubusercontent.com/mascotinme/GitHub/master/MSDA%20605/assign6.txt", header = FALSE)
View(mydata)
review_text <- paste(mydata, collapse = " ")
review_text## [1] "c(15, 13, 11, 22, 9, 36, 19, 4, 26, 32, 27, 33, 14, 38, 29, 8, 37, 34, 35, 10, 31, 2, 17, 28, 16, 6, 12, 21, 30, 18, 5, 20, 25, 23, 7, 24, 3, 1)"
review_source <- VectorSource(mydata)
corpus <- Corpus(review_source)
corpus <- tm_map(corpus, removePunctuation)
head(inspect(corpus))## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## $V1
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 8077
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
corpus <- tm_map(corpus, removeNumbers) # We remove numbers
corpus <- tm_map(corpus, tolower) # Change case to lower
corpus <- tm_map(corpus, stripWhitespace) #Removing whitespaces
corpus <- tm_map(corpus, removeWords, stopwords("english")) # Removing words
head(corpus)## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
dtm <- DocumentTermMatrix(corpus)
dtm <- removeSparseTerms(dtm, 0.2)
inspect(dtm)## <<DocumentTermMatrix (documents: 1, terms: 505)>>
## Non-/sparse entries: 505/0
## Sparsity : 0%
## Maximal term length: 18
## Weighting : term frequency (tf)
##
## Terms
## Docs â<U+0080><U+0098> â<U+0080> â<U+0080><U+009C> â<U+0080><U+009C>iâ<U+0080><U+0099>ve â<U+0080><U+009C>itâ<U+0080><U+0099>s â<U+0080><U+009C>right â<U+0080><U+009C>threestrikesâ<U+0080>
## character(0) 1 1 10 1 2 1 1
## Terms
## Docs â<U+0080><U+009C>weâ<U+0080><U+0099>re â<U+0080><U+009C>yes abundance abundant abus abuse abysmal
## character(0) 1 1 1 1 2 1 1
## Terms
## Docs according across act acting administrationâ<U+0080><U+0099>s aging agreed
## character(0) 1 1 1 1 1 1 1
## Terms
## Docs alabama alabamaâ<U+0080> alabamaâ<U+0080><U+0099>s alabaster almost also
## character(0) 4 1 1 1 2 1
## Terms
## Docs although among analyst angel anything appalled appetite
## character(0) 1 3 1 1 2 1 1
## Terms
## Docs approval april arbuthnot argues arise armed asked assaults
## character(0) 1 1 1 1 1 1 2 1
## Terms
## Docs assistant attention attorney autopsy average aware back
## character(0) 1 1 1 1 1 1 1
## Terms
## Docs backward bad banned bas basic basics bathtub beaten began
## character(0) 1 2 1 1 1 1 1 1 1
## Terms
## Docs beginning believe bentley better beyond bigger birth blind
## character(0) 1 1 2 3 1 1 1 1
## Terms
## Docs bodies botched box budget build building built buried
## character(0) 1 1 1 2 1 1 1 1
## Terms
## Docs called calls cam came cameras can candidate capac capacity
## character(0) 1 1 1 2 1 2 1 1 1
## Terms
## Docs capita care case caution chairman challenging change
## character(0) 1 1 1 1 1 1 2
## Terms
## Docs changeâ<U+0080> changing charlotte child choices citizensâ<U+0080><U+0099>
## character(0) 1 2 1 2 1 1
## Terms
## Docs civil clean clinical colby cologne coming commission
## character(0) 1 1 1 2 1 1 1
## Terms
## Docs commissioner committee commodity conditions congress
## character(0) 1 1 1 6 1
## Terms
## Docs constitutional contact contraband convicted conviction
## character(0) 1 1 1 1 2
## Terms
## Docs corners corrections court courts created crime crimes
## character(0) 1 10 1 1 1 1 3
## Terms
## Docs criminals crisis culture curb currency custodial damning
## character(0) 1 1 1 1 1 1 1
## Terms
## Docs dangerously daughter dealing death december defendants
## character(0) 1 1 1 1 1 1
## Terms
## Docs deliberate department departmentâ<U+0080><U+0099>s deprivation designed
## character(0) 1 7 1 1 1
## Terms
## Docs disparages document donâ<U+0080><U+0099>t double drowned drug drugs
## character(0) 1 1 2 1 1 2 1
## Terms
## Docs dynamiteâ<U+0080> elderly employees enough environment equal even
## character(0) 1 1 2 1 1 2 2
## Terms
## Docs examiner exchanged eyes faced faces failed family far
## character(0) 1 1 1 1 1 1 1 1
## Terms
## Docs favorsâ<U+0080> fearful federal female filled finally findings
## character(0) 1 1 5 2 1 1 1
## Terms
## Docs fix food former forward fresh gave general george get
## character(0) 2 1 1 1 1 1 1 1 4
## Terms
## Docs getting give going good gov government governor governorâ<U+0080>
## character(0) 1 1 1 1 1 4 1 1
## Terms
## Docs grave great group guard guards guidelines guntoting half
## character(0) 1 1 1 2 2 1 1 1
## Terms
## Docs happened harassed health helped hereâ<U+0080> highest highly hire
## character(0) 1 1 2 1 1 1 1 1
## Terms
## Docs hired home iâ<U+0080><U+0099>ve ignoring important improve improved
## character(0) 1 1 1 1 1 2 1
## Terms
## Docs included includes including indifference indigent inhumane
## character(0) 1 1 1 1 1 1
## Terms
## Docs initiative inmate inmates inside instead institute
## character(0) 2 1 4 2 1 1
## Terms
## Docs institutions intervention interview investigate
## character(0) 1 1 2 1
## Terms
## Docs investigating investigation investigations issued itâ<U+0080>
## character(0) 1 3 1 2 1
## Terms
## Docs itâ<U+0080><U+0099>s items jail januari january jocelyn judiciary julia
## character(0) 4 2 1 1 1 1 1 2
## Terms
## Docs june just justice kim lack larger larry last law lawyer
## character(0) 1 6 6 1 1 1 1 2 1 1
## Terms
## Docs least legal legislator legislature less levels liberal life
## character(0) 2 1 1 3 1 1 1 2
## Terms
## Docs like likely live living locked long longtime look low
## character(0) 5 1 2 1 1 1 1 1 1
## Terms
## Docs lowlevel make makeup male management many margin marked
## character(0) 1 1 1 1 1 2 1 1
## Terms
## Docs marsha matter may medical mental met middle million minim
## character(0) 1 1 1 2 2 1 1 4 1
## Terms
## Docs misconduct money moneyâ<U+0080> monica montgomeri month months
## character(0) 1 2 1 1 1 1 3
## Terms
## Docs morrison mother moved much murder named national near need
## character(0) 1 1 1 2 1 1 1 2 3
## Terms
## Docs needs never new nonviolent now number odds offenders
## character(0) 2 1 1 1 3 1 1 2
## Terms
## Docs offenses offic officer officers officersâ<U+0080> officials often
## character(0) 1 1 1 4 1 1 1
## Terms
## Docs one open organization organize original others overhaul
## character(0) 2 1 2 1 1 2 1
## Terms
## Docs overturned page paper parole part past people per percent
## character(0) 1 1 1 1 1 1 1 1 1
## Terms
## Docs periodâ<U+0080> personally perspective places plan policies
## character(0) 1 1 1 1 2 2
## Terms
## Docs policy political practices premature pressing primary
## character(0) 3 1 1 1 1 1
## Terms
## Docs primitive prison prisonâ<U+0080><U+0099>s prisoners prisons problem
## character(0) 1 12 2 6 6 2
## Terms
## Docs problems procedures programs project prominence promising
## character(0) 1 1 1 1 1 1
## Terms
## Docs prompt property psychologist question quit raise rampant
## character(0) 1 1 1 1 1 1 2
## Terms
## Docs raped rate recent recently recruiting rectify reform
## character(0) 2 1 1 2 1 1 2
## Terms
## Docs relatives released releasing remained remains repeat
## character(0) 1 2 1 1 3 1
## Terms
## Docs replaced report reports represents republican request
## character(0) 1 4 2 1 2 1
## Terms
## Docs rescinding resellable review rights robbery robert rodney
## character(0) 1 1 1 1 1 1 1
## Terms
## Docs routinely row rules running said samuels say says
## character(0) 1 1 1 2 22 1 2 2
## Terms
## Docs scrutinizing secondhighest secure see seen sell senate
## character(0) 1 1 1 1 1 1 1
## Terms
## Docs senator sending senior sent sentence sentencing series
## character(0) 1 1 1 1 1 2 2
## Terms
## Docs serious served servic serving session several sex sexual
## character(0) 1 2 1 1 1 1 4 3
## Terms
## Docs sexualized show showed showering sick since situation six
## character(0) 1 1 1 1 1 3 2 4
## Terms
## Docs soft solution sometimes son spending split spots stacy
## character(0) 1 1 2 1 1 1 1 1
## Terms
## Docs staffing state stateâ<U+0080><U+0099>s step stephen stepped stetson still
## character(0) 2 4 2 1 1 1 1 6
## Terms
## Docs stillborn stockades strip strong stuff stupidâ<U+0080> support
## character(0) 1 1 1 1 1 1 2
## Terms
## Docs system systemâ<U+0080> take tampon telephone texas thatâ<U+0080><U+0099>s
## character(0) 2 1 1 1 1 1 2
## Terms
## Docs things think third thisâ<U+0080> thomas three tied toilet top
## character(0) 1 1 1 1 3 1 1 1 2
## Terms
## Docs toxic track tracked transparentâ<U+0080> treatment troubled
## character(0) 1 1 1 1 2 1
## Terms
## Docs trying tutwil tutwiler tutwilerâ<U+0080> two unconstitutional
## character(0) 1 1 13 1 1 1
## Terms
## Docs uncovered unfolding uniform use using violations wanted
## character(0) 1 1 1 2 1 1 1
## Terms
## Docs wants ward warden washington watched way weâ<U+0080><U+0099>re week
## character(0) 1 3 1 2 1 1 1 1
## Terms
## Docs weighing well whether whose wideranging will without woman
## character(0) 1 2 2 1 1 1 1 1
## Terms
## Docs women wood work worked working worse year yearâ<U+0080><U+0099>s years
## character(0) 6 1 1 1 1 1 3 1 6
Obtaining the the keyword frequecies
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(table(freq), 40) ## freq
## 1 2 3 4 5 6 7 10 12 13 22
## 386 77 15 11 2 8 1 2 1 1 1
Glimpse of the keywords
wf <- data.frame(word=names(freq), freq=freq)
wf[1:20, ]## word freq
## said said 22
## tutwiler tutwiler 13
## prison prison 12
## â<U+0080><U+009C> â<U+0080><U+009C> 10
## corrections corrections 10
## department department 7
## conditions conditions 6
## just just 6
## justice justice 6
## prisoners prisoners 6
## prisons prisons 6
## still still 6
## women women 6
## years years 6
## federal federal 5
## like like 5
## alabama alabama 4
## get get 4
## government government 4
## inmates inmates 4
Summation of all keywords.
add <- sum(wf$freq) # Summation of all the keywords
add## [1] 761
firstword <- subset(wf, wf$word == "said") # Calling and assigning keyword "said"
firstword## word freq
## said said 22
secondword <- subset(wf, wf$word == "tutwiler") # Calling and assigning keyword "tutwiler"
secondword ## word freq
## tutwiler tutwiler 13
PrA <- (secondword$freq/add) # Probability of A (said)
PrA## [1] 0.01708279
PrB<- (firstword$freq/add) # Probability of B (tutwiler)
PrB## [1] 0.02890933
The Joint probability of both of them occurring together
Pr_AnB <- (PrA*PrA)
Pr_AnB## [1] 0.0002918216
A bar plot showing the frequencing of each word.
barplot(wf[1:10, ]$freq, las = 2, names.arg = wf[1:10,]$word, col = "brown",main = "Most Frequent Words", ylab ="Word frequecies")A word cloud of the keywords
findAssocs(dtm, "remove" , corlimit=0.5)## $remove
## numeric(0)
suppressMessages(library(wordcloud))
word <- names(freq)
set.seed(142)
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(freq), freq, max.words=100, rot.per=0.2, colors=dark2)