Problem Set 1
# 1.
6^3
## [1] 216
# 2. Combinations include 1 + 2 and 2 + 1 out of 6^2 total combinations
2/6^2
## [1] 0.05555556
# 3.
same_bday_prob = function(k) {
# we want 1 - 364/365 * 363/365 * ... * (365-k)/365, or
# 1 - [ 365! / (365 - k)! ] / 365^k
# since combination = n! / k!(n - k)!, just multiply by k!
return(1 - choose(365, k) * factorial(k) / 365^k)
}
same_bday_prob(25)
## [1] 0.5686997
same_bday_prob(50)
## [1] 0.9703736
Problem Set 2
library(RTextTools)
library(tm)
texts = paste(readLines(paste0(getwd(), '/assign6.sample.txt')), collapse = ' ')
term_freq = function(texts) {
# create document-term matrix with i columns for each document (1 in this case),
# j rows for each uniue term, and matrix v for count of each term within each document
tdm = create_matrix(textColumns = texts, language = 'english', removeNumbers = FALSE, removePunctuation = TRUE,
removeStopwords = FALSE, stemWords = FALSE, stripWhitespace = TRUE, toLower = TRUE)
# create data frame of terms and term counts
data.frame(term = tdm$dimnames$Terms[tdm$j], count = tdm$v, stringsAsFactors = FALSE) %>%
mutate(freq = round(count / sum(count), 4)) %>%
arrange(desc(freq))
}
head(term_freq(texts))
## term count freq
## 1 the 76 0.0679
## 2 and 38 0.0339
## 3 for 31 0.0277
## 4 said 22 0.0196
## 5 that 19 0.0170
## 6 tutwiler 15 0.0134
library(stringr)
# function to cleanse corpus
cleanse_corpus = function(x) {
x = tm_map(x, PlainTextDocument) # to make tm package play nice
x = tm_map(x, removeNumbers)
x = tm_map(x, removePunctuation) # leaving punctuation in case relevant to spam ham filter
x = tm_map(x, stripWhitespace)
# x = tm_map(x, stemDocument)
x = tm_map(x, content_transformer(tolower)) # content_transformer required because tolower not tm function
# x = tm_map(x, removeWords, stopwords('en'))
}
bigram_freq = function(texts, word1, word2) {
# convert texts to corpus
text_corpus = Corpus(VectorSource(texts))
text_corpus = cleanse_corpus(text_corpus)
dtm = DocumentTermMatrix(text_corpus)
# data frame to house individual probabilities
df = term_freq(texts) %>% filter(term == word1 | term == word2)
# calculate joint probability of adjacent occurence
joint_count = str_count(texts, paste0("\\b", word1, "\\b\\s\\b", word2, "\\b")) +
str_count(texts, paste0("\\b", word2, "\\b\\s\\b", word1, "\\b"))
df %>% union(data.frame(term = 'bigram', count = joint_count, freq = round(joint_count / sum(dtm$v), 4))) # divide by total word count
}
bigram_freq(texts, 'and', 'the')
## term count freq
## 1 bigram 1 0.0009
## 2 and 38 0.0339
## 3 the 76 0.0679