Problem set 1 1) Roll a die 3 times how many possible outcomes are there.

a die has 6 sides so the total possible outcomes for rolling a die 3 times is 6 x 6 x 6 = 216

  1. Probability of getting the sum of 3 when you roll a die twice?

to get a sum of 3 we need to get (2,1) or (1,2) out of a possible 36 total outcomes, 2/36 = 1/18

  1. if one person in a room the probability of being born on any day is 365/365 =1 if two people in a room the probability of the two having the same birthday is 365/365 * 1/365

for 3 people in the room, the probablity of any two people having the same birthday is the P(person_A and Person B have same birthday and Person C different birthday) or P(Person A and Person C has same birthday while Person B has different) or (person B and Person C has same birthday while Person A is differnt) = 365/365 (1/365)(364/365) + (365/365)(1/365)(364/365) + (365/365)(1/365)(364/365) =3 * (364/365^2)

For 4 people in the room, probability of any 2 people having same birthday is P(Person_1) = 365/365 P(Person_2) = 1/365 P(Person_3) = 364/365 P(Person_4) = 363/365 Multipled by 6 because there are six different ways it could happen (3653643631)/365^4 6 we can come up with a formula like for any number of people in the room n

the probability for having only two people with same birthday for an N number of people in same room is 364!/(366-n!)365^(n-1) (n-1)*n/2

for n =25

#P_25 <- factorial(364)/(factorial((366-25))*365^(25-1))

Birthday_pr_fn <- function(n){
p<-1
for (i in 0:(n-3))
{
  p <- p*(364 - i)
  
}

 p <- p/(365^(n-1)) * (n/2) *(n-1)

return(p)
}

Birthday_pr_fn(100)
## [1] 5.717602e-06

the probability reduces as we increases as the number of people gets to 25 which is 0.3794431 but as we keep increasing the number of people the probability reduces to 0.1148493. This is because we are limiting it to just two people. So the more people in the room the probability reduces because more than two people could now share the same birthday.

x <- c(3:60)
y <- sapply(x,function(x) Birthday_pr_fn(x))

plot(x,y)

Let’s compare the Probability of at least two people having the same birthday.

For a group of 3 people: P(at least 2 people having same birthday) = Probability(first person same birthday with second person) + Pr(first person same bday with 3 rd person) + Pr(second person same bday with third person) + Pr(all 3 having same birthday)

= 365/356(1/365)(364/365) *3 + (365/365)(1/365)(1/365) = 0.008204

Notice that in this case it will be much easier to compute the probability of compliment of at least 2 people which is Pr(Everybody has a different birthday) = 365/365 * 364/365 *363/365 when we take 1-Pr(diff bdays) = 0.008204

for 4 people, 1- Pr(diff bdays) = 1 - (365/365)(364/365)(363/365)362/365 =1-(364!/(365-n)!365^n-1)

for n =25

Birthday_pr_fn_2 <- function(n){
p<-1
for (i in 0:(n-2))
{
  p <- p*(364 - i)
  
}

 p <- p/(365^(n-1)) 

return(1-p)
}

Birthday_pr_fn_2(25)
## [1] 0.5686997
Birthday_pr_fn_2(50)
## [1] 0.9703736

We notice at n=25 Probabiliyt is 0.56 and at 50 Probability is 0.97

x <- c(3:300)
y <- sapply(x,function(x) Birthday_pr_fn_2(x))

plot(x,y)

We notice that probability goes to 1 as the number of people increases to 100

Problem NO 2

Probability of words

This function will take in the filepath or url of document and calcuate probabilities of each word

Probability_words_fn <- function(docpath){
  if("readr" %in% rownames(installed.packages()) == FALSE) {install.packages("readr")}
library(readr)
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)

#docpath <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/assign6.sample.txt")
  
filepath <-  docpath

doc <- read_file(filepath)

# remove the punctuations
data <-str_replace_all(doc, pattern = '[[:punct:]]', replacement = "") 

# remove newline characters
data_clean <-str_replace_all(data, pattern = '[\n]', replacement = " ")

#convert letters to lower case
data_clean_lower <-tolower(data_clean)

#chop the long text into individual character vectors containing each word
words<-unlist(str_split(data_clean_lower, pattern = ' '))

#filter out empty strings
words_v <-words[!(words %in% c(""))]

#filter out numeric vectors
words_only <-words_v[!(str_detect(words_v,'[^a-zA-Z]'))]

Word_count <-length(words_only)

words_unique <- unique(words_only)


#Probability of each word in a document is No_of_occurrence_of_word/total_word_count

Pr_w<-vector(mode="numeric", length=length(words_unique))
for(i in 1:length(words_unique))
{
 p_w <- length(words_only[words_only %in% words_unique[i]])
 Pr_w[i] <- p_w/Word_count
}

Probability_df <- cbind(words_unique, Pr_w)

return(Probability_df)
}

#Test function
docpath <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/assign6.sample.txt")
Probability_words_fn(docpath)
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'stringr' was built under R version 3.3.2
## Warning: package 'dplyr' was built under R version 3.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##        words_unique       Pr_w                  
##   [1,] "for"              "0.0232558139534884"  
##   [2,] "a"                "0.0337584396099025"  
##   [3,] "female"           "0.00150037509377344" 
##   [4,] "inmate"           "0.000750187546886722"
##   [5,] "there"            "0.00450112528132033" 
##   [6,] "are"              "0.00675168792198049" 
##   [7,] "few"              "0.000750187546886722"
##   [8,] "places"           "0.000750187546886722"
##   [9,] "worse"            "0.000750187546886722"
##  [10,] "than"             "0.00525131282820705" 
##  [11,] "the"              "0.0570142535633908"  
##  [12,] "julia"            "0.00150037509377344" 
##  [13,] "tutwiler"         "0.0112528132033008"  
##  [14,] "prison"           "0.00825206301575394" 
##  [15,] "women"            "0.00450112528132033" 
##  [16,] "corrections"      "0.00750187546886722" 
##  [17,] "officers"         "0.00450112528132033" 
##  [18,] "have"             "0.00675168792198049" 
##  [19,] "raped"            "0.00150037509377344" 
##  [20,] "beaten"           "0.000750187546886722"
##  [21,] "and"              "0.0285071267816954"  
##  [22,] "harassed"         "0.000750187546886722"
##  [23,] "inside"           "0.00150037509377344" 
##  [24,] "aging"            "0.000750187546886722"
##  [25,] "here"             "0.00225056264066016" 
##  [26,] "at"               "0.00900225056264066" 
##  [27,] "least"            "0.00150037509377344" 
##  [28,] "years"            "0.00525131282820705" 
##  [29,] "according"        "0.000750187546886722"
##  [30,] "to"               "0.0210052513128282"  
##  [31,] "an"               "0.00450112528132033" 
##  [32,] "unfolding"        "0.000750187546886722"
##  [33,] "justice"          "0.00450112528132033" 
##  [34,] "department"       "0.00525131282820705" 
##  [35,] "investigation"    "0.00225056264066016" 
##  [36,] "more"             "0.00450112528132033" 
##  [37,] "third"            "0.000750187546886722"
##  [38,] "of"               "0.0210052513128282"  
##  [39,] "employees"        "0.00150037509377344" 
##  [40,] "had"              "0.00450112528132033" 
##  [41,] "sex"              "0.00300075018754689" 
##  [42,] "with"             "0.00600150037509377" 
##  [43,] "prisoners"        "0.00525131282820705" 
##  [44,] "which"            "0.000750187546886722"
##  [45,] "is"               "0.0165041260315079"  
##  [46,] "sometimes"        "0.00150037509377344" 
##  [47,] "only"             "0.00375093773443361" 
##  [48,] "currency"         "0.000750187546886722"
##  [49,] "basics"           "0.00150037509377344" 
##  [50,] "like"             "0.00375093773443361" 
##  [51,] "toilet"           "0.000750187546886722"
##  [52,] "paper"            "0.000750187546886722"
##  [53,] "tampons"          "0.000750187546886722"
##  [54,] "but"              "0.00675168792198049" 
##  [55,] "whose"            "0.000750187546886722"
##  [56,] "conditions"       "0.00450112528132033" 
##  [57,] "so"               "0.00150037509377344" 
##  [58,] "bad"              "0.00150037509377344" 
##  [59,] "that"             "0.0142535633908477"  
##  [60,] "federal"          "0.00375093773443361" 
##  [61,] "government"       "0.00300075018754689" 
##  [62,] "says"             "0.00150037509377344" 
##  [63,] "they"             "0.00450112528132033" 
##  [64,] "most"             "0.00150037509377344" 
##  [65,] "likely"           "0.000750187546886722"
##  [66,] "unconstitutional" "0.000750187546886722"
##  [67,] "one"              "0.00150037509377344" 
##  [68,] "in"               "0.0210052513128282"  
##  [69,] "series"           "0.00150037509377344" 
##  [70,] "troubled"         "0.000750187546886722"
##  [71,] "prisons"          "0.00600150037509377" 
##  [72,] "state"            "0.00300075018754689" 
##  [73,] "system"           "0.00225056264066016" 
##  [74,] "has"              "0.00525131282820705" 
##  [75,] "secondhighest"    "0.000750187546886722"
##  [76,] "number"           "0.000750187546886722"
##  [77,] "inmates"          "0.00300075018754689" 
##  [78,] "per"              "0.000750187546886722"
##  [79,] "capita"           "0.000750187546886722"
##  [80,] "nation"           "0.00150037509377344" 
##  [81,] "now"              "0.00225056264066016" 
##  [82,] "as"               "0.00450112528132033" 
##  [83,] "alabama"          "0.00375093773443361" 
##  [84,] "faces"            "0.000750187546886722"
##  [85,] "intervention"     "0.000750187546886722"
##  [86,] "legislature"      "0.00225056264066016" 
##  [87,] "weighing"         "0.000750187546886722"
##  [88,] "its"              "0.00750187546886722" 
##  [89,] "spending"         "0.000750187546886722"
##  [90,] "choices"          "0.000750187546886722"
##  [91,] "coming"           "0.000750187546886722"
##  [92,] "year"             "0.00225056264066016" 
##  [93,] "it"               "0.00900225056264066" 
##  [94,] "remains"          "0.00225056264066016" 
##  [95,] "open"             "0.000750187546886722"
##  [96,] "question"         "0.000750187546886722"
##  [97,] "whether"          "0.00150037509377344" 
##  [98,] "recent"           "0.000750187546886722"
##  [99,] "reports"          "0.00150037509377344" 
## [100,] "on"               "0.00375093773443361" 
## [101,] "enough"           "0.000750187546886722"
## [102,] "prompt"           "0.000750187546886722"
## [103,] "reform"           "0.00150037509377344" 
## [104,] "yes"              "0.000750187546886722"
## [105,] "we"               "0.00225056264066016" 
## [106,] "need"             "0.00225056264066016" 
## [107,] "rectify"          "0.000750187546886722"
## [108,] "crimes"           "0.00225056264066016" 
## [109,] "happened"         "0.000750187546886722"
## [110,] "going"            "0.000750187546886722"
## [111,] "forward"          "0.000750187546886722"
## [112,] "bigger"           "0.000750187546886722"
## [113,] "problem"          "0.000750187546886722"
## [114,] "just"             "0.00450112528132033" 
## [115,] "said"             "0.0165041260315079"  
## [116,] "senator"          "0.000750187546886722"
## [117,] "cam"              "0.000750187546886722"
## [118,] "ward"             "0.00225056264066016" 
## [119,] "republican"       "0.00150037509377344" 
## [120,] "from"             "0.00225056264066016" 
## [121,] "alabaster"        "0.000750187546886722"
## [122,] "who"              "0.00675168792198049" 
## [123,] "chairman"         "0.000750187546886722"
## [124,] "senate"           "0.000750187546886722"
## [125,] "judiciary"        "0.000750187546886722"
## [126,] "committee"        "0.000750187546886722"
## [127,] "were"             "0.00450112528132033" 
## [128,] "dealing"          "0.000750187546886722"
## [129,] "box"              "0.000750187546886722"
## [130,] "dynamite"         "0.000750187546886722"
## [131,] "solution"         "0.000750187546886722"
## [132,] "mr"               "0.00375093773443361" 
## [133,] "others"           "0.00150037509377344" 
## [134,] "say"              "0.00150037509377344" 
## [135,] "not"              "0.00225056264066016" 
## [136,] "build"            "0.000750187546886722"
## [137,] "change"           "0.00225056264066016" 
## [138,] "sentencing"       "0.00150037509377344" 
## [139,] "guidelines"       "0.000750187546886722"
## [140,] "filled"           "0.000750187546886722"
## [141,] "well"             "0.00150037509377344" 
## [142,] "beyond"           "0.000750187546886722"
## [143,] "capacity"         "0.00150037509377344" 
## [144,] "over"             "0.00150037509377344" 
## [145,] "half"             "0.000750187546886722"
## [146,] "states"           "0.00150037509377344" 
## [147,] "locked"           "0.000750187546886722"
## [148,] "up"               "0.00150037509377344" 
## [149,] "drug"             "0.00150037509377344" 
## [150,] "property"         "0.000750187546886722"
## [151,] "rate"             "0.000750187546886722"
## [152,] "nonviolent"       "0.000750187546886722"
## [153,] "offenses"         "0.000750187546886722"
## [154,] "among"            "0.00225056264066016" 
## [155,] "highest"          "0.000750187546886722"
## [156,] "no"               "0.00150037509377344" 
## [157,] "wants"            "0.000750187546886722"
## [158,] "be"               "0.00300075018754689" 
## [159,] "soft"             "0.000750187546886722"
## [160,] "crime"            "0.000750187546886722"
## [161,] "way"              "0.000750187546886722"
## [162,] "doing"            "0.000750187546886722"
## [163,] "this"             "0.00300075018754689" 
## [164,] "stupid"           "0.000750187546886722"
## [165,] "still"            "0.00450112528132033" 
## [166,] "many"             "0.00150037509377344" 
## [167,] "corners"          "0.000750187546886722"
## [168,] "where"            "0.00150037509377344" 
## [169,] "political"        "0.000750187546886722"
## [170,] "prominence"       "0.000750187546886722"
## [171,] "often"            "0.000750187546886722"
## [172,] "tied"             "0.000750187546886722"
## [173,] "how"              "0.00150037509377344" 
## [174,] "much"             "0.00150037509377344" 
## [175,] "candidate"        "0.000750187546886722"
## [176,] "disparages"       "0.000750187546886722"
## [177,] "criminals"        "0.000750187546886722"
## [178,] "appetite"         "0.000750187546886722"
## [179,] "minimal"          "0.000750187546886722"
## [180,] "middle"           "0.000750187546886722"
## [181,] "budget"           "0.00150037509377344" 
## [182,] "session"          "0.000750187546886722"
## [183,] "working"          "0.000750187546886722"
## [184,] "document"         "0.000750187546886722"
## [185,] "gov"              "0.000750187546886722"
## [186,] "robert"           "0.000750187546886722"
## [187,] "bentley"          "0.00150037509377344" 
## [188,] "includes"         "0.000750187546886722"
## [189,] "million"          "0.00300075018754689" 
## [190,] "about"            "0.00450112528132033" 
## [191,] "less"             "0.000750187546886722"
## [192,] "last"             "0.00150037509377344" 
## [193,] "argues"           "0.000750187546886722"
## [194,] "needs"            "0.00150037509377344" 
## [195,] "running"          "0.00150037509377344" 
## [196,] "almost"           "0.00150037509377344" 
## [197,] "double"           "0.000750187546886722"
## [198,] "staffing"         "0.00150037509377344" 
## [199,] "dangerously"      "0.000750187546886722"
## [200,] "low"              "0.000750187546886722"
## [201,] "kim"              "0.000750187546886722"
## [202,] "t"                "0.000750187546886722"
## [203,] "thomas"           "0.00225056264066016" 
## [204,] "departments"      "0.000750187546886722"
## [205,] "commissioner"     "0.00150037509377344" 
## [206,] "he"               "0.00675168792198049" 
## [207,] "would"            "0.000750187546886722"
## [208,] "use"              "0.00150037509377344" 
## [209,] "his"              "0.000750187546886722"
## [210,] "request"          "0.000750187546886722"
## [211,] "give"             "0.000750187546886722"
## [212,] "percent"          "0.000750187546886722"
## [213,] "raise"            "0.000750187546886722"
## [214,] "hire"             "0.000750187546886722"
## [215,] "odds"             "0.000750187546886722"
## [216,] "approval"         "0.000750187546886722"
## [217,] "new"              "0.000750187546886722"
## [218,] "money"            "0.00225056264066016" 
## [219,] "great"            "0.000750187546886722"
## [220,] "better"           "0.00225056264066016" 
## [221,] "been"             "0.00675168792198049" 
## [222,] "long"             "0.000750187546886722"
## [223,] "while"            "0.000750187546886722"
## [224,] "stephen"          "0.000750187546886722"
## [225,] "stetson"          "0.000750187546886722"
## [226,] "policy"           "0.00225056264066016" 
## [227,] "analyst"          "0.000750187546886722"
## [228,] "arise"            "0.000750187546886722"
## [229,] "citizens"         "0.000750187546886722"
## [230,] "project"          "0.000750187546886722"
## [231,] "liberal"          "0.000750187546886722"
## [232,] "group"            "0.000750187546886722"
## [233,] "even"             "0.00150037509377344" 
## [234,] "average"          "0.000750187546886722"
## [235,] "legislator"       "0.000750187546886722"
## [236,] "these"            "0.000750187546886722"
## [237,] "bodies"           "0.000750187546886722"
## [238,] "dont"             "0.00150037509377344" 
## [239,] "matter"           "0.000750187546886722"
## [240,] "ignoring"         "0.000750187546886722"
## [241,] "crisis"           "0.000750187546886722"
## [242,] "stacy"            "0.000750187546886722"
## [243,] "george"           "0.000750187546886722"
## [244,] "former"           "0.000750187546886722"
## [245,] "officer"          "0.000750187546886722"
## [246,] "challenging"      "0.000750187546886722"
## [247,] "june"             "0.000750187546886722"
## [248,] "primary"          "0.000750187546886722"
## [249,] "by"               "0.00300075018754689" 
## [250,] "promising"        "0.000750187546886722"
## [251,] "guntoting"        "0.000750187546886722"
## [252,] "governor"         "0.00150037509377344" 
## [253,] "past"             "0.000750187546886722"
## [254,] "week"             "0.000750187546886722"
## [255,] "issued"           "0.00150037509377344" 
## [256,] "plan"             "0.00150037509377344" 
## [257,] "calls"            "0.000750187546886722"
## [258,] "changing"         "0.00150037509377344" 
## [259,] "rules"            "0.000750187546886722"
## [260,] "rescinding"       "0.000750187546886722"
## [261,] "threestrikes"     "0.000750187546886722"
## [262,] "law"              "0.000750187546886722"
## [263,] "repeat"           "0.000750187546886722"
## [264,] "offenders"        "0.00150037509377344" 
## [265,] "releasing"        "0.000750187546886722"
## [266,] "sick"             "0.000750187546886722"
## [267,] "elderly"          "0.000750187546886722"
## [268,] "sending"          "0.000750187546886722"
## [269,] "lowlevel"         "0.000750187546886722"
## [270,] "into"             "0.00225056264066016" 
## [271,] "treatment"        "0.00150037509377344" 
## [272,] "programs"         "0.000750187546886722"
## [273,] "instead"          "0.000750187546886722"
## [274,] "stepped"          "0.000750187546886722"
## [275,] "fix"              "0.00150037509377344" 
## [276,] "alabamas"         "0.000750187546886722"
## [277,] "problems"         "0.00150037509377344" 
## [278,] "before"           "0.00150037509377344" 
## [279,] "since"            "0.00225056264066016" 
## [280,] "faced"            "0.000750187546886722"
## [281,] "situation"        "0.00150037509377344" 
## [282,] "serious"          "0.000750187546886722"
## [283,] "uncovered"        "0.000750187546886722"
## [284,] "damning"          "0.000750187546886722"
## [285,] "investigations"   "0.000750187546886722"
## [286,] "think"            "0.000750187546886722"
## [287,] "very"             "0.00150037509377344" 
## [288,] "strong"           "0.000750187546886722"
## [289,] "case"             "0.000750187546886722"
## [290,] "constitutional"   "0.000750187546886722"
## [291,] "violations"       "0.000750187546886722"
## [292,] "jocelyn"          "0.000750187546886722"
## [293,] "samuels"          "0.000750187546886722"
## [294,] "acting"           "0.000750187546886722"
## [295,] "assistant"        "0.000750187546886722"
## [296,] "attorney"         "0.000750187546886722"
## [297,] "general"          "0.000750187546886722"
## [298,] "civil"            "0.000750187546886722"
## [299,] "rights"           "0.000750187546886722"
## [300,] "sent"             "0.000750187546886722"
## [301,] "report"           "0.00300075018754689" 
## [302,] "january"          "0.00150037509377344" 
## [303,] "toxic"            "0.000750187546886722"
## [304,] "highly"           "0.000750187546886722"
## [305,] "sexualized"       "0.000750187546886722"
## [306,] "environment"      "0.000750187546886722"
## [307,] "she"              "0.00675168792198049" 
## [308,] "interview"        "0.00150037509377344" 
## [309,] "met"              "0.000750187546886722"
## [310,] "deliberate"       "0.000750187546886722"
## [311,] "indifference"     "0.000750187546886722"
## [312,] "part"             "0.000750187546886722"
## [313,] "officials"        "0.000750187546886722"
## [314,] "management"       "0.000750187546886722"
## [315,] "aware"            "0.000750187546886722"
## [316,] "failed"           "0.000750187546886722"
## [317,] "curb"             "0.000750187546886722"
## [318,] "was"              "0.00675168792198049" 
## [319,] "built"            "0.000750187546886722"
## [320,] "named"            "0.000750187546886722"
## [321,] "after"            "0.00375093773443361" 
## [322,] "woman"            "0.000750187546886722"
## [323,] "called"           "0.000750187546886722"
## [324,] "angel"            "0.000750187546886722"
## [325,] "stockades"        "0.000750187546886722"
## [326,] "her"              "0.00225056264066016" 
## [327,] "work"             "0.000750187546886722"
## [328,] "trying"           "0.000750187546886722"
## [329,] "improve"          "0.00150037509377344" 
## [330,] "live"             "0.00150037509377344" 
## [331,] "including"        "0.000750187546886722"
## [332,] "some"             "0.00150037509377344" 
## [333,] "death"            "0.000750187546886722"
## [334,] "row"              "0.000750187546886722"
## [335,] "although"         "0.000750187546886722"
## [336,] "original"         "0.000750187546886722"
## [337,] "building"         "0.000750187546886722"
## [338,] "designed"         "0.000750187546886722"
## [339,] "abysmal"          "0.000750187546886722"
## [340,] "levels"           "0.000750187546886722"
## [341,] "abundant"         "0.000750187546886722"
## [342,] "blind"            "0.000750187546886722"
## [343,] "spots"            "0.000750187546886722"
## [344,] "three"            "0.000750187546886722"
## [345,] "cameras"          "0.000750187546886722"
## [346,] "created"          "0.000750187546886722"
## [347,] "guards"           "0.00150037509377344" 
## [348,] "rampant"          "0.00150037509377344" 
## [349,] "male"             "0.000750187546886722"
## [350,] "routinely"        "0.000750187546886722"
## [351,] "watched"          "0.000750187546886722"
## [352,] "showering"        "0.000750187546886722"
## [353,] "once"             "0.000750187546886722"
## [354,] "helped"           "0.000750187546886722"
## [355,] "organize"         "0.000750187546886722"
## [356,] "strip"            "0.000750187546886722"
## [357,] "show"             "0.000750187546886722"
## [358,] "exchanged"        "0.000750187546886722"
## [359,] "both"             "0.000750187546886722"
## [360,] "banned"           "0.000750187546886722"
## [361,] "items"            "0.00150037509377344" 
## [362,] "drugs"            "0.000750187546886722"
## [363,] "basic"            "0.000750187546886722"
## [364,] "clean"            "0.000750187546886722"
## [365,] "uniforms"         "0.000750187546886722"
## [366,] "six"              "0.00300075018754689" 
## [367,] "convicted"        "0.000750187546886722"
## [368,] "sexual"           "0.00225056264066016" 
## [369,] "investigating"    "0.000750187546886722"
## [370,] "scrutinizing"     "0.000750187546886722"
## [371,] "medical"          "0.00150037509377344" 
## [372,] "mental"           "0.00150037509377344" 
## [373,] "health"           "0.00150037509377344" 
## [374,] "care"             "0.000750187546886722"
## [375,] "culture"          "0.000750187546886722"
## [376,] "deprivation"      "0.000750187546886722"
## [377,] "abuse"            "0.00225056264066016" 
## [378,] "institutions"     "0.000750187546886722"
## [379,] "across"           "0.000750187546886722"
## [380,] "charlotte"        "0.000750187546886722"
## [381,] "morrison"         "0.000750187546886722"
## [382,] "senior"           "0.000750187546886722"
## [383,] "lawyer"           "0.000750187546886722"
## [384,] "equal"            "0.00150037509377344" 
## [385,] "initiative"       "0.00150037509377344" 
## [386,] "legal"            "0.000750187546886722"
## [387,] "organization"     "0.00150037509377344" 
## [388,] "represents"       "0.000750187546886722"
## [389,] "indigent"         "0.000750187546886722"
## [390,] "defendants"       "0.000750187546886722"
## [391,] "asked"            "0.00150037509377344" 
## [392,] "step"             "0.000750187546886722"
## [393,] "own"              "0.000750187546886722"
## [394,] "showed"           "0.000750187546886722"
## [395,] "beginning"        "0.000750187546886722"
## [396,] "began"            "0.000750187546886722"
## [397,] "april"            "0.000750187546886722"
## [398,] "months"           "0.00225056264066016" 
## [399,] "came"             "0.00150037509377344" 
## [400,] "out"              "0.000750187546886722"
## [401,] "may"              "0.000750187546886722"
## [402,] "longtime"         "0.000750187546886722"
## [403,] "warden"           "0.000750187546886722"
## [404,] "other"            "0.00150037509377344" 
## [405,] "top"              "0.00150037509377344" 
## [406,] "replaced"         "0.000750187546886722"
## [407,] "also"             "0.000750187546886722"
## [408,] "national"         "0.000750187546886722"
## [409,] "institute"        "0.000750187546886722"
## [410,] "review"           "0.000750187546886722"
## [411,] "practices"        "0.000750187546886722"
## [412,] "policies"         "0.00150037509377344" 
## [413,] "using"            "0.000750187546886722"
## [414,] "those"            "0.000750187546886722"
## [415,] "findings"         "0.000750187546886722"
## [416,] "wideranging"      "0.000750187546886722"
## [417,] "included"         "0.000750187546886722"
## [418,] "recruiting"       "0.000750187546886722"
## [419,] "pressing"         "0.000750187546886722"
## [420,] "several"          "0.000750187546886722"
## [421,] "procedures"       "0.000750187546886722"
## [422,] "them"             "0.000750187546886722"
## [423,] "investigate"      "0.000750187546886722"
## [424,] "track"            "0.000750187546886722"
## [425,] "assaults"         "0.000750187546886722"
## [426,] "because"          "0.000750187546886722"
## [427,] "i"                "0.000750187546886722"
## [428,] "wanted"           "0.000750187546886722"
## [429,] "abundance"        "0.000750187546886722"
## [430,] "caution"          "0.000750187546886722"
## [431,] "transparent"      "0.000750187546886722"
## [432,] "recently"         "0.00150037509377344" 
## [433,] "released"         "0.00150037509377344" 
## [434,] "life"             "0.00150037509377344" 
## [435,] "improved"         "0.000750187546886722"
## [436,] "marginally"       "0.000750187546886722"
## [437,] "monica"           "0.000750187546886722"
## [438,] "washington"       "0.00150037509377344" 
## [439,] "serving"          "0.000750187546886722"
## [440,] "armed"            "0.000750187546886722"
## [441,] "robbery"          "0.000750187546886722"
## [442,] "guard"            "0.00150037509377344" 
## [443,] "gave"             "0.000750187546886722"
## [444,] "birth"            "0.000750187546886722"
## [445,] "daughter"         "0.000750187546886722"
## [446,] "living"           "0.000750187546886722"
## [447,] "relatives"        "0.000750187546886722"
## [448,] "near"             "0.00150037509377344" 
## [449,] "montgomery"       "0.000750187546886722"
## [450,] "rodney"           "0.000750187546886722"
## [451,] "arbuthnot"        "0.000750187546886722"
## [452,] "served"           "0.00150037509377344" 
## [453,] "jail"             "0.000750187546886722"
## [454,] "custodial"        "0.000750187546886722"
## [455,] "misconduct"       "0.000750187546886722"
## [456,] "moved"            "0.000750187546886722"
## [457,] "texas"            "0.000750187546886722"
## [458,] "courts"           "0.000750187546886722"
## [459,] "tracked"          "0.000750187546886722"
## [460,] "him"              "0.00150037509377344" 
## [461,] "down"             "0.000750187546886722"
## [462,] "family"           "0.000750187546886722"
## [463,] "finally"          "0.000750187546886722"
## [464,] "getting"          "0.000750187546886722"
## [465,] "month"            "0.000750187546886722"
## [466,] "child"            "0.00150037509377344" 
## [467,] "support"          "0.00150037509377344" 
## [468,] "telephone"        "0.000750187546886722"
## [469,] "ms"               "0.00150037509377344" 
## [470,] "fearful"          "0.000750187546886722"
## [471,] "remained"         "0.000750187546886722"
## [472,] "right"            "0.000750187546886722"
## [473,] "me"               "0.000750187546886722"
## [474,] "personally"       "0.000750187546886722"
## [475,] "same"             "0.000750187546886722"
## [476,] "far"              "0.000750187546886722"
## [477,] "act"              "0.000750187546886722"
## [478,] "congress"         "0.000750187546886722"
## [479,] "get"              "0.00300075018754689" 
## [480,] "things"           "0.000750187546886722"
## [481,] "you"              "0.000750187546886722"
## [482,] "inhumane"         "0.000750187546886722"
## [483,] "period"           "0.000750187546886722"
## [484,] "marsha"           "0.000750187546886722"
## [485,] "colby"            "0.00150037509377344" 
## [486,] "mother"           "0.000750187546886722"
## [487,] "sentence"         "0.000750187546886722"
## [488,] "without"          "0.000750187546886722"
## [489,] "parole"           "0.000750187546886722"
## [490,] "murder"           "0.000750187546886722"
## [491,] "conviction"       "0.00150037509377344" 
## [492,] "premature"        "0.000750187546886722"
## [493,] "son"              "0.000750187546886722"
## [494,] "stillborn"        "0.000750187546886722"
## [495,] "buried"           "0.000750187546886722"
## [496,] "marked"           "0.000750187546886722"
## [497,] "grave"            "0.000750187546886722"
## [498,] "home"             "0.000750187546886722"
## [499,] "examiner"         "0.000750187546886722"
## [500,] "drowned"          "0.000750187546886722"
## [501,] "bathtub"          "0.000750187546886722"
## [502,] "overturned"       "0.000750187546886722"
## [503,] "court"            "0.000750187546886722"
## [504,] "agreed"           "0.000750187546886722"
## [505,] "autopsy"          "0.000750187546886722"
## [506,] "botched"          "0.000750187546886722"
## [507,] "december"         "0.000750187546886722"
## [508,] "contact"          "0.000750187546886722"
## [509,] "split"            "0.000750187546886722"
## [510,] "attention"        "0.000750187546886722"
## [511,] "good"             "0.000750187546886722"
## [512,] "thing"            "0.000750187546886722"
## [513,] "important"        "0.000750187546886722"
## [514,] "commodity"        "0.000750187546886722"
## [515,] "secure"           "0.000750187546886722"
## [516,] "contraband"       "0.000750187546886722"
## [517,] "can"              "0.00150037509377344" 
## [518,] "then"             "0.000750187546886722"
## [519,] "sell"             "0.000750187546886722"
## [520,] "food"             "0.000750187546886722"
## [521,] "do"               "0.000750187546886722"
## [522,] "favors"           "0.000750187546886722"
## [523,] "makeup"           "0.000750187546886722"
## [524,] "cologne"          "0.000750187546886722"
## [525,] "anything"         "0.00150037509377344" 
## [526,] "thats"            "0.00150037509377344" 
## [527,] "stuff"            "0.000750187546886722"
## [528,] "resellable"       "0.000750187546886722"
## [529,] "make"             "0.000750187546886722"
## [530,] "their"            "0.000750187546886722"
## [531,] "believe"          "0.000750187546886722"
## [532,] "will"             "0.000750187546886722"
## [533,] "take"             "0.000750187546886722"
## [534,] "larger"           "0.000750187546886722"
## [535,] "overhaul"         "0.000750187546886722"
## [536,] "primitive"        "0.000750187546886722"
## [537,] "backward"         "0.000750187546886722"
## [538,] "larry"            "0.000750187546886722"
## [539,] "f"                "0.000750187546886722"
## [540,] "wood"             "0.000750187546886722"
## [541,] "clinical"         "0.000750187546886722"
## [542,] "psychologist"     "0.000750187546886722"
## [543,] "hired"            "0.000750187546886722"
## [544,] "quit"             "0.000750187546886722"
## [545,] "two"              "0.000750187546886722"
## [546,] "appalled"         "0.000750187546886722"
## [547,] "what"             "0.00150037509377344" 
## [548,] "administrations"  "0.000750187546886722"
## [549,] "lack"             "0.000750187546886722"
## [550,] "services"         "0.000750187546886722"
## [551,] "ive"              "0.00150037509377344" 
## [552,] "worked"           "0.000750187546886722"
## [553,] "never"            "0.000750187546886722"
## [554,] "seen"             "0.000750187546886722"
## [555,] "back"             "0.000750187546886722"
## [556,] "look"             "0.000750187546886722"
## [557,] "fresh"            "0.000750187546886722"
## [558,] "eyes"             "0.000750187546886722"
## [559,] "people"           "0.000750187546886722"
## [560,] "perspective"      "0.000750187546886722"
## [561,] "see"              "0.000750187546886722"

Probability of specific words appearing in the document and joint probability of the words appearing adjacent to each other

Probability_words_joint_fn <- function(docpath, word1, word2){
  
if("readr" %in% rownames(installed.packages()) == FALSE) {install.packages("readr")}
library(readr)
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)

#docpath <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/assign6.sample.txt")
  
filepath <-  docpath

doc <- read_file(filepath)

# remove the punctuations
data <-str_replace_all(doc, pattern = '[[:punct:]]', replacement = "") 

# remove newline characters
data_clean <-str_replace_all(data, pattern = '[\n]', replacement = " ")

#convert letters to lower case
data_clean_lower <-tolower(data_clean)

#chop the long text into individual character vectors containing each word
words<-unlist(str_split(data_clean_lower, pattern = ' '))

#filter out empty strings
words_v <-words[!(words %in% c(""))]

#filter out numeric vectors
words_only <-words_v[!(str_detect(words_v,'[^a-zA-Z]'))]

Word_count <-length(words_only)

words_unique <- unique(words_only)
  
#var1<- c("for")
var1<- word1
p_w <- length(words_only[words_only %in% var1[1]])
 Pr_var_1 <- p_w/Word_count
 
 #var2<- c("the")
var2 <- word2
p_w <- length(words_only[words_only %in% var2[1]])
 Pr_var_2 <- p_w/Word_count

 #Probability of two words appearing next to each other = (No. of occurrence of the two words adjacent to each other)/total number of adjacent words 
 counter<-0
adj_cnt<-0
for (i in 1:length(words_only))
{
  if(words_only[i] %in% c(var1,var2))
  {counter <- counter +1}
  else{counter <-0}
  
  if (counter ==2)
    {adj_cnt <- adj_cnt +1}  
}
Prob_adj <- adj_cnt*2/length(words_only)

result_set <- c(Prob_word_1 = Pr_var_1, Prob_word_2 = Pr_var_2, Joint_Prob_adj = Prob_adj)
return(result_set)
}

#Test function
docpath <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/assign6.sample.txt")
word1<- c("for") 
word2 <- c("the")
Probability_words_joint_fn(docpath, word1, word2) 
##    Prob_word_1    Prob_word_2 Joint_Prob_adj 
##     0.02325581     0.05701425     0.00600150