Problem set 1 1) Roll a die 3 times how many possible outcomes are there.
a die has 6 sides so the total possible outcomes for rolling a die 3 times is 6 x 6 x 6 = 216
to get a sum of 3 we need to get (2,1) or (1,2) out of a possible 36 total outcomes, 2/36 = 1/18
for 3 people in the room, the probablity of any two people having the same birthday is the P(person_A and Person B have same birthday and Person C different birthday) or P(Person A and Person C has same birthday while Person B has different) or (person B and Person C has same birthday while Person A is differnt) = 365/365 (1/365)(364/365) + (365/365)(1/365)(364/365) + (365/365)(1/365)(364/365) =3 * (364/365^2)
For 4 people in the room, probability of any 2 people having same birthday is P(Person_1) = 365/365 P(Person_2) = 1/365 P(Person_3) = 364/365 P(Person_4) = 363/365 Multipled by 6 because there are six different ways it could happen (3653643631)/365^4 6 we can come up with a formula like for any number of people in the room n
the probability for having only two people with same birthday for an N number of people in same room is 364!/(366-n!)365^(n-1) (n-1)*n/2
for n =25
#P_25 <- factorial(364)/(factorial((366-25))*365^(25-1))
Birthday_pr_fn <- function(n){
p<-1
for (i in 0:(n-3))
{
p <- p*(364 - i)
}
p <- p/(365^(n-1)) * (n/2) *(n-1)
return(p)
}
Birthday_pr_fn(100)
## [1] 5.717602e-06
the probability reduces as we increases as the number of people gets to 25 which is 0.3794431 but as we keep increasing the number of people the probability reduces to 0.1148493. This is because we are limiting it to just two people. So the more people in the room the probability reduces because more than two people could now share the same birthday.
x <- c(3:60)
y <- sapply(x,function(x) Birthday_pr_fn(x))
plot(x,y)
Let’s compare the Probability of at least two people having the same birthday.
For a group of 3 people: P(at least 2 people having same birthday) = Probability(first person same birthday with second person) + Pr(first person same bday with 3 rd person) + Pr(second person same bday with third person) + Pr(all 3 having same birthday)
= 365/356(1/365)(364/365) *3 + (365/365)(1/365)(1/365) = 0.008204
Notice that in this case it will be much easier to compute the probability of compliment of at least 2 people which is Pr(Everybody has a different birthday) = 365/365 * 364/365 *363/365 when we take 1-Pr(diff bdays) = 0.008204
for 4 people, 1- Pr(diff bdays) = 1 - (365/365)(364/365)(363/365)362/365 =1-(364!/(365-n)!365^n-1)
for n =25
Birthday_pr_fn_2 <- function(n){
p<-1
for (i in 0:(n-2))
{
p <- p*(364 - i)
}
p <- p/(365^(n-1))
return(1-p)
}
Birthday_pr_fn_2(25)
## [1] 0.5686997
Birthday_pr_fn_2(50)
## [1] 0.9703736
We notice at n=25 Probabiliyt is 0.56 and at 50 Probability is 0.97
x <- c(3:300)
y <- sapply(x,function(x) Birthday_pr_fn_2(x))
plot(x,y)
We notice that probability goes to 1 as the number of people increases to 100
Problem NO 2
Probability of words
This function will take in the filepath or url of document and calcuate probabilities of each word
Probability_words_fn <- function(docpath){
if("readr" %in% rownames(installed.packages()) == FALSE) {install.packages("readr")}
library(readr)
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)
#docpath <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/assign6.sample.txt")
filepath <- docpath
doc <- read_file(filepath)
# remove the punctuations
data <-str_replace_all(doc, pattern = '[[:punct:]]', replacement = "")
# remove newline characters
data_clean <-str_replace_all(data, pattern = '[\n]', replacement = " ")
#convert letters to lower case
data_clean_lower <-tolower(data_clean)
#chop the long text into individual character vectors containing each word
words<-unlist(str_split(data_clean_lower, pattern = ' '))
#filter out empty strings
words_v <-words[!(words %in% c(""))]
#filter out numeric vectors
words_only <-words_v[!(str_detect(words_v,'[^a-zA-Z]'))]
Word_count <-length(words_only)
words_unique <- unique(words_only)
#Probability of each word in a document is No_of_occurrence_of_word/total_word_count
Pr_w<-vector(mode="numeric", length=length(words_unique))
for(i in 1:length(words_unique))
{
p_w <- length(words_only[words_only %in% words_unique[i]])
Pr_w[i] <- p_w/Word_count
}
Probability_df <- cbind(words_unique, Pr_w)
return(Probability_df)
}
#Test function
docpath <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/assign6.sample.txt")
Probability_words_fn(docpath)
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'stringr' was built under R version 3.3.2
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## words_unique Pr_w
## [1,] "for" "0.0232558139534884"
## [2,] "a" "0.0337584396099025"
## [3,] "female" "0.00150037509377344"
## [4,] "inmate" "0.000750187546886722"
## [5,] "there" "0.00450112528132033"
## [6,] "are" "0.00675168792198049"
## [7,] "few" "0.000750187546886722"
## [8,] "places" "0.000750187546886722"
## [9,] "worse" "0.000750187546886722"
## [10,] "than" "0.00525131282820705"
## [11,] "the" "0.0570142535633908"
## [12,] "julia" "0.00150037509377344"
## [13,] "tutwiler" "0.0112528132033008"
## [14,] "prison" "0.00825206301575394"
## [15,] "women" "0.00450112528132033"
## [16,] "corrections" "0.00750187546886722"
## [17,] "officers" "0.00450112528132033"
## [18,] "have" "0.00675168792198049"
## [19,] "raped" "0.00150037509377344"
## [20,] "beaten" "0.000750187546886722"
## [21,] "and" "0.0285071267816954"
## [22,] "harassed" "0.000750187546886722"
## [23,] "inside" "0.00150037509377344"
## [24,] "aging" "0.000750187546886722"
## [25,] "here" "0.00225056264066016"
## [26,] "at" "0.00900225056264066"
## [27,] "least" "0.00150037509377344"
## [28,] "years" "0.00525131282820705"
## [29,] "according" "0.000750187546886722"
## [30,] "to" "0.0210052513128282"
## [31,] "an" "0.00450112528132033"
## [32,] "unfolding" "0.000750187546886722"
## [33,] "justice" "0.00450112528132033"
## [34,] "department" "0.00525131282820705"
## [35,] "investigation" "0.00225056264066016"
## [36,] "more" "0.00450112528132033"
## [37,] "third" "0.000750187546886722"
## [38,] "of" "0.0210052513128282"
## [39,] "employees" "0.00150037509377344"
## [40,] "had" "0.00450112528132033"
## [41,] "sex" "0.00300075018754689"
## [42,] "with" "0.00600150037509377"
## [43,] "prisoners" "0.00525131282820705"
## [44,] "which" "0.000750187546886722"
## [45,] "is" "0.0165041260315079"
## [46,] "sometimes" "0.00150037509377344"
## [47,] "only" "0.00375093773443361"
## [48,] "currency" "0.000750187546886722"
## [49,] "basics" "0.00150037509377344"
## [50,] "like" "0.00375093773443361"
## [51,] "toilet" "0.000750187546886722"
## [52,] "paper" "0.000750187546886722"
## [53,] "tampons" "0.000750187546886722"
## [54,] "but" "0.00675168792198049"
## [55,] "whose" "0.000750187546886722"
## [56,] "conditions" "0.00450112528132033"
## [57,] "so" "0.00150037509377344"
## [58,] "bad" "0.00150037509377344"
## [59,] "that" "0.0142535633908477"
## [60,] "federal" "0.00375093773443361"
## [61,] "government" "0.00300075018754689"
## [62,] "says" "0.00150037509377344"
## [63,] "they" "0.00450112528132033"
## [64,] "most" "0.00150037509377344"
## [65,] "likely" "0.000750187546886722"
## [66,] "unconstitutional" "0.000750187546886722"
## [67,] "one" "0.00150037509377344"
## [68,] "in" "0.0210052513128282"
## [69,] "series" "0.00150037509377344"
## [70,] "troubled" "0.000750187546886722"
## [71,] "prisons" "0.00600150037509377"
## [72,] "state" "0.00300075018754689"
## [73,] "system" "0.00225056264066016"
## [74,] "has" "0.00525131282820705"
## [75,] "secondhighest" "0.000750187546886722"
## [76,] "number" "0.000750187546886722"
## [77,] "inmates" "0.00300075018754689"
## [78,] "per" "0.000750187546886722"
## [79,] "capita" "0.000750187546886722"
## [80,] "nation" "0.00150037509377344"
## [81,] "now" "0.00225056264066016"
## [82,] "as" "0.00450112528132033"
## [83,] "alabama" "0.00375093773443361"
## [84,] "faces" "0.000750187546886722"
## [85,] "intervention" "0.000750187546886722"
## [86,] "legislature" "0.00225056264066016"
## [87,] "weighing" "0.000750187546886722"
## [88,] "its" "0.00750187546886722"
## [89,] "spending" "0.000750187546886722"
## [90,] "choices" "0.000750187546886722"
## [91,] "coming" "0.000750187546886722"
## [92,] "year" "0.00225056264066016"
## [93,] "it" "0.00900225056264066"
## [94,] "remains" "0.00225056264066016"
## [95,] "open" "0.000750187546886722"
## [96,] "question" "0.000750187546886722"
## [97,] "whether" "0.00150037509377344"
## [98,] "recent" "0.000750187546886722"
## [99,] "reports" "0.00150037509377344"
## [100,] "on" "0.00375093773443361"
## [101,] "enough" "0.000750187546886722"
## [102,] "prompt" "0.000750187546886722"
## [103,] "reform" "0.00150037509377344"
## [104,] "yes" "0.000750187546886722"
## [105,] "we" "0.00225056264066016"
## [106,] "need" "0.00225056264066016"
## [107,] "rectify" "0.000750187546886722"
## [108,] "crimes" "0.00225056264066016"
## [109,] "happened" "0.000750187546886722"
## [110,] "going" "0.000750187546886722"
## [111,] "forward" "0.000750187546886722"
## [112,] "bigger" "0.000750187546886722"
## [113,] "problem" "0.000750187546886722"
## [114,] "just" "0.00450112528132033"
## [115,] "said" "0.0165041260315079"
## [116,] "senator" "0.000750187546886722"
## [117,] "cam" "0.000750187546886722"
## [118,] "ward" "0.00225056264066016"
## [119,] "republican" "0.00150037509377344"
## [120,] "from" "0.00225056264066016"
## [121,] "alabaster" "0.000750187546886722"
## [122,] "who" "0.00675168792198049"
## [123,] "chairman" "0.000750187546886722"
## [124,] "senate" "0.000750187546886722"
## [125,] "judiciary" "0.000750187546886722"
## [126,] "committee" "0.000750187546886722"
## [127,] "were" "0.00450112528132033"
## [128,] "dealing" "0.000750187546886722"
## [129,] "box" "0.000750187546886722"
## [130,] "dynamite" "0.000750187546886722"
## [131,] "solution" "0.000750187546886722"
## [132,] "mr" "0.00375093773443361"
## [133,] "others" "0.00150037509377344"
## [134,] "say" "0.00150037509377344"
## [135,] "not" "0.00225056264066016"
## [136,] "build" "0.000750187546886722"
## [137,] "change" "0.00225056264066016"
## [138,] "sentencing" "0.00150037509377344"
## [139,] "guidelines" "0.000750187546886722"
## [140,] "filled" "0.000750187546886722"
## [141,] "well" "0.00150037509377344"
## [142,] "beyond" "0.000750187546886722"
## [143,] "capacity" "0.00150037509377344"
## [144,] "over" "0.00150037509377344"
## [145,] "half" "0.000750187546886722"
## [146,] "states" "0.00150037509377344"
## [147,] "locked" "0.000750187546886722"
## [148,] "up" "0.00150037509377344"
## [149,] "drug" "0.00150037509377344"
## [150,] "property" "0.000750187546886722"
## [151,] "rate" "0.000750187546886722"
## [152,] "nonviolent" "0.000750187546886722"
## [153,] "offenses" "0.000750187546886722"
## [154,] "among" "0.00225056264066016"
## [155,] "highest" "0.000750187546886722"
## [156,] "no" "0.00150037509377344"
## [157,] "wants" "0.000750187546886722"
## [158,] "be" "0.00300075018754689"
## [159,] "soft" "0.000750187546886722"
## [160,] "crime" "0.000750187546886722"
## [161,] "way" "0.000750187546886722"
## [162,] "doing" "0.000750187546886722"
## [163,] "this" "0.00300075018754689"
## [164,] "stupid" "0.000750187546886722"
## [165,] "still" "0.00450112528132033"
## [166,] "many" "0.00150037509377344"
## [167,] "corners" "0.000750187546886722"
## [168,] "where" "0.00150037509377344"
## [169,] "political" "0.000750187546886722"
## [170,] "prominence" "0.000750187546886722"
## [171,] "often" "0.000750187546886722"
## [172,] "tied" "0.000750187546886722"
## [173,] "how" "0.00150037509377344"
## [174,] "much" "0.00150037509377344"
## [175,] "candidate" "0.000750187546886722"
## [176,] "disparages" "0.000750187546886722"
## [177,] "criminals" "0.000750187546886722"
## [178,] "appetite" "0.000750187546886722"
## [179,] "minimal" "0.000750187546886722"
## [180,] "middle" "0.000750187546886722"
## [181,] "budget" "0.00150037509377344"
## [182,] "session" "0.000750187546886722"
## [183,] "working" "0.000750187546886722"
## [184,] "document" "0.000750187546886722"
## [185,] "gov" "0.000750187546886722"
## [186,] "robert" "0.000750187546886722"
## [187,] "bentley" "0.00150037509377344"
## [188,] "includes" "0.000750187546886722"
## [189,] "million" "0.00300075018754689"
## [190,] "about" "0.00450112528132033"
## [191,] "less" "0.000750187546886722"
## [192,] "last" "0.00150037509377344"
## [193,] "argues" "0.000750187546886722"
## [194,] "needs" "0.00150037509377344"
## [195,] "running" "0.00150037509377344"
## [196,] "almost" "0.00150037509377344"
## [197,] "double" "0.000750187546886722"
## [198,] "staffing" "0.00150037509377344"
## [199,] "dangerously" "0.000750187546886722"
## [200,] "low" "0.000750187546886722"
## [201,] "kim" "0.000750187546886722"
## [202,] "t" "0.000750187546886722"
## [203,] "thomas" "0.00225056264066016"
## [204,] "departments" "0.000750187546886722"
## [205,] "commissioner" "0.00150037509377344"
## [206,] "he" "0.00675168792198049"
## [207,] "would" "0.000750187546886722"
## [208,] "use" "0.00150037509377344"
## [209,] "his" "0.000750187546886722"
## [210,] "request" "0.000750187546886722"
## [211,] "give" "0.000750187546886722"
## [212,] "percent" "0.000750187546886722"
## [213,] "raise" "0.000750187546886722"
## [214,] "hire" "0.000750187546886722"
## [215,] "odds" "0.000750187546886722"
## [216,] "approval" "0.000750187546886722"
## [217,] "new" "0.000750187546886722"
## [218,] "money" "0.00225056264066016"
## [219,] "great" "0.000750187546886722"
## [220,] "better" "0.00225056264066016"
## [221,] "been" "0.00675168792198049"
## [222,] "long" "0.000750187546886722"
## [223,] "while" "0.000750187546886722"
## [224,] "stephen" "0.000750187546886722"
## [225,] "stetson" "0.000750187546886722"
## [226,] "policy" "0.00225056264066016"
## [227,] "analyst" "0.000750187546886722"
## [228,] "arise" "0.000750187546886722"
## [229,] "citizens" "0.000750187546886722"
## [230,] "project" "0.000750187546886722"
## [231,] "liberal" "0.000750187546886722"
## [232,] "group" "0.000750187546886722"
## [233,] "even" "0.00150037509377344"
## [234,] "average" "0.000750187546886722"
## [235,] "legislator" "0.000750187546886722"
## [236,] "these" "0.000750187546886722"
## [237,] "bodies" "0.000750187546886722"
## [238,] "dont" "0.00150037509377344"
## [239,] "matter" "0.000750187546886722"
## [240,] "ignoring" "0.000750187546886722"
## [241,] "crisis" "0.000750187546886722"
## [242,] "stacy" "0.000750187546886722"
## [243,] "george" "0.000750187546886722"
## [244,] "former" "0.000750187546886722"
## [245,] "officer" "0.000750187546886722"
## [246,] "challenging" "0.000750187546886722"
## [247,] "june" "0.000750187546886722"
## [248,] "primary" "0.000750187546886722"
## [249,] "by" "0.00300075018754689"
## [250,] "promising" "0.000750187546886722"
## [251,] "guntoting" "0.000750187546886722"
## [252,] "governor" "0.00150037509377344"
## [253,] "past" "0.000750187546886722"
## [254,] "week" "0.000750187546886722"
## [255,] "issued" "0.00150037509377344"
## [256,] "plan" "0.00150037509377344"
## [257,] "calls" "0.000750187546886722"
## [258,] "changing" "0.00150037509377344"
## [259,] "rules" "0.000750187546886722"
## [260,] "rescinding" "0.000750187546886722"
## [261,] "threestrikes" "0.000750187546886722"
## [262,] "law" "0.000750187546886722"
## [263,] "repeat" "0.000750187546886722"
## [264,] "offenders" "0.00150037509377344"
## [265,] "releasing" "0.000750187546886722"
## [266,] "sick" "0.000750187546886722"
## [267,] "elderly" "0.000750187546886722"
## [268,] "sending" "0.000750187546886722"
## [269,] "lowlevel" "0.000750187546886722"
## [270,] "into" "0.00225056264066016"
## [271,] "treatment" "0.00150037509377344"
## [272,] "programs" "0.000750187546886722"
## [273,] "instead" "0.000750187546886722"
## [274,] "stepped" "0.000750187546886722"
## [275,] "fix" "0.00150037509377344"
## [276,] "alabamas" "0.000750187546886722"
## [277,] "problems" "0.00150037509377344"
## [278,] "before" "0.00150037509377344"
## [279,] "since" "0.00225056264066016"
## [280,] "faced" "0.000750187546886722"
## [281,] "situation" "0.00150037509377344"
## [282,] "serious" "0.000750187546886722"
## [283,] "uncovered" "0.000750187546886722"
## [284,] "damning" "0.000750187546886722"
## [285,] "investigations" "0.000750187546886722"
## [286,] "think" "0.000750187546886722"
## [287,] "very" "0.00150037509377344"
## [288,] "strong" "0.000750187546886722"
## [289,] "case" "0.000750187546886722"
## [290,] "constitutional" "0.000750187546886722"
## [291,] "violations" "0.000750187546886722"
## [292,] "jocelyn" "0.000750187546886722"
## [293,] "samuels" "0.000750187546886722"
## [294,] "acting" "0.000750187546886722"
## [295,] "assistant" "0.000750187546886722"
## [296,] "attorney" "0.000750187546886722"
## [297,] "general" "0.000750187546886722"
## [298,] "civil" "0.000750187546886722"
## [299,] "rights" "0.000750187546886722"
## [300,] "sent" "0.000750187546886722"
## [301,] "report" "0.00300075018754689"
## [302,] "january" "0.00150037509377344"
## [303,] "toxic" "0.000750187546886722"
## [304,] "highly" "0.000750187546886722"
## [305,] "sexualized" "0.000750187546886722"
## [306,] "environment" "0.000750187546886722"
## [307,] "she" "0.00675168792198049"
## [308,] "interview" "0.00150037509377344"
## [309,] "met" "0.000750187546886722"
## [310,] "deliberate" "0.000750187546886722"
## [311,] "indifference" "0.000750187546886722"
## [312,] "part" "0.000750187546886722"
## [313,] "officials" "0.000750187546886722"
## [314,] "management" "0.000750187546886722"
## [315,] "aware" "0.000750187546886722"
## [316,] "failed" "0.000750187546886722"
## [317,] "curb" "0.000750187546886722"
## [318,] "was" "0.00675168792198049"
## [319,] "built" "0.000750187546886722"
## [320,] "named" "0.000750187546886722"
## [321,] "after" "0.00375093773443361"
## [322,] "woman" "0.000750187546886722"
## [323,] "called" "0.000750187546886722"
## [324,] "angel" "0.000750187546886722"
## [325,] "stockades" "0.000750187546886722"
## [326,] "her" "0.00225056264066016"
## [327,] "work" "0.000750187546886722"
## [328,] "trying" "0.000750187546886722"
## [329,] "improve" "0.00150037509377344"
## [330,] "live" "0.00150037509377344"
## [331,] "including" "0.000750187546886722"
## [332,] "some" "0.00150037509377344"
## [333,] "death" "0.000750187546886722"
## [334,] "row" "0.000750187546886722"
## [335,] "although" "0.000750187546886722"
## [336,] "original" "0.000750187546886722"
## [337,] "building" "0.000750187546886722"
## [338,] "designed" "0.000750187546886722"
## [339,] "abysmal" "0.000750187546886722"
## [340,] "levels" "0.000750187546886722"
## [341,] "abundant" "0.000750187546886722"
## [342,] "blind" "0.000750187546886722"
## [343,] "spots" "0.000750187546886722"
## [344,] "three" "0.000750187546886722"
## [345,] "cameras" "0.000750187546886722"
## [346,] "created" "0.000750187546886722"
## [347,] "guards" "0.00150037509377344"
## [348,] "rampant" "0.00150037509377344"
## [349,] "male" "0.000750187546886722"
## [350,] "routinely" "0.000750187546886722"
## [351,] "watched" "0.000750187546886722"
## [352,] "showering" "0.000750187546886722"
## [353,] "once" "0.000750187546886722"
## [354,] "helped" "0.000750187546886722"
## [355,] "organize" "0.000750187546886722"
## [356,] "strip" "0.000750187546886722"
## [357,] "show" "0.000750187546886722"
## [358,] "exchanged" "0.000750187546886722"
## [359,] "both" "0.000750187546886722"
## [360,] "banned" "0.000750187546886722"
## [361,] "items" "0.00150037509377344"
## [362,] "drugs" "0.000750187546886722"
## [363,] "basic" "0.000750187546886722"
## [364,] "clean" "0.000750187546886722"
## [365,] "uniforms" "0.000750187546886722"
## [366,] "six" "0.00300075018754689"
## [367,] "convicted" "0.000750187546886722"
## [368,] "sexual" "0.00225056264066016"
## [369,] "investigating" "0.000750187546886722"
## [370,] "scrutinizing" "0.000750187546886722"
## [371,] "medical" "0.00150037509377344"
## [372,] "mental" "0.00150037509377344"
## [373,] "health" "0.00150037509377344"
## [374,] "care" "0.000750187546886722"
## [375,] "culture" "0.000750187546886722"
## [376,] "deprivation" "0.000750187546886722"
## [377,] "abuse" "0.00225056264066016"
## [378,] "institutions" "0.000750187546886722"
## [379,] "across" "0.000750187546886722"
## [380,] "charlotte" "0.000750187546886722"
## [381,] "morrison" "0.000750187546886722"
## [382,] "senior" "0.000750187546886722"
## [383,] "lawyer" "0.000750187546886722"
## [384,] "equal" "0.00150037509377344"
## [385,] "initiative" "0.00150037509377344"
## [386,] "legal" "0.000750187546886722"
## [387,] "organization" "0.00150037509377344"
## [388,] "represents" "0.000750187546886722"
## [389,] "indigent" "0.000750187546886722"
## [390,] "defendants" "0.000750187546886722"
## [391,] "asked" "0.00150037509377344"
## [392,] "step" "0.000750187546886722"
## [393,] "own" "0.000750187546886722"
## [394,] "showed" "0.000750187546886722"
## [395,] "beginning" "0.000750187546886722"
## [396,] "began" "0.000750187546886722"
## [397,] "april" "0.000750187546886722"
## [398,] "months" "0.00225056264066016"
## [399,] "came" "0.00150037509377344"
## [400,] "out" "0.000750187546886722"
## [401,] "may" "0.000750187546886722"
## [402,] "longtime" "0.000750187546886722"
## [403,] "warden" "0.000750187546886722"
## [404,] "other" "0.00150037509377344"
## [405,] "top" "0.00150037509377344"
## [406,] "replaced" "0.000750187546886722"
## [407,] "also" "0.000750187546886722"
## [408,] "national" "0.000750187546886722"
## [409,] "institute" "0.000750187546886722"
## [410,] "review" "0.000750187546886722"
## [411,] "practices" "0.000750187546886722"
## [412,] "policies" "0.00150037509377344"
## [413,] "using" "0.000750187546886722"
## [414,] "those" "0.000750187546886722"
## [415,] "findings" "0.000750187546886722"
## [416,] "wideranging" "0.000750187546886722"
## [417,] "included" "0.000750187546886722"
## [418,] "recruiting" "0.000750187546886722"
## [419,] "pressing" "0.000750187546886722"
## [420,] "several" "0.000750187546886722"
## [421,] "procedures" "0.000750187546886722"
## [422,] "them" "0.000750187546886722"
## [423,] "investigate" "0.000750187546886722"
## [424,] "track" "0.000750187546886722"
## [425,] "assaults" "0.000750187546886722"
## [426,] "because" "0.000750187546886722"
## [427,] "i" "0.000750187546886722"
## [428,] "wanted" "0.000750187546886722"
## [429,] "abundance" "0.000750187546886722"
## [430,] "caution" "0.000750187546886722"
## [431,] "transparent" "0.000750187546886722"
## [432,] "recently" "0.00150037509377344"
## [433,] "released" "0.00150037509377344"
## [434,] "life" "0.00150037509377344"
## [435,] "improved" "0.000750187546886722"
## [436,] "marginally" "0.000750187546886722"
## [437,] "monica" "0.000750187546886722"
## [438,] "washington" "0.00150037509377344"
## [439,] "serving" "0.000750187546886722"
## [440,] "armed" "0.000750187546886722"
## [441,] "robbery" "0.000750187546886722"
## [442,] "guard" "0.00150037509377344"
## [443,] "gave" "0.000750187546886722"
## [444,] "birth" "0.000750187546886722"
## [445,] "daughter" "0.000750187546886722"
## [446,] "living" "0.000750187546886722"
## [447,] "relatives" "0.000750187546886722"
## [448,] "near" "0.00150037509377344"
## [449,] "montgomery" "0.000750187546886722"
## [450,] "rodney" "0.000750187546886722"
## [451,] "arbuthnot" "0.000750187546886722"
## [452,] "served" "0.00150037509377344"
## [453,] "jail" "0.000750187546886722"
## [454,] "custodial" "0.000750187546886722"
## [455,] "misconduct" "0.000750187546886722"
## [456,] "moved" "0.000750187546886722"
## [457,] "texas" "0.000750187546886722"
## [458,] "courts" "0.000750187546886722"
## [459,] "tracked" "0.000750187546886722"
## [460,] "him" "0.00150037509377344"
## [461,] "down" "0.000750187546886722"
## [462,] "family" "0.000750187546886722"
## [463,] "finally" "0.000750187546886722"
## [464,] "getting" "0.000750187546886722"
## [465,] "month" "0.000750187546886722"
## [466,] "child" "0.00150037509377344"
## [467,] "support" "0.00150037509377344"
## [468,] "telephone" "0.000750187546886722"
## [469,] "ms" "0.00150037509377344"
## [470,] "fearful" "0.000750187546886722"
## [471,] "remained" "0.000750187546886722"
## [472,] "right" "0.000750187546886722"
## [473,] "me" "0.000750187546886722"
## [474,] "personally" "0.000750187546886722"
## [475,] "same" "0.000750187546886722"
## [476,] "far" "0.000750187546886722"
## [477,] "act" "0.000750187546886722"
## [478,] "congress" "0.000750187546886722"
## [479,] "get" "0.00300075018754689"
## [480,] "things" "0.000750187546886722"
## [481,] "you" "0.000750187546886722"
## [482,] "inhumane" "0.000750187546886722"
## [483,] "period" "0.000750187546886722"
## [484,] "marsha" "0.000750187546886722"
## [485,] "colby" "0.00150037509377344"
## [486,] "mother" "0.000750187546886722"
## [487,] "sentence" "0.000750187546886722"
## [488,] "without" "0.000750187546886722"
## [489,] "parole" "0.000750187546886722"
## [490,] "murder" "0.000750187546886722"
## [491,] "conviction" "0.00150037509377344"
## [492,] "premature" "0.000750187546886722"
## [493,] "son" "0.000750187546886722"
## [494,] "stillborn" "0.000750187546886722"
## [495,] "buried" "0.000750187546886722"
## [496,] "marked" "0.000750187546886722"
## [497,] "grave" "0.000750187546886722"
## [498,] "home" "0.000750187546886722"
## [499,] "examiner" "0.000750187546886722"
## [500,] "drowned" "0.000750187546886722"
## [501,] "bathtub" "0.000750187546886722"
## [502,] "overturned" "0.000750187546886722"
## [503,] "court" "0.000750187546886722"
## [504,] "agreed" "0.000750187546886722"
## [505,] "autopsy" "0.000750187546886722"
## [506,] "botched" "0.000750187546886722"
## [507,] "december" "0.000750187546886722"
## [508,] "contact" "0.000750187546886722"
## [509,] "split" "0.000750187546886722"
## [510,] "attention" "0.000750187546886722"
## [511,] "good" "0.000750187546886722"
## [512,] "thing" "0.000750187546886722"
## [513,] "important" "0.000750187546886722"
## [514,] "commodity" "0.000750187546886722"
## [515,] "secure" "0.000750187546886722"
## [516,] "contraband" "0.000750187546886722"
## [517,] "can" "0.00150037509377344"
## [518,] "then" "0.000750187546886722"
## [519,] "sell" "0.000750187546886722"
## [520,] "food" "0.000750187546886722"
## [521,] "do" "0.000750187546886722"
## [522,] "favors" "0.000750187546886722"
## [523,] "makeup" "0.000750187546886722"
## [524,] "cologne" "0.000750187546886722"
## [525,] "anything" "0.00150037509377344"
## [526,] "thats" "0.00150037509377344"
## [527,] "stuff" "0.000750187546886722"
## [528,] "resellable" "0.000750187546886722"
## [529,] "make" "0.000750187546886722"
## [530,] "their" "0.000750187546886722"
## [531,] "believe" "0.000750187546886722"
## [532,] "will" "0.000750187546886722"
## [533,] "take" "0.000750187546886722"
## [534,] "larger" "0.000750187546886722"
## [535,] "overhaul" "0.000750187546886722"
## [536,] "primitive" "0.000750187546886722"
## [537,] "backward" "0.000750187546886722"
## [538,] "larry" "0.000750187546886722"
## [539,] "f" "0.000750187546886722"
## [540,] "wood" "0.000750187546886722"
## [541,] "clinical" "0.000750187546886722"
## [542,] "psychologist" "0.000750187546886722"
## [543,] "hired" "0.000750187546886722"
## [544,] "quit" "0.000750187546886722"
## [545,] "two" "0.000750187546886722"
## [546,] "appalled" "0.000750187546886722"
## [547,] "what" "0.00150037509377344"
## [548,] "administrations" "0.000750187546886722"
## [549,] "lack" "0.000750187546886722"
## [550,] "services" "0.000750187546886722"
## [551,] "ive" "0.00150037509377344"
## [552,] "worked" "0.000750187546886722"
## [553,] "never" "0.000750187546886722"
## [554,] "seen" "0.000750187546886722"
## [555,] "back" "0.000750187546886722"
## [556,] "look" "0.000750187546886722"
## [557,] "fresh" "0.000750187546886722"
## [558,] "eyes" "0.000750187546886722"
## [559,] "people" "0.000750187546886722"
## [560,] "perspective" "0.000750187546886722"
## [561,] "see" "0.000750187546886722"
Probability of specific words appearing in the document and joint probability of the words appearing adjacent to each other
Probability_words_joint_fn <- function(docpath, word1, word2){
if("readr" %in% rownames(installed.packages()) == FALSE) {install.packages("readr")}
library(readr)
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)
#docpath <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/assign6.sample.txt")
filepath <- docpath
doc <- read_file(filepath)
# remove the punctuations
data <-str_replace_all(doc, pattern = '[[:punct:]]', replacement = "")
# remove newline characters
data_clean <-str_replace_all(data, pattern = '[\n]', replacement = " ")
#convert letters to lower case
data_clean_lower <-tolower(data_clean)
#chop the long text into individual character vectors containing each word
words<-unlist(str_split(data_clean_lower, pattern = ' '))
#filter out empty strings
words_v <-words[!(words %in% c(""))]
#filter out numeric vectors
words_only <-words_v[!(str_detect(words_v,'[^a-zA-Z]'))]
Word_count <-length(words_only)
words_unique <- unique(words_only)
#var1<- c("for")
var1<- word1
p_w <- length(words_only[words_only %in% var1[1]])
Pr_var_1 <- p_w/Word_count
#var2<- c("the")
var2 <- word2
p_w <- length(words_only[words_only %in% var2[1]])
Pr_var_2 <- p_w/Word_count
#Probability of two words appearing next to each other = (No. of occurrence of the two words adjacent to each other)/total number of adjacent words
counter<-0
adj_cnt<-0
for (i in 1:length(words_only))
{
if(words_only[i] %in% c(var1,var2))
{counter <- counter +1}
else{counter <-0}
if (counter ==2)
{adj_cnt <- adj_cnt +1}
}
Prob_adj <- adj_cnt*2/length(words_only)
result_set <- c(Prob_word_1 = Pr_var_1, Prob_word_2 = Pr_var_2, Joint_Prob_adj = Prob_adj)
return(result_set)
}
#Test function
docpath <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/assign6.sample.txt")
word1<- c("for")
word2 <- c("the")
Probability_words_joint_fn(docpath, word1, word2)
## Prob_word_1 Prob_word_2 Joint_Prob_adj
## 0.02325581 0.05701425 0.00600150