R Markdown

df <- read.csv("C:\\Users\\Charls\\Documents\\CunyMSDS\\Data622\\Assignments\\HW1\\Qn1\\data.csv")

kable(df)
age.group networth status credit_rating classprospect
youth high employed fair no
youth high employed excellent no
middle high employed fair yes
senior medium employed fair yes
senior low unemployed fair yes
senior low unemployed excellent no
middle low unemployed excellent yes
youth medium employed fair no
youth low unemployed fair yes
senior medium unemployed fair yes
youth medium unemployed excellent yes
middle medium employed excellent yes
middle high unemployed fair yes
senior medium employed excellent no

Calculate the prior probabilties

P(Yes) P(No)

This calculated without seeing the entire dataset and is determined with total no of each lables in the dataset.

p_yes <- 9/14
p_no <- 5/14

cond_prob(df, col:val, lbl_val )

cond_prob <- function(df, A, B){
  A <- unlist(strsplit(A, ':'))
  B <- unlist(strsplit(B, ':'))
  df1 <- df[(df[A[1]] == A[2] & df[B[1]] == B[2]), ]
  df2 <- df[(df[B[1]] == B[2]), ]
  print(paste(toString(nrow(df1)),  toString(nrow(df2)), sep='/'))
  nrow(df1)/nrow(df2)
}
P_age_group_youth_prospect_yes <- cond_prob(df, 'age.group:youth', 'classprospect:yes')
## [1] "2/9"
print(paste("P(age-group=youth|prospect=yes) = ", toString(round(P_age_group_youth_prospect_yes, 2))))
## [1] "P(age-group=youth|prospect=yes) =  0.22"
P_age_group_middle_prospect_yes <- cond_prob(df, 'age.group:middle', 'classprospect:yes')
## [1] "4/9"
print(paste("P(age-group=middle|prospect=yes) = ", toString(round(P_age_group_middle_prospect_yes, 2))))
## [1] "P(age-group=middle|prospect=yes) =  0.44"
P_age_group_senior_prospect_yes <- cond_prob(df, 'age.group:senior', 'classprospect:yes')
## [1] "3/9"
print(paste("P(age-group=senior|prospect=yes) = ", toString(round(P_age_group_senior_prospect_yes, 2))))
## [1] "P(age-group=senior|prospect=yes) =  0.33"
P_age_group_youth_prospect_no<- cond_prob(df, 'age.group:youth', 'classprospect:no')
## [1] "3/5"
print(paste("P(age-group=youth|prospect=no) = ", toString(round(P_age_group_youth_prospect_no, 2))))
## [1] "P(age-group=youth|prospect=no) =  0.6"
P_age_group_middle_prospect_no <- cond_prob(df, 'age.group:middle', 'classprospect:no')
## [1] "0/5"
print(paste("P(age-group=middle|prospect=no) = ", toString(round(P_age_group_middle_prospect_no, 2))))
## [1] "P(age-group=middle|prospect=no) =  0"
P_age_group_senior_prospect_no <- cond_prob(df, 'age.group:senior', 'classprospect:no')
## [1] "2/5"
print(paste("P(age-group=senior|prospect=no) = ", toString(round(P_age_group_senior_prospect_no, 2))))
## [1] "P(age-group=senior|prospect=no) =  0.4"
P_networth_high_prospect_yes <- cond_prob(df, 'networth:high', 'classprospect:yes')
## [1] "2/9"
print(paste("P(networth=high|prospect=yes) = ", toString(round(P_networth_high_prospect_yes, 2))))
## [1] "P(networth=high|prospect=yes) =  0.22"
P_networth_low_prospect_yes <-  cond_prob(df, 'networth:low', 'classprospect:yes')
## [1] "3/9"
print(paste("P(networth=low|prospect=yes) = ", toString(round(P_networth_low_prospect_yes, 2))))
## [1] "P(networth=low|prospect=yes) =  0.33"
P_networth_medium_prospect_yes <- cond_prob(df, 'networth:medium', 'classprospect:yes')
## [1] "4/9"
print(paste("P(networth=medium|prospect=yes) = ", toString(round(P_networth_medium_prospect_yes, 2))))
## [1] "P(networth=medium|prospect=yes) =  0.44"
P_networth_high_prospect_no <-  cond_prob(df, 'networth:high', 'classprospect:no')
## [1] "2/5"
print(paste("P(networth=high|prospect=no) = ", toString(round(P_networth_high_prospect_no, 2))))
## [1] "P(networth=high|prospect=no) =  0.4"
P_networth_low_prospect_no <-  cond_prob(df, 'networth:low', 'classprospect:no')
## [1] "1/5"
print(paste("P(networth=low|prospect=no) = ", toString(round(P_networth_low_prospect_no, 2))))
## [1] "P(networth=low|prospect=no) =  0.2"
P_networth_medium_prospect_no <- cond_prob(df, 'networth:medium', 'classprospect:no')
## [1] "2/5"
print(paste("P(networth=medium|prospect=no) = ", toString(round(P_networth_medium_prospect_no, 2))))
## [1] "P(networth=medium|prospect=no) =  0.4"
P_status_employed_prospect_yes <- cond_prob(df, 'status:employed', 'classprospect:yes')
## [1] "3/9"
print(paste("P(status=employed|prospect=yes) = ", toString(round(P_status_employed_prospect_yes, 2))))
## [1] "P(status=employed|prospect=yes) =  0.33"
P_status_employed_prospect_no <- cond_prob(df, 'status:employed', 'classprospect:no')
## [1] "4/5"
print(paste("P(status=employed|prospect=no) = ", toString(round(P_status_employed_prospect_no, 2))))
## [1] "P(status=employed|prospect=no) =  0.8"
P_status_unemployed_prospect_yes <- cond_prob(df, 'status:unemployed', 'classprospect:yes')
## [1] "6/9"
print(paste("P(status=unemployed|prospect=yes) = ", toString(round(P_status_unemployed_prospect_yes, 2))))
## [1] "P(status=unemployed|prospect=yes) =  0.67"
P_status_unemployed_prospect_no <- cond_prob(df, 'status:unemployed', 'classprospect:no')
## [1] "1/5"
print(paste("P(status=unemployed|prospect=no) = ", toString(round(P_status_unemployed_prospect_no, 2))))
## [1] "P(status=unemployed|prospect=no) =  0.2"
P_credit_fair_prospect_yes <- cond_prob(df, 'credit_rating:fair', 'classprospect:yes')
## [1] "6/9"
print(paste("P(credit=fair|prospect=yes) = ", toString(round(P_credit_fair_prospect_yes, 2))))
## [1] "P(credit=fair|prospect=yes) =  0.67"
P_credit_fair_prospect_no  <- cond_prob(df, 'credit_rating:fair', 'classprospect:no')
## [1] "2/5"
print(paste("P(credit=fair|prospect=no) = ", toString(round(P_credit_fair_prospect_no, 2))))
## [1] "P(credit=fair|prospect=no) =  0.4"
P_credit_excellent_prospect_yes  <- cond_prob(df, 'credit_rating:excellent', 'classprospect:yes')
## [1] "3/9"
print(paste("P(credit=excellent|prospect=yes) = ", toString(round(P_credit_excellent_prospect_yes, 2))))
## [1] "P(credit=excellent|prospect=yes) =  0.33"
P_credit_excellent_prospect_no   <- cond_prob(df, 'credit_rating:excellent', 'classprospect:no') 
## [1] "3/5"
print(paste("P(credit=excellent|prospect=no) = ", toString(round(P_credit_excellent_prospect_no, 2))))
## [1] "P(credit=excellent|prospect=no) =  0.6"