Question 1

  • I chose Y1 and X3
rm(list=ls())
library(tidyverse)
library(knitr)
library(xlsx)

## import df 
df <- read.csv('final.csv',header=TRUE,stringsAsFactors=FALSE)
df <- df[-1,]
df <- as.data.frame(df)
y <- as.numeric(df$X)
x <- as.numeric(df$X.6)

new_df <- as.data.frame(cbind(x,y))
kable(new_df)
x y
9.5 20.3
3.7 19.1
11.7 19.3
7.4 20.9
5.3 22.0
7.4 23.5
7.4 13.8
8.6 18.8
9.1 20.9
11.4 18.6
8.4 22.3
7.3 17.6
11.3 20.8
4.4 28.7
9.3 15.2
10.9 20.9
10.9 18.4
7.7 10.3
7.7 26.3
11.5 28.1

A. P(A=X>x | B= Y>y)

  • To find the conditional probability

\[ \frac {P(X\cap Y)}{P(Y)} \]

  • The intersection of x&y can be found programatically by filtering for both

  • \[ .2/.75= .2667 \]

x_cutoff <- as.numeric(summary(x)[5])

y_cutoff <- as.numeric(summary(y)[2])

## grab total combinations 
total_combinations <- as.numeric(dim(new_df)[1])


## Get Conditional probability

## filter df for y_cutoff
prob_y <- new_df %>% 
  filter(y>y_cutoff)

##P(Y>y)=prob_y1=.75
prob_y1 <- dim(prob_y)[1]/total_combinations

# get P(XY)
x_and_y <- new_df %>%
  filter(x>x_cutoff & y>y_cutoff)
prob_x_and_y <- dim(x_and_y)[1]/total_combinations

## get conditional prob
x_cond_y<- prob_x_and_y/prob_y1
x_cond_y
## [1] 0.2666667

Extra: Are they independent

\(P(X>x) \neq P(X>x| Y>y)\)

prob_x <- new_df %>% 
  filter(x>x_cutoff)
prob_x1 <- dim(prob_x)[1]/total_combinations
prob_x1==x_cond_y
## [1] FALSE

b. P(A=X>x & B= Y>y)

  • Probability of two events occuring \[ p(x)*p(y)\]

\[ .25 * .2= .15 \]

## get probability x&y
x_prob2 <- new_df %>%
  filter(x>x_cutoff)
prob_y2 <- new_df %>%
  filter(y>y_cutoff) 
new_prob <- (dim(prob_y2)[1]/total_combinations)*(dim(x_prob2)[1]/total_combinations) 

new_prob
## [1] 0.15

C.P(A=X<x & B= Y>y)

  • Should be the compliment of problem A
  • Answer = .666667
  • turns out it isn’t because P(X=x| Y>y)= .06667
    • which sums to 1 in code below
x_and_y <- new_df %>%
  filter(y>y_cutoff & x<x_cutoff)
prob_x_and_y2 <- dim(x_and_y)[1]/total_combinations
## get conditional prob
prob_x_and_y2 <- prob_x_and_y2/prob_y1

print(prob_x_and_y2)
## [1] 0.6666667
##
x_and_y <- new_df %>%
  filter(y>y_cutoff & x==x_cutoff)
prob_xequal_and_y2 <- dim(x_and_y)[1]/total_combinations
## get conditional prob
prob_xequal_and_y2 <- prob_xequal_and_y2/prob_y1


## PROVE A+B+C SUM TO 1
1== round(prob_x_and_y2+x_cond_y+prob_xequal_and_y2)
## [1] TRUE

Table of counts

  • Using the table we can confirm our answer to conditional probability question A.
    • We can’t confirm answer to C becuase our below table includes P(X=x) and question C doesn’t.
    • To check question A
      • We find total of (Y>y)=15
      • X>x =4
      • so P(X>x | Y>y) = 4/15== .266667
#x_1st <- as.numeric(summary(x)[5])
#y_3rd <- as.numeric(summary(y)[2])
#ummary(x)
x_3rd_less <- new_df %>%
  filter(x<= x_cutoff)

x_greater3rd <- new_df %>%
  filter(x> x_cutoff)
## row 1
first <- x_3rd_less %>% 
  filter(y<=y_cutoff ) %>% 
  dim()
first <- as.numeric(first[1])
second <- x_3rd_less %>% 
  filter(y>y_cutoff ) %>% 
  dim()
second <- as.numeric(second[1])
total_row1 <- first+ second
## row 2
third <- x_greater3rd %>% 
  filter(y<=y_cutoff ) %>% 
  dim()
third <- as.numeric(third[1])
fourth <- x_greater3rd %>% 
  filter(y>y_cutoff ) %>% 
  dim()
fourth <- as.numeric(fourth[1])
total_row2 <- third+ fourth

new_table <- as.data.frame(cbind(c(first,second,total_row1),c(third,fourth,total_row2),c(5,15,20)))

rownames(new_table) <-  c("Y<=y","Y>y",'total')
colnames(new_table) <- c("x<=x","X>x","Total")
kable(new_table)
x<=x X>x Total
Y<=y 5 0 5
Y>y 11 4 15
total 16 4 20

Does splitting the training data in this fashion make them independent?

  • test for indpendence
  • if two variables are independent, than \[P(A|B)=P(A) \]

  • to find P(A|B) we need to find \(\frac{P(A\cap B)}{P(B)}\)
    • As can be seen the P(A|B)=.666667 and P(A)=.65. Considering how close these probabilities are, it is tough to determine if they are independent
  • another check can be to see if P(AB)=P(A)*P(B)
    • if the intersection of these sets is equivalent to the multiplication of their individual probabilities, than they are independent of each other
      • .5 versus .4875
    • This shows different probabilities, but once again it can be due to chance so lets run chi square test
X1 <- as.numeric(summary(x)[2])
y1 <- as.numeric(summary(y)[2])
A <- new_df %>%
  filter(x>= X1 )
prob_A <- as.numeric(dim(A)[1])/total_combinations

## Filter DF for B>1st quartile
B <- prob_y
prob_B <- as.numeric(dim(B))[1]/20
## filter DF to return DF of A>1st quartile and B > 1st quartile 
A_given_b <- new_df %>%
  filter(x> X1& y>y1 )
## get P(A&B)
prob_a_and_b <- as.numeric(dim(A_given_b))[1]/20

## get P(A|B)
ProbabilityA_given_B <- prob_a_and_b/prob_y1

## now if these variables are independent than P(A|B== P(A))
ProbabilityA_given_B==prob_A
## [1] FALSE
## now show if (P(AB)=P(A)*P(B)) 
prob_a_and_b==prob_A*prob_B
## [1] FALSE

Check with chi square test

\[x^2= \sum \frac{(observed- expected)^2}{expected} \]

  • Below I create a new table with our observed frequencies, i then create our expected frequencies from P(x)*P(y)
    • Answer df=6, x^2= 8, pvalue = 0.2381. therefore we fail to reject the null that our variables are independent
  • So our bayes theorem approach would have us declare that the vairables are dependent, while our chi squared test decalres the vairables independent
newer_df <- new_df %>% 
    mutate(new_col= ifelse(x >= X1 & y >= y1,"Both_above",ifelse(x < X1 & y < y1,"both_below",ifelse(x >= X1 & y < y1,"Y_below_X_above",ifelse(x < X1 & y >= y1,"X_below_Y_above",0)))))

actual <- table(newer_df$new_col)
both_above <- 20*(3/4)*(3/4)
Y_below_X_above <- 20*(1/4)*(3/4)
X_below_Y_above <- 20*(1/4)*(3/4)
both_below <- 20*(1/4)*(1/4)
actual
## 
##      Both_above      both_below X_below_Y_above Y_below_X_above 
##              12               1               3               4
expected <- c(both_above,Y_below_X_above,X_below_Y_above,both_below)

my_x_2 <- sum((actual-expected)**2/(expected))
my_x_2
## [1] 8.266667
chisq.test(actual,expected)
## Warning in chisq.test(actual, expected): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  actual and expected
## X-squared = 8, df = 6, p-value = 0.2381