Question 1

I chose Y1 and X3

rm(list=ls())
library(tidyverse)
library(knitr)
library(xlsx)

## import df 
df <- read.csv('final.csv',header=TRUE,stringsAsFactors=FALSE)
df <- df[-1,]
df <- as.data.frame(df)
y <- as.numeric(df$X)
x <- as.numeric(df$X.6)

new_df <- as.data.frame(cbind(x,y))
kable(new_df)

x	y
9.5	20.3
3.7	19.1
11.7	19.3
7.4	20.9
5.3	22.0
7.4	23.5
7.4	13.8
8.6	18.8
9.1	20.9
11.4	18.6
8.4	22.3
7.3	17.6
11.3	20.8
4.4	28.7
9.3	15.2
10.9	20.9
10.9	18.4
7.7	10.3
7.7	26.3
11.5	28.1

A. P(A=X>x | B= Y>y)

To find the conditional probability

\[ \frac {P(X\cap Y)}{P(Y)} \]

The intersection of x&y can be found programatically by filtering for both
\[ .2/.75= .2667 \]

x_cutoff <- as.numeric(summary(x)[5])

y_cutoff <- as.numeric(summary(y)[2])

## grab total combinations 
total_combinations <- as.numeric(dim(new_df)[1])


## Get Conditional probability

## filter df for y_cutoff
prob_y <- new_df %>% 
  filter(y>y_cutoff)

##P(Y>y)=prob_y1=.75
prob_y1 <- dim(prob_y)[1]/total_combinations

# get P(XY)
x_and_y <- new_df %>%
  filter(x>x_cutoff & y>y_cutoff)
prob_x_and_y <- dim(x_and_y)[1]/total_combinations

## get conditional prob
x_cond_y<- prob_x_and_y/prob_y1
x_cond_y

## [1] 0.2666667

Extra: Are they independent

\(P(X>x) \neq P(X>x| Y>y)\)

prob_x <- new_df %>% 
  filter(x>x_cutoff)
prob_x1 <- dim(prob_x)[1]/total_combinations
prob_x1==x_cond_y

## [1] FALSE

b. P(A=X>x & B= Y>y)

Probability of two events occuring \[ p(x)*p(y)\]

\[ .25 * .2= .15 \]

## get probability x&y
x_prob2 <- new_df %>%
  filter(x>x_cutoff)
prob_y2 <- new_df %>%
  filter(y>y_cutoff) 
new_prob <- (dim(prob_y2)[1]/total_combinations)*(dim(x_prob2)[1]/total_combinations) 

new_prob

## [1] 0.15

C.P(A=X<x & B= Y>y)

Should be the compliment of problem A
Answer = .666667
turns out it isn’t because P(X=x| Y>y)= .06667
- which sums to 1 in code below

x_and_y <- new_df %>%
  filter(y>y_cutoff & x<x_cutoff)
prob_x_and_y2 <- dim(x_and_y)[1]/total_combinations
## get conditional prob
prob_x_and_y2 <- prob_x_and_y2/prob_y1

print(prob_x_and_y2)

## [1] 0.6666667

##
x_and_y <- new_df %>%
  filter(y>y_cutoff & x==x_cutoff)
prob_xequal_and_y2 <- dim(x_and_y)[1]/total_combinations
## get conditional prob
prob_xequal_and_y2 <- prob_xequal_and_y2/prob_y1


## PROVE A+B+C SUM TO 1
1== round(prob_x_and_y2+x_cond_y+prob_xequal_and_y2)

## [1] TRUE

Table of counts

Using the table we can confirm our answer to conditional probability question A.
- We can’t confirm answer to C becuase our below table includes P(X=x) and question C doesn’t.
- To check question A
  - We find total of (Y>y)=15
  - X>x =4
  - so P(X>x | Y>y) = 4/15== .266667

#x_1st <- as.numeric(summary(x)[5])
#y_3rd <- as.numeric(summary(y)[2])
#ummary(x)
x_3rd_less <- new_df %>%
  filter(x<= x_cutoff)

x_greater3rd <- new_df %>%
  filter(x> x_cutoff)
## row 1
first <- x_3rd_less %>% 
  filter(y<=y_cutoff ) %>% 
  dim()
first <- as.numeric(first[1])
second <- x_3rd_less %>% 
  filter(y>y_cutoff ) %>% 
  dim()
second <- as.numeric(second[1])
total_row1 <- first+ second
## row 2
third <- x_greater3rd %>% 
  filter(y<=y_cutoff ) %>% 
  dim()
third <- as.numeric(third[1])
fourth <- x_greater3rd %>% 
  filter(y>y_cutoff ) %>% 
  dim()
fourth <- as.numeric(fourth[1])
total_row2 <- third+ fourth

new_table <- as.data.frame(cbind(c(first,second,total_row1),c(third,fourth,total_row2),c(5,15,20)))

rownames(new_table) <-  c("Y<=y","Y>y",'total')
colnames(new_table) <- c("x<=x","X>x","Total")
kable(new_table)

	x<=x	X>x	Total
Y<=y	5	0	5
Y>y	11	4	15
total	16	4	20

Does splitting the training data in this fashion make them independent?

test for indpendence
if two variables are independent, than \[P(A|B)=P(A) \]
to find P(A|B) we need to find \(\frac{P(A\cap B)}{P(B)}\)
- As can be seen the P(A|B)=.666667 and P(A)=.65. Considering how close these probabilities are, it is tough to determine if they are independent
another check can be to see if P(AB)=P(A)*P(B)
- if the intersection of these sets is equivalent to the multiplication of their individual probabilities, than they are independent of each other
  - .5 versus .4875
- This shows different probabilities, but once again it can be due to chance so lets run chi square test

X1 <- as.numeric(summary(x)[2])
y1 <- as.numeric(summary(y)[2])
A <- new_df %>%
  filter(x>= X1 )
prob_A <- as.numeric(dim(A)[1])/total_combinations

## Filter DF for B>1st quartile
B <- prob_y
prob_B <- as.numeric(dim(B))[1]/20
## filter DF to return DF of A>1st quartile and B > 1st quartile 
A_given_b <- new_df %>%
  filter(x> X1& y>y1 )
## get P(A&B)
prob_a_and_b <- as.numeric(dim(A_given_b))[1]/20

## get P(A|B)
ProbabilityA_given_B <- prob_a_and_b/prob_y1

## now if these variables are independent than P(A|B== P(A))
ProbabilityA_given_B==prob_A

## [1] FALSE

## now show if (P(AB)=P(A)*P(B)) 
prob_a_and_b==prob_A*prob_B

## [1] FALSE

Check with chi square test

\[x^2= \sum \frac{(observed- expected)^2}{expected} \]

Below I create a new table with our observed frequencies, i then create our expected frequencies from P(x)*P(y)
- Answer df=6, x^2= 8, pvalue = 0.2381. therefore we fail to reject the null that our variables are independent
So our bayes theorem approach would have us declare that the vairables are dependent, while our chi squared test decalres the vairables independent

newer_df <- new_df %>% 
    mutate(new_col= ifelse(x >= X1 & y >= y1,"Both_above",ifelse(x < X1 & y < y1,"both_below",ifelse(x >= X1 & y < y1,"Y_below_X_above",ifelse(x < X1 & y >= y1,"X_below_Y_above",0)))))

actual <- table(newer_df$new_col)
both_above <- 20*(3/4)*(3/4)
Y_below_X_above <- 20*(1/4)*(3/4)
X_below_Y_above <- 20*(1/4)*(3/4)
both_below <- 20*(1/4)*(1/4)
actual

## 
##      Both_above      both_below X_below_Y_above Y_below_X_above 
##              12               1               3               4

expected <- c(both_above,Y_below_X_above,X_below_Y_above,both_below)

my_x_2 <- sum((actual-expected)**2/(expected))
my_x_2

## [1] 8.266667

chisq.test(actual,expected)

## Warning in chisq.test(actual, expected): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  actual and expected
## X-squared = 8, df = 6, p-value = 0.2381

Final

Justin Herman

December 9, 2018

Question 1

A. P(A=X>x | B= Y>y)

Extra: Are they independent

b. P(A=X>x & B= Y>y)

C.P(A=X<x & B= Y>y)

Table of counts

Does splitting the training data in this fashion make them independent?

Check with chi square test