rm(list=ls())
library(tidyverse)
library(knitr)
library(xlsx)
## import df
df <- read.csv('final.csv',header=TRUE,stringsAsFactors=FALSE)
df <- df[-1,]
df <- as.data.frame(df)
y <- as.numeric(df$X)
x <- as.numeric(df$X.6)
new_df <- as.data.frame(cbind(x,y))
kable(new_df)| x | y |
|---|---|
| 9.5 | 20.3 |
| 3.7 | 19.1 |
| 11.7 | 19.3 |
| 7.4 | 20.9 |
| 5.3 | 22.0 |
| 7.4 | 23.5 |
| 7.4 | 13.8 |
| 8.6 | 18.8 |
| 9.1 | 20.9 |
| 11.4 | 18.6 |
| 8.4 | 22.3 |
| 7.3 | 17.6 |
| 11.3 | 20.8 |
| 4.4 | 28.7 |
| 9.3 | 15.2 |
| 10.9 | 20.9 |
| 10.9 | 18.4 |
| 7.7 | 10.3 |
| 7.7 | 26.3 |
| 11.5 | 28.1 |
\[ \frac {P(X\cap Y)}{P(Y)} \]
The intersection of x&y can be found programatically by filtering for both
\[ .2/.75= .2667 \]
x_cutoff <- as.numeric(summary(x)[5])
y_cutoff <- as.numeric(summary(y)[2])
## grab total combinations
total_combinations <- as.numeric(dim(new_df)[1])
## Get Conditional probability
## filter df for y_cutoff
prob_y <- new_df %>%
filter(y>y_cutoff)
##P(Y>y)=prob_y1=.75
prob_y1 <- dim(prob_y)[1]/total_combinations
# get P(XY)
x_and_y <- new_df %>%
filter(x>x_cutoff & y>y_cutoff)
prob_x_and_y <- dim(x_and_y)[1]/total_combinations
## get conditional prob
x_cond_y<- prob_x_and_y/prob_y1
x_cond_y## [1] 0.2666667
\(P(X>x) \neq P(X>x| Y>y)\)
prob_x <- new_df %>%
filter(x>x_cutoff)
prob_x1 <- dim(prob_x)[1]/total_combinations
prob_x1==x_cond_y## [1] FALSE
\[ .25 * .2= .15 \]
## get probability x&y
x_prob2 <- new_df %>%
filter(x>x_cutoff)
prob_y2 <- new_df %>%
filter(y>y_cutoff)
new_prob <- (dim(prob_y2)[1]/total_combinations)*(dim(x_prob2)[1]/total_combinations)
new_prob## [1] 0.15
x_and_y <- new_df %>%
filter(y>y_cutoff & x<x_cutoff)
prob_x_and_y2 <- dim(x_and_y)[1]/total_combinations
## get conditional prob
prob_x_and_y2 <- prob_x_and_y2/prob_y1
print(prob_x_and_y2)## [1] 0.6666667
##
x_and_y <- new_df %>%
filter(y>y_cutoff & x==x_cutoff)
prob_xequal_and_y2 <- dim(x_and_y)[1]/total_combinations
## get conditional prob
prob_xequal_and_y2 <- prob_xequal_and_y2/prob_y1
## PROVE A+B+C SUM TO 1
1== round(prob_x_and_y2+x_cond_y+prob_xequal_and_y2)## [1] TRUE
#x_1st <- as.numeric(summary(x)[5])
#y_3rd <- as.numeric(summary(y)[2])
#ummary(x)
x_3rd_less <- new_df %>%
filter(x<= x_cutoff)
x_greater3rd <- new_df %>%
filter(x> x_cutoff)
## row 1
first <- x_3rd_less %>%
filter(y<=y_cutoff ) %>%
dim()
first <- as.numeric(first[1])
second <- x_3rd_less %>%
filter(y>y_cutoff ) %>%
dim()
second <- as.numeric(second[1])
total_row1 <- first+ second
## row 2
third <- x_greater3rd %>%
filter(y<=y_cutoff ) %>%
dim()
third <- as.numeric(third[1])
fourth <- x_greater3rd %>%
filter(y>y_cutoff ) %>%
dim()
fourth <- as.numeric(fourth[1])
total_row2 <- third+ fourth
new_table <- as.data.frame(cbind(c(first,second,total_row1),c(third,fourth,total_row2),c(5,15,20)))
rownames(new_table) <- c("Y<=y","Y>y",'total')
colnames(new_table) <- c("x<=x","X>x","Total")
kable(new_table)| x<=x | X>x | Total | |
|---|---|---|---|
| Y<=y | 5 | 0 | 5 |
| Y>y | 11 | 4 | 15 |
| total | 16 | 4 | 20 |
if two variables are independent, than \[P(A|B)=P(A) \]
X1 <- as.numeric(summary(x)[2])
y1 <- as.numeric(summary(y)[2])
A <- new_df %>%
filter(x>= X1 )
prob_A <- as.numeric(dim(A)[1])/total_combinations
## Filter DF for B>1st quartile
B <- prob_y
prob_B <- as.numeric(dim(B))[1]/20
## filter DF to return DF of A>1st quartile and B > 1st quartile
A_given_b <- new_df %>%
filter(x> X1& y>y1 )
## get P(A&B)
prob_a_and_b <- as.numeric(dim(A_given_b))[1]/20
## get P(A|B)
ProbabilityA_given_B <- prob_a_and_b/prob_y1
## now if these variables are independent than P(A|B== P(A))
ProbabilityA_given_B==prob_A## [1] FALSE
## now show if (P(AB)=P(A)*P(B))
prob_a_and_b==prob_A*prob_B## [1] FALSE
\[x^2= \sum \frac{(observed- expected)^2}{expected} \]
newer_df <- new_df %>%
mutate(new_col= ifelse(x >= X1 & y >= y1,"Both_above",ifelse(x < X1 & y < y1,"both_below",ifelse(x >= X1 & y < y1,"Y_below_X_above",ifelse(x < X1 & y >= y1,"X_below_Y_above",0)))))
actual <- table(newer_df$new_col)
both_above <- 20*(3/4)*(3/4)
Y_below_X_above <- 20*(1/4)*(3/4)
X_below_Y_above <- 20*(1/4)*(3/4)
both_below <- 20*(1/4)*(1/4)
actual##
## Both_above both_below X_below_Y_above Y_below_X_above
## 12 1 3 4
expected <- c(both_above,Y_below_X_above,X_below_Y_above,both_below)
my_x_2 <- sum((actual-expected)**2/(expected))
my_x_2## [1] 8.266667
chisq.test(actual,expected)## Warning in chisq.test(actual, expected): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: actual and expected
## X-squared = 8, df = 6, p-value = 0.2381