Problem 1

library(knitr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)
library(ggplot2)

X <- runif(10000,min = 1,max = 6)
Y <- rnorm(10000,mean = ((6+1)/2),sd = ((6+1)/2))
df <- data.frame(X,Y)
hist(X)

hist(Y)

(x <- summary(X)[3])
##   Median 
## 3.492486
(y <- summary(Y)[2])
##  1st Qu. 
## 1.082792

5 points

  1. P(X>x | X>y) b. P(X>x, Y>y) c. P(Xy)
(length(X[X > x &  X > y])/length(X)) / (length(X[X > y])/ length(X))
## [1] 0.5080268
(length(X[X > x &  Y > y])/length(X))
## [1] 0.3746
(length(X[X < x &  X > y])/length(X)) / (length(X[X > y])/ length(X))
## [1] 0.4919732

5 points.
Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

Table <- matrix(c( nrow( df %>% dplyr::filter(df$X < x & df$Y < y) ), 
                 nrow( df %>% dplyr::filter(df$X > x & df$Y < y) ), 
                 nrow( df %>% dplyr::filter(df$X < x & df$Y > y) ), 
                 nrow( df %>% dplyr::filter(df$X > x & df$Y > y) ) 
              ) , nrow=2)

colnames(Table) <- c('Y < y','Y > y')
rownames(Table) <- c('X < x','X > x')

(Table <- as.table(Table))
##       Y < y Y > y
## X < x  1246  3754
## X > x  1254  3746
Xgreaterx <- margin.table(Table,1)[2] / margin.table(Table)
Ygreatery <- margin.table(Table,2)[2] / margin.table(Table)
MarginT <- (Xgreaterx * Ygreatery)
JointT <- (xygreaterxy = Table[2,2] / margin.table(Table))
T2 <- data.frame(MarginT,JointT)
T2
##       MarginT JointT
## X > x   0.375 0.3746

5 points. Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

Fisher’s Exact test is a way to test the association between two categorical variables when you have small cell sizes (expected values less than 5). Chi-square test is used when the cell sizes are expected to be large.

FTest <- fisher.test(Table)
CTest <- chisq.test(Table)
FTest
## 
##  Fisher's Exact Test for Count Data
## 
## data:  Table
## p-value = 0.8716
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9047465 1.0866202
## sample estimates:
## odds ratio 
##   0.991503
CTest
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Table
## X-squared = 0.026133, df = 1, p-value = 0.8716

The P-Value for both tests is higher than .05 showing that the variables are independent.