Data 605 - Final Problem 1

Problem 1

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu =\sigma (N+1)/2\).

creating the Training set:

set.seed(4242)
N <- 7
n <- 10000
mu <- sigma <- (N+1)/2

tset <- data.frame(X = runif(n,min = 1, max=N), Y=rnorm(n,mean=mu, sd=sigma))

Set 1 - Probability.

Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

x = median(tset$X)
y = quantile(tset$Y, 0.25)

y

##      25% 
## 1.244764

Set 2 - P Functions

P(X>x|X>y) Probability A given B -> P(A|B)

Pa1 <- tset %>%
    filter(X>x, X>y)%>%
    nrow()/n

Pa2 <- tset %>%
    filter(X>y, X>y)%>%
    nrow()/n        
        
Pa1/Pa2

## [1] 0.5213764

b.P(X>x,X>y)
Probability of P(A>B) so this states all possible X values are greater than x and all possible Y values are greater than y

Pb2 <- tset %>%
    filter(X>x, Y>y)%>%
    nrow()/n
Pb2

## [1] 0.377

c.P(X<x|X>y)
Probability A given B -> P(A|B) stating X is greater than it’s own median AND greater than Q1 of Y

Pb3 <-  tset %>%
    filter(X<x, X>y)%>%
    nrow()/n

Pb3

## [1] 0.459

Set 3

Investigate whether P(X>x and Y>y)=P(X>x)P(Y>y) by building a table and evaluating the marginal and joint probabilities.

# Joint probability
matrix<- tset%>%
  mutate(P1 = ifelse(X>x, "X>X", "X<X"), P2 = ifelse(Y>y, "Y>y", "Y<y"))%>%
  group_by(P1, P2)%>%
  summarise(count=n(), probability = count/n)
matrix

## # A tibble: 4 x 4
## # Groups:   P1 [2]
##   P1    P2    count probability
##   <chr> <chr> <int>       <dbl>
## 1 X<X   Y<y    1270       0.127
## 2 X<X   Y>y    3730       0.373
## 3 X>X   Y<y    1230       0.123
## 4 X>X   Y>y    3770       0.377

# Building marginal probability 
matrix2<- matrix %>%
  ungroup()%>%
  group_by(P1)%>%
  summarise(count=sum(count), probability = sum(probability))%>%
  mutate(P2 = "Total")%>%
  bind_rows(matrix)
matrix2

## # A tibble: 6 x 4
##   P1    count probability P2   
##   <chr> <int>       <dbl> <chr>
## 1 X<X    5000       0.5   Total
## 2 X>X    5000       0.5   Total
## 3 X<X    1270       0.127 Y<y  
## 4 X<X    3730       0.373 Y>y  
## 5 X>X    1230       0.123 Y<y  
## 6 X>X    3770       0.377 Y>y

matrix3 <- matrix2 %>%
  ungroup()%>%
  group_by(P2)%>%
  summarise(count=sum(count), probability = sum(probability))%>%
  mutate(P1 = "Total")%>%
  bind_rows(matrix2)
matrix3

## # A tibble: 9 x 4
##   P2    count probability P1   
##   <chr> <int>       <dbl> <chr>
## 1 Total 10000       1     Total
## 2 Y<y    2500       0.25  Total
## 3 Y>y    7500       0.75  Total
## 4 Total  5000       0.5   X<X  
## 5 Total  5000       0.5   X>X  
## 6 Y<y    1270       0.127 X<X  
## 7 Y>y    3730       0.373 X<X  
## 8 Y<y    1230       0.123 X>X  
## 9 Y>y    3770       0.377 X>X

# Table
matrix3 %>% select(-count) %>% spread(P1, probability) %>% rename(`Compare` = P2)

## # A tibble: 3 x 4
##   Compare Total `X<X` `X>X`
##   <chr>   <dbl> <dbl> <dbl>
## 1 Total    1    0.5   0.5  
## 2 Y<y      0.25 0.127 0.123
## 3 Y>y      0.75 0.373 0.377

Set 4

Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

# Have to set up the dataframe to work with the fisher test
pre.q4 <- matrix3%>%
  filter(P1 != "Total", P2 != "Total")%>%
  select(-probability)%>%
  spread(P1, count)%>%
  as.data.frame()

# move values from P2 column to row names
row.names(pre.q4) = pre.q4$P2

#create the matrix
q4 <- pre.q4 %>%
  select(-P2)%>%
  as.matrix()

Running the fisher test

fisher.test(q4,simulate.p.value = TRUE)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  q4
## p-value = 0.3678
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9522537 1.1436981
## sample estimates:
## odds ratio 
##   1.043608

Running the Chi Square test

chisq.test(q4)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  q4
## X-squared = 0.8112, df = 1, p-value = 0.3678