1 Problem 1

Pick one of the quantitative independent variables (Xi) from the data set below, and define that variable as X. Also, pick one of the dependent variables (Yi) below, and define that as Y.

library(tidyverse)
library(reshape)
library(knitr)

data605 <- read_csv("https://raw.githubusercontent.com/niteen11/CUNY_DATA_605/master/data_finalexam/605final.csv")
kable(data605)
Y1 Y2 Y3 Y4 X1 X2 X3 X4
20.3 20.8 28.4 20.2 9.3 7.4 9.5 9.3
19.1 14.6 21.5 18.6 4.1 6.4 3.7 12.4
19.3 18.0 20.8 22.6 22.4 8.5 11.7 19.9
20.9 7.3 22.2 11.4 9.1 9.5 7.4 6.9
22.0 19.4 21.6 23.6 15.8 11.8 5.3 -1.0
23.5 13.5 21.8 24.0 7.1 8.8 7.4 10.6
13.8 14.7 25.2 26.0 15.9 8.4 7.4 6.4
18.8 15.3 22.5 26.8 6.9 5.1 8.6 10.6
20.9 12.6 21.1 19.7 16.0 11.4 9.1 1.2
18.6 13.0 21.7 22.7 6.7 15.1 11.4 7.7
22.3 13.1 21.4 16.8 8.2 12.6 8.4 15.5
17.6 10.3 20.8 20.2 16.0 8.0 7.3 6.9
20.8 14.9 23.0 21.7 6.4 10.3 11.3 13.7
28.7 14.8 17.4 20.9 11.8 10.4 4.4 3.7
15.2 16.2 21.3 26.9 3.5 9.5 9.3 4.4
20.9 15.7 15.1 16.3 21.7 9.5 10.9 11.5
18.4 16.3 17.8 19.9 12.2 15.1 10.9 4.2
10.3 11.5 26.4 15.5 9.3 6.6 7.7 13.9
26.3 12.2 21.6 26.5 8.0 15.4 7.7 12.9
28.1 11.8 22.5 21.7 6.2 8.2 11.5 1.2

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

summary(data605)
##        Y1              Y2              Y3              Y4       
##  Min.   :10.30   Min.   : 7.30   Min.   :15.10   Min.   :11.40  
##  1st Qu.:18.55   1st Qu.:12.50   1st Qu.:21.02   1st Qu.:19.43  
##  Median :20.55   Median :14.65   Median :21.60   Median :21.30  
##  Mean   :20.29   Mean   :14.30   Mean   :21.70   Mean   :21.10  
##  3rd Qu.:22.07   3rd Qu.:15.82   3rd Qu.:22.50   3rd Qu.:23.70  
##  Max.   :28.70   Max.   :20.80   Max.   :28.40   Max.   :26.90  
##        X1              X2              X3               X4        
##  Min.   : 3.50   Min.   : 5.10   Min.   : 3.700   Min.   :-1.000  
##  1st Qu.: 6.85   1st Qu.: 8.15   1st Qu.: 7.400   1st Qu.: 4.350  
##  Median : 9.20   Median : 9.50   Median : 8.500   Median : 8.500  
##  Mean   :10.83   Mean   : 9.90   Mean   : 8.545   Mean   : 8.595  
##  3rd Qu.:15.82   3rd Qu.:11.50   3rd Qu.:10.900   3rd Qu.:12.525  
##  Max.   :22.40   Max.   :15.40   Max.   :11.700   Max.   :19.900
X <- data605$X4
Y <- data605$Y4
XY<- cbind(X,Y)

rec <- nrow(XY)
# x - 3rd quartile
x <- quantile(X, 0.75)
# y - 1st quartile
y <- quantile(Y, 0.25)

print(x)
##    75% 
## 12.525
print(y)
##    25% 
## 19.425
total <- nrow(data605)

#get P(Y>y)
Yy<- data605[data605$Y4 > y,]
pY <- round(nrow(Yy) / total, 4)

#get P(X>x)
Xx <- data605[data605$X4 > x, ]
pX <- round(nrow(Xx) / total, 4)

pX
## [1] 0.25
pY
## [1] 0.75

Evaluate \(P(X>x | Y>y)\)

#get P(X>x | Y>y)
p1 <- round(nrow(Yy[Yy$X4 > x,]) / total, 4)
print(paste0("P(X>x | Y>y) = ", p1))
## [1] "P(X>x | Y>y) = 0.15"

Evaluate \(P(X>x \space , \space Y>y)\)

p2 <- round(pX * pY, 4)
print(paste0("P(X>x, Y>y) = ", p2))
## [1] "P(X>x, Y>y) = 0.1875"

Evaluate \(P(X<x \space | \space Y>y)\)

p3<-round(nrow(data605[X<=x & Y>y,])/nrow(Yy), 4)
print(paste0("P(X<x | Y>y) = ", p3))
## [1] "P(X<x | Y>y) = 0.8"

Quartile Table

c1<-nrow(data605[X<x & Y<=y, ])
c2<-nrow(data605[X <=x & Y>y, ])
c3<-c1+c2
c4<-nrow(data605[X >x & Y<=y, ])
c5<-nrow(data605[X >x & Y>y, ])
c6<-c4+c5
c7<-c1+c4
c8<-c2+c5
c9<-c3+c6
  
quartile.table<-matrix(round(c(c1,c2,c3,
                            c4,c5,c6,
                            c7,c8,c9),3), ncol=3, nrow=3, byrow=TRUE)

colnames(quartile.table) <- c("<=3d quartile",">3d quartile","Total")
rownames(quartile.table) <- c('<=1st quartile', '>1st quartile','Total')
quartile.table<-as.table(quartile.table)

kable(quartile.table)
<=3d quartile >3d quartile Total
<=1st quartile 3 12 15
>1st quartile 2 3 5
Total 5 15 20

Does splitting the training data in this fashion make them independent? Let A be the new variable counting those observations above the 1st quartile for X, and let B be the new variable counting those observations above the 1st quartile for Y. Does \(P(AB)=P(A)P(B)\)?

x.q1 <- quantile(X, probs = 0.25) # x.q1 = 4.35
y.q1 <- quantile(Y, probs = 0.25) # y.q1 = 19.425

A<-subset(data605, data605$X4>x.q1)
B<-subset(data605, data605$Y4>y.q1)

A
## # A tibble: 15 x 8
##       Y1    Y2    Y3    Y4    X1    X2    X3    X4
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  20.3 20.8   28.4  20.2  9.30  7.40  9.50  9.30
##  2  19.1 14.6   21.5  18.6  4.10  6.40  3.70 12.4 
##  3  19.3 18.0   20.8  22.6 22.4   8.50 11.7  19.9 
##  4  20.9  7.30  22.2  11.4  9.10  9.50  7.40  6.90
##  5  23.5 13.5   21.8  24.0  7.10  8.80  7.40 10.6 
##  6  13.8 14.7   25.2  26.0 15.9   8.40  7.40  6.40
##  7  18.8 15.3   22.5  26.8  6.90  5.10  8.60 10.6 
##  8  18.6 13.0   21.7  22.7  6.70 15.1  11.4   7.70
##  9  22.3 13.1   21.4  16.8  8.20 12.6   8.40 15.5 
## 10  17.6 10.3   20.8  20.2 16.0   8.00  7.30  6.90
## 11  20.8 14.9   23.0  21.7  6.40 10.3  11.3  13.7 
## 12  15.2 16.2   21.3  26.9  3.50  9.50  9.30  4.40
## 13  20.9 15.7   15.1  16.3 21.7   9.50 10.9  11.5 
## 14  10.3 11.5   26.4  15.5  9.30  6.60  7.70 13.9 
## 15  26.3 12.2   21.6  26.5  8.00 15.4   7.70 12.9
B
## # A tibble: 15 x 8
##       Y1    Y2    Y3    Y4    X1    X2    X3     X4
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1  20.3  20.8  28.4  20.2  9.30  7.40  9.50   9.30
##  2  19.3  18.0  20.8  22.6 22.4   8.50 11.7   19.9 
##  3  22.0  19.4  21.6  23.6 15.8  11.8   5.30  -1.00
##  4  23.5  13.5  21.8  24.0  7.10  8.80  7.40  10.6 
##  5  13.8  14.7  25.2  26.0 15.9   8.40  7.40   6.40
##  6  18.8  15.3  22.5  26.8  6.90  5.10  8.60  10.6 
##  7  20.9  12.6  21.1  19.7 16.0  11.4   9.10   1.20
##  8  18.6  13.0  21.7  22.7  6.70 15.1  11.4    7.70
##  9  17.6  10.3  20.8  20.2 16.0   8.00  7.30   6.90
## 10  20.8  14.9  23.0  21.7  6.40 10.3  11.3   13.7 
## 11  28.7  14.8  17.4  20.9 11.8  10.4   4.40   3.70
## 12  15.2  16.2  21.3  26.9  3.50  9.50  9.30   4.40
## 13  18.4  16.3  17.8  19.9 12.2  15.1  10.9    4.20
## 14  26.3  12.2  21.6  26.5  8.00 15.4   7.70  12.9 
## 15  28.1  11.8  22.5  21.7  6.20  8.20 11.5    1.20
# P(AB)
p.ab <- nrow(subset(data605, data605$X4>x.q1 & data605$Y4>y.q1)) / total

# P(A)P(B)
pa <- nrow(A) / total
pb <- nrow(B) / total
pa.pb <- pa*pb
p.ab == pa.pb
## [1] FALSE

Check mathematically, and then evaluate by running a Chi Square test for association.

chisq.test(A, B)
## 
##  Pearson's Chi-squared test
## 
## data:  A
## X-squared = 110.45, df = 98, p-value = 0.1838

The p-value is > 0.05, thus, we fail to reject the null hypothesis that the data are independant.