2)

a)

\[ \text{Note: The top row displays all data. The bottom row displays a random selection of 'No' data that is equal in size to} \\ \text{the 'Yes' data. This is for display purposes only. We use the complete data set to construct every model.} \]

b)

\[ \begin{aligned} \operatorname{E}[\text{balance | default = Yes}] & \approx \text{\$1747.88} \\ \operatorname{E}[\text{balance | default = No}] & \approx \text{\$803.94} \\ \operatorname{\hat{\sigma}^{2}} &= \frac{1}{n-K} \sum_{k=1}^K(n_k-1)\operatorname{\hat\sigma^2_k}\\ &=\frac{n_1-1}{n-K} \operatorname{Var}[\text{balance | default = Yes}] + \frac{n_2-1}{n-K} \operatorname{Var}[\text{balance | default = No}] \\ &=\frac{332}{9998} 116,113.3 + \frac{9666}{9998} 208,349\\ &\approx \text{\$205,286.16}\\ \operatorname{\hat{\pi}_{Y}} &= \frac{333}{10,000} = \text{3.33%} \\ \operatorname{\hat{\pi}_{N}} &= \frac{9,667}{10,000} = \text{96.67%} \end{aligned} \]

c)

d)

\[ \text{The decision boundry shifted right from \$1,276.7 to \$2,007.3 (approximate intercepts) when we incorporated priors. That is,}\\ \text{We require a higher balance of credit card debt before the probability of defaulting exceeds the probability of not defaulting} \\ \text{when we adjust for priors. We prefer the adjusted model because it does a better job accounting for the entire population} \\ \text{as you can see from the top and bottom data rows.} \] ###e)

f)

b)

\[ \begin{aligned} \operatorname{E}[\text{balance | default = Yes}] & \approx \text{\$1747.88} \\ \operatorname{E}[\text{balance | default = No}] & \approx \text{\$803.94} \\ \operatorname{\hat{\sigma}^{2}_Y} &= \operatorname{Var}[\text{balance | default = Yes}] \approx \text{\$116,113}\\ \operatorname{\hat{\sigma}^{2}_N} &= \operatorname{Var}[\text{balance | default = Yes}] \approx \text{\$208,349}\\ \operatorname{\hat{\pi}_{Y}} &= \frac{333}{10,000} = \text{3.33%} \\ \operatorname{\hat{\pi}_{N}} &= \frac{9,667}{10,000} = \text{96.67%} \end{aligned} \]

c)

d)

\[ \text{The decision boundry shifted right from \$1,297.9 to \$1,978.1 (approximate intercepts) when we incorporated priors. Our assessment}\\ \text{is effectively the same as when we used pooled variance, albiet the QDA method produces a marginally smaller interval between} \\ \text{the shifts. This suggests that both sets of data have the same population variance.} \]

3)

# circle = read.csv("C:/Users/Will/OneDrive/Documents/School/SDS 323/HW 2/circle.txt", header = FALSE)
circle = read.csv("C:/Users/wtc464/Downloads/circle.txt", header = FALSE)
rm(X)
## Warning in rm(X): object 'X' not found
colnames(circle)[1] = "X"
colnames(circle)[2] = "Y"
attach(circle)
plot(X,Y,
     xlim = c(-1,1),
     ylim = c(-1,1)
     )

Xgiven0 = circle[circle$V3 == 0, 1]
Xgiven1 = circle[circle$V3 == 1, 1]
Ygiven0 = circle[circle$V3 == 0, 2]
Ygiven1 = circle[circle$V3 == 1, 2]

plot(Xgiven0,Ygiven0,
     xlim = c(-1,1),
     ylim = c(-1,1)
     )

# plot(Xgiven0,Ygiven1)
# plot(Xgiven1,Ygiven0)
plot(Xgiven1,Ygiven1,
     xlim = c(-1,1),
     ylim = c(-1,1)
     )

\[ \text{The data appears to be randomly distributed within donut-shaped bounds when we plot X against Y. When we control for Variable 3 } \\ \text{we see that the V3 = 0 corresponds to X-Y coordinates that are closer to the center of the donut than V3 = 1 X-Y coordinates. It} \\ \text{appears that V3 = 1 and V3 = 0 do not represent data drawn from the same population that was divided along arbitrary bounds.} \\ \text{We made this assessment because some of the V3 = 1 data fall within the bounds of the V3 = 0 data.} \]

b)

glm_fit = glm(V3 ~., data = circle, family = binomial)
glm_pred = predict(glm_fit, type="response")
glm_pred = glm_pred > .5
table(true = V3, pred = glm_pred)
##     pred
## true FALSE TRUE
##    0    35   25
##    1    28   30
summary(glm_fit)$coefficients
##                Estimate Std. Error     z value  Pr(>|z|)
## (Intercept) -0.01418412  0.1980570 -0.07161635 0.9429072
## X           -0.30352113  0.3780411 -0.80287875 0.4220448
## Y           -0.01813178  0.3599832 -0.05036842 0.9598288