\[ \text{Let} \\ Y = \beta_0+\beta_1X+\varepsilon \\ X = [x_1, ..., x_n] \\ Y = [y_1, ..., y_n] \\ \varepsilon = [\varepsilon_1, ..., \varepsilon_n] \sim N(0,\sigma^2)\\ \hat{\beta} = [\hat{\beta}_0,\hat{\beta}_1] \ \text{denotes estimators of} \ \beta = [\beta_0, \beta_1] \\ \]
\[ \text{We want to show that} \\ \hat{\beta} = \text{argmin}_{\beta \epsilon \mathbb{R}^2} \Bigg\{ \sum_{k=i}^n(y_i-\beta_0 - \beta_1x_i)^2 \Bigg\} = \text{argmax}_{\beta \epsilon \mathbb{R}^2} \Bigg\{ \prod_{k=i}^np(y_i|\beta,\sigma^2) \Bigg\} \\ \text{Note that the conditional density function for} \ y_i\text{,} \ p(y_i|\beta,\sigma^2), \\ \text{Is normally distributed with the parameters} \ Y\sim N(\beta_0+\beta_1X,\sigma^2) \]
\[ \text{1) This means that the probability density function of some point} \ x_i \ \text{is} \\ f(x_i|\beta, \sigma^2) = (2\pi\sigma^2)^{-\frac{1}{2}}e^{-\frac{(y_i-\hat\beta_0-\hat\beta_1x_i)^2}{2\sigma^2}} \]
\[ \text{2) We apply the liklihood function}\\ L[Y|\beta, \sigma^2] = \prod_{i=1}^n\bigg( (2\pi\sigma^2)^{-\frac{1}{2}}e^{-\frac{(y_i-\hat\beta_0-\hat\beta_1x_i)^2}{2\sigma^2}} \bigg) \\ \\ \] \[ \text{3) Factor the constants} \\ L[Y|\beta, \sigma^2] = (2\pi\sigma^2)^{-\frac{n}{2}} \prod_{i=1}^n\bigg( e^{-\frac{(y_i-\hat\beta_0-\hat'beta_1x_i)^2}{2\sigma^2}} \bigg) \\ \\ \] \[ \text{4) Combine the products, rewrite the constants} \\ L[Y|\beta, \sigma^2] = (2\pi)^{-\frac{n}{2}}\sigma^{-n} \bigg( e^{-\frac{\sum_{i=1}^n(y_i-\hat\beta_0-\hat\beta_1x_i)^2}{2\sigma^2}} \bigg) \\ \\ \] \[ \text{5) Calculate the log-liklihood} \\ \ln(L[Y|\beta, \sigma^2]) = -\frac{n}{2}\ln(2\pi) -n\ln(\sigma)-\frac{1}{2\sigma^2}\sum_{i=1}^n ( y_i-\hat\beta_0-\hat\beta_1x_i )^2 \\ \text{Pause, and let us look at the risidual sum of squares.} \] \[ \text{6) Note the simularities between RSS and the liklihood function} \\ RSS = \sum_{k=i}^n(y_i-\hat\beta_0 - \hat\beta_1x_i)^2 \\ \]
\[ \text{7) The first step to solving for} \ \hat\beta \text{is to derive either function with respect to} \ \hat\beta_0. \\ \frac{\partial}{\partial\hat\beta_0}\ln(L[Y|\beta, \sigma^2])= -\frac{1}{2\sigma^2}\frac{\partial}{\partial\hat\beta_0}\sum_{i=1}^n( y_i-\hat\beta_0-\hat\beta_1x_i )^2 \\ \text{and} \\ \frac{\partial}{\partial\hat\beta_0}RSS= \frac{\partial}{\partial\hat\beta_0}\sum_{k=i}^n(y_i-\hat\beta_0 - \hat\beta_1x_i)^2 \\ \] \[ \text{8) Set both functions equal to 0. Multiply the first function by} \ -2\sigma^2 \ \text{to get rid of the constant.}\\ \text{This gives us} \\ 0=\sum_{k=i}^n(y_i-\hat\beta_0 - \hat\beta_1x_i) \\ \text{for both functions. This demonstrates equivalence between MLE and least squares linear regression.} \\ \]
\[ \text{9) We can use a similar process to establish equivalence when we derive with respect to } \ \hat\beta_1 \text{, giving us the equivalence} \\ 0=\sum_{k=i}^nx_i(y_i-\hat\beta_0 - \hat\beta_1x_i) \\ \]
\[ \text{10) From here it is simply a matter of solving for} \ \hat\beta_1\text{ and then} \ \hat\beta_0 \\ \text{Let's begin by distributing 8) and solve for} \ \hat\beta_0 \\ \sum_{k=i}^n\hat\beta_0 = \sum_{k=i}^ny_i-\sum_{k=i}^n\hat\beta_1x_i \]
\[ \text{11) Multiply both sides by} \ \frac{1}{n} \\ E[\hat\beta_0]=E[Y] - E[\hat\beta_1X] \\ \hat\beta_0 = E[Y] - \hat\beta_1E[X] \\ \text{Now we solve for} \ \hat\beta_1 \]
\[ \text{12) Let us return to step 9). Substitute} \ \hat\beta_0 \\ 0=\sum_{k=i}^nx_i(y_i-\hat\beta_0 - \hat\beta_1x_i) = \sum_{k=i}^nx_i(y_i-E[Y] + \hat\beta_1E[X] - \hat\beta_1x_i) \] \[ \text{13) Distribute} \ x_i \ \text{and the summation.} \\ 0 = \sum_{k=i}^nx_iy_i-\sum_{k=i}^nx_iE[Y] + \hat\beta_1\sum_{k=i}^nx_iE[X] - \hat\beta_1\sum_{k=i}^nx_i^2) \]
\[ \text{14) Solve for} \ \hat\beta_1 \\ \hat\beta_1 = \frac{\sum_{k=i}^nx_iy_i-\sum_{k=i}^nx_iE[Y]}{\sum_{k=i}^nx_i^2-\sum_{k=i}^nx_iE[X]} \]
\[ \text{15) Multiply numerator and denominator by} \ \frac{1}{n} \\ \hat\beta_1 = \frac{\frac{1}{n}}{\frac{1}{n}} \frac{\sum_{k=i}^nx_iy_i-\sum_{k=i}^nx_iE[Y]}{\sum_{k=i}^nx_i^2-\sum_{k=i}^nx_iE[X]} \\ \text{ } \\ \hat\beta_1 = \frac{E[XY]-E[X]E[Y]}{E[X^2]-E[X]^2} \\ \text{ } \\ \hat\beta_1 = \frac{Cov[X,Y]}{Var[X]} \]
\[ \text{Let} \\ \hat{\beta} = \text{argmin}_{\beta \epsilon \mathbb{R}^2} \Bigg\{ \sum_{k=i}^nw_i(y_i-\beta_0 - \beta_1x_i)^2 \Bigg\}\\ \text{we want to show that} \ w_i \ \text{are weights that correspond to the maximum-liklihood solution under heterskedastic Gausian error}\\ \hat\beta=\text{argmax}_{\beta \epsilon \mathbb{R}^2} \Bigg\{ \prod_{k=i}^np(y_i|\beta,\sigma_i^2) \Bigg\} \\ \text{where} \\ p(y_i|\beta,\sigma_i^2) = N(\beta_0+\beta_1x_i, \sigma_i^2)\\ \]
\[ \text{Recall from part a) that we can transform the maximum liklihood function to log-liklihood} \\ L[Y|\beta, \sigma_i^2] = \prod_{i=1}^n\bigg( (2\pi\sigma_i^2)^{-\frac{1}{2}}e^{-\frac{1}{2} \bigg( \frac{y_i-\hat\beta_0-\hat\beta_1x_i}{\sigma_i}\bigg)^2} \bigg) \\ L[Y|\beta, \sigma_i^2] = \prod_{i=1}^n\bigg( (2\pi\sigma_i^2)^{-\frac{1}{2}}\bigg) e^{-\frac{1}{2} \sum_{i=1}^n \bigg( \frac{y_i-\hat\beta_0-\hat\beta_1x_i}{\sigma_i}\bigg)^2} \\ \ln(L[Y|\beta, \sigma_i^2] ) =- \frac{1}{2} \ln \bigg(\prod_{i=1}^n\bigg( (2\pi\sigma_i^2)\bigg)\bigg) -\frac{1}{2}\sum_{i=1}^n \bigg( \frac{y_i-\hat\beta_0-\hat\beta_1x_i}{\sigma_i}\bigg)^2 \bigg)\\ \]
\[ \text{We can dismiss most of this function because it will disappear when we differentiate. Let us simply examine} \\ \sum_{i=1}^n \bigg( \frac{y_i-\hat\beta_0-\hat\beta_1x_i}{\sigma_i}\bigg)^2 \]
\[ \text{Compare this to the weighted least squares RSS} \\ \sum_{k=i}^nw_i(y_i-\hat\beta_0 - \hat\beta_1x_i)^2 \] \[ \text{Rearrange the modified log-liklihood and the relationship becomes apaprent} \\ \sum_{i=1}^n \bigg( \frac{y_i-\hat\beta_0-\hat\beta_1x_i}{\sigma_i}\bigg)^2 =\sum_{k=i}^n \sigma^{-2}_i(y_i-\hat\beta_0 - \hat\beta_1x_i)^2 = \sum_{k=i}^nw_i(y_i-\hat\beta_0 - \hat\beta_1x_i)^2 \implies w_i = \sigma_i^{-2} \]
\[ \text{Let}\\ Y = \beta_0 + \beta_1X_1 + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \beta_5X_5 +\varepsilon \\ \text{where} \\ X_1= \text{GPA}\\ X_2= \text{IQ}\\ X_3= \text{Gender, 1 if female 0 if male}\\ X_4= \text{Interaction Between GPA and IQ}\\ X_5= \text{Interaction between GPA and gender}\\ \text{We use least squares to fit the model, so} \\ \hat\beta_0 = 50, \ \hat\beta_1 = 20, \ \hat\beta_2 = 0.07, \ \hat\beta_3 = 35, \ \hat\beta_4 = 0.01, \ \hat\beta_5 = -10\\ \text{This give us} \\ \hat{Y} = \hat\beta_0 + \hat\beta_1X_1 + \hat\beta_2X_2 + \hat\beta_3X_3 + \hat\beta_4X_1X_2 + \hat\beta_5X_1X_3 \]
\[ \text{Let's express} \ \hat{Y} \text{as male and female functions.}\\ \hat{Y}_F = \hat\beta_0 + \hat\beta_1X_1 + \hat\beta_2X_2 + \hat\beta_3 + \hat\beta_4X_1X_2 + \hat\beta_5X_1\\ \hat{Y}_M = \hat\beta_0 + \hat\beta_1X_1 + \hat\beta_2X_2 + \hat\beta_4X_1X_2\\ \text{Holding GPA and IQ constant as the values} \ c_1 \text{and} \ c_2 \ \text{respectively, we have}\\ \hat{Y}_F = \hat\beta_0 + \hat\beta_1c_1 + \hat\beta_2c_2 + \hat\beta_3 + \hat\beta_4c_1c_2 + \hat\beta_5c_1\\ \hat{Y}_M = \hat\beta_0 + \hat\beta_1c_1 + \hat\beta_2c_2 + \hat\beta_4c_1c_2\\ \] \[ \text{Subtract} \ \hat{Y}_M \ \text{from} \ \hat{Y}_F \ \text{and set equal to 0} \\ \hat{Y}_F - \hat{Y}_M = \hat\beta_0 + \hat\beta_1c_1 + \hat\beta_2c_2 + \hat\beta_3 + \hat\beta_4c_1c_2 + \hat\beta_5c_1 - (\hat\beta_0 + \hat\beta_1c_1 + \hat\beta_2c_2 + \hat\beta_4c_1c_2) = 0 \\ \hat{Y}_F - \hat{Y}_M =\hat\beta_3 + \hat\beta_5c_1 = 0\implies c_1 = -\frac{\hat\beta_3}{\hat\beta_5} = -\frac{35}{-10} = 3.5 \\ \text{For a fixed value of IQ and GPA, females earn more on average than males provided that the GPA is high enough.} \]
\[ \hat{Y}_F = 50+ (20)(4.0) + (0.07)(110) + 35 + (0.01)(4.0)(110) + (-10)(4.0)\\ \hat{Y}_F = 137.1 \]
\[ \text{We expect that this female will earn \$137,100} \]
\[ \text{False, we must perform a signficance test.} \]
set.seed(1)
x <- rnorm(100, 0, 1)
e <- rnorm(100, 0, 0.5)
y <- -1 + 0.5 * x + e
\[ Y = -1 + 0.5 X + \varepsilon \\ Y = [y_1, ..., y_{100}] \ \text{is length 100} \\ \beta_0 = -1 \\ \beta_1 = 0.5 \]
\[ \text{Scatterplot indicates high linear correlation between X and Y} \]
\[ \hat\beta_0 \approx -1.0188\\ \hat\beta_1 \approx 0.4995\\ \text{The model does a good job predicting y given the true values} \ \beta_0 \ \text{and} \ \beta_1 \]
linearMod <- lm(y ~ x)
plot(y~x)
abline(linearMod, col="blue")
legend(0, legend="ls line", fill="blue")
\[ \hat\beta_0 \approx -0.97164\\ \hat\beta_1 \approx 0.50858\\ \hat\beta_2 \approx -0.05946\\ \]
\[ \text{There is little evidence that the quadratic term imroves the model's fitness.} \\ \text{The p-value 0.164 for the quadratic termindicates that there is not enough evidence to warrant a quadratic model.} \\ \text{Furthermore, the minimal gain in} \ r^2 \ \text{(0.4779 instead of 0.4674), and the nearly 0 value of} \ \hat\beta_2 \ \text{strengthen the case against} \\ \text{using a quadratic model.} \]
\[ \text{Let} \ \varepsilon \sim N(0, \sigma^2=0.09) \\ \text{repeating steps a) - f) gives us} \\ \hat\beta_0 \approx -0.9855 \\ \hat\beta_1 \approx 0.5319 \\ \text{Interestingly, this model does a poorer job of predicting y than the normal variance model.} \\ \text{However, the} \ r^2 \ \text{value is much higher: 0.7759 against 0.4674 of the previous model with higher variance.} \]
\[ \text{Let} \ \varepsilon \sim N(0, \sigma^2=1) \\ \text{repeating steps a) - f) gives us} \\ \hat\beta_0 \approx -1.04755 \\ \hat\beta_1 \approx 0.4251\\ \text{This model does the worst job of predicting y. r^2 value is 0.2107.} \]
\[ \text{95% CI can be calculated by adding and subtracting 2 standard errors from the regression coefficients.}\\ \sigma^2 = 0.25 \implies \beta_0 \ \epsilon \ (-1.1158, -0.9218), \ \beta_1 \ \epsilon \ (0.3917, 0.6073); \ 1- \alpha =.95\\ \sigma^2 = 0.09 \implies \beta_0 \ \epsilon \ (-1.0449, -0.9261), \ \beta_1 \ \epsilon \ (0.4741, 0.5897); \ 1- \alpha =.95\\ \sigma^2 = 1 \implies \beta_0 \ \epsilon \ (-1.2411, -0.8539), \ \beta_1 \ \epsilon \ (0.2589, 0.5913); \ 1- \alpha =.95\\ \]
\[ \text{Proxy variables in the following equation are binary categorical variables.}\\ \text{Only one of the proxy variables can be set to 1, and all the others are 0.}\\ \]
\[ \text{deltaT} = \hat\beta_0 \text{Proxy1} +\hat\beta_1 \text{Proxy2} +\hat\beta_2 \text{Proxy3} +\hat\beta_3 \text{Proxy4} +\hat\beta_4 \text{Proxy5} +\hat\beta_5 \text{Proxy6} +\hat\beta_6 \text{Proxy7} + \hat\beta_{7} \text{(Proxy1)(latitude)} + \\ \hat\beta_8 \text{(Proxy2)(latitude)} +\hat\beta_9 \text{(Proxy3)(latitude)} +\hat\beta_{10} \text{(Proxy4)(latitude)} +\hat\beta_{11} \text{(Proxy5)(latitude)} + \hat\beta_{12} \text{(Proxy6)(latitude)} + \\ \hat\beta_{13} \text{(Proxy7)(latitude)} \]
\[ \text{The R code creates a matrix that allows us to use the proxy variable categorically.}\\ \text{If the proxy is set to 2, then the factor(proxy)2 column is nonzero and the factor(proxy)2:latitude column contains data.} \]
\[ L[\theta] = E[(X-\hat{\theta})^2] \\ = E[X^2-2\hat{\theta}X+\hat{\theta}^2] \\ =E[X^2]-E[2\hat{\theta}X]+E[\hat{\theta}^2]\\ \frac{\partial L[\hat{\theta}]}{\partial X} = 2E[X]-E[2\hat{\theta}]\\ 0 = 2E[X]-2{\hat{\theta}} \\ E[X]=\hat\theta\\ L[\hat\theta]=E[X^2]-E[2E[X]X]+E[E[X]^2]\\ L[\hat\theta]=E[X^2]-2E[X]^2+E[X]^2\\ L[\hat\theta]=E[X^2]-E[X]^2=Var[X] \]
\[ \text{recall }y_0=f(x_0) +ε \text{ and }E[ε] = 0\\ \operatorname {Var} [X]=\operatorname {E} [X^{2}]-\operatorname {E} [X]^{2} \\ \operatorname {E} [X^{2}]=\operatorname {Var} [X]+\operatorname {E} [X]^{2} \\ \text{since } f \text{ is deterministic, then }\\ E[y] = E[f+ε] = E[f] = f \\ \operatorname {Var} [y] = \operatorname {E} [(y-\operatorname {E} [y])^{2}] \\ =\operatorname {E} [(y-f)^{2}]\\ =\operatorname {E} [(f+\varepsilon -f)^{2}]\\ =\operatorname {E} [\varepsilon ^{2}]\\ \text{rearranging the first equation gives us: }\\ E[\varepsilon^2]=\operatorname {Var} [\varepsilon ]+\operatorname {E} [\varepsilon ]^{2}=\sigma ^{2} \]
\[ {\displaystyle {\begin{aligned}\operatorname {E} {\big [}(y-{\hat {f}})^{2}{\big ]}& =\operatorname {E} {\big [}(f+\varepsilon -{\hat {f}})^{2}{\big ]}\\[5pt]& =\operatorname {E} {\big [}(f+\varepsilon -{\hat {f}}+\operatorname {E} [{\hat {f}}]-\operatorname {E} [{\hat {f}}])^{2}{\big ]}\\[5pt]& =\operatorname {E} {\big [}(f-\operatorname {E} [{\hat {f}}])^{2}{\big ]}+\operatorname {E} [\varepsilon ^{2}]+\operatorname {E} {\big [}(\operatorname {E} [{\hat {f}}]-{\hat {f}})^{2}{\big ]}+2\operatorname {E} {\big [}(f-\operatorname {E} [{\hat {f}}])\varepsilon {\big ]}+2\operatorname {E} {\big [}\varepsilon (\operatorname {E} [{\hat {f}}]-{\hat {f}}){\big ]}+2\operatorname {E} {\big [}(\operatorname {E} [{\hat {f}}]-{\hat {f}})(f-\operatorname {E} [{\hat {f}}]){\big ]}\\[5pt]& =(f-\operatorname {E} [{\hat {f}}])^{2}+\operatorname {E} [\varepsilon ^{2}]+\operatorname {E} {\big [}(\operatorname {E} [{\hat {f}}]-{\hat {f}})^{2}{\big ]}+2(f-\operatorname {E} [{\hat {f}}])\operatorname {E} [\varepsilon ]+2\operatorname {E} [\varepsilon ]\operatorname {E} {\big [}\operatorname {E} [{\hat {f}}]-{\hat {f}}{\big ]}+2\operatorname {E} {\big [}\operatorname {E} [{\hat {f}}]-{\hat {f}}{\big ]}(f-\operatorname {E} [{\hat {f}}])\\[5pt]& =(f-\operatorname {E} [{\hat {f}}])^{2}+\operatorname {E} [\varepsilon ^{2}]+\operatorname {E} {\big [}(\operatorname {E} [{\hat {f}}]-{\hat {f}})^{2}{\big ]}\\[5pt]& =(f-\operatorname {E} [{\hat {f}}])^{2}+\operatorname {Var} [y]+\operatorname {Var} {\big [}{\hat {f}}{\big ]}\\[5pt]& =\operatorname {Bias} [{\hat {f}}]^{2}+\operatorname {Var} [y]+\operatorname {Var} {\big [}{\hat {f}}{\big ]}\\[5pt]& =\operatorname {Bias} [{\hat {f}}]^{2}+\sigma ^{2}+\operatorname {Var} {\big [}{\hat {f}}{\big ]}\end{aligned}}} \]
\[ \text{Interpretation: MSE can be disaggregated into irriducible variance of the noise} \ \sigma^2 \text{,} \\ \text{and two reducible errors} \ \operatorname {Bias} [{\hat {f}}]^{2} \ \text{and} \ \operatorname {Var} {\big [}{\hat {f}}{\big ]} \ \text{where}\\ \operatorname {Bias} [{\hat {f}}] = f-\operatorname {E} [{\hat {f}}] \ \text{and} \ \operatorname {Var} {\big [}{\hat {f}}{\big ]} = \operatorname{ E}{\big [}(\operatorname {E} [{\hat {f}}]-\hat{f})^2{\big ]} \]