● Instructions: Submit the R Notebook with your name as the author. Refer to Sample Notebook.
Part I - Statistical Estimation
Q1: Concepts Definitions (20 points)
Answer:
Answer:
Answer:
Answer:
Q2: Maximum Likelihood Estimation (30 points)
For this question, you will use the stroke data from the ISwR package. Consider that the hospital ABC provided this report which contains a sample of the patients that suffered from a stroke. The hospital wants to estimate the true mean and standard deviation of the age of the whole population of patients that had a stroke in that hospital. For this question, assume that the variable “age” follows a normal distribution.
library(ISwR)
#View(stroke)
# define the likelihood function
LLmean2a = function(M)
{
LLsum2a = sum(dnorm(stroke$age,mean = M,sd = 13,log = T))
return(LLsum2a)
}
# now define M and use sapply
Mvalues = seq(0,90,by = 0.1)
# sapply applies a function to each element of a vector
LLres2a = sapply(Mvalues,LLmean2a)
plot(Mvalues,
LLres2a,
type="l",
col="blue",
xlab = "M Values",
ylab="Log Likelihood",
main="Likelihood")
# get the value where this is true
y <- which.max(LLres2a)
# return the M Value
theMax <- Mvalues[y]
# add to plot
abline(v=theMax, col="red", lty=2)
# import package bbmle
library(stats4)
library(bbmle)
# Log-Likelihood function for mean and sd
LLmeansd2b = function(M,sigma)
{
LLsum2b = sum(dnorm(stroke$age,mean = M,sd = sigma,log = T))
return(-1*LLsum2b)
}
# using function LLmeansd2b defined in previous chunk
mle2c = mle2(minuslogl = LLmeansd2b,
start = list(M=30,sigma=1),
lower=c(sigma=0),
method = "L-BFGS-B")
summary(mle2c)
## Maximum likelihood estimation
##
## Call:
## mle2(minuslogl = LLmeansd2b, start = list(M = 30, sigma = 1),
## method = "L-BFGS-B", lower = c(sigma = 0))
##
## Coefficients:
## Estimate Std. Error z value Pr(z)
## M 69.88541 0.47948 145.752 < 2.2e-16 ***
## sigma 13.80536 0.33904 40.719 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## -2 log L: 6704.948
Part II - Linear Regression
Q3: Concepts Definitions (20 points)
Answer:
Answer:
Answer:
Answer:
Q4: Linear Regression (30 points)
For this question, you will use the mtcars dataset in R.
# linear regression for mpg~hp
fit4a <- lm(mpg~hp, data=mtcars)
summary(fit4a)
##
## Call:
## lm(formula = mpg ~ hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7121 -2.1122 -0.8854 1.5819 8.2360
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.09886 1.63392 18.421 < 2e-16 ***
## hp -0.06823 0.01012 -6.742 1.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.863 on 30 degrees of freedom
## Multiple R-squared: 0.6024, Adjusted R-squared: 0.5892
## F-statistic: 45.46 on 1 and 30 DF, p-value: 1.788e-07
# linear regression for mpg~wt
fit4b <- lm(mpg~wt, data=mtcars)
summary(fit4b)
##
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
Answer:
Adjusted R-squared: 0.5892
Adjusted R-squared: 0.7446
# model comparision
# model 1
mpg_analysis4a <- cbind(mtcars$mpg, fit4a$fitted.values)
mpg_analysis4a <- as.data.frame(mpg_analysis4a)
names(mpg_analysis4a) <- c("Actual","Predicted")
# View(mpg_analysis4a)
# look at error metrics
# create error column in an intiutive direction
mpg_analysis4a$Error <- mpg_analysis4a$Predicted - mpg_analysis4a$Actual
# squared error
mpg_analysis4a$SqError <- mpg_analysis4a$Error^2
# absolute error
mpg_analysis4a$AbsError <- abs(mpg_analysis4a$Error)
# Mean absolute error
mae4a <- mean(mpg_analysis4a$AbsError)
mae_sd4a <- sd(mpg_analysis4a$AbsError)
# Mean Squared Error
mse4a <- mean(mpg_analysis4a$SqError)
mse_sd4a <- sd(mpg_analysis4a$SqError)
# if no zeros in the target variable, percentage error are useful
mpg_analysis4a$AbsPercError <- mpg_analysis4a$AbsError/mpg_analysis4a$Actual
# Mean Absolute Percentage Error
mape4a <- mean(mpg_analysis4a$AbsPercError)
mape_sd4a <- sd(mpg_analysis4a$AbsPercError)
# model 2
mpg_analysis4b <- cbind(mtcars$mpg, fit4b$fitted.values)
mpg_analysis4b <- as.data.frame(mpg_analysis4b)
names(mpg_analysis4b) <- c("Actual","Predicted")
# View(mpg_analysis4b)
# look at error metrics
# create error column in an intiutive direction
mpg_analysis4b$Error <- mpg_analysis4b$Predicted - mpg_analysis4b$Actual
# squared error
mpg_analysis4b$SqError <- mpg_analysis4b$Error^2
# absolute error
mpg_analysis4b$AbsError <- abs(mpg_analysis4b$Error)
# Mean absolute error
mae4b <- mean(mpg_analysis4b$AbsError)
mae_sd4b <- sd(mpg_analysis4b$AbsError)
# Mean Squared Error
mse4b <- mean(mpg_analysis4b$SqError)
mse_sd4b <- sd(mpg_analysis4b$SqError)
# if no zeros in the target variable, percentage error are useful
mpg_analysis4b$AbsPercError <- mpg_analysis4b$AbsError/mpg_analysis4b$Actual
# Mean Absolute Percentage Error
mape4b <- mean(mpg_analysis4b$AbsPercError)
mape_sd4b <- sd(mpg_analysis4b$AbsPercError)
# AIC = Aikike Information Criteria
aic4a <- AIC(fit4a)
aic4b <- AIC(fit4b)
cat("R-squared for model 1 is 0.6024",
"\nR-squared for model 2 is 0.7528",
"\nMean Absolute Error for model 1 = ",mae4a,
"\nMean Absolute Error for model 2 = ",mae4b,
"\nMean Squared Error for model 1 = ",mse4a,
"\nMean Squared Error for model 2 = ",mse4b,
"\nMean Absolute Percentage Error for model 1 = ",mape4a,
"\nMean Absolute Percentage Error for model 2 = ",mape4b,
"\nAIC for model 1 = ",aic4a,
"\nAIC for model 1 = = ",aic4b)
## R-squared for model 1 is 0.6024
## R-squared for model 2 is 0.7528
## Mean Absolute Error for model 1 = 2.907452
## Mean Absolute Error for model 2 = 2.340642
## Mean Squared Error for model 1 = 13.98982
## Mean Squared Error for model 2 = 8.697561
## Mean Absolute Percentage Error for model 1 = 0.1566944
## Mean Absolute Percentage Error for model 2 = 0.1260733
## AIC for model 1 = 181.2386
## AIC for model 1 = = 166.0294
cat("After comparing the R-squared and other error metrics for model comparison, it can be concluded that:\n1)Model 2 has a bigger R-squared value than model 1\n2)Model 2 has lesser mean absolute error, mean squared error and mean absolute percentage error\n3)Model 2 has a lesser AIC than model 1\n\nAs higher the R-squared better the model and lower the AIC, better the model, model 2 is a better model than model 1.")
## After comparing the R-squared and other error metrics for model comparison, it can be concluded that:
## 1)Model 2 has a bigger R-squared value than model 1
## 2)Model 2 has lesser mean absolute error, mean squared error and mean absolute percentage error
## 3)Model 2 has a lesser AIC than model 1
##
## As higher the R-squared better the model and lower the AIC, better the model, model 2 is a better model than model 1.
# plot the variable spread for model 1
plot(y=mtcars$mpg,
x=mtcars$hp,
xlab="hp",
ylab="mpg",
main="hp vs. mpg")
abline(fit4a,
col="red")
# plot the variable spread for model 2
plot(y=mtcars$mpg,
x=mtcars$wt,
xlab="wt",
ylab="mpg",
main="wt vs. mpg")
abline(fit4b,
col="red")