Part 1 A substance used in biological and medical research is shipped by airfreight to users in cartons of 1,000 ampules. The data below, involving 10 shipments, were collected on the number of times the carton was transferred from one aircraft to another over the shipment route (X) and the number of ampules found to be broken upon arrival (Y). Assume that first-order regression model (1.1) is appropriate.
transfers <- c(1,0,2,0,3,1,0,1,2,0)
breakages <- c(16,9,17,12,22,13,8,15,19,11)
airfreight.model <- lm(breakages~transfers)
summary(airfreight.model)
##
## Call:
## lm(formula = breakages ~ transfers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2 -1.2 0.3 0.8 1.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.2000 0.6633 15.377 3.18e-07 ***
## transfers 4.0000 0.4690 8.528 2.75e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.483 on 8 degrees of freedom
## Multiple R-squared: 0.9009, Adjusted R-squared: 0.8885
## F-statistic: 72.73 on 1 and 8 DF, p-value: 2.749e-05
# Estimated regression function is (Breakages) = 4*(Transfers)+10.2
plot(breakages~transfers)
abline(airfreight.model)
# Adjusted R-squared is 0.8885, therefore, this linear regression appears to be a good fit for the data.
airfreight.model$coefficients
## (Intercept) transfers
## 10.2 4.0
point1<-4*1+10.2
point1
## [1] 14.2
point2<-4*2+10.2
point2 - point1
## [1] 4
plot(breakages~transfers)
points(mean(transfers),mean(breakages), pch = 19, col = "blue")
abline(airfreight.model)
Part 2 The director of admission of a small college selected 120 students at random from the new freshman class in a study to determine whether a student’s grade point average (GPA) at the end of the freshman year can be predicted from the ACT test score (X).
GPA.data <- read.delim("HW1_exercise2.txt", header = FALSE, sep = " ")
GPA <- GPA.data[,2]
ACT <- GPA.data[,6]
X.ACT <- 0
GPA.model <- lm(GPA~ACT)
summary(GPA.model)
##
## Call:
## lm(formula = GPA ~ ACT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.74004 -0.33827 0.04062 0.44064 1.22737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.11405 0.32089 6.588 1.3e-09 ***
## ACT 0.03883 0.01277 3.040 0.00292 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6231 on 118 degrees of freedom
## Multiple R-squared: 0.07262, Adjusted R-squared: 0.06476
## F-statistic: 9.24 on 1 and 118 DF, p-value: 0.002917
Y.GPA <- 0.03883*(X.ACT) + 2.11405
plot(GPA~ACT)
abline(GPA.model)
R-squared below 0.1, therefore, bad fit.
X.ACT <- 30
Y.GPA <- 0.03883*(X.ACT) + 2.11405
Y.GPA
## [1] 3.27895
Difference between ACT score of 31 and 30
((0.03883*31)+2.11405) - ((0.03883*30)+2.11405)
## [1] 0.03883
The difference is the slope, 0.03883 GPA per ACT
Resid1 <- resid(GPA.model)
sum(Resid1)
## [1] -2.942091e-15
Yes, they sum to zero
sigma2 <- sum(Resid1^2)/(length(ACT)-2)
sigma <- sqrt(sigma2)
sigma2
## [1] 0.3882848
sigma
## [1] 0.623125
Sigma is measured in GPA units for this problem.
Part 3 A person’s muscle mass is expected to decrease with age. To explore this relationship in women, a nutritionist randomly selected 15 women from 10-year age group, beginning with age 40 and ending with age 79. The results follow; Xis age, and Y is a measure of muscle mass. Assume that first-order regressino model is appropriate.
MuscMass.dat <- read.table("HW1_exercise3.txt", header = FALSE, sep = " ")
MuscMass.model <- lm(MuscMass.dat[,1]~MuscMass.dat[,2])
plot(MuscMass.dat[,2],MuscMass.dat[,1])
abline(MuscMass.model)
summary(MuscMass.model)
##
## Call:
## lm(formula = MuscMass.dat[, 1] ~ MuscMass.dat[, 2])
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.1368 -6.1968 -0.5969 6.7607 23.4731
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 156.3466 5.5123 28.36 <2e-16 ***
## MuscMass.dat[, 2] -1.1900 0.0902 -13.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.173 on 58 degrees of freedom
## Multiple R-squared: 0.7501, Adjusted R-squared: 0.7458
## F-statistic: 174.1 on 1 and 58 DF, p-value: < 2.2e-16
Y = -1.19*x +156.3466 R-squared is about 0.75, therefore the model describes an acceptable amount of the variance. Therefore, the fit is okay.
age.50 <- -1.19*(50)+156.3466
age.51 <- -1.19*(51)+156.3466
age.51-age.50
## [1] -1.19
change of muscle mass = -1.19
-1.19*(60)+156.3466
## [1] 84.9466
The muscle mass of a 60 year old woman is 84.9466 according to the linear model.
MuscMass.model$residuals[8]
## 8
## 4.443252
Resids<-resid(MuscMass.model)
sum(Resids^2)/(length(MuscMass.dat[,1])-2)
## [1] 66.80082
sigma2 = 66.80082
Part 4 Shown below are the number of galleys for a manuscript (X) and the dollar cost of correcting typographical errors(Y) in a random sample of recent orders handled by a firm specializing in technical manuscripts. Assume that the regression model \({Y_i = B_1(X_i) +E_i}\) is appropriate, with normally distributed independent error terms whose variance is sigma squared = 16.
galleys <- c(7,12,4,14,25,30)
cost <- c(128,213,75,250,446,540)
sigma.sq <- 16
samp.size <- 6
b0 <- 0
Likelihood Function = \({(1/((2*pi*16)^(6/2))) x exp(-1/(2*16)*sum((Y-bo-(b1)*X)^2))}\)
Likelihood function (using b1 = 17)
b1 <- 17
like.func.17 <- 1/((2*pi*16)^3)*exp(sum(((cost[1]-b0-(b1*galleys[1]))^2)+(cost[2]-b0-(b1*galleys[2]))^2+(cost[3]-b0-(b1*galleys[3]))^2+(cost[4]-b0-(b1*galleys[4]))^2+(cost[5]-b0-(b1*galleys[5]))^2+(cost[6]-b0-(b1*galleys[6]))^2)/32)
b1 is 18
b1=18
like.func.18 <- 1/((2*pi*16)^3)*exp(sum(((cost[1]-b0-(b1*galleys[1]))^2)+(cost[2]-b0-(b1*galleys[2]))^2+(cost[3]-b0-(b1*galleys[3]))^2+(cost[4]-b0-(b1*galleys[4]))^2+(cost[5]-b0-(b1*galleys[5]))^2+(cost[6]-b0-(b1*galleys[6]))^2)/32)
b1 is 19
b1=19
like.func.19 <- 1/((2*pi*16)^3)*exp(sum(((cost[1]-b0-(b1*galleys[1]))^2)+(cost[2]-b0-(b1*galleys[2]))^2+(cost[3]-b0-(b1*galleys[3]))^2+(cost[4]-b0-(b1*galleys[4]))^2+(cost[5]-b0-(b1*galleys[5]))^2+(cost[6]-b0-(b1*galleys[6]))^2)/32)
likelihood function for all three B1s
like.func.17
## [1] 1.024963e+17
like.func.18
## [1] 3.65689e-06
like.func.19
## [1] 3.17898e+24
The likelihood function is largest with \({B_1=18}\).
c – The max likelihood estimator is \({Sum(X_iY_i)/Sum(X_i^2)}\). Find the maximum likelihood estimate. Are your results in part (b) consistent with this estimate?
max.like.est<- sum(cost*galleys)/sum((galleys)^2)
max.like.est
## [1] 17.9285
The maximum likelihood estimate is 17.92. This matches the estimate in part (b) closely.
B1 <- seq(16,20, 0.001)
B0 = 0
i=0
part4d_like.func <- as.data.frame(B1)
colnames(part4d_like.func) <- c("B1")
for(b1 in part4d_like.func$B1){
i=i+1
part4d_like.func$L[i] <- (1/((2*pi*sigma.sq)^(samp.size/2)))*exp(-1/(2*sigma.sq)*sum((cost-B0-(b1)*galleys)^2))
}
plot(part4d_like.func$B1, part4d_like.func$L, type = "l")
part4d_like.func$B1[which(part4d_like.func$L == max(part4d_like.func$L))]
## [1] 17.928
The maximum likelihood estimator of this is 17.928. This closely matches what is found in part (c)
Part 5 Assume that the first-order regression model is appropriate, with normally distributed independent error terms whose variance sigma squared = 16.
Likelihood Function = \({(1/((2*pi*16)^(6/2))) x exp(-1/(2*16)*sum((Y-bo-(b1)*X)^2))}\)
typo.error.lm <- lm(cost~galleys)
typo.error.lm[1]
## $coefficients
## (Intercept) galleys
## 1.596919 17.852375
\({B_0 = 1.596919}\) \({B_1 = 17.852375}\)
B0 <- seq(-10,10,.1)
B1 <-seq(16,20,0.01)
threedim.like.func.mat <- as.data.frame(matrix(NA,length(B1),length(B0)))
colnames(threedim.like.func.mat) <- B0
rownames(threedim.like.func.mat) <- B1
i <- 0
for (b1 in B1){
i <- i+1
j <- 0
for(b0 in B0){
j <- j+1
threedim.like.func.mat[i,j] <- (1/((2*pi*sigma.sq)^(samp.size/2)))*exp(-1/(2*sigma.sq)*sum((cost-b0-(b1)*galleys)^2))
}
}
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
threedim.like.mat <- as.matrix(threedim.like.func.mat)
plot_ly(x = B0, y = B1, z=threedim.like.mat)%>% add_surface()
The likelihood function appears to be maximized at about a B0 of 1.5 and a B1 of 17.86, this matches the estimate gained through the linear model.