#Problem 1.13
Data <- "Place Minority Crime Poverty Language school Housing City Conventional Undercount
Alabama 26.1 49 18.9 0.2 43.5 7.6 state 0 -0.04
Alaska 5.7 62 10.7 1.7 17.5 23.6 state 100 3.35
Arizona 18.9 81 13.2 3.2 27.6 8.1 state 18 2.48
Arkansas 16.9 38 19.0 0.2 44.5 7.0 state 0 -0.74
California 24.3 73 10.4 5.0 26.0 11.8 state 4 3.60
Colorado 15.2 73 10.1 1.2 21.4 9.2 state 19 1.34
Connecticut 10.8 58 8.0 2.4 29.7 21.0 state 0 -0.26
Delaware 17.5 68 11.8 0.7 31.4 8.9 state 0 -0.16
Florida 22.3 81 13.4 3.6 33.3 10.1 state 0 2.20
Georgia 27.6 55 16.6 0.3 43.6 10.2 state 0 0.37
Hawaii 9.1 75 9.9 5.7 26.2 17.0 state 29 1.46
Idaho 4.2 48 12.6 1.0 26.3 9.1 state 56 1.53
Illinois 8.1 48 7.7 1.0 29.8 13.5 state 0 1.69
Indiana 7.1 48 9.4 0.5 33.6 9.9 state 0 -0.68
Iowa 2.3 47 10.1 0.3 28.5 10.4 state 0 -0.59
Kansas 7.9 54 10.1 0.5 26.7 8.5 state 14 0.94
Kentucky 7.7 34 17.6 0.2 46.9 10.6 state 0 -1.41
Louisiana 31.4 54 18.6 1.1 42.3 9.7 state 0 2.46
Maine 0.7 44 13.0 1.0 31.3 19.5 state 40 2.06
Maryland 16.7 58 6.8 0.8 28.2 10.5 state 0 2.03
Massachusetts 3.8 53 8.5 2.1 27.4 26.9 state 4 -0.57
Michigan 7.0 61 8.7 0.7 29.9 9.4 state 8 0.89
Minnesota 2.1 48 9.5 0.5 26.9 10.7 state 11 1.57
Mississippi 35.8 34 23.9 0.2 45.2 7.2 state 0 1.52
Missouri 7.8 45 11.2 0.3 34.9 9.1 state 0 0.81
Montana 1.5 50 12.3 0.4 25.6 12.8 state 75 1.81
Nebraska 4.8 43 10.7 0.5 26.6 9.7 state 33 0.36
Nevada 13.0 88 8.7 1.6 24.5 11.7 state 10 5.08
New Hampshire 1.0 47 8.5 0.8 27.7 20.3 state 0 -1.49
New Jersey 19.0 64 9.5 3.6 32.6 23.7 state 0 1.44
New Mexico 38.4 59 17.6 4.6 31.1 10.7 state 58 2.69
New York 8.0 48 8.9 1.3 29.3 21.6 state 0 -1.48
North Carolina 23.1 46 14.8 0.2 45.2 8.2 state 0 1.36
North Dakota 1.0 30 12.6 0.5 33.6 15.1 state 70 0.35
Ohio 8.9 52 9.6 0.5 32.1 11.3 state 0 0.97
Oklahoma 8.6 50 13.4 0.5 34.0 8.0 state 0 -0.12
Oregon 3.9 60 10.7 0.8 24.4 7.9 state 13 0.93
Pennsylvania 4.8 33 8.8 0.6 33.6 13.3 state 0 -0.78
Rhode Island 4.9 59 10.3 3.2 38.9 29.6 state 0 0.74
South Carolina 31.0 53 16.6 0.2 46.3 7.9 state 0 6.19
South Dakota 0.9 32 16.9 0.4 32.1 12.0 state 84 0.42
Tennessee 16.4 44 16.4 0.2 43.8 9.4 state 0 -2.31
Texas 30.6 55 15.0 4.7 38.7 7.7 state 1 0.27
Utah 4.7 58 10.3 0.9 20.0 11.3 state 14 1.14
Vermont 0.9 50 12.1 0.5 29.0 20.8 state 0 -1.12
Virginia 20.0 46 11.8 0.5 37.6 10.3 state 0 1.11
Washington 5.4 69 9.8 1.0 22.4 9.4 state 4 1.48
West Virginia 3.9 25 15.0 0.2 44.0 9.0 state 0 -0.69
Wisconsin 1.7 45 7.9 0.4 29.5 12.8 state 9 1.45
Wyoming 5.9 49 7.9 0.7 22.1 13.2 state 100 4.01
Baltimore 55.5 100 22.9 0.7 51.6 23.3 city 0 6.15
Boston 28.4 135 20.2 4.4 31.6 52.1 city 0 2.27
Chicago 53.7 66 20.3 6.7 43.8 51.4 city 0 5.42
Cleveland 46.7 101 22.1 1.6 49.1 36.4 city 0 5.01
Dallas 41.6 118 14.2 3.1 31.5 12.9 city 0 8.18
Detroit 65.4 106 21.9 1.6 45.8 18.6 city 0 4.33
Houston 45.1 80 12.7 5.1 31.6 8.9 city 0 5.79
Indianapolis 22.5 53 11.5 0.3 33.3 13.6 city 0 0.31
Los Angeles 44.4 100 16.4 12.7 31.4 15.0 city 0 7.52
Milwaukee 27.2 65 13.8 1.6 46.4 27.2 city 0 3.17
New York City 44.0 101 20.0 8.9 39.8 32.2 city 0 7.39
Philadelphia 41.3 60 20.6 2.2 45.7 21.7 city 0 6.41
Saint Louis 46.7 143 21.8 0.5 51.8 40.9 city 0 3.60
San Diego 23.6 81 12.4 4.2 21.1 11.2 city 0 0.47
San Francisco 24.8 107 13.7 9.2 26.0 20.3 city 0 5.18
Washington DC 72.6 102 18.6 1.1 32.9 21.0 city 0 5.93"
#cleaning
lines <- strsplit(Data, "\n")[[1]]
header <- strsplit(lines[1], "\\s+")[[1]]
rows <- lapply(lines[-1], function(x) {
parts <- strsplit(x, "\\s+")[[1]]
place <- paste(parts[1:(length(parts) - 9)], collapse = " ")
rest <- parts[(length(parts) - 8):length(parts)]
c(place, rest)
})
df <- as.data.frame(do.call(rbind, rows), stringsAsFactors = FALSE)
names(df) <- header
num_vars <- c("Minority", "Crime", "Poverty", "Language",
"school", "Housing", "Conventional", "Undercount")
df[num_vars] <- lapply(df[num_vars], as.numeric)
df$City <- factor(df$City)
# checking
str(df)
## 'data.frame': 66 obs. of 10 variables:
## $ Place : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Minority : num 26.1 5.7 18.9 16.9 24.3 15.2 10.8 17.5 22.3 27.6 ...
## $ Crime : num 49 62 81 38 73 73 58 68 81 55 ...
## $ Poverty : num 18.9 10.7 13.2 19 10.4 10.1 8 11.8 13.4 16.6 ...
## $ Language : num 0.2 1.7 3.2 0.2 5 1.2 2.4 0.7 3.6 0.3 ...
## $ school : num 43.5 17.5 27.6 44.5 26 21.4 29.7 31.4 33.3 43.6 ...
## $ Housing : num 7.6 23.6 8.1 7 11.8 9.2 21 8.9 10.1 10.2 ...
## $ City : Factor w/ 2 levels "city","state": 2 2 2 2 2 2 2 2 2 2 ...
## $ Conventional: num 0 100 18 0 4 19 0 0 0 0 ...
## $ Undercount : num -0.04 3.35 2.48 -0.74 3.6 1.34 -0.26 -0.16 2.2 0.37 ...
nrow(df)
## [1] 66
head(df)
## Place Minority Crime Poverty Language school Housing City Conventional
## 1 Alabama 26.1 49 18.9 0.2 43.5 7.6 state 0
## 2 Alaska 5.7 62 10.7 1.7 17.5 23.6 state 100
## 3 Arizona 18.9 81 13.2 3.2 27.6 8.1 state 18
## 4 Arkansas 16.9 38 19.0 0.2 44.5 7.0 state 0
## 5 California 24.3 73 10.4 5.0 26.0 11.8 state 4
## 6 Colorado 15.2 73 10.1 1.2 21.4 9.2 state 19
## Undercount
## 1 -0.04
## 2 3.35
## 3 2.48
## 4 -0.74
## 5 3.60
## 6 1.34
##(a)
#The regression model
fit1 <- lm(Undercount ~ Minority + Crime + Poverty + Language +
school + Housing + Conventional, data = df)
summary(fit1)
##
## Call:
## lm(formula = Undercount ~ Minority + Crime + Poverty + Language +
## school + Housing + Conventional, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8742 -0.8182 0.0064 0.6476 3.9922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.217678 1.364658 -1.625 0.109568
## Minority 0.093366 0.020970 4.452 3.92e-05 ***
## Crime 0.034687 0.012775 2.715 0.008712 **
## Poverty -0.173520 0.085775 -2.023 0.047696 *
## Language 0.230780 0.092615 2.492 0.015589 *
## school 0.058204 0.045213 1.287 0.203100
## Housing -0.022518 0.023454 -0.960 0.340995
## Conventional 0.036120 0.009335 3.869 0.000279 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.442 on 58 degrees of freedom
## Multiple R-squared: 0.6961, Adjusted R-squared: 0.6594
## F-statistic: 18.98 on 7 and 58 DF, p-value: 6.62e-13
df$fitted <- fitted(fit1)
df$residuals <- resid(fit1)
# residuals for the first few observations
residuals_df <- data.frame(
Place = df$Place,
Undercount = df$Undercount,
Fitted = round(df$fitted,3),
Residual = round(df$residuals,3)
)
head(residuals_df,10)
## Place Undercount Fitted Residual
## 1 Alabama -0.04 1.046 -1.086
## 2 Alaska 3.35 3.100 0.250
## 3 Arizona 2.48 2.879 -0.399
## 4 Arkansas -0.74 -0.140 -0.600
## 5 California 3.60 3.325 0.275
## 6 Colorado 1.34 1.983 -0.643
## 7 Connecticut -0.26 1.224 -1.484
## 8 Delaware -0.16 1.516 -1.676
## 9 Florida 2.20 2.890 -0.690
## 10 Georgia 0.37 1.764 -1.394
residuals_df[order(-abs(residuals_df$Residual)), ][1:10, ]
## Place Undercount Fitted Residual
## 40 South Carolina 6.19 2.198 3.992
## 62 Philadelphia 6.41 2.824 3.586
## 43 Texas 0.27 3.144 -2.874
## 42 Tennessee -2.31 0.378 -2.688
## 28 Nevada 5.08 2.432 2.648
## 55 Dallas 8.18 5.554 2.626
## 31 New Mexico 2.69 5.086 -2.396
## 64 San Diego 0.47 2.589 -2.119
## 56 Detroit 4.33 6.381 -2.051
## 63 Saint Louis 3.60 5.529 -1.929
# Ranking predictors by importance based on p-values
fit1_summary <- summary(fit1)
coef_table <- fit1_summary$coefficients
importance <- data.frame(
Predictor = rownames(coef_table)[-1],
Estimate = coef_table[-1,1],
Std_Error = coef_table[-1,2],
t_value = coef_table[-1,3],
p_value = coef_table[-1,4]
)
importance <- importance[order(importance$p_value), ]
importance
## Predictor Estimate Std_Error t_value p_value
## Minority Minority 0.09336637 0.020970245 4.4523261 3.921428e-05
## Conventional Conventional 0.03611970 0.009335328 3.8691404 2.791736e-04
## Crime Crime 0.03468717 0.012775289 2.7151766 8.711916e-03
## Language Language 0.23078011 0.092614627 2.4918322 1.558920e-02
## Poverty Poverty -0.17352000 0.085775464 -2.0229562 4.769556e-02
## school school 0.05820351 0.045213270 1.2873103 2.030996e-01
## Housing Housing -0.02251768 0.023453724 -0.9600899 3.409954e-01
# The regression model explains a substantial portion of the variation
# in census undercount. The R-squared is 0.696 and the adjusted R-squared
# is 0.659, indicating that roughly 66–70% of the variation in undercount
# is explained by the predictors included in the model.
# Based on the p-values of the coefficients:
# Minority (p < 0.001) is the most statistically significant predictor.
# Areas with a higher percentage of minority population tend to have
# higher census undercount.
# Conventional (p < 0.001) is also highly significant and positively
# associated with undercount
# Crime (p = 0.009) and Language (p = 0.016) are statistically significant
# predictors as well.
# Poverty (p = 0.048) is marginally significant.
# school (p = 0.203) and Housing (p = 0.341) are not statistically
# significant, suggesting that they do not add much explanatory power
# once the other variables are included in the model.
# Residual plot against fitted values
plot(df$fitted, df$residuals,
xlab = "Fitted values",
ylab = "Residuals",
main = "Residuals vs Fitted Values")
abline(h=0, col="red", lty=2)
# The residual plot shows residuals scattered roughly around zero without
# a clear pattern, suggesting that the linear regression model is
# reasonably appropriate.
# Q-Q plot for residual normality
qqnorm(df$residuals, main="Q-Q Plot of Residuals")
qqline(df$residuals, col="red")
# The Q-Q plot indicates that the residuals follow the reference line
# fairly closely, suggesting that the normality assumption for the
# regression errors is approximately satisfied
# (b)
# structure of City
str(df$City)
## Factor w/ 2 levels "city","state": 2 2 2 2 2 2 2 2 2 2 ...
# structure of other predictors
str(df[,c("Minority","Crime","Poverty","Language",
"school","Housing","Conventional")])
## 'data.frame': 66 obs. of 7 variables:
## $ Minority : num 26.1 5.7 18.9 16.9 24.3 15.2 10.8 17.5 22.3 27.6 ...
## $ Crime : num 49 62 81 38 73 73 58 68 81 55 ...
## $ Poverty : num 18.9 10.7 13.2 19 10.4 10.1 8 11.8 13.4 16.6 ...
## $ Language : num 0.2 1.7 3.2 0.2 5 1.2 2.4 0.7 3.6 0.3 ...
## $ school : num 43.5 17.5 27.6 44.5 26 21.4 29.7 31.4 33.3 43.6 ...
## $ Housing : num 7.6 23.6 8.1 7 11.8 9.2 21 8.9 10.1 10.2 ...
## $ Conventional: num 0 100 18 0 4 19 0 0 0 0 ...
# The variable City is categorical (a factor) with two levels:
# "city" and "state". It indicates whether the observation
# corresponds to a major city or a state remainder.
#
# In contrast, all other predictors are quantitative variables
# measuring percentages or rates. In regression analysis, City
# would enter the model as a dummy variable representing the
# difference between these two groups.
# (c) Best subset regression
library(leaps)
best <- regsubsets(Undercount ~ Minority + Crime + Poverty + Language +
school + Housing + Conventional,
data = df,
nvmax = 7)
best_sum <- summary(best)
best_sum$which
## (Intercept) Minority Crime Poverty Language school Housing Conventional
## 1 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## 3 TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## 4 TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE
## 5 TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
## 6 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## 7 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
best_sum$adjr2
## [1] 0.4855990 0.5558969 0.6200099 0.6474507 0.6593270 0.6598359 0.6593843
best_sum$cp
## [1] 34.653395 22.140936 11.167048 7.137160 6.010091 6.921773 8.000000
best_sum$bic
## [1] -36.51760 -43.06581 -50.22234 -52.05288 -51.21581 -48.23409 -45.08510
# best models according to criteria
best_adj_r2 <- which.max(best_sum$adjr2)
best_cp <- which.min(best_sum$cp)
best_bic <- which.min(best_sum$bic)
best_sum$which[best_adj_r2, ]
## (Intercept) Minority Crime Poverty Language school
## TRUE TRUE TRUE TRUE TRUE TRUE
## Housing Conventional
## FALSE TRUE
best_sum$which[best_cp, ]
## (Intercept) Minority Crime Poverty Language school
## TRUE TRUE TRUE TRUE TRUE FALSE
## Housing Conventional
## FALSE TRUE
best_sum$which[best_bic, ]
## (Intercept) Minority Crime Poverty Language school
## TRUE TRUE TRUE FALSE TRUE FALSE
## Housing Conventional
## FALSE TRUE
# Stepwise regression using AIC
library(MASS)
step_model <- stepAIC(fit1, direction="both")
## Start: AIC=55.78
## Undercount ~ Minority + Crime + Poverty + Language + school +
## Housing + Conventional
##
## Df Sum of Sq RSS AIC
## - Housing 1 1.917 122.51 54.824
## - school 1 3.446 124.04 55.643
## <none> 120.59 55.784
## - Poverty 1 8.509 129.10 58.284
## - Language 1 12.910 133.50 60.496
## - Crime 1 15.328 135.92 61.681
## - Conventional 1 31.127 151.72 68.938
## - Minority 1 41.217 161.81 73.187
##
## Step: AIC=54.82
## Undercount ~ Minority + Crime + Poverty + Language + school +
## Conventional
##
## Df Sum of Sq RSS AIC
## - school 1 2.263 124.77 54.032
## <none> 122.51 54.824
## + Housing 1 1.917 120.59 55.784
## - Poverty 1 8.325 130.84 57.163
## - Language 1 11.347 133.86 58.671
## - Crime 1 13.782 136.29 59.860
## - Conventional 1 29.360 151.87 67.003
## - Minority 1 47.669 170.18 74.515
##
## Step: AIC=54.03
## Undercount ~ Minority + Crime + Poverty + Language + Conventional
##
## Df Sum of Sq RSS AIC
## <none> 124.77 54.032
## + school 1 2.263 122.51 54.824
## - Poverty 1 6.502 131.28 55.385
## + Housing 1 0.734 124.04 55.643
## - Language 1 9.400 134.17 56.826
## - Crime 1 11.535 136.31 57.868
## - Conventional 1 29.971 154.75 66.240
## - Minority 1 51.832 176.61 74.962
summary(step_model)
##
## Call:
## lm(formula = Undercount ~ Minority + Crime + Poverty + Language +
## Conventional, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6089 -0.9797 0.0373 0.7054 4.3521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.792689 0.860341 -0.921 0.360550
## Minority 0.100967 0.020224 4.992 5.44e-06 ***
## Crime 0.024346 0.010337 2.355 0.021804 *
## Poverty -0.110030 0.062227 -1.768 0.082111 .
## Language 0.183850 0.086475 2.126 0.037623 *
## Conventional 0.029327 0.007725 3.796 0.000345 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.442 on 60 degrees of freedom
## Multiple R-squared: 0.6855, Adjusted R-squared: 0.6593
## F-statistic: 26.16 on 5 and 60 DF, p-value: 6.449e-14
#Model comparison
#best subset model variables based on BIC
best_bic_vars <- names(best_sum$which[best_bic, ])[best_sum$which[best_bic, ]][-1]
best_bic_formula <- as.formula(paste("Undercount ~",
paste(best_bic_vars, collapse=" + ")))
best_bic_model <- lm(best_bic_formula, data=df)
comparison <- data.frame(
Model = c("Full Model",
"Best Subset (BIC)",
"Stepwise AIC Model"),
R_squared = c(summary(fit1)$r.squared,
summary(best_bic_model)$r.squared,
summary(step_model)$r.squared),
Adj_R_squared = c(summary(fit1)$adj.r.squared,
summary(best_bic_model)$adj.r.squared,
summary(step_model)$adj.r.squared),
AIC = c(AIC(fit1),
AIC(best_bic_model),
AIC(step_model)),
BIC = c(BIC(fit1),
BIC(best_bic_model),
BIC(step_model))
)
comparison
## Model R_squared Adj_R_squared AIC BIC
## 1 Full Model 0.6960660 0.6593843 245.0836 264.7905
## 2 Best Subset (BIC) 0.6691460 0.6474507 244.6848 257.8227
## 3 Stepwise AIC Model 0.6855326 0.6593270 243.3322 258.6598
# The model with the highest Adjusted R^2 includes six predictors:
# Minority, Crime, Poverty, Language, school, and Conventional.
# The Cp criterion suggests a model with five predictors:
# Minority, Crime, Poverty, Language, and Conventional.
# The BIC criterion selects a more parsimonious four-variable model:
# Minority, Crime, Language, and Conventional.
#
# Stepwise regression using AIC selects the model:
# Undercount ~ Minority + Crime + Poverty + Language + Conventional
#
# This model excludes school and Housing, indicating that these
# variables contribute little additional explanatory power once
# the main predictors are included.
#
# Comparing the models shows that both best subset and stepwise
# regression identify a similar set of important predictors.
# In particular, Minority, Crime, Language, and Conventional
# consistently appear as key determinants of census undercount.
#Problem 2.11
# (b) F-test p-value
# Given from ANOVA table
MST <- 21.47
MSE <- 2.39
df1 <- 3
df2 <- 26
# F statistic
F_stat <- MST / MSE
F_stat
## [1] 8.983264
# p-value
1 - pf(F_stat, df1, df2)
## [1] 0.0002976389
# (c) Tukey multiple comparisons
# Treatment means and sample sizes
yA <- 66.10; nA <- 7
yB <- 65.75; nB <- 8
yC <- 62.63; nC <- 9
yD <- 63.85; nD <- 6
MSE <- 2.39
alpha <- 0.01
k <- 4
df <- 26
# Tukey critical value
q_crit <- qtukey(1 - alpha, k, df)
q_crit
## [1] 4.865002
# Pairwise mean differences
abs(yA - yB)
## [1] 0.35
abs(yA - yC)
## [1] 3.47
abs(yA - yD)
## [1] 2.25
abs(yB - yC)
## [1] 3.12
abs(yB - yD)
## [1] 1.9
abs(yC - yD)
## [1] 1.22
# standard errors for unequal sample sizes
SE_AB <- sqrt(MSE/2 * (1/nA + 1/nB))
SE_AC <- sqrt(MSE/2 * (1/nA + 1/nC))
SE_AD <- sqrt(MSE/2 * (1/nA + 1/nD))
SE_BC <- sqrt(MSE/2 * (1/nB + 1/nC))
SE_BD <- sqrt(MSE/2 * (1/nB + 1/nD))
SE_CD <- sqrt(MSE/2 * (1/nC + 1/nD))
# Tukey test statistics
q_AB <- abs(yA - yB) / SE_AB
q_AC <- abs(yA - yC) / SE_AC
q_AD <- abs(yA - yD) / SE_AD
q_BC <- abs(yB - yC) / SE_BC
q_BD <- abs(yB - yD) / SE_BD
q_CD <- abs(yC - yD) / SE_CD
q_AB; q_AC; q_AD; q_BC; q_BD; q_CD
## [1] 0.6186321
## [1] 6.298771
## [1] 3.699572
## [1] 5.873709
## [1] 3.218298
## [1] 2.117518
#(d)Contrast: brand-name vs generic drugs
L <- 0.5*(yA + yB) - 0.5*(yC + yD)
L
## [1] 2.685
# Standard error of the contrast
SE_L <- sqrt(MSE * (0.25/nA + 0.25/nB + 0.25/nC + 0.25/nD))
SE_L
## [1] 0.5709789
# t statistic
t_stat <- L / SE_L
t_stat
## [1] 4.702451
# two-sided p-value
2 * (1 - pt(abs(t_stat), df = 26))
## [1] 7.372113e-05
#Problem 2.14
Data <- "
Area Velocity
0.016 294.9
0.016 294.1
0.016 301.7
0.016 307.9
0.016 285.5
0.016 298.6
0.016 303.1
0.016 305.3
0.016 264.9
0.016 262.9
0.016 256
0.016 255.3
0.016 256.3
0.016 258.2
0.016 243.6
0.016 250.1
0.03 295
0.03 301.1
0.03 293.1
0.03 300.6
0.03 285
0.03 289.1
0.03 277.8
0.03 266.4
0.03 248.1
0.03 255.7
0.03 245.7
0.03 251
0.03 254.9
0.03 254.5
0.03 246.3
0.03 246.9
0.044 270.5
0.044 263.2
0.044 278.6
0.044 267.9
0.044 269.6
0.044 269.1
0.044 262.2
0.044 263.2
0.044 224.2
0.044 227.9
0.044 217.7
0.044 219.6
0.044 228.5
0.044 230.9
0.044 227.6
0.044 228.6
0.058 258.6
0.058 255.9
0.058 257.1
0.058 263.6
0.058 262.6
0.058 260.3
0.058 305.3
0.058 304.9
0.058 216
0.058 216
0.058 210.6
0.058 207.4
0.058 214.6
0.058 214.3
0.058 222.1
0.058 222.2
"
df <- read.table(text = Data, header = TRUE)
head(df)
## Area Velocity
## 1 0.016 294.9
## 2 0.016 294.1
## 3 0.016 301.7
## 4 0.016 307.9
## 5 0.016 285.5
## 6 0.016 298.6
str(df)
## 'data.frame': 64 obs. of 2 variables:
## $ Area : num 0.016 0.016 0.016 0.016 0.016 0.016 0.016 0.016 0.016 0.016 ...
## $ Velocity: num 295 294 302 308 286 ...
# (a)
df$AreaF <- factor(df$Area)
# Fitting the one-way ANOVA model
fit_aov <- aov(Velocity ~ AreaF, data = df)
# ANOVA table
summary(fit_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## AreaF 3 13515 4505 7.067 0.000381 ***
## Residuals 60 38250 638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The ANOVA table gives F = 7.067 with p-value = 0.000381.
# Since the p-value is much smaller than 0.01, we reject the
# null hypothesis that all four discharge hole areas produce
# the same mean muzzle velocity.
#
# This provides strong statistical evidence that the discharge
# hole area has a significant effect on muzzle velocity.
# Group means
tapply(df$Velocity, df$AreaF, mean)
## 0.016 0.03 0.044 0.058
## 277.4000 269.4500 246.8313 243.2188
# The means clearly decrease as the discharge hole area increases.
# This supports the expected inverse relationship: smaller hole
# areas produce higher pressure pulses, resulting in higher
# muzzle velocity.
# Boxplot for visual comparison
boxplot(Velocity ~ AreaF, data = df,
main = "Muzzle Velocity by Discharge Hole Area",
xlab = "Discharge Hole Area",
ylab = "Velocity (ft/s)",
col = "lightgray")
# Multiple comparisons using Tukey's method
TukeyHSD(fit_aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Velocity ~ AreaF, data = df)
##
## $AreaF
## diff lwr upr p adj
## 0.03-0.016 -7.95000 -31.53930 15.6392992 0.8097473
## 0.044-0.016 -30.56875 -54.15805 -6.9794508 0.0059909
## 0.058-0.016 -34.18125 -57.77055 -10.5919508 0.0017201
## 0.044-0.03 -22.61875 -46.20805 0.9705492 0.0648075
## 0.058-0.03 -26.23125 -49.82055 -2.6419508 0.0235506
## 0.058-0.044 -3.61250 -27.20180 19.9767992 0.9773936
# The Tukey test identifies which pairs of area levels differ
# significantly in mean velocity while controlling the
# family-wise error rate.
# Significant differences are observed between:
# Area 0.016 and Area 0.044
# Area 0.016 and Area 0.058
# These results indicate that the smallest discharge hole area
# produces significantly higher muzzle velocity than the larger
# areas 0.044 and 0.058.
# Other pairwise differences are not statistically significant
# at the 5% level
# Residual diagnostics for one-way ANOVA
par(mfrow = c(2, 2))
plot(fit_aov)
par(mfrow = c(1, 1))
# (b) orthogonal polynomials to model linear and quadratic effects
poly_fit <- lm(Velocity ~ poly(Area, 2), data = df)
summary(poly_fit)
##
## Call:
## lm(formula = Velocity ~ poly(Area, 2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.484 -23.133 -3.349 18.953 63.765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 259.225 3.167 81.851 < 2e-16 ***
## poly(Area, 2)1 -111.949 25.336 -4.419 4.15e-05 ***
## poly(Area, 2)2 8.675 25.336 0.342 0.733
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.34 on 61 degrees of freedom
## Multiple R-squared: 0.2436, Adjusted R-squared: 0.2188
## F-statistic: 9.82 on 2 and 61 DF, p-value: 0.0002008
# orthogonal polynomial analysis using factor contrasts
df$AreaOrd <- factor(df$Area, ordered = TRUE)
contrasts(df$AreaOrd) <- contr.poly(4)
#ANOVA model with polynomial decomposition
fit_poly_aov <- aov(Velocity ~ AreaOrd, data = df)
summary(fit_poly_aov, split = list(AreaOrd = list(
Linear = 1,
Quadratic = 2,
Cubic = 3
)))
## Df Sum Sq Mean Sq F value Pr(>F)
## AreaOrd 3 13515 4505 7.067 0.000381 ***
## AreaOrd: Linear 1 12533 12533 19.659 4.01e-05 ***
## AreaOrd: Quadratic 1 75 75 0.118 0.732363
## AreaOrd: Cubic 1 907 907 1.423 0.237597
## Residuals 60 38250 638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The orthogonal polynomial decomposition separates the treatment
# effect into linear, quadratic, and cubic components.
# Linear component:
# F = 19.659, p-value = 4.01e-05
# This indicates a strong and statistically significant linear
# trend between discharge hole area and muzzle velocity.
# Quadratic component:
# F = 0.118, p-value = 0.732
# This term is not significant, indicating no evidence of
# curvature in the relationship.
# Cubic component:
# F = 1.423, p-value = 0.238
# This higher-order term is also not significant.
# Means plotted against quantitative area levels
area_means <- aggregate(Velocity ~ Area, data = df, mean)
area_means
## Area Velocity
## 1 0.016 277.4000
## 2 0.030 269.4500
## 3 0.044 246.8313
## 4 0.058 243.2188
plot(area_means$Area, area_means$Velocity, type = "b", pch = 19,
xlab = "Discharge Hole Area (square inches)",
ylab = "Mean Velocity (ft/s)",
main = "Mean Velocity vs Discharge Hole Area")
# Both analyses lead to the same conclusion. The one-way ANOVA
# shows that muzzle velocity differs across the four discharge
# hole areas, while the polynomial analysis shows that this
# difference is primarily explained by a strong linear trend.
# Specifically, muzzle velocity decreases as discharge hole
# area increases. This result is consistent with the expected
# physical relationship: smaller discharge holes increase the
# pressure of the propellant gases and therefore produce
# higher muzzle velocity.
#Problem 2.17
Data <- "
Day Dev1 Dev2 Dev3 Doc1 Doc2 Doc3
1 133.34 133.36 133.45 126.54 127.36 131.88
2 110.94 110.85 110.92 124.69 128.86 132.39
3 118.54 118.56 118.67 125.46 129.43 134.43
4 137.94 137.80 137.77 125.95 130.72 134.28
5 139.52 139.62 139.59 125.90 130.13 134.44
6 139.23 139.11 139.36 127.85 132.03 137.37
7 117.96 117.81 117.85 125.55 132.05 132.17
8 119.59 119.42 119.48 125.80 129.87 134.97
9 116.12 116.00 115.93 125.11 128.09 133.97
10 128.38 128.48 128.41 125.75 131.94 132.68
11 125.17 125.25 125.34 128.77 130.05 134.75
12 134.62 134.41 134.55 125.26 131.13 134.29
13 136.14 136.07 136.22 126.26 130.91 133.38
14 131.21 131.03 130.96 125.68 128.83 135.67
15 132.51 132.86 132.65 124.47 129.46 134.39
"
df <- read.table(text = Data, header = TRUE)
head(df)
## Day Dev1 Dev2 Dev3 Doc1 Doc2 Doc3
## 1 1 133.34 133.36 133.45 126.54 127.36 131.88
## 2 2 110.94 110.85 110.92 124.69 128.86 132.39
## 3 3 118.54 118.56 118.67 125.46 129.43 134.43
## 4 4 137.94 137.80 137.77 125.95 130.72 134.28
## 5 5 139.52 139.62 139.59 125.90 130.13 134.44
## 6 6 139.23 139.11 139.36 127.85 132.03 137.37
str(df)
## 'data.frame': 15 obs. of 7 variables:
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Dev1: num 133 111 119 138 140 ...
## $ Dev2: num 133 111 119 138 140 ...
## $ Dev3: num 133 111 119 138 140 ...
## $ Doc1: num 127 125 125 126 126 ...
## $ Doc2: num 127 129 129 131 130 ...
## $ Doc3: num 132 132 134 134 134 ...
# Devices
device_data <- data.frame(
Day = rep(df$Day, 3),
Source = factor(rep(c("Dev1", "Dev2", "Dev3"), each = nrow(df))),
BP = c(df$Dev1, df$Dev2, df$Dev3)
)
# Doctors
doctor_data <- data.frame(
Day = rep(df$Day, 3),
Source = factor(rep(c("Doc1", "Doc2", "Doc3"), each = nrow(df))),
BP = c(df$Doc1, df$Doc2, df$Doc3)
)
# (a) One-way random effects model: Devices
fit_dev <- aov(BP ~ Source, data = device_data)
summary(fit_dev)
## Df Sum Sq Mean Sq F value Pr(>F)
## Source 2 0 0.01 0 1
## Residuals 42 3699 88.08
# ANOVA components
anova_dev <- summary(fit_dev)[[1]]
MS_dev_source <- anova_dev["Source", "Mean Sq"]
MS_dev_error <- anova_dev["Residuals", "Mean Sq"]
df_dev_source <- anova_dev["Source", "Df"]
df_dev_error <- anova_dev["Residuals", "Df"]
F_dev <- anova_dev["Source", "F value"]
p_dev <- anova_dev["Source", "Pr(>F)"]
# Number of replicates per source
n_dev <- 15
# Variance component estimates for random effects model
sigma2_dev <- MS_dev_error
sigma_alpha2_dev <- (MS_dev_source - MS_dev_error) / n_dev
sigma_alpha2_dev_trunc <- max(sigma_alpha2_dev, 0)
MS_dev_source
## [1] 0.006782222
MS_dev_error
## [1] 88.08183
F_dev
## [1] 7.69991e-05
p_dev
## [1] 0.999923
sigma2_dev
## [1] 88.08183
sigma_alpha2_dev
## [1] -5.87167
sigma_alpha2_dev_trunc
## [1] 0
# (a) Random effects analysis for devices
# The ANOVA for devices gives:
# F = 0.000077 and p-value = 0.9999.
# Since the p-value is extremely large, we fail to reject the
# null hypothesis that the three devices have the same mean
# blood pressure measurement.
# This indicates that there is essentially no variation among
# the three randomly selected devices beyond normal day-to-day
# measurement variability.
# The estimated between-device variance component is negative
# (-5.87), which is not meaningful in practice. Therefore it is
# truncated to zero. This implies that almost all variability in
# the device measurements is residual variability rather than
# device-to-device differences.
# Thus, the new electronic devices appear to be highly consistent
# and provide very precise measurements
# (a) One-way random effects model: Doctors
fit_doc <- aov(BP ~ Source, data = doctor_data)
summary(fit_doc)
## Df Sum Sq Mean Sq F value Pr(>F)
## Source 2 496.3 248.16 139.1 <2e-16 ***
## Residuals 42 74.9 1.78
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANOVA components
anova_doc <- summary(fit_doc)[[1]]
MS_doc_source <- anova_doc["Source", "Mean Sq"]
MS_doc_error <- anova_doc["Residuals", "Mean Sq"]
df_doc_source <- anova_doc["Source", "Df"]
df_doc_error <- anova_doc["Residuals", "Df"]
F_doc <- anova_doc["Source", "F value"]
p_doc <- anova_doc["Source", "Pr(>F)"]
# Number of replicates per source
n_doc <- 15
# Variance component estimates
sigma2_doc <- MS_doc_error
sigma_alpha2_doc <- (MS_doc_source - MS_doc_error) / n_doc
sigma_alpha2_doc_trunc <- max(sigma_alpha2_doc, 0)
MS_doc_source
## [1] 248.1626
MS_doc_error
## [1] 1.784094
F_doc
## [1] 139.0973
p_doc
## [1] 2.982244e-19
sigma2_doc
## [1] 1.784094
sigma_alpha2_doc
## [1] 16.42523
sigma_alpha2_doc_trunc
## [1] 16.42523
# The ANOVA for doctors gives:
# F = 139.10 and p-value 2.98 × 10^-19.
#
# Since the p-value is extremely small, we reject the null
# hypothesis that the three doctors have the same mean blood
# pressure measurements.
#
# This indicates that there are significant differences among
# the doctors' readings.
#
# The estimated variance components are:
# Residual variance (sigma^2) 1.78
# Between-doctor variance (sigma_alpha^2) 16.43
#
# The large between-doctor variance component shows that a
# substantial portion of the variability in measurements is
# due to differences among doctors rather than random variation
# (b) Group means
tapply(device_data$BP, device_data$Source, mean)
## Dev1 Dev2 Dev3
## 128.0807 128.0420 128.0767
mean(device_data$BP)
## [1] 128.0664
tapply(doctor_data$BP, doctor_data$Source, mean)
## Doc1 Doc2 Doc3
## 125.9360 130.0573 134.0707
mean(doctor_data$BP)
## [1] 130.0213
# The device measurements show essentially no variability among
# the three devices, indicating very high precision and strong
# agreement between the devices.
#
# In contrast, the doctor measurements show large variability
# among the three doctors. This suggests that readings taken by
# doctors using their existing devices are much less consistent.
#
# Therefore, the new electronic device appears to provide much
# more precise and reproducible blood pressure measurements
# than the measurements obtained by doctors.
# (c) 95% confidence interval for mean blood pressure:
# Devices
a_dev <- 3
grand_mean_dev <- mean(device_data$BP)
SE_mean_dev <- sqrt(sigma2_dev / (a_dev * n_dev))
CI_dev <- grand_mean_dev + c(-1, 1) * qt(0.975, df_dev_error) * SE_mean_dev
grand_mean_dev
## [1] 128.0664
SE_mean_dev
## [1] 1.399062
CI_dev
## [1] 125.2430 130.8899
# (c) 95% confidence interval for mean blood pressure:
# Doctors
a_doc <- 3
grand_mean_doc <- mean(doctor_data$BP)
SE_mean_doc <- sqrt(sigma_alpha2_doc_trunc / a_doc + sigma2_doc / (a_doc * n_doc))
CI_doc <- grand_mean_doc + c(-1, 1) * qt(0.975, a_doc - 1) * SE_mean_doc
grand_mean_doc
## [1] 130.0213
SE_mean_doc
## [1] 2.348345
CI_doc
## [1] 119.9172 140.1254
# Mean blood pressure measured by devices:
# Grand mean = 128.07
# 95% confidence interval = (125.24 , 130.89)
# Mean blood pressure measured by doctors:
# Grand mean = 130.02
# 95% confidence interval = (119.92 , 140.13)
# The confidence interval for the devices is relatively narrow,
# reflecting the high precision and low variability among device
# measurements.
# The confidence interval for doctors is much wider because it
# incorporates substantial between-doctor variability.
#
# This further supports the conclusion that the electronic devices
# provide more precise measurements than the doctors' readings