HW_2_DOE

#Problem 1.13
Data <- "Place Minority Crime Poverty Language school Housing City Conventional Undercount
Alabama 26.1 49 18.9 0.2 43.5 7.6 state 0 -0.04
Alaska 5.7 62 10.7 1.7 17.5 23.6 state 100 3.35
Arizona 18.9 81 13.2 3.2 27.6 8.1 state 18 2.48
Arkansas 16.9 38 19.0 0.2 44.5 7.0 state 0 -0.74
California 24.3 73 10.4 5.0 26.0 11.8 state 4 3.60
Colorado 15.2 73 10.1 1.2 21.4 9.2 state 19 1.34
Connecticut 10.8 58 8.0 2.4 29.7 21.0 state 0 -0.26
Delaware 17.5 68 11.8 0.7 31.4 8.9 state 0 -0.16
Florida 22.3 81 13.4 3.6 33.3 10.1 state 0 2.20
Georgia 27.6 55 16.6 0.3 43.6 10.2 state 0 0.37
Hawaii 9.1 75 9.9 5.7 26.2 17.0 state 29 1.46
Idaho 4.2 48 12.6 1.0 26.3 9.1 state 56 1.53
Illinois 8.1 48 7.7 1.0 29.8 13.5 state 0 1.69
Indiana 7.1 48 9.4 0.5 33.6 9.9 state 0 -0.68
Iowa 2.3 47 10.1 0.3 28.5 10.4 state 0 -0.59
Kansas 7.9 54 10.1 0.5 26.7 8.5 state 14 0.94
Kentucky 7.7 34 17.6 0.2 46.9 10.6 state 0 -1.41
Louisiana 31.4 54 18.6 1.1 42.3 9.7 state 0 2.46
Maine 0.7 44 13.0 1.0 31.3 19.5 state 40 2.06
Maryland 16.7 58 6.8 0.8 28.2 10.5 state 0 2.03
Massachusetts 3.8 53 8.5 2.1 27.4 26.9 state 4 -0.57
Michigan 7.0 61 8.7 0.7 29.9 9.4 state 8 0.89
Minnesota 2.1 48 9.5 0.5 26.9 10.7 state 11 1.57
Mississippi 35.8 34 23.9 0.2 45.2 7.2 state 0 1.52
Missouri 7.8 45 11.2 0.3 34.9 9.1 state 0 0.81
Montana 1.5 50 12.3 0.4 25.6 12.8 state 75 1.81
Nebraska 4.8 43 10.7 0.5 26.6 9.7 state 33 0.36
Nevada 13.0 88 8.7 1.6 24.5 11.7 state 10 5.08
New Hampshire 1.0 47 8.5 0.8 27.7 20.3 state 0 -1.49
New Jersey 19.0 64 9.5 3.6 32.6 23.7 state 0 1.44
New Mexico 38.4 59 17.6 4.6 31.1 10.7 state 58 2.69
New York 8.0 48 8.9 1.3 29.3 21.6 state 0 -1.48
North Carolina 23.1 46 14.8 0.2 45.2 8.2 state 0 1.36
North Dakota 1.0 30 12.6 0.5 33.6 15.1 state 70 0.35
Ohio 8.9 52 9.6 0.5 32.1 11.3 state 0 0.97
Oklahoma 8.6 50 13.4 0.5 34.0 8.0 state 0 -0.12
Oregon 3.9 60 10.7 0.8 24.4 7.9 state 13 0.93
Pennsylvania 4.8 33 8.8 0.6 33.6 13.3 state 0 -0.78
Rhode Island 4.9 59 10.3 3.2 38.9 29.6 state 0 0.74
South Carolina 31.0 53 16.6 0.2 46.3 7.9 state 0 6.19
South Dakota 0.9 32 16.9 0.4 32.1 12.0 state 84 0.42
Tennessee 16.4 44 16.4 0.2 43.8 9.4 state 0 -2.31
Texas 30.6 55 15.0 4.7 38.7 7.7 state 1 0.27
Utah 4.7 58 10.3 0.9 20.0 11.3 state 14 1.14
Vermont 0.9 50 12.1 0.5 29.0 20.8 state 0 -1.12
Virginia 20.0 46 11.8 0.5 37.6 10.3 state 0 1.11
Washington 5.4 69 9.8 1.0 22.4 9.4 state 4 1.48
West Virginia 3.9 25 15.0 0.2 44.0 9.0 state 0 -0.69
Wisconsin 1.7 45 7.9 0.4 29.5 12.8 state 9 1.45
Wyoming 5.9 49 7.9 0.7 22.1 13.2 state 100 4.01
Baltimore 55.5 100 22.9 0.7 51.6 23.3 city 0 6.15
Boston 28.4 135 20.2 4.4 31.6 52.1 city 0 2.27
Chicago 53.7 66 20.3 6.7 43.8 51.4 city 0 5.42
Cleveland 46.7 101 22.1 1.6 49.1 36.4 city 0 5.01
Dallas 41.6 118 14.2 3.1 31.5 12.9 city 0 8.18
Detroit 65.4 106 21.9 1.6 45.8 18.6 city 0 4.33
Houston 45.1 80 12.7 5.1 31.6 8.9 city 0 5.79
Indianapolis 22.5 53 11.5 0.3 33.3 13.6 city 0 0.31
Los Angeles 44.4 100 16.4 12.7 31.4 15.0 city 0 7.52
Milwaukee 27.2 65 13.8 1.6 46.4 27.2 city 0 3.17
New York City 44.0 101 20.0 8.9 39.8 32.2 city 0 7.39
Philadelphia 41.3 60 20.6 2.2 45.7 21.7 city 0 6.41
Saint Louis 46.7 143 21.8 0.5 51.8 40.9 city 0 3.60
San Diego 23.6 81 12.4 4.2 21.1 11.2 city 0 0.47
San Francisco 24.8 107 13.7 9.2 26.0 20.3 city 0 5.18
Washington DC 72.6 102 18.6 1.1 32.9 21.0 city 0 5.93"

#cleaning
lines <- strsplit(Data, "\n")[[1]]
header <- strsplit(lines[1], "\\s+")[[1]]
rows <- lapply(lines[-1], function(x) {
  parts <- strsplit(x, "\\s+")[[1]]
  place <- paste(parts[1:(length(parts) - 9)], collapse = " ")
  rest <- parts[(length(parts) - 8):length(parts)]
  
  c(place, rest)
})

df <- as.data.frame(do.call(rbind, rows), stringsAsFactors = FALSE)
names(df) <- header

num_vars <- c("Minority", "Crime", "Poverty", "Language",
              "school", "Housing", "Conventional", "Undercount")

df[num_vars] <- lapply(df[num_vars], as.numeric)

df$City <- factor(df$City)

# checking
str(df)

## 'data.frame':    66 obs. of  10 variables:
##  $ Place       : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Minority    : num  26.1 5.7 18.9 16.9 24.3 15.2 10.8 17.5 22.3 27.6 ...
##  $ Crime       : num  49 62 81 38 73 73 58 68 81 55 ...
##  $ Poverty     : num  18.9 10.7 13.2 19 10.4 10.1 8 11.8 13.4 16.6 ...
##  $ Language    : num  0.2 1.7 3.2 0.2 5 1.2 2.4 0.7 3.6 0.3 ...
##  $ school      : num  43.5 17.5 27.6 44.5 26 21.4 29.7 31.4 33.3 43.6 ...
##  $ Housing     : num  7.6 23.6 8.1 7 11.8 9.2 21 8.9 10.1 10.2 ...
##  $ City        : Factor w/ 2 levels "city","state": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Conventional: num  0 100 18 0 4 19 0 0 0 0 ...
##  $ Undercount  : num  -0.04 3.35 2.48 -0.74 3.6 1.34 -0.26 -0.16 2.2 0.37 ...

nrow(df)

## [1] 66

head(df)

##        Place Minority Crime Poverty Language school Housing  City Conventional
## 1    Alabama     26.1    49    18.9      0.2   43.5     7.6 state            0
## 2     Alaska      5.7    62    10.7      1.7   17.5    23.6 state          100
## 3    Arizona     18.9    81    13.2      3.2   27.6     8.1 state           18
## 4   Arkansas     16.9    38    19.0      0.2   44.5     7.0 state            0
## 5 California     24.3    73    10.4      5.0   26.0    11.8 state            4
## 6   Colorado     15.2    73    10.1      1.2   21.4     9.2 state           19
##   Undercount
## 1      -0.04
## 2       3.35
## 3       2.48
## 4      -0.74
## 5       3.60
## 6       1.34

##(a)

#The regression model
fit1 <- lm(Undercount ~ Minority + Crime + Poverty + Language +
             school + Housing + Conventional, data = df)

summary(fit1)

## 
## Call:
## lm(formula = Undercount ~ Minority + Crime + Poverty + Language + 
##     school + Housing + Conventional, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8742 -0.8182  0.0064  0.6476  3.9922 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.217678   1.364658  -1.625 0.109568    
## Minority      0.093366   0.020970   4.452 3.92e-05 ***
## Crime         0.034687   0.012775   2.715 0.008712 ** 
## Poverty      -0.173520   0.085775  -2.023 0.047696 *  
## Language      0.230780   0.092615   2.492 0.015589 *  
## school        0.058204   0.045213   1.287 0.203100    
## Housing      -0.022518   0.023454  -0.960 0.340995    
## Conventional  0.036120   0.009335   3.869 0.000279 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.442 on 58 degrees of freedom
## Multiple R-squared:  0.6961, Adjusted R-squared:  0.6594 
## F-statistic: 18.98 on 7 and 58 DF,  p-value: 6.62e-13

df$fitted <- fitted(fit1)
df$residuals <- resid(fit1)

# residuals for the first few observations
residuals_df <- data.frame(
  Place = df$Place,
  Undercount = df$Undercount,
  Fitted = round(df$fitted,3),
  Residual = round(df$residuals,3)
)

head(residuals_df,10)

##          Place Undercount Fitted Residual
## 1      Alabama      -0.04  1.046   -1.086
## 2       Alaska       3.35  3.100    0.250
## 3      Arizona       2.48  2.879   -0.399
## 4     Arkansas      -0.74 -0.140   -0.600
## 5   California       3.60  3.325    0.275
## 6     Colorado       1.34  1.983   -0.643
## 7  Connecticut      -0.26  1.224   -1.484
## 8     Delaware      -0.16  1.516   -1.676
## 9      Florida       2.20  2.890   -0.690
## 10     Georgia       0.37  1.764   -1.394

residuals_df[order(-abs(residuals_df$Residual)), ][1:10, ]

##             Place Undercount Fitted Residual
## 40 South Carolina       6.19  2.198    3.992
## 62   Philadelphia       6.41  2.824    3.586
## 43          Texas       0.27  3.144   -2.874
## 42      Tennessee      -2.31  0.378   -2.688
## 28         Nevada       5.08  2.432    2.648
## 55         Dallas       8.18  5.554    2.626
## 31     New Mexico       2.69  5.086   -2.396
## 64      San Diego       0.47  2.589   -2.119
## 56        Detroit       4.33  6.381   -2.051
## 63    Saint Louis       3.60  5.529   -1.929

# Ranking predictors by importance based on p-values
fit1_summary <- summary(fit1)
coef_table <- fit1_summary$coefficients

importance <- data.frame(
  Predictor = rownames(coef_table)[-1],
  Estimate = coef_table[-1,1],
  Std_Error = coef_table[-1,2],
  t_value = coef_table[-1,3],
  p_value = coef_table[-1,4]
)

importance <- importance[order(importance$p_value), ]
importance

##                 Predictor    Estimate   Std_Error    t_value      p_value
## Minority         Minority  0.09336637 0.020970245  4.4523261 3.921428e-05
## Conventional Conventional  0.03611970 0.009335328  3.8691404 2.791736e-04
## Crime               Crime  0.03468717 0.012775289  2.7151766 8.711916e-03
## Language         Language  0.23078011 0.092614627  2.4918322 1.558920e-02
## Poverty           Poverty -0.17352000 0.085775464 -2.0229562 4.769556e-02
## school             school  0.05820351 0.045213270  1.2873103 2.030996e-01
## Housing           Housing -0.02251768 0.023453724 -0.9600899 3.409954e-01

# The regression model explains a substantial portion of the variation
# in census undercount. The R-squared is 0.696 and the adjusted R-squared
# is 0.659, indicating that roughly 66–70% of the variation in undercount
# is explained by the predictors included in the model.

# Based on the p-values of the coefficients:
# Minority (p < 0.001) is the most statistically significant predictor.
#   Areas with a higher percentage of minority population tend to have
#   higher census undercount.
#  Conventional (p < 0.001) is also highly significant and positively
#   associated with undercount
#  Crime (p = 0.009) and Language (p = 0.016) are statistically significant
#   predictors as well.
#  Poverty (p = 0.048) is marginally significant.
# school (p = 0.203) and Housing (p = 0.341) are not statistically
#   significant, suggesting that they do not add much explanatory power
#   once the other variables are included in the model.


# Residual plot against fitted values


plot(df$fitted, df$residuals,
     xlab = "Fitted values",
     ylab = "Residuals",
     main = "Residuals vs Fitted Values")

abline(h=0, col="red", lty=2)

# The residual plot shows residuals scattered roughly around zero without
# a clear pattern, suggesting that the linear regression model is
# reasonably appropriate.


# Q-Q plot for residual normality
qqnorm(df$residuals, main="Q-Q Plot of Residuals")
qqline(df$residuals, col="red")

# The Q-Q plot indicates that the residuals follow the reference line
# fairly closely, suggesting that the normality assumption for the
# regression errors is approximately satisfied



# (b)
# structure of City
str(df$City)

##  Factor w/ 2 levels "city","state": 2 2 2 2 2 2 2 2 2 2 ...

# structure of other predictors
str(df[,c("Minority","Crime","Poverty","Language",
          "school","Housing","Conventional")])

## 'data.frame':    66 obs. of  7 variables:
##  $ Minority    : num  26.1 5.7 18.9 16.9 24.3 15.2 10.8 17.5 22.3 27.6 ...
##  $ Crime       : num  49 62 81 38 73 73 58 68 81 55 ...
##  $ Poverty     : num  18.9 10.7 13.2 19 10.4 10.1 8 11.8 13.4 16.6 ...
##  $ Language    : num  0.2 1.7 3.2 0.2 5 1.2 2.4 0.7 3.6 0.3 ...
##  $ school      : num  43.5 17.5 27.6 44.5 26 21.4 29.7 31.4 33.3 43.6 ...
##  $ Housing     : num  7.6 23.6 8.1 7 11.8 9.2 21 8.9 10.1 10.2 ...
##  $ Conventional: num  0 100 18 0 4 19 0 0 0 0 ...

# The variable City is categorical (a factor) with two levels:
# "city" and "state". It indicates whether the observation
# corresponds to a major city or a state remainder.
#
# In contrast, all other predictors are quantitative variables
# measuring percentages or rates. In regression analysis, City
# would enter the model as a dummy variable representing the
# difference between these two groups.

# (c) Best subset regression

library(leaps)

best <- regsubsets(Undercount ~ Minority + Crime + Poverty + Language +
                     school + Housing + Conventional,
                   data = df,
                   nvmax = 7)

best_sum <- summary(best)

best_sum$which

##   (Intercept) Minority Crime Poverty Language school Housing Conventional
## 1        TRUE     TRUE FALSE   FALSE    FALSE  FALSE   FALSE        FALSE
## 2        TRUE     TRUE FALSE   FALSE    FALSE   TRUE   FALSE        FALSE
## 3        TRUE     TRUE  TRUE   FALSE    FALSE  FALSE   FALSE         TRUE
## 4        TRUE     TRUE  TRUE   FALSE     TRUE  FALSE   FALSE         TRUE
## 5        TRUE     TRUE  TRUE    TRUE     TRUE  FALSE   FALSE         TRUE
## 6        TRUE     TRUE  TRUE    TRUE     TRUE   TRUE   FALSE         TRUE
## 7        TRUE     TRUE  TRUE    TRUE     TRUE   TRUE    TRUE         TRUE

best_sum$adjr2

## [1] 0.4855990 0.5558969 0.6200099 0.6474507 0.6593270 0.6598359 0.6593843

best_sum$cp

## [1] 34.653395 22.140936 11.167048  7.137160  6.010091  6.921773  8.000000

best_sum$bic

## [1] -36.51760 -43.06581 -50.22234 -52.05288 -51.21581 -48.23409 -45.08510

# best models according to criteria
best_adj_r2 <- which.max(best_sum$adjr2)
best_cp <- which.min(best_sum$cp)
best_bic <- which.min(best_sum$bic)

best_sum$which[best_adj_r2, ]

##  (Intercept)     Minority        Crime      Poverty     Language       school 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##      Housing Conventional 
##        FALSE         TRUE

best_sum$which[best_cp, ]

##  (Intercept)     Minority        Crime      Poverty     Language       school 
##         TRUE         TRUE         TRUE         TRUE         TRUE        FALSE 
##      Housing Conventional 
##        FALSE         TRUE

best_sum$which[best_bic, ]

##  (Intercept)     Minority        Crime      Poverty     Language       school 
##         TRUE         TRUE         TRUE        FALSE         TRUE        FALSE 
##      Housing Conventional 
##        FALSE         TRUE

# Stepwise regression using AIC

library(MASS)

step_model <- stepAIC(fit1, direction="both")

## Start:  AIC=55.78
## Undercount ~ Minority + Crime + Poverty + Language + school + 
##     Housing + Conventional
## 
##                Df Sum of Sq    RSS    AIC
## - Housing       1     1.917 122.51 54.824
## - school        1     3.446 124.04 55.643
## <none>                      120.59 55.784
## - Poverty       1     8.509 129.10 58.284
## - Language      1    12.910 133.50 60.496
## - Crime         1    15.328 135.92 61.681
## - Conventional  1    31.127 151.72 68.938
## - Minority      1    41.217 161.81 73.187
## 
## Step:  AIC=54.82
## Undercount ~ Minority + Crime + Poverty + Language + school + 
##     Conventional
## 
##                Df Sum of Sq    RSS    AIC
## - school        1     2.263 124.77 54.032
## <none>                      122.51 54.824
## + Housing       1     1.917 120.59 55.784
## - Poverty       1     8.325 130.84 57.163
## - Language      1    11.347 133.86 58.671
## - Crime         1    13.782 136.29 59.860
## - Conventional  1    29.360 151.87 67.003
## - Minority      1    47.669 170.18 74.515
## 
## Step:  AIC=54.03
## Undercount ~ Minority + Crime + Poverty + Language + Conventional
## 
##                Df Sum of Sq    RSS    AIC
## <none>                      124.77 54.032
## + school        1     2.263 122.51 54.824
## - Poverty       1     6.502 131.28 55.385
## + Housing       1     0.734 124.04 55.643
## - Language      1     9.400 134.17 56.826
## - Crime         1    11.535 136.31 57.868
## - Conventional  1    29.971 154.75 66.240
## - Minority      1    51.832 176.61 74.962

summary(step_model)

## 
## Call:
## lm(formula = Undercount ~ Minority + Crime + Poverty + Language + 
##     Conventional, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6089 -0.9797  0.0373  0.7054  4.3521 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.792689   0.860341  -0.921 0.360550    
## Minority      0.100967   0.020224   4.992 5.44e-06 ***
## Crime         0.024346   0.010337   2.355 0.021804 *  
## Poverty      -0.110030   0.062227  -1.768 0.082111 .  
## Language      0.183850   0.086475   2.126 0.037623 *  
## Conventional  0.029327   0.007725   3.796 0.000345 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.442 on 60 degrees of freedom
## Multiple R-squared:  0.6855, Adjusted R-squared:  0.6593 
## F-statistic: 26.16 on 5 and 60 DF,  p-value: 6.449e-14

#Model comparison


#best subset model variables based on BIC
best_bic_vars <- names(best_sum$which[best_bic, ])[best_sum$which[best_bic, ]][-1]
best_bic_formula <- as.formula(paste("Undercount ~",
                                     paste(best_bic_vars, collapse=" + ")))

best_bic_model <- lm(best_bic_formula, data=df)

comparison <- data.frame(
  Model = c("Full Model",
            "Best Subset (BIC)",
            "Stepwise AIC Model"),
  R_squared = c(summary(fit1)$r.squared,
                summary(best_bic_model)$r.squared,
                summary(step_model)$r.squared),
  Adj_R_squared = c(summary(fit1)$adj.r.squared,
                    summary(best_bic_model)$adj.r.squared,
                    summary(step_model)$adj.r.squared),
  AIC = c(AIC(fit1),
          AIC(best_bic_model),
          AIC(step_model)),
  BIC = c(BIC(fit1),
          BIC(best_bic_model),
          BIC(step_model))
)

comparison

##                Model R_squared Adj_R_squared      AIC      BIC
## 1         Full Model 0.6960660     0.6593843 245.0836 264.7905
## 2  Best Subset (BIC) 0.6691460     0.6474507 244.6848 257.8227
## 3 Stepwise AIC Model 0.6855326     0.6593270 243.3322 258.6598

# The model with the highest Adjusted R^2 includes six predictors:
#   Minority, Crime, Poverty, Language, school, and Conventional.
#  The Cp criterion suggests a model with five predictors:
#   Minority, Crime, Poverty, Language, and Conventional.
#  The BIC criterion selects a more parsimonious four-variable model:
#   Minority, Crime, Language, and Conventional.
#
# Stepwise regression using AIC selects the model:
# Undercount ~ Minority + Crime + Poverty + Language + Conventional
#
# This model excludes school and Housing, indicating that these
# variables contribute little additional explanatory power once
# the main predictors are included.
#
# Comparing the models shows that both best subset and stepwise
# regression identify a similar set of important predictors.
# In particular, Minority, Crime, Language, and Conventional
# consistently appear as key determinants of census undercount.

#Problem 2.11

# (b) F-test p-value

# Given from ANOVA table
MST <- 21.47
MSE <- 2.39
df1 <- 3
df2 <- 26

# F statistic
F_stat <- MST / MSE
F_stat

## [1] 8.983264

# p-value
1 - pf(F_stat, df1, df2)

## [1] 0.0002976389

# (c) Tukey multiple comparisons

# Treatment means and sample sizes
yA <- 66.10; nA <- 7
yB <- 65.75; nB <- 8
yC <- 62.63; nC <- 9
yD <- 63.85; nD <- 6

MSE <- 2.39
alpha <- 0.01
k <- 4
df <- 26

# Tukey critical value
q_crit <- qtukey(1 - alpha, k, df)
q_crit

## [1] 4.865002

# Pairwise mean differences
abs(yA - yB)

## [1] 0.35

abs(yA - yC)

## [1] 3.47

abs(yA - yD)

## [1] 2.25

abs(yB - yC)

## [1] 3.12

abs(yB - yD)

## [1] 1.9

abs(yC - yD)

## [1] 1.22

# standard errors for unequal sample sizes
SE_AB <- sqrt(MSE/2 * (1/nA + 1/nB))
SE_AC <- sqrt(MSE/2 * (1/nA + 1/nC))
SE_AD <- sqrt(MSE/2 * (1/nA + 1/nD))
SE_BC <- sqrt(MSE/2 * (1/nB + 1/nC))
SE_BD <- sqrt(MSE/2 * (1/nB + 1/nD))
SE_CD <- sqrt(MSE/2 * (1/nC + 1/nD))

# Tukey test statistics
q_AB <- abs(yA - yB) / SE_AB
q_AC <- abs(yA - yC) / SE_AC
q_AD <- abs(yA - yD) / SE_AD
q_BC <- abs(yB - yC) / SE_BC
q_BD <- abs(yB - yD) / SE_BD
q_CD <- abs(yC - yD) / SE_CD

q_AB; q_AC; q_AD; q_BC; q_BD; q_CD

## [1] 0.6186321

## [1] 6.298771

## [1] 3.699572

## [1] 5.873709

## [1] 3.218298

## [1] 2.117518

#(d)Contrast: brand-name vs generic drugs

L <- 0.5*(yA + yB) - 0.5*(yC + yD)
L

## [1] 2.685

# Standard error of the contrast
SE_L <- sqrt(MSE * (0.25/nA + 0.25/nB + 0.25/nC + 0.25/nD))
SE_L

## [1] 0.5709789

# t statistic
t_stat <- L / SE_L
t_stat

## [1] 4.702451

# two-sided p-value
2 * (1 - pt(abs(t_stat), df = 26))

## [1] 7.372113e-05

#Problem 2.14

Data <- "
Area Velocity
0.016 294.9
0.016 294.1
0.016 301.7
0.016 307.9
0.016 285.5
0.016 298.6
0.016 303.1
0.016 305.3
0.016 264.9
0.016 262.9
0.016 256
0.016 255.3
0.016 256.3
0.016 258.2
0.016 243.6
0.016 250.1
0.03 295
0.03 301.1
0.03 293.1
0.03 300.6
0.03 285
0.03 289.1
0.03 277.8
0.03 266.4
0.03 248.1
0.03 255.7
0.03 245.7
0.03 251
0.03 254.9
0.03 254.5
0.03 246.3
0.03 246.9
0.044 270.5
0.044 263.2
0.044 278.6
0.044 267.9
0.044 269.6
0.044 269.1
0.044 262.2
0.044 263.2
0.044 224.2
0.044 227.9
0.044 217.7
0.044 219.6
0.044 228.5
0.044 230.9
0.044 227.6
0.044 228.6
0.058 258.6
0.058 255.9
0.058 257.1
0.058 263.6
0.058 262.6
0.058 260.3
0.058 305.3
0.058 304.9
0.058 216
0.058 216
0.058 210.6
0.058 207.4
0.058 214.6
0.058 214.3
0.058 222.1
0.058 222.2
"

df <- read.table(text = Data, header = TRUE)

head(df)

##    Area Velocity
## 1 0.016    294.9
## 2 0.016    294.1
## 3 0.016    301.7
## 4 0.016    307.9
## 5 0.016    285.5
## 6 0.016    298.6

str(df)

## 'data.frame':    64 obs. of  2 variables:
##  $ Area    : num  0.016 0.016 0.016 0.016 0.016 0.016 0.016 0.016 0.016 0.016 ...
##  $ Velocity: num  295 294 302 308 286 ...

# (a)

df$AreaF <- factor(df$Area)

# Fitting the one-way ANOVA model
fit_aov <- aov(Velocity ~ AreaF, data = df)

# ANOVA table
summary(fit_aov)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## AreaF        3  13515    4505   7.067 0.000381 ***
## Residuals   60  38250     638                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# The ANOVA table gives F = 7.067 with p-value = 0.000381.
# Since the p-value is much smaller than 0.01, we reject the
# null hypothesis that all four discharge hole areas produce
# the same mean muzzle velocity.
#
# This provides strong statistical evidence that the discharge
# hole area has a significant effect on muzzle velocity.


# Group means

tapply(df$Velocity, df$AreaF, mean)

##    0.016     0.03    0.044    0.058 
## 277.4000 269.4500 246.8313 243.2188

# The means clearly decrease as the discharge hole area increases.
# This supports the expected inverse relationship: smaller hole
# areas produce higher pressure pulses, resulting in higher
# muzzle velocity.

# Boxplot for visual comparison

boxplot(Velocity ~ AreaF, data = df,
        main = "Muzzle Velocity by Discharge Hole Area",
        xlab = "Discharge Hole Area",
        ylab = "Velocity (ft/s)",
        col = "lightgray")

# Multiple comparisons using Tukey's method


TukeyHSD(fit_aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Velocity ~ AreaF, data = df)
## 
## $AreaF
##                  diff       lwr         upr     p adj
## 0.03-0.016   -7.95000 -31.53930  15.6392992 0.8097473
## 0.044-0.016 -30.56875 -54.15805  -6.9794508 0.0059909
## 0.058-0.016 -34.18125 -57.77055 -10.5919508 0.0017201
## 0.044-0.03  -22.61875 -46.20805   0.9705492 0.0648075
## 0.058-0.03  -26.23125 -49.82055  -2.6419508 0.0235506
## 0.058-0.044  -3.61250 -27.20180  19.9767992 0.9773936

# The Tukey test identifies which pairs of area levels differ
# significantly in mean velocity while controlling the
# family-wise error rate.
# Significant differences are observed between:
# Area 0.016 and Area 0.044
# Area 0.016 and Area 0.058
# These results indicate that the smallest discharge hole area
# produces significantly higher muzzle velocity than the larger
# areas 0.044 and 0.058.
# Other pairwise differences are not statistically significant
# at the 5% level

# Residual diagnostics for one-way ANOVA


par(mfrow = c(2, 2))
plot(fit_aov)

par(mfrow = c(1, 1))




# (b) orthogonal polynomials to model linear and quadratic effects


poly_fit <- lm(Velocity ~ poly(Area, 2), data = df)

summary(poly_fit)

## 
## Call:
## lm(formula = Velocity ~ poly(Area, 2), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.484 -23.133  -3.349  18.953  63.765 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     259.225      3.167  81.851  < 2e-16 ***
## poly(Area, 2)1 -111.949     25.336  -4.419 4.15e-05 ***
## poly(Area, 2)2    8.675     25.336   0.342    0.733    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.34 on 61 degrees of freedom
## Multiple R-squared:  0.2436, Adjusted R-squared:  0.2188 
## F-statistic:  9.82 on 2 and 61 DF,  p-value: 0.0002008

# orthogonal polynomial analysis using factor contrasts

df$AreaOrd <- factor(df$Area, ordered = TRUE)


contrasts(df$AreaOrd) <- contr.poly(4)

#ANOVA model with polynomial decomposition
fit_poly_aov <- aov(Velocity ~ AreaOrd, data = df)

summary(fit_poly_aov, split = list(AreaOrd = list(
  Linear = 1,
  Quadratic = 2,
  Cubic = 3
)))

##                      Df Sum Sq Mean Sq F value   Pr(>F)    
## AreaOrd               3  13515    4505   7.067 0.000381 ***
##   AreaOrd: Linear     1  12533   12533  19.659 4.01e-05 ***
##   AreaOrd: Quadratic  1     75      75   0.118 0.732363    
##   AreaOrd: Cubic      1    907     907   1.423 0.237597    
## Residuals            60  38250     638                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# The orthogonal polynomial decomposition separates the treatment
# effect into linear, quadratic, and cubic components.
# Linear component:
#   F = 19.659, p-value = 4.01e-05
#   This indicates a strong and statistically significant linear
#   trend between discharge hole area and muzzle velocity.
# Quadratic component:
#   F = 0.118, p-value = 0.732
#   This term is not significant, indicating no evidence of
#   curvature in the relationship.
# Cubic component:
#   F = 1.423, p-value = 0.238
#   This higher-order term is also not significant.

# Means plotted against quantitative area levels


area_means <- aggregate(Velocity ~ Area, data = df, mean)
area_means

##    Area Velocity
## 1 0.016 277.4000
## 2 0.030 269.4500
## 3 0.044 246.8313
## 4 0.058 243.2188

plot(area_means$Area, area_means$Velocity, type = "b", pch = 19,
     xlab = "Discharge Hole Area (square inches)",
     ylab = "Mean Velocity (ft/s)",
     main = "Mean Velocity vs Discharge Hole Area")

# Both analyses lead to the same conclusion. The one-way ANOVA
# shows that muzzle velocity differs across the four discharge
# hole areas, while the polynomial analysis shows that this
# difference is primarily explained by a strong linear trend.
# Specifically, muzzle velocity decreases as discharge hole
# area increases. This result is consistent with the expected
# physical relationship: smaller discharge holes increase the
# pressure of the propellant gases and therefore produce
# higher muzzle velocity.

#Problem 2.17

Data <- "
Day Dev1 Dev2 Dev3 Doc1 Doc2 Doc3
1 133.34 133.36 133.45 126.54 127.36 131.88
2 110.94 110.85 110.92 124.69 128.86 132.39
3 118.54 118.56 118.67 125.46 129.43 134.43
4 137.94 137.80 137.77 125.95 130.72 134.28
5 139.52 139.62 139.59 125.90 130.13 134.44
6 139.23 139.11 139.36 127.85 132.03 137.37
7 117.96 117.81 117.85 125.55 132.05 132.17
8 119.59 119.42 119.48 125.80 129.87 134.97
9 116.12 116.00 115.93 125.11 128.09 133.97
10 128.38 128.48 128.41 125.75 131.94 132.68
11 125.17 125.25 125.34 128.77 130.05 134.75
12 134.62 134.41 134.55 125.26 131.13 134.29
13 136.14 136.07 136.22 126.26 130.91 133.38
14 131.21 131.03 130.96 125.68 128.83 135.67
15 132.51 132.86 132.65 124.47 129.46 134.39
"

df <- read.table(text = Data, header = TRUE)


head(df)

##   Day   Dev1   Dev2   Dev3   Doc1   Doc2   Doc3
## 1   1 133.34 133.36 133.45 126.54 127.36 131.88
## 2   2 110.94 110.85 110.92 124.69 128.86 132.39
## 3   3 118.54 118.56 118.67 125.46 129.43 134.43
## 4   4 137.94 137.80 137.77 125.95 130.72 134.28
## 5   5 139.52 139.62 139.59 125.90 130.13 134.44
## 6   6 139.23 139.11 139.36 127.85 132.03 137.37

str(df)

## 'data.frame':    15 obs. of  7 variables:
##  $ Day : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Dev1: num  133 111 119 138 140 ...
##  $ Dev2: num  133 111 119 138 140 ...
##  $ Dev3: num  133 111 119 138 140 ...
##  $ Doc1: num  127 125 125 126 126 ...
##  $ Doc2: num  127 129 129 131 130 ...
##  $ Doc3: num  132 132 134 134 134 ...

# Devices
device_data <- data.frame(
  Day = rep(df$Day, 3),
  Source = factor(rep(c("Dev1", "Dev2", "Dev3"), each = nrow(df))),
  BP = c(df$Dev1, df$Dev2, df$Dev3)
)

# Doctors
doctor_data <- data.frame(
  Day = rep(df$Day, 3),
  Source = factor(rep(c("Doc1", "Doc2", "Doc3"), each = nrow(df))),
  BP = c(df$Doc1, df$Doc2, df$Doc3)
)


# (a) One-way random effects model: Devices


fit_dev <- aov(BP ~ Source, data = device_data)
summary(fit_dev)

##             Df Sum Sq Mean Sq F value Pr(>F)
## Source       2      0    0.01       0      1
## Residuals   42   3699   88.08

# ANOVA components
anova_dev <- summary(fit_dev)[[1]]
MS_dev_source <- anova_dev["Source", "Mean Sq"]
MS_dev_error  <- anova_dev["Residuals", "Mean Sq"]
df_dev_source <- anova_dev["Source", "Df"]
df_dev_error  <- anova_dev["Residuals", "Df"]
F_dev <- anova_dev["Source", "F value"]
p_dev <- anova_dev["Source", "Pr(>F)"]

# Number of replicates per source
n_dev <- 15

# Variance component estimates for random effects model
sigma2_dev <- MS_dev_error
sigma_alpha2_dev <- (MS_dev_source - MS_dev_error) / n_dev


sigma_alpha2_dev_trunc <- max(sigma_alpha2_dev, 0)

MS_dev_source

## [1] 0.006782222

MS_dev_error

## [1] 88.08183

F_dev

## [1] 7.69991e-05

p_dev

## [1] 0.999923

sigma2_dev

## [1] 88.08183

sigma_alpha2_dev

## [1] -5.87167

sigma_alpha2_dev_trunc

## [1] 0

# (a) Random effects analysis for devices

# The ANOVA for devices gives:
# F = 0.000077 and p-value = 0.9999.

# Since the p-value is extremely large, we fail to reject the
# null hypothesis that the three devices have the same mean
# blood pressure measurement.

# This indicates that there is essentially no variation among
# the three randomly selected devices beyond normal day-to-day
# measurement variability.

# The estimated between-device variance component is negative
# (-5.87), which is not meaningful in practice. Therefore it is
# truncated to zero. This implies that almost all variability in
# the device measurements is residual variability rather than
# device-to-device differences.

# Thus, the new electronic devices appear to be highly consistent
# and provide very precise measurements

# (a) One-way random effects model: Doctors

fit_doc <- aov(BP ~ Source, data = doctor_data)
summary(fit_doc)

##             Df Sum Sq Mean Sq F value Pr(>F)    
## Source       2  496.3  248.16   139.1 <2e-16 ***
## Residuals   42   74.9    1.78                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#ANOVA components
anova_doc <- summary(fit_doc)[[1]]
MS_doc_source <- anova_doc["Source", "Mean Sq"]
MS_doc_error  <- anova_doc["Residuals", "Mean Sq"]
df_doc_source <- anova_doc["Source", "Df"]
df_doc_error  <- anova_doc["Residuals", "Df"]
F_doc <- anova_doc["Source", "F value"]
p_doc <- anova_doc["Source", "Pr(>F)"]

# Number of replicates per source
n_doc <- 15

# Variance component estimates
sigma2_doc <- MS_doc_error
sigma_alpha2_doc <- (MS_doc_source - MS_doc_error) / n_doc
sigma_alpha2_doc_trunc <- max(sigma_alpha2_doc, 0)

MS_doc_source

## [1] 248.1626

MS_doc_error

## [1] 1.784094

F_doc

## [1] 139.0973

p_doc

## [1] 2.982244e-19

sigma2_doc

## [1] 1.784094

sigma_alpha2_doc

## [1] 16.42523

sigma_alpha2_doc_trunc

## [1] 16.42523

# The ANOVA for doctors gives:
# F = 139.10 and p-value  2.98 × 10^-19.
#
# Since the p-value is extremely small, we reject the null
# hypothesis that the three doctors have the same mean blood
# pressure measurements.
#
# This indicates that there are significant differences among
# the doctors' readings.
#
# The estimated variance components are:
# Residual variance (sigma^2)  1.78
# Between-doctor variance (sigma_alpha^2)  16.43
#
# The large between-doctor variance component shows that a
# substantial portion of the variability in measurements is
# due to differences among doctors rather than random variation


# (b) Group means


tapply(device_data$BP, device_data$Source, mean)

##     Dev1     Dev2     Dev3 
## 128.0807 128.0420 128.0767

mean(device_data$BP)

## [1] 128.0664

tapply(doctor_data$BP, doctor_data$Source, mean)

##     Doc1     Doc2     Doc3 
## 125.9360 130.0573 134.0707

mean(doctor_data$BP)

## [1] 130.0213

# The device measurements show essentially no variability among
# the three devices, indicating very high precision and strong
# agreement between the devices.
#
# In contrast, the doctor measurements show large variability
# among the three doctors. This suggests that readings taken by
# doctors using their existing devices are much less consistent.
#
# Therefore, the new electronic device appears to provide much
# more precise and reproducible blood pressure measurements
# than the measurements obtained by doctors.

# (c) 95% confidence interval for mean blood pressure:
# Devices


a_dev <- 3
grand_mean_dev <- mean(device_data$BP)


SE_mean_dev <- sqrt(sigma2_dev / (a_dev * n_dev))

CI_dev <- grand_mean_dev + c(-1, 1) * qt(0.975, df_dev_error) * SE_mean_dev
grand_mean_dev

## [1] 128.0664

SE_mean_dev

## [1] 1.399062

CI_dev

## [1] 125.2430 130.8899

# (c) 95% confidence interval for mean blood pressure:
# Doctors


a_doc <- 3
grand_mean_doc <- mean(doctor_data$BP)

SE_mean_doc <- sqrt(sigma_alpha2_doc_trunc / a_doc + sigma2_doc / (a_doc * n_doc))


CI_doc <- grand_mean_doc + c(-1, 1) * qt(0.975, a_doc - 1) * SE_mean_doc
grand_mean_doc

## [1] 130.0213

SE_mean_doc

## [1] 2.348345

CI_doc

## [1] 119.9172 140.1254

# Mean blood pressure measured by devices:
# Grand mean = 128.07
# 95% confidence interval = (125.24 , 130.89)

# Mean blood pressure measured by doctors:
# Grand mean = 130.02
# 95% confidence interval = (119.92 , 140.13)

# The confidence interval for the devices is relatively narrow,
# reflecting the high precision and low variability among device
# measurements.

# The confidence interval for doctors is much wider because it
# incorporates substantial between-doctor variability.
#
# This further supports the conclusion that the electronic devices
# provide more precise measurements than the doctors' readings

HW_2_DOE

Sabuj Ganguly

2026-03-16