Chapter 6

6.1

period=c(1:6)
refined=c(32.0,31.2,27.0,21.0,14.9,8.8)
manufactured=c(16.3,23.1,23.6,27.7,34.6,33.9)

(a)

plot(period, refined, type="o", col="blue")

fit.ref = glm(refined ~ period, family=gaussian)
coef(fit.ref)
## (Intercept)      period 
##   39.573333   -4.882857
confint(fit.ref, level = 0.95)
## Waiting for profiling to be done...
##                 2.5 %    97.5 %
## (Intercept) 35.850894 43.295773
## period      -5.838691 -3.927023
fit.ref.lm = lm(refined ~ period)
summary(fit.ref.lm)
## 
## Call:
## lm(formula = refined ~ period)
## 
## Residuals:
##       1       2       3       4       5       6 
## -2.6905  1.3924  2.0752  0.9581 -0.2590 -1.4762 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.5733     1.8992   20.84 3.13e-05 ***
## period       -4.8829     0.4877  -10.01 0.000559 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.04 on 4 degrees of freedom
## Multiple R-squared:  0.9616, Adjusted R-squared:  0.952 
## F-statistic: 100.2 on 1 and 4 DF,  p-value: 0.0005593
plot(period,manufactured, type="o", col="green")

fit.man = glm(manufactured ~ period, family=gaussian)
coef(fit.man)
## (Intercept)      period 
##   13.873333    3.617143
confint(fit.man, level = 0.95)
## Waiting for profiling to be done...
##                 2.5 %    97.5 %
## (Intercept) 10.128601 17.618066
## period       2.655584  4.578701

(b)

sugar.total = refined + manufactured
plot(period,sugar.total, type="o", col="green")

fit.tot = glm(sugar.total ~ period, family=gaussian)
coef(fit.tot)
## (Intercept)      period 
##   53.446667   -1.265714
confint(fit.tot, level = 0.95)
## Waiting for profiling to be done...
##                 2.5 %     97.5 %
## (Intercept) 47.454998 59.4383351
## period      -2.804233  0.2728042
fit.tot.lm = lm(sugar.total ~ period)
summary(fit.tot.lm)
## 
## Call:
## lm(formula = sugar.total ~ period)
## 
## Residuals:
##       1       2       3       4       5       6 
## -3.8810  3.3848  0.9505  0.3162  2.3819 -3.1524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   53.447      3.057  17.483 6.28e-05 ***
## period        -1.266      0.785  -1.612    0.182    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.284 on 4 degrees of freedom
## Multiple R-squared:  0.3939, Adjusted R-squared:  0.2424 
## F-statistic:   2.6 on 1 and 4 DF,  p-value: 0.1822

Here we have a complete summary of both models with their estimations and confidence intervals.

The question asks about testing the hypotesis that total sugar consumption did not change over time, i.e, if the model including time is better than the simplest model. We test that with the F test, so in this case, because the p-value is large, we cannot reject the hypotesis that total sugar consumptino did not change over time.

A complete interpretaion of the output above can be found in this link http://blog.yhat.com/posts/r-lm-summary.html.

6.5

Always first, set up the data

hyper=c(2.3,4.1,4.2,4.0,4.6,4.6,3.8,5.2,3.1,3.7,3.8)
nonh=c(3.0,4.1,3.9,3.1,3.3,2.9,3.3,3.9)
cont=c(3.0,2.6,3.1,2.2,2.1,2.4,2.8,3.4,2.9,2.6,3.1,3.2)
obese =c(hyper, nonh, cont)
groups=c(rep("H",11), rep("NH",8), rep("C",12))
df = data.frame(levels = obese, group = groups)

then we can vizualize it

ggplot(df, aes(x = group, y = levels)) +
  geom_boxplot(fill = "grey80", colour = "blue") +
  scale_x_discrete() + xlab("Treatment Group") +
  ylab("Plasma inorganic phosphate levels")

(a)

obese.fit = lm(obese ~ groups) #, family=gaussian)
summary(obese.fit)
## 
## Call:
## lm(formula = obese ~ groups)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.64545 -0.29148  0.01667  0.36667  1.25455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.7833     0.1671  16.656 4.63e-16 ***
## groupsH       1.1621     0.2416   4.809 4.67e-05 ***
## groupsNH      0.6542     0.2642   2.476   0.0196 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5789 on 28 degrees of freedom
## Multiple R-squared:  0.4542, Adjusted R-squared:  0.4152 
## F-statistic: 11.65 on 2 and 28 DF,  p-value: 0.0002082
anova(obese.fit)  #means are different
## Analysis of Variance Table
## 
## Response: obese
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## groups     2 7.8083  3.9041  11.651 0.0002082 ***
## Residuals 28 9.3827  0.3351                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qf(.95, 2,28)
## [1] 3.340386

Since \(F=11.651\) is significant at the \(5\%\) level when compared with \(F(2,28)\) (see also p-value), we conclude that the group means differ.

(b)

pooled.sd = sqrt( (var(hyper)*(length(hyper)-1) + var(nonh)*(length(nonh)-1) + var(cont)*(length(cont)-1))/
                (length(hyper) + length(nonh) + length(cont) - 3 )  )
df2 = length(hyper) + length(nonh) + length(cont) - 3

lower.lim = mean(hyper) - mean(nonh) - qt(0.975, df2)*(pooled.sd *sqrt((1/11)+(1/8)))
lower.lim
## [1] -0.04302617
upper.lim = mean(hyper) - mean(nonh) + qt(0.975, df2)*(pooled.sd *sqrt((1/11)+(1/8)))
upper.lim
## [1] 1.058935

(c)

A first nice view of the residuals could be:

obs.mod = data.frame(Fitted = fitted(obese.fit),
  Residuals = resid(obese.fit), Treatment = df$group)

ggplot(obs.mod, aes(Fitted, Residuals, colour = Treatment)) + geom_point()

then we would like to check normality on the residuals just as we did last week. You could use qq-plot and test of normality for that.

6.6

First we set up the data

#gl(n, k, length = n*k, labels = seq_len(n), ordered = FALSE)
days = gl(2, 20, length=40, labels = c("day1", "day2"))
days
##  [1] day1 day1 day1 day1 day1 day1 day1 day1 day1 day1 day1 day1 day1 day1
## [15] day1 day1 day1 day1 day1 day1 day2 day2 day2 day2 day2 day2 day2 day2
## [29] day2 day2 day2 day2 day2 day2 day2 day2 day2 day2 day2 day2
## Levels: day1 day2
workers = gl(4,5, length=40,labels = c("1", "2","3", "4"))
workers
##  [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
## [36] 4 4 4 4 4
## Levels: 1 2 3 4
wgts=c(35.7,37.1,36.7,37.7,35.3,38.4,37.2,38.1,36.9,37.2,
         34.9,34.3,34.5,33.7,36.2,37.1,35.5,36.5,36.0,33.8,
         34.7,35.2,34.6,36.4,35.2,36.9,38.5,36.4,37.8,36.1,
         32.0,35.2,33.5,32.9,33.3,35.8,32.9,35.7,38.0,36.1)
machine = data.frame(days,workers,wgts)
machine
##    days workers wgts
## 1  day1       1 35.7
## 2  day1       1 37.1
## 3  day1       1 36.7
## 4  day1       1 37.7
## 5  day1       1 35.3
## 6  day1       2 38.4
## 7  day1       2 37.2
## 8  day1       2 38.1
## 9  day1       2 36.9
## 10 day1       2 37.2
## 11 day1       3 34.9
## 12 day1       3 34.3
## 13 day1       3 34.5
## 14 day1       3 33.7
## 15 day1       3 36.2
## 16 day1       4 37.1
## 17 day1       4 35.5
## 18 day1       4 36.5
## 19 day1       4 36.0
## 20 day1       4 33.8
## 21 day2       1 34.7
## 22 day2       1 35.2
## 23 day2       1 34.6
## 24 day2       1 36.4
## 25 day2       1 35.2
## 26 day2       2 36.9
## 27 day2       2 38.5
## 28 day2       2 36.4
## 29 day2       2 37.8
## 30 day2       2 36.1
## 31 day2       3 32.0
## 32 day2       3 35.2
## 33 day2       3 33.5
## 34 day2       3 32.9
## 35 day2       3 33.3
## 36 day2       4 35.8
## 37 day2       4 32.9
## 38 day2       4 35.7
## 39 day2       4 38.0
## 40 day2       4 36.1

We now can plot it

ggplot(machine, aes(x = workers, y = wgts, fill = days)) +
  geom_boxplot(colour = "blue") +
  scale_x_discrete() 

machine.fit=lm(wgts ~ days*workers) #, family=gaussian)
summary(machine.fit)
## 
## Call:
## lm(formula = wgts ~ days * workers)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.800 -0.545 -0.020  0.615  2.300 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        36.5000     0.5012  72.822   <2e-16 ***
## daysday2           -1.2800     0.7088  -1.806   0.0804 .  
## workers2            1.0600     0.7088   1.495   0.1446    
## workers3           -1.7800     0.7088  -2.511   0.0173 *  
## workers4           -0.7200     0.7088  -1.016   0.3174    
## daysday2:workers2   0.8600     1.0025   0.858   0.3973    
## daysday2:workers3  -0.0600     1.0025  -0.060   0.9526    
## daysday2:workers4   1.2000     1.0025   1.197   0.2401    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.121 on 32 degrees of freedom
## Multiple R-squared:  0.613,  Adjusted R-squared:  0.5283 
## F-statistic:  7.24 on 7 and 32 DF,  p-value: 3.263e-05
anova(machine.fit)
## Analysis of Variance Table
## 
## Response: wgts
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## days          1  6.084  6.0840  4.8435   0.03508 *  
## workers       3 54.622 18.2073 14.4948 3.895e-06 ***
## days:workers  3  2.958  0.9860  0.7850   0.51117    
## Residuals    32 40.196  1.2561                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can find significant difference between days and between workers but no significant interaction effect.

Now let´s explore more models, using the book software suggestions

m1 = glm(wgts ~ days*workers, family = gaussian, data = machine)
summary(m1)
## 
## Call:
## glm(formula = wgts ~ days * workers, family = gaussian, data = machine)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.800  -0.545  -0.020   0.615   2.300  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        36.5000     0.5012  72.822   <2e-16 ***
## daysday2           -1.2800     0.7088  -1.806   0.0804 .  
## workers2            1.0600     0.7088   1.495   0.1446    
## workers3           -1.7800     0.7088  -2.511   0.0173 *  
## workers4           -0.7200     0.7088  -1.016   0.3174    
## daysday2:workers2   0.8600     1.0025   0.858   0.3973    
## daysday2:workers3  -0.0600     1.0025  -0.060   0.9526    
## daysday2:workers4   1.2000     1.0025   1.197   0.2401    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.256125)
## 
##     Null deviance: 103.860  on 39  degrees of freedom
## Residual deviance:  40.196  on 32  degrees of freedom
## AIC: 131.71
## 
## Number of Fisher Scoring iterations: 2
m2 = glm(wgts ~ days+workers, family = gaussian, data = machine)
summary(m2)
## 
## Call:
## glm(formula = wgts ~ days + workers, family = gaussian, data = machine)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4500  -0.6575  -0.1350   0.6825   2.6500  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.2500     0.3926  92.337  < 2e-16 ***
## daysday2     -0.7800     0.3511  -2.221  0.03289 *  
## workers2      1.4900     0.4966   3.001  0.00494 ** 
## workers3     -1.8100     0.4966  -3.645  0.00086 ***
## workers4     -0.1200     0.4966  -0.242  0.81046    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.232971)
## 
##     Null deviance: 103.860  on 39  degrees of freedom
## Residual deviance:  43.154  on 35  degrees of freedom
## AIC: 128.55
## 
## Number of Fisher Scoring iterations: 2
m3 = glm(wgts ~ workers, family = gaussian, data = machine)
summary(m3)
## 
## Call:
## glm(formula = wgts ~ workers, family = gaussian, data = machine)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.840  -0.660  -0.095   0.780   2.260  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  35.8600     0.3698  96.964  < 2e-16 ***
## workers2      1.4900     0.5230   2.849  0.00721 ** 
## workers3     -1.8100     0.5230  -3.461  0.00140 ** 
## workers4     -0.1200     0.5230  -0.229  0.81983    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.367722)
## 
##     Null deviance: 103.860  on 39  degrees of freedom
## Residual deviance:  49.238  on 36  degrees of freedom
## AIC: 131.83
## 
## Number of Fisher Scoring iterations: 2
m4 = glm(wgts ~ days, family = gaussian, data = machine)
summary(m4)
## 
## Call:
## glm(formula = wgts ~ days, family = gaussian, data = machine)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -3.36   -0.94    0.20    1.04    3.14  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.1400     0.3587 100.758   <2e-16 ***
## daysday2     -0.7800     0.5073  -1.538    0.132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2.573053)
## 
##     Null deviance: 103.860  on 39  degrees of freedom
## Residual deviance:  97.776  on 38  degrees of freedom
## AIC: 155.27
## 
## Number of Fisher Scoring iterations: 2
m5 = glm(wgts ~ 1, family = gaussian, data = machine)
summary(m5)
## 
## Call:
## glm(formula = wgts ~ 1, family = gaussian, data = machine)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.750  -1.075   0.150   1.200   2.750  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   35.750      0.258   138.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2.663077)
## 
##     Null deviance: 103.86  on 39  degrees of freedom
## Residual deviance: 103.86  on 39  degrees of freedom
## AIC: 155.68
## 
## Number of Fisher Scoring iterations: 2

With all this models we can compare, using for instance AIC, which model fits best and which variables should be taken in consideration whithin the model.

Chapter 7

7.1

First we sut up data

y <- Leukemia <- c(13, 5, 5, 3, 4, 18)
n <- total <- c(391, 205, 156, 50, 35, 51)
# Part (a) : first proposed model is binomial model with link function "logit".
n_y <- n - y
con.mat <- cbind(y, n_y)
x <- Radiation.dose <- c(0, 1, 10, 50, 100, 200)

res.glm1 <- glm(con.mat ~ x, family = binomial(link = "logit"))
res.glm1
## 
## Call:  glm(formula = con.mat ~ x, family = binomial(link = "logit"))
## 
## Coefficients:
## (Intercept)            x  
##    -3.48897      0.01441  
## 
## Degrees of Freedom: 5 Total (i.e. Null);  4 Residual
## Null Deviance:       54.35 
## Residual Deviance: 0.4321    AIC: 26.1
summary(glm(con.mat ~ x, family = binomial(link = "logit")))
## 
## Call:
## glm(formula = con.mat ~ x, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##        1         2         3         4         5         6  
##  0.41428  -0.48994  -0.13991   0.02835   0.00048   0.00269  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.488973   0.204062 -17.098  < 2e-16 ***
## x            0.014410   0.001817   7.932 2.15e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 54.35089  on 5  degrees of freedom
## Residual deviance:  0.43206  on 4  degrees of freedom
## AIC: 26.097
## 
## Number of Fisher Scoring iterations: 4
#----------------------------
beta1 <- - 3.48897 
beta2 <- 0.01441 

pihat <- 1 / (1 + exp( - (beta1 + beta2 * x)))
pihat
## [1] 0.02962770 0.03004481 0.03406355 0.05905197 0.11425766 0.35275069
fit_p <- c(fitted.values(res.glm1))
fit_p
##          1          2          3          4          5          6 
## 0.02962762 0.03004473 0.03406353 0.05905247 0.11425978 0.35276092
yhat <- total * pihat
round(yhat, 1)
## [1] 11.6  6.2  5.3  3.0  4.0 18.0
y
## [1] 13  5  5  3  4 18
# according to the page 136 we have
X2 <- sum(((y - yhat) ^ 2) / (yhat * (1 - pihat)))
X2
## [1] 0.4231947
N <- 6
p <- 2
df <- 6 - 2
shape <- df / 2
scale <- 1 / 2
pvalue <- pgamma(X2, shape, scale)
pvalue
## [1] 0.01946578
# This suggests that the model does not fit particulary well.
#--------------------------------------------------------------
# for the Hosmer-Lemeshow test according to page 142 line 3 we put
# the data in approximately equal total numbers of observations
# now, categories is as below
n <- total <- c(391, 205 + 156, 50 + 35 + 51)
y <- Leukemia <- c(13, 5 + 5, 3 + 4 + 18)
n_y <- n - y
con.mat <- cbind(y, n_y)
x <- Radiation.dose <- c(0, 1, 50)
glm(con.mat ~ x, family = binomial(link = "logit"))
## 
## Call:  glm(formula = con.mat ~ x, family = binomial(link = "logit"))
## 
## Coefficients:
## (Intercept)            x  
##    -3.47425      0.03965  
## 
## Degrees of Freedom: 2 Total (i.e. Null);  1 Residual
## Null Deviance:       38.17 
## Residual Deviance: 0.2869    AIC: 17.66
# summary(glm(con.mat ~ x, family = binomial(link = "logit")))
#----------------------------
beta1 <- - 3.47425
beta2 <- 0.03965 
pihat <- 1 / (1 + exp( - (beta1 + beta2 * x)))
pihat
## [1] 0.03005384 0.03123145 0.18365921
pi <- y / n
pi
## [1] 0.03324808 0.02770083 0.18382353
yhat <- total * pihat
round(yhat, 1)
## [1] 11.8 11.3 25.0
y
## [1] 13 10 25
# according to the page 136 we have
X2 <- sum(((y - yhat) ^ 2) / (yhat * (1 - pihat)))
X2
## [1] 0.2856101
N <- 3
p <- 2
df <- N - p
shape <- df / 2
scale <- 1 / 2
pvalue <- pgamma(X2, shape, scale)
pvalue
## [1] 0.4069525

This indicate a good fit.