period=c(1:6)
refined=c(32.0,31.2,27.0,21.0,14.9,8.8)
manufactured=c(16.3,23.1,23.6,27.7,34.6,33.9)
plot(period, refined, type="o", col="blue")
fit.ref = glm(refined ~ period, family=gaussian)
coef(fit.ref)
## (Intercept) period
## 39.573333 -4.882857
confint(fit.ref, level = 0.95)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 35.850894 43.295773
## period -5.838691 -3.927023
fit.ref.lm = lm(refined ~ period)
summary(fit.ref.lm)
##
## Call:
## lm(formula = refined ~ period)
##
## Residuals:
## 1 2 3 4 5 6
## -2.6905 1.3924 2.0752 0.9581 -0.2590 -1.4762
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.5733 1.8992 20.84 3.13e-05 ***
## period -4.8829 0.4877 -10.01 0.000559 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.04 on 4 degrees of freedom
## Multiple R-squared: 0.9616, Adjusted R-squared: 0.952
## F-statistic: 100.2 on 1 and 4 DF, p-value: 0.0005593
plot(period,manufactured, type="o", col="green")
fit.man = glm(manufactured ~ period, family=gaussian)
coef(fit.man)
## (Intercept) period
## 13.873333 3.617143
confint(fit.man, level = 0.95)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 10.128601 17.618066
## period 2.655584 4.578701
sugar.total = refined + manufactured
plot(period,sugar.total, type="o", col="green")
fit.tot = glm(sugar.total ~ period, family=gaussian)
coef(fit.tot)
## (Intercept) period
## 53.446667 -1.265714
confint(fit.tot, level = 0.95)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 47.454998 59.4383351
## period -2.804233 0.2728042
fit.tot.lm = lm(sugar.total ~ period)
summary(fit.tot.lm)
##
## Call:
## lm(formula = sugar.total ~ period)
##
## Residuals:
## 1 2 3 4 5 6
## -3.8810 3.3848 0.9505 0.3162 2.3819 -3.1524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.447 3.057 17.483 6.28e-05 ***
## period -1.266 0.785 -1.612 0.182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.284 on 4 degrees of freedom
## Multiple R-squared: 0.3939, Adjusted R-squared: 0.2424
## F-statistic: 2.6 on 1 and 4 DF, p-value: 0.1822
Here we have a complete summary of both models with their estimations and confidence intervals.
The question asks about testing the hypotesis that total sugar consumption did not change over time, i.e, if the model including time is better than the simplest model. We test that with the F test, so in this case, because the p-value is large, we cannot reject the hypotesis that total sugar consumptino did not change over time.
A complete interpretaion of the output above can be found in this link http://blog.yhat.com/posts/r-lm-summary.html.
Always first, set up the data
hyper=c(2.3,4.1,4.2,4.0,4.6,4.6,3.8,5.2,3.1,3.7,3.8)
nonh=c(3.0,4.1,3.9,3.1,3.3,2.9,3.3,3.9)
cont=c(3.0,2.6,3.1,2.2,2.1,2.4,2.8,3.4,2.9,2.6,3.1,3.2)
obese =c(hyper, nonh, cont)
groups=c(rep("H",11), rep("NH",8), rep("C",12))
df = data.frame(levels = obese, group = groups)
then we can vizualize it
ggplot(df, aes(x = group, y = levels)) +
geom_boxplot(fill = "grey80", colour = "blue") +
scale_x_discrete() + xlab("Treatment Group") +
ylab("Plasma inorganic phosphate levels")
obese.fit = lm(obese ~ groups) #, family=gaussian)
summary(obese.fit)
##
## Call:
## lm(formula = obese ~ groups)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.64545 -0.29148 0.01667 0.36667 1.25455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7833 0.1671 16.656 4.63e-16 ***
## groupsH 1.1621 0.2416 4.809 4.67e-05 ***
## groupsNH 0.6542 0.2642 2.476 0.0196 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5789 on 28 degrees of freedom
## Multiple R-squared: 0.4542, Adjusted R-squared: 0.4152
## F-statistic: 11.65 on 2 and 28 DF, p-value: 0.0002082
anova(obese.fit) #means are different
## Analysis of Variance Table
##
## Response: obese
## Df Sum Sq Mean Sq F value Pr(>F)
## groups 2 7.8083 3.9041 11.651 0.0002082 ***
## Residuals 28 9.3827 0.3351
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qf(.95, 2,28)
## [1] 3.340386
Since \(F=11.651\) is significant at the \(5\%\) level when compared with \(F(2,28)\) (see also p-value), we conclude that the group means differ.
pooled.sd = sqrt( (var(hyper)*(length(hyper)-1) + var(nonh)*(length(nonh)-1) + var(cont)*(length(cont)-1))/
(length(hyper) + length(nonh) + length(cont) - 3 ) )
df2 = length(hyper) + length(nonh) + length(cont) - 3
lower.lim = mean(hyper) - mean(nonh) - qt(0.975, df2)*(pooled.sd *sqrt((1/11)+(1/8)))
lower.lim
## [1] -0.04302617
upper.lim = mean(hyper) - mean(nonh) + qt(0.975, df2)*(pooled.sd *sqrt((1/11)+(1/8)))
upper.lim
## [1] 1.058935
A first nice view of the residuals could be:
obs.mod = data.frame(Fitted = fitted(obese.fit),
Residuals = resid(obese.fit), Treatment = df$group)
ggplot(obs.mod, aes(Fitted, Residuals, colour = Treatment)) + geom_point()
then we would like to check normality on the residuals just as we did last week. You could use qq-plot and test of normality for that.
First we set up the data
#gl(n, k, length = n*k, labels = seq_len(n), ordered = FALSE)
days = gl(2, 20, length=40, labels = c("day1", "day2"))
days
## [1] day1 day1 day1 day1 day1 day1 day1 day1 day1 day1 day1 day1 day1 day1
## [15] day1 day1 day1 day1 day1 day1 day2 day2 day2 day2 day2 day2 day2 day2
## [29] day2 day2 day2 day2 day2 day2 day2 day2 day2 day2 day2 day2
## Levels: day1 day2
workers = gl(4,5, length=40,labels = c("1", "2","3", "4"))
workers
## [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
## [36] 4 4 4 4 4
## Levels: 1 2 3 4
wgts=c(35.7,37.1,36.7,37.7,35.3,38.4,37.2,38.1,36.9,37.2,
34.9,34.3,34.5,33.7,36.2,37.1,35.5,36.5,36.0,33.8,
34.7,35.2,34.6,36.4,35.2,36.9,38.5,36.4,37.8,36.1,
32.0,35.2,33.5,32.9,33.3,35.8,32.9,35.7,38.0,36.1)
machine = data.frame(days,workers,wgts)
machine
## days workers wgts
## 1 day1 1 35.7
## 2 day1 1 37.1
## 3 day1 1 36.7
## 4 day1 1 37.7
## 5 day1 1 35.3
## 6 day1 2 38.4
## 7 day1 2 37.2
## 8 day1 2 38.1
## 9 day1 2 36.9
## 10 day1 2 37.2
## 11 day1 3 34.9
## 12 day1 3 34.3
## 13 day1 3 34.5
## 14 day1 3 33.7
## 15 day1 3 36.2
## 16 day1 4 37.1
## 17 day1 4 35.5
## 18 day1 4 36.5
## 19 day1 4 36.0
## 20 day1 4 33.8
## 21 day2 1 34.7
## 22 day2 1 35.2
## 23 day2 1 34.6
## 24 day2 1 36.4
## 25 day2 1 35.2
## 26 day2 2 36.9
## 27 day2 2 38.5
## 28 day2 2 36.4
## 29 day2 2 37.8
## 30 day2 2 36.1
## 31 day2 3 32.0
## 32 day2 3 35.2
## 33 day2 3 33.5
## 34 day2 3 32.9
## 35 day2 3 33.3
## 36 day2 4 35.8
## 37 day2 4 32.9
## 38 day2 4 35.7
## 39 day2 4 38.0
## 40 day2 4 36.1
We now can plot it
ggplot(machine, aes(x = workers, y = wgts, fill = days)) +
geom_boxplot(colour = "blue") +
scale_x_discrete()
machine.fit=lm(wgts ~ days*workers) #, family=gaussian)
summary(machine.fit)
##
## Call:
## lm(formula = wgts ~ days * workers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.800 -0.545 -0.020 0.615 2.300
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.5000 0.5012 72.822 <2e-16 ***
## daysday2 -1.2800 0.7088 -1.806 0.0804 .
## workers2 1.0600 0.7088 1.495 0.1446
## workers3 -1.7800 0.7088 -2.511 0.0173 *
## workers4 -0.7200 0.7088 -1.016 0.3174
## daysday2:workers2 0.8600 1.0025 0.858 0.3973
## daysday2:workers3 -0.0600 1.0025 -0.060 0.9526
## daysday2:workers4 1.2000 1.0025 1.197 0.2401
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.121 on 32 degrees of freedom
## Multiple R-squared: 0.613, Adjusted R-squared: 0.5283
## F-statistic: 7.24 on 7 and 32 DF, p-value: 3.263e-05
anova(machine.fit)
## Analysis of Variance Table
##
## Response: wgts
## Df Sum Sq Mean Sq F value Pr(>F)
## days 1 6.084 6.0840 4.8435 0.03508 *
## workers 3 54.622 18.2073 14.4948 3.895e-06 ***
## days:workers 3 2.958 0.9860 0.7850 0.51117
## Residuals 32 40.196 1.2561
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can find significant difference between days and between workers but no significant interaction effect.
Now let´s explore more models, using the book software suggestions
m1 = glm(wgts ~ days*workers, family = gaussian, data = machine)
summary(m1)
##
## Call:
## glm(formula = wgts ~ days * workers, family = gaussian, data = machine)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.800 -0.545 -0.020 0.615 2.300
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.5000 0.5012 72.822 <2e-16 ***
## daysday2 -1.2800 0.7088 -1.806 0.0804 .
## workers2 1.0600 0.7088 1.495 0.1446
## workers3 -1.7800 0.7088 -2.511 0.0173 *
## workers4 -0.7200 0.7088 -1.016 0.3174
## daysday2:workers2 0.8600 1.0025 0.858 0.3973
## daysday2:workers3 -0.0600 1.0025 -0.060 0.9526
## daysday2:workers4 1.2000 1.0025 1.197 0.2401
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.256125)
##
## Null deviance: 103.860 on 39 degrees of freedom
## Residual deviance: 40.196 on 32 degrees of freedom
## AIC: 131.71
##
## Number of Fisher Scoring iterations: 2
m2 = glm(wgts ~ days+workers, family = gaussian, data = machine)
summary(m2)
##
## Call:
## glm(formula = wgts ~ days + workers, family = gaussian, data = machine)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4500 -0.6575 -0.1350 0.6825 2.6500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.2500 0.3926 92.337 < 2e-16 ***
## daysday2 -0.7800 0.3511 -2.221 0.03289 *
## workers2 1.4900 0.4966 3.001 0.00494 **
## workers3 -1.8100 0.4966 -3.645 0.00086 ***
## workers4 -0.1200 0.4966 -0.242 0.81046
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.232971)
##
## Null deviance: 103.860 on 39 degrees of freedom
## Residual deviance: 43.154 on 35 degrees of freedom
## AIC: 128.55
##
## Number of Fisher Scoring iterations: 2
m3 = glm(wgts ~ workers, family = gaussian, data = machine)
summary(m3)
##
## Call:
## glm(formula = wgts ~ workers, family = gaussian, data = machine)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.840 -0.660 -0.095 0.780 2.260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.8600 0.3698 96.964 < 2e-16 ***
## workers2 1.4900 0.5230 2.849 0.00721 **
## workers3 -1.8100 0.5230 -3.461 0.00140 **
## workers4 -0.1200 0.5230 -0.229 0.81983
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.367722)
##
## Null deviance: 103.860 on 39 degrees of freedom
## Residual deviance: 49.238 on 36 degrees of freedom
## AIC: 131.83
##
## Number of Fisher Scoring iterations: 2
m4 = glm(wgts ~ days, family = gaussian, data = machine)
summary(m4)
##
## Call:
## glm(formula = wgts ~ days, family = gaussian, data = machine)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.36 -0.94 0.20 1.04 3.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.1400 0.3587 100.758 <2e-16 ***
## daysday2 -0.7800 0.5073 -1.538 0.132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2.573053)
##
## Null deviance: 103.860 on 39 degrees of freedom
## Residual deviance: 97.776 on 38 degrees of freedom
## AIC: 155.27
##
## Number of Fisher Scoring iterations: 2
m5 = glm(wgts ~ 1, family = gaussian, data = machine)
summary(m5)
##
## Call:
## glm(formula = wgts ~ 1, family = gaussian, data = machine)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.750 -1.075 0.150 1.200 2.750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.750 0.258 138.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2.663077)
##
## Null deviance: 103.86 on 39 degrees of freedom
## Residual deviance: 103.86 on 39 degrees of freedom
## AIC: 155.68
##
## Number of Fisher Scoring iterations: 2
With all this models we can compare, using for instance AIC, which model fits best and which variables should be taken in consideration whithin the model.
First we sut up data
y <- Leukemia <- c(13, 5, 5, 3, 4, 18)
n <- total <- c(391, 205, 156, 50, 35, 51)
# Part (a) : first proposed model is binomial model with link function "logit".
n_y <- n - y
con.mat <- cbind(y, n_y)
x <- Radiation.dose <- c(0, 1, 10, 50, 100, 200)
res.glm1 <- glm(con.mat ~ x, family = binomial(link = "logit"))
res.glm1
##
## Call: glm(formula = con.mat ~ x, family = binomial(link = "logit"))
##
## Coefficients:
## (Intercept) x
## -3.48897 0.01441
##
## Degrees of Freedom: 5 Total (i.e. Null); 4 Residual
## Null Deviance: 54.35
## Residual Deviance: 0.4321 AIC: 26.1
summary(glm(con.mat ~ x, family = binomial(link = "logit")))
##
## Call:
## glm(formula = con.mat ~ x, family = binomial(link = "logit"))
##
## Deviance Residuals:
## 1 2 3 4 5 6
## 0.41428 -0.48994 -0.13991 0.02835 0.00048 0.00269
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.488973 0.204062 -17.098 < 2e-16 ***
## x 0.014410 0.001817 7.932 2.15e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 54.35089 on 5 degrees of freedom
## Residual deviance: 0.43206 on 4 degrees of freedom
## AIC: 26.097
##
## Number of Fisher Scoring iterations: 4
#----------------------------
beta1 <- - 3.48897
beta2 <- 0.01441
pihat <- 1 / (1 + exp( - (beta1 + beta2 * x)))
pihat
## [1] 0.02962770 0.03004481 0.03406355 0.05905197 0.11425766 0.35275069
fit_p <- c(fitted.values(res.glm1))
fit_p
## 1 2 3 4 5 6
## 0.02962762 0.03004473 0.03406353 0.05905247 0.11425978 0.35276092
yhat <- total * pihat
round(yhat, 1)
## [1] 11.6 6.2 5.3 3.0 4.0 18.0
y
## [1] 13 5 5 3 4 18
# according to the page 136 we have
X2 <- sum(((y - yhat) ^ 2) / (yhat * (1 - pihat)))
X2
## [1] 0.4231947
N <- 6
p <- 2
df <- 6 - 2
shape <- df / 2
scale <- 1 / 2
pvalue <- pgamma(X2, shape, scale)
pvalue
## [1] 0.01946578
# This suggests that the model does not fit particulary well.
#--------------------------------------------------------------
# for the Hosmer-Lemeshow test according to page 142 line 3 we put
# the data in approximately equal total numbers of observations
# now, categories is as below
n <- total <- c(391, 205 + 156, 50 + 35 + 51)
y <- Leukemia <- c(13, 5 + 5, 3 + 4 + 18)
n_y <- n - y
con.mat <- cbind(y, n_y)
x <- Radiation.dose <- c(0, 1, 50)
glm(con.mat ~ x, family = binomial(link = "logit"))
##
## Call: glm(formula = con.mat ~ x, family = binomial(link = "logit"))
##
## Coefficients:
## (Intercept) x
## -3.47425 0.03965
##
## Degrees of Freedom: 2 Total (i.e. Null); 1 Residual
## Null Deviance: 38.17
## Residual Deviance: 0.2869 AIC: 17.66
# summary(glm(con.mat ~ x, family = binomial(link = "logit")))
#----------------------------
beta1 <- - 3.47425
beta2 <- 0.03965
pihat <- 1 / (1 + exp( - (beta1 + beta2 * x)))
pihat
## [1] 0.03005384 0.03123145 0.18365921
pi <- y / n
pi
## [1] 0.03324808 0.02770083 0.18382353
yhat <- total * pihat
round(yhat, 1)
## [1] 11.8 11.3 25.0
y
## [1] 13 10 25
# according to the page 136 we have
X2 <- sum(((y - yhat) ^ 2) / (yhat * (1 - pihat)))
X2
## [1] 0.2856101
N <- 3
p <- 2
df <- N - p
shape <- df / 2
scale <- 1 / 2
pvalue <- pgamma(X2, shape, scale)
pvalue
## [1] 0.4069525
This indicate a good fit.