# Data manipulation
smokedata<- read.table("smoking_data.csv",sep=",",header = T)
kable(smokedata)
| age | smoke | pop | dead |
|---|---|---|---|
| 40-44 | cigarPipeOnly | 145 | 2 |
| 60-64 | cigarPipeOnly | 846 | 113 |
| 45-59 | cigarrettePlus | 3030 | 169 |
| 50-54 | cigarretteOnly | 1851 | 187 |
| 70-74 | no | 668 | 179 |
| 40-44 | cigarretteOnly | 3410 | 124 |
| 45-59 | cigarretteOnly | 2239 | 140 |
| 70-74 | cigarretteOnly | 1195 | 432 |
| 80+ | cigarPipeOnly | 537 | 253 |
| 45-59 | no | 359 | 22 |
| 65-69 | no | 897 | 170 |
| 60-64 | no | 1067 | 117 |
| 75-79 | cigarPipeOnly | 667 | 243 |
| 80+ | no | 274 | 120 |
| 65-69 | cigarrettePlus | 3880 | 901 |
| 45-59 | cigarPipeOnly | 104 | 4 |
| 65-69 | cigarPipeOnly | 949 | 173 |
| 40-44 | cigarrettePlus | 4531 | 149 |
| 60-64 | cigarretteOnly | 3791 | 778 |
| 75-79 | cigarretteOnly | 436 | 214 |
| 65-69 | cigarretteOnly | 2421 | 689 |
| 55-59 | no | 632 | 55 |
| 80+ | cigarretteOnly | 113 | 63 |
| 80+ | cigarrettePlus | 345 | 189 |
| 70-74 | cigarPipeOnly | 824 | 212 |
| 70-74 | cigarrettePlus | 2033 | 613 |
| 40-44 | no | 656 | 18 |
| 50-54 | cigarPipeOnly | 98 | 3 |
| 55-59 | cigarretteOnly | 3270 | 514 |
| 50-54 | no | 249 | 19 |
| 75-79 | no | 361 | 120 |
| 75-79 | cigarrettePlus | 871 | 337 |
| 55-59 | cigarPipeOnly | 372 | 38 |
| 55-59 | cigarrettePlus | 4682 | 576 |
| 60-64 | cigarrettePlus | 6052 | 1001 |
| 50-54 | cigarrettePlus | 2267 | 193 |
levels(smokedata$age) <- gsub("45-59","45-49", levels(smokedata$age)) #### change the length of the levels
factor(levels(smokedata$age))
## [1] 80+ 80+ 40-44 45-49 50-54 55-59 60-64 65-69
## [9] 70-74 75-79 40-44 45-49 50-54 55-59 60-64 65-69
## [17] 70-74 75-79
## 18 Levels: 80+ 80+ 40-44 45-49 50-54 55-59 ... 75-79
levels(smokedata$age) <- gsub(" ","", levels(smokedata$age)) ###get rid off the space for age factor
length(levels(smokedata$age))
## [1] 9
levels(smokedata$smoke) <- gsub(" ","", levels(smokedata$smoke)) ###get rid off the space for smokedata factor
smokedata$age = factor(smokedata$age,levels(smokedata$age)[c(2:9,1)])###reorder the age levels
###colnames(smokedata)<- c("Age","Smoking status","Population","n of death")
str(smokedata)
## 'data.frame': 36 obs. of 4 variables:
## $ age : Factor w/ 9 levels "40-44","45-49",..: 1 5 2 3 7 1 2 7 9 2 ...
## $ smoke: Factor w/ 4 levels "no","cigarPipeOnly",..: 2 2 4 3 1 3 3 3 2 1 ...
## $ pop : int 145 846 3030 1851 668 3410 2239 1195 537 359 ...
## $ dead : int 2 113 169 187 179 124 140 432 253 22 ...
###################starting exploratory analysis
deadwithage<-ggplot(smokedata,aes(x=pop,y=dead,col=age))+geom_point(size=2)+
scale_color_brewer(palette="YlOrRd")+
theme_dark()+labs(colour="Age")+
xlab("Population")+
ylab("Number of Death")+
ggtitle("Population and death with age as the colour",
subtitle = "Plot 1")+
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.text =
element_text(colour="burlywood2", size = 10, face = "bold"))+
theme(legend.title = element_text(colour="darkseagreen1",
size=12, face="bold"))+
theme(legend.background = element_rect(fill="grey40", size=1,
linetype="dotted"))+
theme(plot.subtitle = element_text(size =15, colour="red3"))+
theme(plot.title = element_text(hjust = 0.5,size = 15))
deadwithsmoke<-ggplot(smokedata,aes(x=pop,y=dead,col=smoke))+geom_point(size=2)+
##scale_color_gradientn(colours = rainbow(5))##change the colour by the scale
scale_color_brewer(palette="Dark2")+
labs(colour="Smoking status",caption="The describtion of
the factors are shown above")+
xlab("Population")+
ylab("Number of Death")+
ggtitle("Population and death with smoking status as the colour",
subtitle = "Plot 2")+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(legend.text =
element_text(colour="salmon", size = 10, face = "bold"))+
theme(legend.title = element_text(colour="maroon",
size=12, face="bold"))+
theme(legend.background =
element_rect(fill="cornsilk1", size=1, linetype="dotted"),
legend.position = "top")+
theme(plot.subtitle = element_text(size =15, colour="red3"))+
theme(plot.caption = element_text(colour="navyblue",size = 12.5))+
theme(
panel.background = element_rect(fill = "pink",
colour = "lightblue",
size = 0.5, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',
colour = "gray"),
panel.grid.minor = element_line(size = 0.25, linetype = 'solid',
colour = "gray")
)
grid.arrange(deadwithage,deadwithsmoke,nrow=2,ncol=1)
################################################################
ggplotly(deadwithsmoke)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
###
###
###
########
The colour of the points are showing some patterns
############## box plot for age
boxplotage<- ggboxplot(smokedata, x = "age", y = "dead",
color = "age", palette = "lancet",fill = "age",notch = T)+
# Add pairwise comparisons p-value
stat_compare_means(label.y = c(1000, 1000,100000))+
theme_light()+
stat_compare_means(method = "anova", label.y = 900)+
ggtitle("Boxplot for age against The number of death",
subtitle = "Plot 3")+
theme(plot.subtitle = element_text(size =15, colour="red3"))+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(
panel.background = element_rect(fill = "lightblue",
colour = "lightblue",
size = 0.5, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid',
colour = "white"),
panel.grid.minor = element_line(size = 0.25, linetype = 'solid',
colour = "white")
)
############## box plot for smoke with individual comparsions
#####
mycomparisons <- list( c("no", "cigarPipeOnly"), c("no","cigarretteOnly"),
c("no", "cigarrettePlus"),
c("cigarPipeOnly","cigarretteOnly"),
c("cigarPipeOnly","cigarrettePlus"),
c("cigarretteOnly","cigarrettePlus"))
boxplotsmoke<-ggboxplot(smokedata, x = "smoke", y = "dead",
color = "smoke", palette = "lancet",fill = "smoke",notch = F)+
stat_compare_means(label.y = 650)+
# Add pairwise comparisons p-value
stat_compare_means(comparisons=mycomparisons,label.y = c(950, 1050,1150,900))+
theme_light()+
stat_compare_means(method = "anova", label.y = 750)+
ggtitle("Boxplot for smoke status against the number of death",
subtitle = "Plot 4")+
theme(plot.subtitle = element_text(size =15, colour="red3"))+
stat_summary(fun.y=mean, geom="line", aes(group=1))+
stat_boxplot(geom ='errorbar',col="deeppink4")+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
ylim(0,1200)
####################
#######################################################
########################################
grid.arrange(boxplotage,boxplotsmoke)
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
For P values< 0.5 indicates the means are statistically different.
par(mfrow=c(2,2))
model<- glm(data=smokedata,dead~age+smoke,family = "poisson",offset = log(pop))
summary(model)
##
## Call:
## glm(formula = dead ~ age + smoke, family = "poisson", data = smokedata,
## offset = log(pop))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.06055 -0.54773 0.06431 0.29963 1.48348
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.68002 0.06824 -53.929 < 2e-16 ***
## age45-49 0.55388 0.07999 6.924 4.38e-12 ***
## age50-54 0.98039 0.07682 12.762 < 2e-16 ***
## age55-59 1.37946 0.06526 21.138 < 2e-16 ***
## age60-64 1.65423 0.06257 26.439 < 2e-16 ***
## age65-69 1.99817 0.06279 31.824 < 2e-16 ***
## age70-74 2.27141 0.06435 35.296 < 2e-16 ***
## age75-79 2.55858 0.06778 37.746 < 2e-16 ***
## age80+ 2.84692 0.07242 39.310 < 2e-16 ***
## smokecigarPipeOnly 0.04781 0.04699 1.017 0.309
## smokecigarretteOnly 0.41696 0.03991 10.447 < 2e-16 ***
## smokecigarrettePlus 0.21796 0.03869 5.633 1.77e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4055.984 on 35 degrees of freedom
## Residual deviance: 21.487 on 24 degrees of freedom
## AIC: 285.51
##
## Number of Fisher Scoring iterations: 4
anova(model)
## Analysis of Deviance Table
##
## Model: poisson, link: log
##
## Response: dead
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 35 4056.0
## age 8 3864.3 27 191.7
## smoke 3 170.2 24 21.5
plot(model)
For a GLM poisson model, the assumptions need to be meet,although the assumptions of normality errors and constant variance are met. we also need to check for dispersion by fitting a Quasipossion model. There are many different ways of testing for dispersion.
model1<- glm(data=smokedata,dead~age+smoke,offset=log(pop),family = "quasipoisson")
summary(model1)############slightly underdispersion, suggest poisson is ok to use
##
## Call:
## glm(formula = dead ~ age + smoke, family = "quasipoisson", data = smokedata,
## offset = log(pop))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.06055 -0.54773 0.06431 0.29963 1.48348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.68002 0.06325 -58.182 < 2e-16 ***
## age45-49 0.55388 0.07414 7.471 1.04e-07 ***
## age50-54 0.98039 0.07120 13.769 6.90e-13 ***
## age55-59 1.37946 0.06049 22.805 < 2e-16 ***
## age60-64 1.65423 0.05799 28.524 < 2e-16 ***
## age65-69 1.99817 0.05820 34.334 < 2e-16 ***
## age70-74 2.27141 0.05965 38.079 < 2e-16 ***
## age75-79 2.55858 0.06283 40.723 < 2e-16 ***
## age80+ 2.84692 0.06713 42.410 < 2e-16 ***
## smokecigarPipeOnly 0.04781 0.04356 1.098 0.283
## smokecigarretteOnly 0.41696 0.03699 11.271 4.52e-11 ***
## smokecigarrettePlus 0.21796 0.03587 6.077 2.82e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 0.8591398)
##
## Null deviance: 4055.984 on 35 degrees of freedom
## Residual deviance: 21.487 on 24 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
par(mfrow=c(2,2))
plot(model1)
Slightly underdispersion, suggesting poisson is ok to use
Testing the fitness of the poisson model Greater than 0.05 indicating the model fits well
pchisq(model$deviance, df=model$df.residual)
## [1] 0.390128
checking the difference between predicted and actual by all equal function
all.equal(smokedata$dead,exp(predict(model)))
## [1] "names for current but not for target"
## [2] "Mean relative difference: 0.0276166"
very small margine of difference between the predicted actual
Lets see the difference between the predicted and the observed
exp(predict(model, newdata = data.frame(smoke = "no", age = "70-74", pop = 668)))
## 1
## 163.3132
newdata<-smokedata[c("age","smoke","pop")]
newdata$pred.dead<- exp(predict(model))
newdata$dead<- smokedata$dead
####preding the whole data
ggplot(newdata,aes(y=pred.dead,x=seq_along(newdata$pred.dead),col="black"))+
geom_line()+
geom_line(data=newdata,aes(y=dead,x=seq_along(dead),colour="green"))+
xlab("Row number")+
ylab("Number of dead")+
#scale_colour_manual(values=c("black","green"))+
#labs(colour="Line colours")+
scale_color_discrete(name = "Line colours",
labels = c("Predicted", "Actual"))+
ggtitle("Predicted and Actual line plot",
subtitle = "Plot 5")+
theme(plot.subtitle = element_text(size =15, colour="red3"))+
theme(plot.title = element_text(hjust = 0.5,size = 15))
###the red is predicted and blue is actual, they are very similar to each other
Note that if predicted and actual are identical may cause overfitting, which suggests the model may not predict accuratly with another set of data.