Data manipulation


# Data manipulation
smokedata<- read.table("smoking_data.csv",sep=",",header = T)
kable(smokedata)
age smoke pop dead
40-44 cigarPipeOnly 145 2
60-64 cigarPipeOnly 846 113
45-59 cigarrettePlus 3030 169
50-54 cigarretteOnly 1851 187
70-74 no 668 179
40-44 cigarretteOnly 3410 124
45-59 cigarretteOnly 2239 140
70-74 cigarretteOnly 1195 432
80+ cigarPipeOnly 537 253
45-59 no 359 22
65-69 no 897 170
60-64 no 1067 117
75-79 cigarPipeOnly 667 243
80+ no 274 120
65-69 cigarrettePlus 3880 901
45-59 cigarPipeOnly 104 4
65-69 cigarPipeOnly 949 173
40-44 cigarrettePlus 4531 149
60-64 cigarretteOnly 3791 778
75-79 cigarretteOnly 436 214
65-69 cigarretteOnly 2421 689
55-59 no 632 55
80+ cigarretteOnly 113 63
80+ cigarrettePlus 345 189
70-74 cigarPipeOnly 824 212
70-74 cigarrettePlus 2033 613
40-44 no 656 18
50-54 cigarPipeOnly 98 3
55-59 cigarretteOnly 3270 514
50-54 no 249 19
75-79 no 361 120
75-79 cigarrettePlus 871 337
55-59 cigarPipeOnly 372 38
55-59 cigarrettePlus 4682 576
60-64 cigarrettePlus 6052 1001
50-54 cigarrettePlus 2267 193
levels(smokedata$age) <- gsub("45-59","45-49", levels(smokedata$age)) #### change the length of the levels 
factor(levels(smokedata$age))
##  [1]     80+    80+    40-44   45-49   50-54   55-59   60-64   65-69
##  [9]   70-74   75-79  40-44   45-49   50-54   55-59   60-64   65-69 
## [17]  70-74   75-79 
## 18 Levels:     80+    80+   40-44   45-49   50-54   55-59 ...  75-79
levels(smokedata$age) <- gsub(" ","", levels(smokedata$age)) ###get rid off the space for age factor
length(levels(smokedata$age))
## [1] 9
levels(smokedata$smoke) <- gsub(" ","", levels(smokedata$smoke)) ###get rid off the space for smokedata factor
smokedata$age = factor(smokedata$age,levels(smokedata$age)[c(2:9,1)])###reorder the age levels
###colnames(smokedata)<- c("Age","Smoking status","Population","n of death")
str(smokedata)
## 'data.frame':    36 obs. of  4 variables:
##  $ age  : Factor w/ 9 levels "40-44","45-49",..: 1 5 2 3 7 1 2 7 9 2 ...
##  $ smoke: Factor w/ 4 levels "no","cigarPipeOnly",..: 2 2 4 3 1 3 3 3 2 1 ...
##  $ pop  : int  145 846 3030 1851 668 3410 2239 1195 537 359 ...
##  $ dead : int  2 113 169 187 179 124 140 432 253 22 ...

Explanatory analysis


###################starting exploratory analysis
deadwithage<-ggplot(smokedata,aes(x=pop,y=dead,col=age))+geom_point(size=2)+
  scale_color_brewer(palette="YlOrRd")+
  theme_dark()+labs(colour="Age")+
  xlab("Population")+
  ylab("Number of Death")+
  ggtitle("Population and death with age as the colour",
          subtitle = "Plot 1")+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(legend.text = 
          element_text(colour="burlywood2", size = 10, face = "bold"))+
  theme(legend.title = element_text(colour="darkseagreen1",
                                    size=12, face="bold"))+
  theme(legend.background = element_rect(fill="grey40", size=1,
                                         linetype="dotted"))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))
  


deadwithsmoke<-ggplot(smokedata,aes(x=pop,y=dead,col=smoke))+geom_point(size=2)+
  ##scale_color_gradientn(colours = rainbow(5))##change the colour by the scale
  scale_color_brewer(palette="Dark2")+
  labs(colour="Smoking status",caption="The describtion of 
       the factors are shown above")+
  xlab("Population")+
  ylab("Number of Death")+
  ggtitle("Population and death with smoking status as the colour",
          subtitle = "Plot 2")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(legend.text = 
               element_text(colour="salmon", size = 10, face = "bold"))+
  theme(legend.title = element_text(colour="maroon",
                                    size=12, face="bold"))+
  theme(legend.background = 
          element_rect(fill="cornsilk1", size=1, linetype="dotted"),
        legend.position = "top")+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  theme(plot.caption  = element_text(colour="navyblue",size = 12.5))+
  theme(
  panel.background = element_rect(fill = "pink",
                                  colour = "lightblue",
                                  size = 0.5, linetype = "solid"),
  panel.grid.major = element_line(size = 0.5, linetype = 'solid',
                                  colour = "gray"), 
  panel.grid.minor = element_line(size = 0.25, linetype = 'solid',
                                  colour = "gray")
)
grid.arrange(deadwithage,deadwithsmoke,nrow=2,ncol=1)

################################################################

ggplotly(deadwithsmoke)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
###
###
###
########

The colour of the points are showing some patterns

############## box plot for age 
boxplotage<- ggboxplot(smokedata, x = "age", y = "dead",
          color = "age", palette = "lancet",fill = "age",notch = T)+
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = c(1000, 1000,100000))+
  theme_light()+
  stat_compare_means(method = "anova", label.y = 900)+
  ggtitle("Boxplot for age against The number of death",
          subtitle = "Plot 3")+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(
  panel.background = element_rect(fill = "lightblue",
                                  colour = "lightblue",
                                  size = 0.5, linetype = "solid"),
  panel.grid.major = element_line(size = 0.5, linetype = 'solid',
                                  colour = "white"), 
  panel.grid.minor = element_line(size = 0.25, linetype = 'solid',
                                  colour = "white")
)

############## box plot for smoke with individual comparsions 
#####
mycomparisons <- list( c("no", "cigarPipeOnly"), c("no","cigarretteOnly"),
                      c("no", "cigarrettePlus"),
                      c("cigarPipeOnly","cigarretteOnly"),
                      c("cigarPipeOnly","cigarrettePlus"),
                      c("cigarretteOnly","cigarrettePlus"))

boxplotsmoke<-ggboxplot(smokedata, x = "smoke", y = "dead",
          color = "smoke", palette = "lancet",fill = "smoke",notch = F)+
  stat_compare_means(label.y = 650)+
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons=mycomparisons,label.y = c(950, 1050,1150,900))+
  theme_light()+
  stat_compare_means(method = "anova", label.y = 750)+
  ggtitle("Boxplot for smoke status against the number of death",
          subtitle = "Plot 4")+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  stat_summary(fun.y=mean, geom="line", aes(group=1))+
  stat_boxplot(geom ='errorbar',col="deeppink4")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  ylim(0,1200)
####################

#######################################################

########################################
grid.arrange(boxplotage,boxplotsmoke) 
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

For P values< 0.5 indicates the means are statistically different.

modelling


The GLM poisson model

par(mfrow=c(2,2))
model<- glm(data=smokedata,dead~age+smoke,family = "poisson",offset = log(pop))
summary(model)
## 
## Call:
## glm(formula = dead ~ age + smoke, family = "poisson", data = smokedata, 
##     offset = log(pop))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.06055  -0.54773   0.06431   0.29963   1.48348  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.68002    0.06824 -53.929  < 2e-16 ***
## age45-49             0.55388    0.07999   6.924 4.38e-12 ***
## age50-54             0.98039    0.07682  12.762  < 2e-16 ***
## age55-59             1.37946    0.06526  21.138  < 2e-16 ***
## age60-64             1.65423    0.06257  26.439  < 2e-16 ***
## age65-69             1.99817    0.06279  31.824  < 2e-16 ***
## age70-74             2.27141    0.06435  35.296  < 2e-16 ***
## age75-79             2.55858    0.06778  37.746  < 2e-16 ***
## age80+               2.84692    0.07242  39.310  < 2e-16 ***
## smokecigarPipeOnly   0.04781    0.04699   1.017    0.309    
## smokecigarretteOnly  0.41696    0.03991  10.447  < 2e-16 ***
## smokecigarrettePlus  0.21796    0.03869   5.633 1.77e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   21.487  on 24  degrees of freedom
## AIC: 285.51
## 
## Number of Fisher Scoring iterations: 4
anova(model)
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: dead
## 
## Terms added sequentially (first to last)
## 
## 
##       Df Deviance Resid. Df Resid. Dev
## NULL                     35     4056.0
## age    8   3864.3        27      191.7
## smoke  3    170.2        24       21.5
plot(model)

For a GLM poisson model, the assumptions need to be meet,although the assumptions of normality errors and constant variance are met. we also need to check for dispersion by fitting a Quasipossion model. There are many different ways of testing for dispersion.

Quasipoisson model

model1<- glm(data=smokedata,dead~age+smoke,offset=log(pop),family = "quasipoisson")
summary(model1)############slightly underdispersion, suggest poisson is ok to use 
## 
## Call:
## glm(formula = dead ~ age + smoke, family = "quasipoisson", data = smokedata, 
##     offset = log(pop))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.06055  -0.54773   0.06431   0.29963   1.48348  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -3.68002    0.06325 -58.182  < 2e-16 ***
## age45-49             0.55388    0.07414   7.471 1.04e-07 ***
## age50-54             0.98039    0.07120  13.769 6.90e-13 ***
## age55-59             1.37946    0.06049  22.805  < 2e-16 ***
## age60-64             1.65423    0.05799  28.524  < 2e-16 ***
## age65-69             1.99817    0.05820  34.334  < 2e-16 ***
## age70-74             2.27141    0.05965  38.079  < 2e-16 ***
## age75-79             2.55858    0.06283  40.723  < 2e-16 ***
## age80+               2.84692    0.06713  42.410  < 2e-16 ***
## smokecigarPipeOnly   0.04781    0.04356   1.098    0.283    
## smokecigarretteOnly  0.41696    0.03699  11.271 4.52e-11 ***
## smokecigarrettePlus  0.21796    0.03587   6.077 2.82e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.8591398)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   21.487  on 24  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4
par(mfrow=c(2,2))
plot(model1)

Slightly underdispersion, suggesting poisson is ok to use

Testing the fitness of the poisson model Greater than 0.05 indicating the model fits well

pchisq(model$deviance, df=model$df.residual)
## [1] 0.390128

checking the difference between predicted and actual by all equal function

all.equal(smokedata$dead,exp(predict(model)))
## [1] "names for current but not for target"
## [2] "Mean relative difference: 0.0276166"

very small margine of difference between the predicted actual

Lets see the difference between the predicted and the observed

exp(predict(model, newdata = data.frame(smoke = "no", age = "70-74", pop = 668)))
##        1 
## 163.3132
newdata<-smokedata[c("age","smoke","pop")]
newdata$pred.dead<- exp(predict(model))
newdata$dead<- smokedata$dead
####preding the whole data
ggplot(newdata,aes(y=pred.dead,x=seq_along(newdata$pred.dead),col="black"))+
  geom_line()+
  geom_line(data=newdata,aes(y=dead,x=seq_along(dead),colour="green"))+
  xlab("Row number")+
  ylab("Number of dead")+
  #scale_colour_manual(values=c("black","green"))+
  #labs(colour="Line colours")+
  scale_color_discrete(name = "Line colours",
                       labels = c("Predicted", "Actual"))+
  ggtitle("Predicted and Actual line plot",
          subtitle = "Plot 5")+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))

###the red is predicted and blue is actual, they are very similar to each other

Note that if predicted and actual are identical may cause overfitting, which suggests the model may not predict accuratly with another set of data.