Data manipulation

# Data manipulation
smokedata<- read.table("smoking_data.csv",sep=",",header = T)
kable(smokedata)

age	smoke	pop	dead
40-44	cigarPipeOnly	145	2
60-64	cigarPipeOnly	846	113
45-59	cigarrettePlus	3030	169
50-54	cigarretteOnly	1851	187
70-74	no	668	179
40-44	cigarretteOnly	3410	124
45-59	cigarretteOnly	2239	140
70-74	cigarretteOnly	1195	432
80+	cigarPipeOnly	537	253
45-59	no	359	22
65-69	no	897	170
60-64	no	1067	117
75-79	cigarPipeOnly	667	243
80+	no	274	120
65-69	cigarrettePlus	3880	901
45-59	cigarPipeOnly	104	4
65-69	cigarPipeOnly	949	173
40-44	cigarrettePlus	4531	149
60-64	cigarretteOnly	3791	778
75-79	cigarretteOnly	436	214
65-69	cigarretteOnly	2421	689
55-59	no	632	55
80+	cigarretteOnly	113	63
80+	cigarrettePlus	345	189
70-74	cigarPipeOnly	824	212
70-74	cigarrettePlus	2033	613
40-44	no	656	18
50-54	cigarPipeOnly	98	3
55-59	cigarretteOnly	3270	514
50-54	no	249	19
75-79	no	361	120
75-79	cigarrettePlus	871	337
55-59	cigarPipeOnly	372	38
55-59	cigarrettePlus	4682	576
60-64	cigarrettePlus	6052	1001
50-54	cigarrettePlus	2267	193

levels(smokedata$age) <- gsub("45-59","45-49", levels(smokedata$age)) #### change the length of the levels 
factor(levels(smokedata$age))

##  [1]     80+    80+    40-44   45-49   50-54   55-59   60-64   65-69
##  [9]   70-74   75-79  40-44   45-49   50-54   55-59   60-64   65-69 
## [17]  70-74   75-79 
## 18 Levels:     80+    80+   40-44   45-49   50-54   55-59 ...  75-79

levels(smokedata$age) <- gsub(" ","", levels(smokedata$age)) ###get rid off the space for age factor
length(levels(smokedata$age))

## [1] 9

levels(smokedata$smoke) <- gsub(" ","", levels(smokedata$smoke)) ###get rid off the space for smokedata factor
smokedata$age = factor(smokedata$age,levels(smokedata$age)[c(2:9,1)])###reorder the age levels
###colnames(smokedata)<- c("Age","Smoking status","Population","n of death")
str(smokedata)

## 'data.frame':    36 obs. of  4 variables:
##  $ age  : Factor w/ 9 levels "40-44","45-49",..: 1 5 2 3 7 1 2 7 9 2 ...
##  $ smoke: Factor w/ 4 levels "no","cigarPipeOnly",..: 2 2 4 3 1 3 3 3 2 1 ...
##  $ pop  : int  145 846 3030 1851 668 3410 2239 1195 537 359 ...
##  $ dead : int  2 113 169 187 179 124 140 432 253 22 ...

Explanatory analysis

###################starting exploratory analysis
deadwithage<-ggplot(smokedata,aes(x=pop,y=dead,col=age))+geom_point(size=2)+
  scale_color_brewer(palette="YlOrRd")+
  theme_dark()+labs(colour="Age")+
  xlab("Population")+
  ylab("Number of Death")+
  ggtitle("Population and death with age as the colour",
          subtitle = "Plot 1")+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(legend.text = 
          element_text(colour="burlywood2", size = 10, face = "bold"))+
  theme(legend.title = element_text(colour="darkseagreen1",
                                    size=12, face="bold"))+
  theme(legend.background = element_rect(fill="grey40", size=1,
                                         linetype="dotted"))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))
  


deadwithsmoke<-ggplot(smokedata,aes(x=pop,y=dead,col=smoke))+geom_point(size=2)+
  ##scale_color_gradientn(colours = rainbow(5))##change the colour by the scale
  scale_color_brewer(palette="Dark2")+
  labs(colour="Smoking status",caption="The describtion of 
       the factors are shown above")+
  xlab("Population")+
  ylab("Number of Death")+
  ggtitle("Population and death with smoking status as the colour",
          subtitle = "Plot 2")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(legend.text = 
               element_text(colour="salmon", size = 10, face = "bold"))+
  theme(legend.title = element_text(colour="maroon",
                                    size=12, face="bold"))+
  theme(legend.background = 
          element_rect(fill="cornsilk1", size=1, linetype="dotted"),
        legend.position = "top")+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  theme(plot.caption  = element_text(colour="navyblue",size = 12.5))+
  theme(
  panel.background = element_rect(fill = "pink",
                                  colour = "lightblue",
                                  size = 0.5, linetype = "solid"),
  panel.grid.major = element_line(size = 0.5, linetype = 'solid',
                                  colour = "gray"), 
  panel.grid.minor = element_line(size = 0.25, linetype = 'solid',
                                  colour = "gray")
)
grid.arrange(deadwithage,deadwithsmoke,nrow=2,ncol=1)

################################################################

ggplotly(deadwithsmoke)

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

###
###
###
########

The colour of the points are showing some patterns

############## box plot for age 
boxplotage<- ggboxplot(smokedata, x = "age", y = "dead",
          color = "age", palette = "lancet",fill = "age",notch = T)+
  # Add pairwise comparisons p-value
  stat_compare_means(label.y = c(1000, 1000,100000))+
  theme_light()+
  stat_compare_means(method = "anova", label.y = 900)+
  ggtitle("Boxplot for age against The number of death",
          subtitle = "Plot 3")+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(
  panel.background = element_rect(fill = "lightblue",
                                  colour = "lightblue",
                                  size = 0.5, linetype = "solid"),
  panel.grid.major = element_line(size = 0.5, linetype = 'solid',
                                  colour = "white"), 
  panel.grid.minor = element_line(size = 0.25, linetype = 'solid',
                                  colour = "white")
)

############## box plot for smoke with individual comparsions 
#####
mycomparisons <- list( c("no", "cigarPipeOnly"), c("no","cigarretteOnly"),
                      c("no", "cigarrettePlus"),
                      c("cigarPipeOnly","cigarretteOnly"),
                      c("cigarPipeOnly","cigarrettePlus"),
                      c("cigarretteOnly","cigarrettePlus"))

boxplotsmoke<-ggboxplot(smokedata, x = "smoke", y = "dead",
          color = "smoke", palette = "lancet",fill = "smoke",notch = F)+
  stat_compare_means(label.y = 650)+
  # Add pairwise comparisons p-value
  stat_compare_means(comparisons=mycomparisons,label.y = c(950, 1050,1150,900))+
  theme_light()+
  stat_compare_means(method = "anova", label.y = 750)+
  ggtitle("Boxplot for smoke status against the number of death",
          subtitle = "Plot 4")+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  stat_summary(fun.y=mean, geom="line", aes(group=1))+
  stat_boxplot(geom ='errorbar',col="deeppink4")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  ylim(0,1200)
####################

#######################################################

########################################
grid.arrange(boxplotage,boxplotsmoke)

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

For P values< 0.5 indicates the means are statistically different.

modelling

The GLM poisson model

par(mfrow=c(2,2))
model<- glm(data=smokedata,dead~age+smoke,family = "poisson",offset = log(pop))
summary(model)

## 
## Call:
## glm(formula = dead ~ age + smoke, family = "poisson", data = smokedata, 
##     offset = log(pop))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.06055  -0.54773   0.06431   0.29963   1.48348  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.68002    0.06824 -53.929  < 2e-16 ***
## age45-49             0.55388    0.07999   6.924 4.38e-12 ***
## age50-54             0.98039    0.07682  12.762  < 2e-16 ***
## age55-59             1.37946    0.06526  21.138  < 2e-16 ***
## age60-64             1.65423    0.06257  26.439  < 2e-16 ***
## age65-69             1.99817    0.06279  31.824  < 2e-16 ***
## age70-74             2.27141    0.06435  35.296  < 2e-16 ***
## age75-79             2.55858    0.06778  37.746  < 2e-16 ***
## age80+               2.84692    0.07242  39.310  < 2e-16 ***
## smokecigarPipeOnly   0.04781    0.04699   1.017    0.309    
## smokecigarretteOnly  0.41696    0.03991  10.447  < 2e-16 ***
## smokecigarrettePlus  0.21796    0.03869   5.633 1.77e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   21.487  on 24  degrees of freedom
## AIC: 285.51
## 
## Number of Fisher Scoring iterations: 4

anova(model)

## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: dead
## 
## Terms added sequentially (first to last)
## 
## 
##       Df Deviance Resid. Df Resid. Dev
## NULL                     35     4056.0
## age    8   3864.3        27      191.7
## smoke  3    170.2        24       21.5

plot(model)

For a GLM poisson model, the assumptions need to be meet,although the assumptions of normality errors and constant variance are met. we also need to check for dispersion by fitting a Quasipossion model. There are many different ways of testing for dispersion.

Quasipoisson model

model1<- glm(data=smokedata,dead~age+smoke,offset=log(pop),family = "quasipoisson")
summary(model1)############slightly underdispersion, suggest poisson is ok to use

## 
## Call:
## glm(formula = dead ~ age + smoke, family = "quasipoisson", data = smokedata, 
##     offset = log(pop))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.06055  -0.54773   0.06431   0.29963   1.48348  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -3.68002    0.06325 -58.182  < 2e-16 ***
## age45-49             0.55388    0.07414   7.471 1.04e-07 ***
## age50-54             0.98039    0.07120  13.769 6.90e-13 ***
## age55-59             1.37946    0.06049  22.805  < 2e-16 ***
## age60-64             1.65423    0.05799  28.524  < 2e-16 ***
## age65-69             1.99817    0.05820  34.334  < 2e-16 ***
## age70-74             2.27141    0.05965  38.079  < 2e-16 ***
## age75-79             2.55858    0.06283  40.723  < 2e-16 ***
## age80+               2.84692    0.06713  42.410  < 2e-16 ***
## smokecigarPipeOnly   0.04781    0.04356   1.098    0.283    
## smokecigarretteOnly  0.41696    0.03699  11.271 4.52e-11 ***
## smokecigarrettePlus  0.21796    0.03587   6.077 2.82e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.8591398)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   21.487  on 24  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

par(mfrow=c(2,2))
plot(model1)

Slightly underdispersion, suggesting poisson is ok to use

Testing the fitness of the poisson model Greater than 0.05 indicating the model fits well

pchisq(model$deviance, df=model$df.residual)

## [1] 0.390128

checking the difference between predicted and actual by all equal function

all.equal(smokedata$dead,exp(predict(model)))

## [1] "names for current but not for target"
## [2] "Mean relative difference: 0.0276166"

very small margine of difference between the predicted actual

Lets see the difference between the predicted and the observed

exp(predict(model, newdata = data.frame(smoke = "no", age = "70-74", pop = 668)))

##        1 
## 163.3132

newdata<-smokedata[c("age","smoke","pop")]
newdata$pred.dead<- exp(predict(model))
newdata$dead<- smokedata$dead
####preding the whole data
ggplot(newdata,aes(y=pred.dead,x=seq_along(newdata$pred.dead),col="black"))+
  geom_line()+
  geom_line(data=newdata,aes(y=dead,x=seq_along(dead),colour="green"))+
  xlab("Row number")+
  ylab("Number of dead")+
  #scale_colour_manual(values=c("black","green"))+
  #labs(colour="Line colours")+
  scale_color_discrete(name = "Line colours",
                       labels = c("Predicted", "Actual"))+
  ggtitle("Predicted and Actual line plot",
          subtitle = "Plot 5")+
  theme(plot.subtitle = element_text(size =15, colour="red3"))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))

###the red is predicted and blue is actual, they are very similar to each other

Note that if predicted and actual are identical may cause overfitting, which suggests the model may not predict accuratly with another set of data.

Regression analysis of the number of death by lung diease with Age&Smoke

Data manipulation

Explanatory analysis

modelling

The GLM poisson model

Quasipoisson model