Modelling merged dataset

Experimenting with ordinal linear regression modelling using the Immunization with Everything inc tax

#import data from ED's final merge which includes immunisation coverage, postcode, PHN, Electorate, SEIFA deciles and ranks and mean taxable income per postcode
everything <- read.csv("../cleaned_data/immunization_with_everything_taxation_update.csv")
str(everything)
## 'data.frame':    191163 obs. of  51 variables:
##  $ X.2                                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ X.1                                        : int  1 7 9 10 14 17 25 45 48 49 ...
##  $ postcode                                   : int  800 800 800 800 800 800 800 800 800 800 ...
##  $ X                                          : int  163657 163658 163662 163663 163661 163665 163656 163660 163659 163654 ...
##  $ state.x                                    : Factor w/ 12 levels "ACT","NSW","NSW/ACT",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ year                                       : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ age                                        : int  5 2 1 1 2 1 5 2 2 5 ...
##  $ pc_immun                                   : Factor w/ 9 levels "<70.0","70.0-74.9",..: 6 3 6 6 3 6 6 3 3 6 ...
##  $ caution                                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pc_immun_class                             : int  6 3 6 6 3 6 6 3 3 6 ...
##  $ PHN_code                                   : Factor w/ 31 levels "PHN101","PHN102",..: 30 30 30 30 30 30 30 30 30 30 ...
##  $ PHN_number                                 : int  701 701 701 701 701 701 701 701 701 701 ...
##  $ Index.type                                 : Factor w/ 4 levels "Index of Economic Resources",..: 4 3 3 2 4 4 1 1 2 3 ...
##  $ Time                                       : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ Maximum.score.for.SA1s.in.area             : int  NA 1167 1167 1163 NA NA 1019 1019 1163 1167 ...
##  $ Minimum.score.for.SA1s.in.area             : int  842 914 914 1023 842 842 751 751 1023 914 ...
##  $ Rank.within.Australia                      : int  2243 2398 2398 2287 2243 2243 412 412 2287 2398 ...
##  $ Rank.within.Australia...Decile             : int  NA 10 10 9 NA NA 2 2 9 10 ...
##  $ Rank.within.Australia...Percentile         : int  86 92 92 87 86 86 16 16 87 92 ...
##  $ Rank.within.State.or.Territory             : int  28 33 33 33 28 28 18 18 33 33 ...
##  $ Rank.within.State.or.Territory...Decile    : int  8 10 10 10 8 8 6 6 10 10 ...
##  $ Rank.within.State.or.Territory...Percentile: int  NA 92 92 92 NA NA 51 51 92 92 ...
##  $ Score                                      : int  1066 1096 1096 1089 1066 1066 946 946 1089 1096 ...
##  $ Usual.resident.population                  : int  6464 6464 6464 6464 6464 6464 6464 6464 6464 6464 ...
##  $ Electoral.division                         : Factor w/ 150 levels "Adelaide","Aston",..: 136 136 136 136 136 136 136 136 136 136 ...
##  $ Per.cent.postcode.in.electorate            : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ DivisionID2016                             : int  307 307 307 307 307 307 307 307 307 307 ...
##  $ StateAB2016                                : Factor w/ 8 levels "ACT","NSW","NT",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ CandidateID2016                            : int  28737 28737 28737 28737 28737 28737 28737 28737 28737 28737 ...
##  $ GivenNm2016                                : Factor w/ 97 levels "Adam","Alan",..: 56 56 56 56 56 56 56 56 56 56 ...
##  $ Surname2016                                : Factor w/ 135 levels "ABBOTT","ALBANESE",..: 51 51 51 51 51 51 51 51 51 51 ...
##  $ PartyNm2016                                : Factor w/ 8 levels "Australian Labor Party",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PartyAb2016                                : Factor w/ 8 levels "ALP","GRN","IND",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DivisionID2010                             : int  307 307 307 307 307 307 307 307 307 307 ...
##  $ StateAb2010                                : Factor w/ 8 levels "ACT","NSW","NT",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ CandidateID2010                            : int  21477 21477 21477 21477 21477 21477 21477 21477 21477 21477 ...
##  $ GivenNm2010                                : Factor w/ 108 levels "Adam","Alan",..: 74 74 74 74 74 74 74 74 74 74 ...
##  $ Surname2010                                : Factor w/ 137 levels "ABBOTT","ADAMS",..: 52 52 52 52 52 52 52 52 52 52 ...
##  $ PartyNm2010                                : Factor w/ 7 levels "Australian Labor Party",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ PartyAb2010                                : Factor w/ 7 levels "ALP","CLP","GRN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ DivisionID2013                             : int  307 307 307 307 307 307 307 307 307 307 ...
##  $ StateAb2013                                : Factor w/ 8 levels "ACT","NSW","NT",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ CandidateID2013                            : int  23502 23502 23502 23502 23502 23502 23502 23502 23502 23502 ...
##  $ GivenNm2013                                : Factor w/ 106 levels "Adam","Alan",..: 76 76 76 76 76 76 76 76 76 76 ...
##  $ Surname2013                                : Factor w/ 140 levels "ABBOTT","ALBANESE",..: 48 48 48 48 48 48 48 48 48 48 ...
##  $ PartyNm2013                                : Factor w/ 9 levels "Australian Labor Party",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ PartyAb2013                                : Factor w/ 9 levels "ALP","CLP","GRN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ state.y                                    : Factor w/ 8 levels "ACT","NSW","NT",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ num_individuals                            : int  5464 5464 5464 5464 5464 5464 5464 5464 5464 5464 ...
##  $ total_tax                                  : num  3.89e+08 3.89e+08 3.89e+08 3.89e+08 3.89e+08 ...
##  $ mean_tax                                   : int  71263 71263 71263 71263 71263 71263 71263 71263 71263 71263 ...
View(everything)
# * mergeddata - the source data frame we want to clean

# Returns - a data frame

mergeddata <- everything %>%
    select('state.x', 'postcode', 'year', 'age', 'pc_immun_class', 'PHN_number','Usual.resident.population','Rank.within.Australia...Decile', 'Electoral.division', 'mean_tax')

#selected these variables to mix postcode, age, immunisation classes, PHN, usual population, SEIFA rank within Australia in deciles, SEIFA score, Electorate and party details and mean tax of postcode.


#turn categorical variables into factors
mergeddata$postcode <- as.factor(mergeddata$postcode)
mergeddata$year <- as.factor(mergeddata$year)
mergeddata$age <- as.factor(mergeddata$age)
mergeddata$pc_immun_class <- as.factor(mergeddata$pc_immun_class)
str(mergeddata)
## 'data.frame':    191163 obs. of  10 variables:
##  $ state.x                       : Factor w/ 12 levels "ACT","NSW","NSW/ACT",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ postcode                      : Factor w/ 2533 levels "800","810","812",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year                          : Factor w/ 6 levels "2011","2012",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ age                           : Factor w/ 3 levels "1","2","5": 3 2 1 1 2 1 3 2 2 3 ...
##  $ pc_immun_class                : Factor w/ 9 levels "0","1","2","3",..: 7 4 7 7 4 7 7 4 4 7 ...
##  $ PHN_number                    : int  701 701 701 701 701 701 701 701 701 701 ...
##  $ Usual.resident.population     : int  6464 6464 6464 6464 6464 6464 6464 6464 6464 6464 ...
##  $ Rank.within.Australia...Decile: int  NA 10 10 9 NA NA 2 2 9 10 ...
##  $ Electoral.division            : Factor w/ 150 levels "Adelaide","Aston",..: 136 136 136 136 136 136 136 136 136 136 ...
##  $ mean_tax                      : int  71263 71263 71263 71263 71263 71263 71263 71263 71263 71263 ...
#cleaning the data
#Filter on age
#remove pc_immun(duplicates pc_immun_class)
#remove score (duplicates Rank.withn.Australia ... Decile)
#remove PartyNms but keep electorate
#discard caution for now

#NPs are useless to us so let's remove them, this means all class 0 won't be in this model

mergeddata <- mergeddata %>%
  filter(pc_immun_class != 0)

Let’s look at the age groups

one_year <- filter(mergeddata, age==1)
two_year <- filter(mergeddata, age==2)
five_year <- filter(mergeddata, age==5)

#THe ordinal package expects the dependent variable to be an ordered factor
one_year$pc_immun_class <- ordered(one_year$pc_immun_class)

#Divide mean_tax by a thousand to get lower numbers (polr doesn't seem to like the big numbers)
one_year <- transform(one_year, mean_tax_thousands = mean_tax/1000)

str(one_year)
## 'data.frame':    43696 obs. of  11 variables:
##  $ state.x                       : Factor w/ 12 levels "ACT","NSW","NSW/ACT",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ postcode                      : Factor w/ 2533 levels "800","810","812",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ year                          : Factor w/ 6 levels "2011","2012",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ age                           : Factor w/ 3 levels "1","2","5": 1 1 1 1 1 1 1 1 1 1 ...
##  $ pc_immun_class                : Ord.factor w/ 8 levels "1"<"2"<"3"<"4"<..: 6 6 6 6 7 7 7 7 6 6 ...
##  $ PHN_number                    : int  701 701 701 701 701 701 701 701 701 701 ...
##  $ Usual.resident.population     : int  6464 6464 6464 6464 33302 33302 33302 33302 18873 NA ...
##  $ Rank.within.Australia...Decile: int  10 9 NA 2 9 8 6 NA 7 6 ...
##  $ Electoral.division            : Factor w/ 150 levels "Adelaide","Aston",..: 136 136 136 136 136 136 136 136 136 136 ...
##  $ mean_tax                      : int  71263 71263 71263 71263 64719 64719 64719 64719 63273 63273 ...
##  $ mean_tax_thousands            : num  71.3 71.3 71.3 71.3 64.7 ...
set.seed(42)
 
#split test and train 60:40
inTrain <- createDataPartition(y = one_year$pc_immun_class, p = .60, list = FALSE)
training <- one_year[inTrain,]
testing <- one_year[-inTrain,]
#use 3000 rows of data as training to stop computer timing out while we build a model
experiment <- head(training, 3000)

dim(experiment)
## [1] 3000   11
dim(testing)
## [1] 17475    11

Commentary around goals of prediction

I’m trying to predict PC_immun_class in 2017-18 or 2018-19 (with the last year of data in our set from 2016-17) The variables that could help build the predictive model could be: - Mean tax (changes each year) - Age group (changes each year) - Electoral division - SEIFA rank by decile - SEIFA score - Postcode - Usual resident population - PHN number - State

It’s likely this requires ordinal regression, as our target variable has an order. pc_immun_class is an ordinal predictor with 8 different ordered classes. 0 = NP, which we will remove in the first instance. We may have to impute them later to balance the model.

#see pc_immun_class counts by filtered age groups.
#for one year olds
ggplot(one_year)+geom_bar(aes(x=pc_immun_class))

# 6 and 7 are the dominant class for one year olds, with 8 close in count to 5
#for two year olds
ggplot(two_year)+geom_bar(aes(x=pc_immun_class))

# there are higher counts in class 5, but 6 & 7 are still the dominant class, with less in 8

Age could be a predictor for future years

#for five year olds
ggplot(five_year)+geom_bar(aes(x=pc_immun_class))

#5 year old has more even distribution across 6, 7 and 8.
#Ordinal regression model - because pc_immun_class has multiple categories and they have a specific order

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
model1 <- polr(pc_immun_class ~ Rank.within.Australia...Decile + mean_tax_thousands, data = training, Hess = TRUE)

#let's try the ordinal library
#library(ordinal)
#model <- clm(pc_immun_class ~ Rank.within.Australia...Decile + PHN_number, data = training)

summary(model1)
## Call:
## polr(formula = pc_immun_class ~ Rank.within.Australia...Decile + 
##     mean_tax_thousands, data = training, Hess = TRUE)
## 
## Coefficients:
##                                    Value Std. Error t value
## Rank.within.Australia...Decile  0.062797  0.0046361  13.545
## mean_tax_thousands             -0.005419  0.0008431  -6.428
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -6.2148   0.1492   -41.6435
## 2|3  -5.2387   0.0966   -54.2493
## 3|4  -4.1194   0.0637   -64.7054
## 4|5  -2.8095   0.0468   -59.9995
## 5|6  -1.0453   0.0405   -25.8087
## 6|7   0.1238   0.0400     3.0998
## 7|8   1.4826   0.0411    36.0755
## 
## Residual Deviance: 79196.05 
## AIC: 79214.05 
## (624 observations deleted due to missingness)

Now let’s try to assess model 1

coefs <- coef(model1)

coefs
## Rank.within.Australia...Decile             mean_tax_thousands 
##                    0.062796625                   -0.005419412
# Find the p-value for model 1's t-value of 13.545
pt(13.545, 400-3, lower.tail=FALSE)*2
## [1] 1.259296e-34

Try a second model, adding PHN_number

model2 <- polr(pc_immun_class ~ Rank.within.Australia...Decile + mean_tax_thousands + PHN_number, data = training, Hess = TRUE)

#let's try the ordinal library
#library(ordinal)
#model <- clm(pc_immun_class ~ Rank.within.Australia...Decile + PHN_number, data = training)

summary(model2)
## Call:
## polr(formula = pc_immun_class ~ Rank.within.Australia...Decile + 
##     mean_tax_thousands + PHN_number, data = training, Hess = TRUE)
## 
## Coefficients:
##                                     Value Std. Error t value
## Rank.within.Australia...Decile  0.0631193  4.639e-03  13.606
## mean_tax_thousands             -0.0053319  8.446e-04  -6.313
## PHN_number                     -0.0001657  7.063e-05  -2.345
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -6.1883   0.1447   -42.7702
## 2|3  -5.2488   0.0962   -54.5467
## 3|4  -4.1792   0.0661   -63.1963
## 4|5  -2.8529   0.0499   -57.1176
## 5|6  -1.0820   0.0440   -24.6055
## 6|7   0.0863   0.0434     1.9886
## 7|8   1.4483   0.0444    32.5903
## 
## Residual Deviance: 79191.03 
## AIC: 79211.03 
## (624 observations deleted due to missingness)
coefs <- coef(model2)

coefs
## Rank.within.Australia...Decile             mean_tax_thousands 
##                    0.063119255                   -0.005331933 
##                     PHN_number 
##                   -0.000165654

Try a third model, adding the continuous variable of usual resident population

model3 <- polr(pc_immun_class ~ Rank.within.Australia...Decile + mean_tax_thousands + PHN_number + Usual.resident.population, data = training, Hess = TRUE)

#let's try the ordinal library
#library(ordinal)
#model <- clm(pc_immun_class ~ Rank.within.Australia...Decile + PHN_number, data = training)

#summary(model3)
#when I introduce large numbers like "usual.resident.population, I get "Error in svd(x) : infinite or missing values in 'x'

Did not proceed with this modelling

It would take more time to try more features and deal with the errors. We chose to stick with binomial linear regression to make our group predictions