#import data from ED's final merge which includes immunisation coverage, postcode, PHN, Electorate, SEIFA deciles and ranks and mean taxable income per postcode
everything <- read.csv("../cleaned_data/immunization_with_everything_taxation_update.csv")
str(everything)
## 'data.frame': 191163 obs. of 51 variables:
## $ X.2 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ X.1 : int 1 7 9 10 14 17 25 45 48 49 ...
## $ postcode : int 800 800 800 800 800 800 800 800 800 800 ...
## $ X : int 163657 163658 163662 163663 163661 163665 163656 163660 163659 163654 ...
## $ state.x : Factor w/ 12 levels "ACT","NSW","NSW/ACT",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ age : int 5 2 1 1 2 1 5 2 2 5 ...
## $ pc_immun : Factor w/ 9 levels "<70.0","70.0-74.9",..: 6 3 6 6 3 6 6 3 3 6 ...
## $ caution : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pc_immun_class : int 6 3 6 6 3 6 6 3 3 6 ...
## $ PHN_code : Factor w/ 31 levels "PHN101","PHN102",..: 30 30 30 30 30 30 30 30 30 30 ...
## $ PHN_number : int 701 701 701 701 701 701 701 701 701 701 ...
## $ Index.type : Factor w/ 4 levels "Index of Economic Resources",..: 4 3 3 2 4 4 1 1 2 3 ...
## $ Time : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ Maximum.score.for.SA1s.in.area : int NA 1167 1167 1163 NA NA 1019 1019 1163 1167 ...
## $ Minimum.score.for.SA1s.in.area : int 842 914 914 1023 842 842 751 751 1023 914 ...
## $ Rank.within.Australia : int 2243 2398 2398 2287 2243 2243 412 412 2287 2398 ...
## $ Rank.within.Australia...Decile : int NA 10 10 9 NA NA 2 2 9 10 ...
## $ Rank.within.Australia...Percentile : int 86 92 92 87 86 86 16 16 87 92 ...
## $ Rank.within.State.or.Territory : int 28 33 33 33 28 28 18 18 33 33 ...
## $ Rank.within.State.or.Territory...Decile : int 8 10 10 10 8 8 6 6 10 10 ...
## $ Rank.within.State.or.Territory...Percentile: int NA 92 92 92 NA NA 51 51 92 92 ...
## $ Score : int 1066 1096 1096 1089 1066 1066 946 946 1089 1096 ...
## $ Usual.resident.population : int 6464 6464 6464 6464 6464 6464 6464 6464 6464 6464 ...
## $ Electoral.division : Factor w/ 150 levels "Adelaide","Aston",..: 136 136 136 136 136 136 136 136 136 136 ...
## $ Per.cent.postcode.in.electorate : num 100 100 100 100 100 100 100 100 100 100 ...
## $ DivisionID2016 : int 307 307 307 307 307 307 307 307 307 307 ...
## $ StateAB2016 : Factor w/ 8 levels "ACT","NSW","NT",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ CandidateID2016 : int 28737 28737 28737 28737 28737 28737 28737 28737 28737 28737 ...
## $ GivenNm2016 : Factor w/ 97 levels "Adam","Alan",..: 56 56 56 56 56 56 56 56 56 56 ...
## $ Surname2016 : Factor w/ 135 levels "ABBOTT","ALBANESE",..: 51 51 51 51 51 51 51 51 51 51 ...
## $ PartyNm2016 : Factor w/ 8 levels "Australian Labor Party",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PartyAb2016 : Factor w/ 8 levels "ALP","GRN","IND",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ DivisionID2010 : int 307 307 307 307 307 307 307 307 307 307 ...
## $ StateAb2010 : Factor w/ 8 levels "ACT","NSW","NT",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ CandidateID2010 : int 21477 21477 21477 21477 21477 21477 21477 21477 21477 21477 ...
## $ GivenNm2010 : Factor w/ 108 levels "Adam","Alan",..: 74 74 74 74 74 74 74 74 74 74 ...
## $ Surname2010 : Factor w/ 137 levels "ABBOTT","ADAMS",..: 52 52 52 52 52 52 52 52 52 52 ...
## $ PartyNm2010 : Factor w/ 7 levels "Australian Labor Party",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ PartyAb2010 : Factor w/ 7 levels "ALP","CLP","GRN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ DivisionID2013 : int 307 307 307 307 307 307 307 307 307 307 ...
## $ StateAb2013 : Factor w/ 8 levels "ACT","NSW","NT",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ CandidateID2013 : int 23502 23502 23502 23502 23502 23502 23502 23502 23502 23502 ...
## $ GivenNm2013 : Factor w/ 106 levels "Adam","Alan",..: 76 76 76 76 76 76 76 76 76 76 ...
## $ Surname2013 : Factor w/ 140 levels "ABBOTT","ALBANESE",..: 48 48 48 48 48 48 48 48 48 48 ...
## $ PartyNm2013 : Factor w/ 9 levels "Australian Labor Party",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ PartyAb2013 : Factor w/ 9 levels "ALP","CLP","GRN",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ state.y : Factor w/ 8 levels "ACT","NSW","NT",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ num_individuals : int 5464 5464 5464 5464 5464 5464 5464 5464 5464 5464 ...
## $ total_tax : num 3.89e+08 3.89e+08 3.89e+08 3.89e+08 3.89e+08 ...
## $ mean_tax : int 71263 71263 71263 71263 71263 71263 71263 71263 71263 71263 ...
View(everything)
# * mergeddata - the source data frame we want to clean
# Returns - a data frame
mergeddata <- everything %>%
select('state.x', 'postcode', 'year', 'age', 'pc_immun_class', 'PHN_number','Usual.resident.population','Rank.within.Australia...Decile', 'Electoral.division', 'mean_tax')
#selected these variables to mix postcode, age, immunisation classes, PHN, usual population, SEIFA rank within Australia in deciles, SEIFA score, Electorate and party details and mean tax of postcode.
#turn categorical variables into factors
mergeddata$postcode <- as.factor(mergeddata$postcode)
mergeddata$year <- as.factor(mergeddata$year)
mergeddata$age <- as.factor(mergeddata$age)
mergeddata$pc_immun_class <- as.factor(mergeddata$pc_immun_class)
str(mergeddata)
## 'data.frame': 191163 obs. of 10 variables:
## $ state.x : Factor w/ 12 levels "ACT","NSW","NSW/ACT",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ postcode : Factor w/ 2533 levels "800","810","812",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : Factor w/ 6 levels "2011","2012",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ age : Factor w/ 3 levels "1","2","5": 3 2 1 1 2 1 3 2 2 3 ...
## $ pc_immun_class : Factor w/ 9 levels "0","1","2","3",..: 7 4 7 7 4 7 7 4 4 7 ...
## $ PHN_number : int 701 701 701 701 701 701 701 701 701 701 ...
## $ Usual.resident.population : int 6464 6464 6464 6464 6464 6464 6464 6464 6464 6464 ...
## $ Rank.within.Australia...Decile: int NA 10 10 9 NA NA 2 2 9 10 ...
## $ Electoral.division : Factor w/ 150 levels "Adelaide","Aston",..: 136 136 136 136 136 136 136 136 136 136 ...
## $ mean_tax : int 71263 71263 71263 71263 71263 71263 71263 71263 71263 71263 ...
#cleaning the data
#Filter on age
#remove pc_immun(duplicates pc_immun_class)
#remove score (duplicates Rank.withn.Australia ... Decile)
#remove PartyNms but keep electorate
#discard caution for now
#NPs are useless to us so let's remove them, this means all class 0 won't be in this model
mergeddata <- mergeddata %>%
filter(pc_immun_class != 0)
one_year <- filter(mergeddata, age==1)
two_year <- filter(mergeddata, age==2)
five_year <- filter(mergeddata, age==5)
#THe ordinal package expects the dependent variable to be an ordered factor
one_year$pc_immun_class <- ordered(one_year$pc_immun_class)
#Divide mean_tax by a thousand to get lower numbers (polr doesn't seem to like the big numbers)
one_year <- transform(one_year, mean_tax_thousands = mean_tax/1000)
str(one_year)
## 'data.frame': 43696 obs. of 11 variables:
## $ state.x : Factor w/ 12 levels "ACT","NSW","NSW/ACT",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ postcode : Factor w/ 2533 levels "800","810","812",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ year : Factor w/ 6 levels "2011","2012",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ age : Factor w/ 3 levels "1","2","5": 1 1 1 1 1 1 1 1 1 1 ...
## $ pc_immun_class : Ord.factor w/ 8 levels "1"<"2"<"3"<"4"<..: 6 6 6 6 7 7 7 7 6 6 ...
## $ PHN_number : int 701 701 701 701 701 701 701 701 701 701 ...
## $ Usual.resident.population : int 6464 6464 6464 6464 33302 33302 33302 33302 18873 NA ...
## $ Rank.within.Australia...Decile: int 10 9 NA 2 9 8 6 NA 7 6 ...
## $ Electoral.division : Factor w/ 150 levels "Adelaide","Aston",..: 136 136 136 136 136 136 136 136 136 136 ...
## $ mean_tax : int 71263 71263 71263 71263 64719 64719 64719 64719 63273 63273 ...
## $ mean_tax_thousands : num 71.3 71.3 71.3 71.3 64.7 ...
set.seed(42)
#split test and train 60:40
inTrain <- createDataPartition(y = one_year$pc_immun_class, p = .60, list = FALSE)
training <- one_year[inTrain,]
testing <- one_year[-inTrain,]
#use 3000 rows of data as training to stop computer timing out while we build a model
experiment <- head(training, 3000)
dim(experiment)
## [1] 3000 11
dim(testing)
## [1] 17475 11
I’m trying to predict PC_immun_class in 2017-18 or 2018-19 (with the last year of data in our set from 2016-17) The variables that could help build the predictive model could be: - Mean tax (changes each year) - Age group (changes each year) - Electoral division - SEIFA rank by decile - SEIFA score - Postcode - Usual resident population - PHN number - State
It’s likely this requires ordinal regression, as our target variable has an order. pc_immun_class is an ordinal predictor with 8 different ordered classes. 0 = NP, which we will remove in the first instance. We may have to impute them later to balance the model.
#see pc_immun_class counts by filtered age groups.
#for one year olds
ggplot(one_year)+geom_bar(aes(x=pc_immun_class))
# 6 and 7 are the dominant class for one year olds, with 8 close in count to 5
#for two year olds
ggplot(two_year)+geom_bar(aes(x=pc_immun_class))
# there are higher counts in class 5, but 6 & 7 are still the dominant class, with less in 8
#for five year olds
ggplot(five_year)+geom_bar(aes(x=pc_immun_class))
#5 year old has more even distribution across 6, 7 and 8.
#Ordinal regression model - because pc_immun_class has multiple categories and they have a specific order
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model1 <- polr(pc_immun_class ~ Rank.within.Australia...Decile + mean_tax_thousands, data = training, Hess = TRUE)
#let's try the ordinal library
#library(ordinal)
#model <- clm(pc_immun_class ~ Rank.within.Australia...Decile + PHN_number, data = training)
summary(model1)
## Call:
## polr(formula = pc_immun_class ~ Rank.within.Australia...Decile +
## mean_tax_thousands, data = training, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Rank.within.Australia...Decile 0.062797 0.0046361 13.545
## mean_tax_thousands -0.005419 0.0008431 -6.428
##
## Intercepts:
## Value Std. Error t value
## 1|2 -6.2148 0.1492 -41.6435
## 2|3 -5.2387 0.0966 -54.2493
## 3|4 -4.1194 0.0637 -64.7054
## 4|5 -2.8095 0.0468 -59.9995
## 5|6 -1.0453 0.0405 -25.8087
## 6|7 0.1238 0.0400 3.0998
## 7|8 1.4826 0.0411 36.0755
##
## Residual Deviance: 79196.05
## AIC: 79214.05
## (624 observations deleted due to missingness)
Now let’s try to assess model 1
coefs <- coef(model1)
coefs
## Rank.within.Australia...Decile mean_tax_thousands
## 0.062796625 -0.005419412
# Find the p-value for model 1's t-value of 13.545
pt(13.545, 400-3, lower.tail=FALSE)*2
## [1] 1.259296e-34
model2 <- polr(pc_immun_class ~ Rank.within.Australia...Decile + mean_tax_thousands + PHN_number, data = training, Hess = TRUE)
#let's try the ordinal library
#library(ordinal)
#model <- clm(pc_immun_class ~ Rank.within.Australia...Decile + PHN_number, data = training)
summary(model2)
## Call:
## polr(formula = pc_immun_class ~ Rank.within.Australia...Decile +
## mean_tax_thousands + PHN_number, data = training, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Rank.within.Australia...Decile 0.0631193 4.639e-03 13.606
## mean_tax_thousands -0.0053319 8.446e-04 -6.313
## PHN_number -0.0001657 7.063e-05 -2.345
##
## Intercepts:
## Value Std. Error t value
## 1|2 -6.1883 0.1447 -42.7702
## 2|3 -5.2488 0.0962 -54.5467
## 3|4 -4.1792 0.0661 -63.1963
## 4|5 -2.8529 0.0499 -57.1176
## 5|6 -1.0820 0.0440 -24.6055
## 6|7 0.0863 0.0434 1.9886
## 7|8 1.4483 0.0444 32.5903
##
## Residual Deviance: 79191.03
## AIC: 79211.03
## (624 observations deleted due to missingness)
coefs <- coef(model2)
coefs
## Rank.within.Australia...Decile mean_tax_thousands
## 0.063119255 -0.005331933
## PHN_number
## -0.000165654
model3 <- polr(pc_immun_class ~ Rank.within.Australia...Decile + mean_tax_thousands + PHN_number + Usual.resident.population, data = training, Hess = TRUE)
#let's try the ordinal library
#library(ordinal)
#model <- clm(pc_immun_class ~ Rank.within.Australia...Decile + PHN_number, data = training)
#summary(model3)
#when I introduce large numbers like "usual.resident.population, I get "Error in svd(x) : infinite or missing values in 'x'
It would take more time to try more features and deal with the errors. We chose to stick with binomial linear regression to make our group predictions