library(readr)
library(xlsx)
library(dplyr)
library(openxlsx)
library(readxl)
library(tidyr)
library(stringr)
library(outliers)
library(MVN)
library(forecast)
library(moments)
The data description includes:
Year: Year of data collected, 2017-2018.
Local Government Area (LGA): Local government area is the naming convention for the area of which the alcohol was sold. A classification for each LGA is through either one of the symbols (C), (S), (RC) or (B) indicating City, Shrine, Rural city or Boroughs respectively. For the purpose of this investigation, this symbol is unimportant and will be removed.
Liquor Type: The type of liquor sold according to the quantity in liters and content of alcohol.
Liquor Group: Group of liquor sold consisting of beer, wine, spirits and cider.
Liquor Volume (L): Volume in liters of liquor sold.
Ratio of alcohol: The ratio of alcohol in the liquor. 1 representing 100% alcohol content and 0 representing 0% alcohol.
Alcohol volume (L): The volume of alcohol sold in liters, calculated the multiplication of Liquor Volume (L) and Ratio of alcohol.
The Crime Statistics Agency (CSA) collects and produces crime data from the Victorian Police. CSA then conducts a series of quality checks and processes the data to be released to the public. In this investigation, an offence is defined as any criminal act of omission by a person or organisation for which a penalty could by imposed by the Victorian legal system. The dataset named ‘offences’ contains 6 variables and 870 observation from 2011 to 2020. The data collected can be broken down into:
The dataset used in this task was generated from: https://www.crimestatistics.vic.gov.au/crime-statistics/latest-victorian-crime-data/download-data
The data description includes:
alcohol <- read_excel("2017-18-wholesale-liquor-sales-data-by-LGA.XLSX", sheet = "Victorian Wholesale Liquor Data", skip = 13)
offences <- read_excel("Data_Tables_LGA_Recorded_Offences_Year_Ending_June_2020.xlsx", sheet = "Table 01")
head(alcohol)
## # A tibble: 6 x 7
## Year `Local Governme~ `Liquor Type` `Liquor Group` `Liquor Volume ~
## <chr> <chr> <chr> <chr> <dbl>
## 1 2017~ Alpine (S) Beer Low <=4~ Beer 32796.
## 2 2017~ Alpine (S) Beer Low >48~ Beer 2722.
## 3 2017~ Alpine (S) Beer Medium ~ Beer 276899.
## 4 2017~ Alpine (S) Beer Medium ~ Beer 24226.
## 5 2017~ Alpine (S) Beer Heavy <~ Beer 1000682.
## 6 2017~ Alpine (S) Beer Heavy >~ Beer 198786.
## # ... with 2 more variables: `Ratio of alcohol` <dbl>, `Alcohol Volume
## # (L)` <dbl>
head(offences)
## # A tibble: 6 x 6
## Year `Year ending` `Police Region` `Local Governme~ `Offence Count`
## <dbl> <chr> <chr> <chr> <dbl>
## 1 2020 June 1 North West M~ Banyule 9700
## 2 2020 June 1 North West M~ Brimbank 20242
## 3 2020 June 1 North West M~ Darebin 15161
## 4 2020 June 1 North West M~ Hobsons Bay 6100
## 5 2020 June 1 North West M~ Hume 21529
## 6 2020 June 1 North West M~ Maribyrnong 9388
## # ... with 1 more variable: `Rate per 100,000 population` <dbl>
str(alcohol)
## tibble [1,360 x 7] (S3: tbl_df/tbl/data.frame)
## $ Year : chr [1:1360] "2017-18" "2017-18" "2017-18" "2017-18" ...
## $ Local Government Area (LGA): chr [1:1360] "Alpine (S)" "Alpine (S)" "Alpine (S)" "Alpine (S)" ...
## $ Liquor Type : chr [1:1360] "Beer Low <=48 Ltrs" "Beer Low >48 Ltrs" "Beer Medium <=48 Ltrs" "Beer Medium >48 Ltrs" ...
## $ Liquor Group : chr [1:1360] "Beer" "Beer" "Beer" "Beer" ...
## $ Liquor Volume (L) : num [1:1360] 32796 2722 276899 24226 1000682 ...
## $ Ratio of alcohol : num [1:1360] 0.0269 0.0269 0.0348 0.0348 0.0476 0.0476 0.123 0.123 0.123 0.123 ...
## $ Alcohol Volume (L) : num [1:1360] 882.2 73.2 9636.1 843 47632.5 ...
str(offences)
## tibble [870 x 6] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:870] 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ Year ending : chr [1:870] "June" "June" "June" "June" ...
## $ Police Region : chr [1:870] "1 North West Metro" "1 North West Metro" "1 North West Metro" "1 North West Metro" ...
## $ Local Government Area : chr [1:870] "Banyule" "Brimbank" "Darebin" "Hobsons Bay" ...
## $ Offence Count : num [1:870] 9700 20242 15161 6100 21529 ...
## $ Rate per 100,000 population: num [1:870] 7328 9630 9144 6187 8918 ...
alcohol$`Local Government Area (LGA)`<- gsub("\\s*\\([^\\)]+\\)","", alcohol$`Local Government Area (LGA)`)
alcohol <- alcohol %>% rename( `Local Government Area`=`Local Government Area (LGA)`)
alcohol <- alcohol %>% group_by(`Local Government Area`) %>%
mutate(`Total alcohol sold (L)` = sum(`Alcohol Volume (L)`))
head(alcohol)
## # A tibble: 6 x 8
## # Groups: Local Government Area [1]
## Year `Local Governme~ `Liquor Type` `Liquor Group` `Liquor Volume ~
## <chr> <chr> <chr> <chr> <dbl>
## 1 2017~ Alpine Beer Low <=4~ Beer 32796.
## 2 2017~ Alpine Beer Low >48~ Beer 2722.
## 3 2017~ Alpine Beer Medium ~ Beer 276899.
## 4 2017~ Alpine Beer Medium ~ Beer 24226.
## 5 2017~ Alpine Beer Heavy <~ Beer 1000682.
## 6 2017~ Alpine Beer Heavy >~ Beer 198786.
## # ... with 3 more variables: `Ratio of alcohol` <dbl>, `Alcohol Volume
## # (L)` <dbl>, `Total alcohol sold (L)` <dbl>
alcohol_a <- alcohol %>% group_by(`Local Government Area`) %>%
summarise(`Total alcohol sold (L)` = sum(`Alcohol Volume (L)`))
## `summarise()` ungrouping output (override with `.groups` argument)
head(alcohol_a)
## # A tibble: 6 x 2
## `Local Government Area` `Total alcohol sold (L)`
## <chr> <dbl>
## 1 Alpine 163637.
## 2 Ararat 76142.
## 3 Ballarat 768553.
## 4 Banyule 373338.
## 5 Bass Coast 405641.
## 6 Baw Baw 267551.
offences2018 <- offences %>% filter(Year == 2018)
head(offences2018)
## # A tibble: 6 x 6
## Year `Year ending` `Police Region` `Local Governme~ `Offence Count`
## <dbl> <chr> <chr> <chr> <dbl>
## 1 2018 June 1 North West M~ Banyule 9688
## 2 2018 June 1 North West M~ Brimbank 18425
## 3 2018 June 1 North West M~ Darebin 14677
## 4 2018 June 1 North West M~ Hobsons Bay 6028
## 5 2018 June 1 North West M~ Hume 20217
## 6 2018 June 1 North West M~ Maribyrnong 8297
## # ... with 1 more variable: `Rate per 100,000 population` <dbl>
alcohol_offence <- left_join(offences2018, alcohol_a, by="Local Government Area")
alcohol_offence$`Total alcohol sold (L)` <- round(alcohol_offence$`Total alcohol sold (L)`, 0)
head(alcohol_offence)
## # A tibble: 6 x 7
## Year `Year ending` `Police Region` `Local Governme~ `Offence Count`
## <dbl> <chr> <chr> <chr> <dbl>
## 1 2018 June 1 North West M~ Banyule 9688
## 2 2018 June 1 North West M~ Brimbank 18425
## 3 2018 June 1 North West M~ Darebin 14677
## 4 2018 June 1 North West M~ Hobsons Bay 6028
## 5 2018 June 1 North West M~ Hume 20217
## 6 2018 June 1 North West M~ Maribyrnong 8297
## # ... with 2 more variables: `Rate per 100,000 population` <dbl>, `Total
## # alcohol sold (L)` <dbl>
str(alcohol_offence)
## tibble [87 x 7] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:87] 2018 2018 2018 2018 2018 ...
## $ Year ending : chr [1:87] "June" "June" "June" "June" ...
## $ Police Region : chr [1:87] "1 North West Metro" "1 North West Metro" "1 North West Metro" "1 North West Metro" ...
## $ Local Government Area : chr [1:87] "Banyule" "Brimbank" "Darebin" "Hobsons Bay" ...
## $ Offence Count : num [1:87] 9688 18425 14677 6028 20217 ...
## $ Rate per 100,000 population: num [1:87] 7438 8827 9079 6248 9008 ...
## $ Total alcohol sold (L) : num [1:87] 373338 652607 781075 895441 1036497 ...
alcohol_offence <- alcohol_offence %>% rename(`Offence per 100,000 population` = `Rate per 100,000 population`)
alcohol_offence <- alcohol_offence %>% mutate(`Total Population` = (`Offence Count`/`Offence per 100,000 population`)*100000)
alcohol_offence <- alcohol_offence %>% mutate(`Average sold per person in Liters` = `Total alcohol sold (L)`/`Total Population`)
alcohol_offence$`Average sold per person in Liters` <- round(alcohol_offence$`Average sold per person in Liters`, 2)
head(alcohol_offence$`Offence per 100,000 population`)
## [1] 7438.004 8826.601 9079.324 6247.862 9008.435 9076.390
head(alcohol_offence$`Total Population`)
## [1] 130250 208744 161653 96481 224423 91413
head(alcohol_offence$`Average sold per person in Liters`)
## [1] 2.87 3.13 4.83 9.28 4.62 5.60
alcohol_offence$`Police Region` <- alcohol_offence$`Police Region` %>% factor(levels=c("1 North West Metro", "2 Eastern", "3 Southern Metro", "4 Western"),
labels=c("North West Metro", "Eastern", "Southern Metro", "Western"))
str(alcohol_offence)
## tibble [87 x 9] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:87] 2018 2018 2018 2018 2018 ...
## $ Year ending : chr [1:87] "June" "June" "June" "June" ...
## $ Police Region : Factor w/ 4 levels "North West Metro",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Local Government Area : chr [1:87] "Banyule" "Brimbank" "Darebin" "Hobsons Bay" ...
## $ Offence Count : num [1:87] 9688 18425 14677 6028 20217 ...
## $ Offence per 100,000 population : num [1:87] 7438 8827 9079 6248 9008 ...
## $ Total alcohol sold (L) : num [1:87] 373338 652607 781075 895441 1036497 ...
## $ Total Population : num [1:87] 130250 208744 161653 96481 224423 ...
## $ Average sold per person in Liters: num [1:87] 2.87 3.13 4.83 9.28 4.62 ...
head(alcohol_offence)
## # A tibble: 6 x 9
## Year `Year ending` `Police Region` `Local Governme~ `Offence Count`
## <dbl> <chr> <fct> <chr> <dbl>
## 1 2018 June North West Met~ Banyule 9688
## 2 2018 June North West Met~ Brimbank 18425
## 3 2018 June North West Met~ Darebin 14677
## 4 2018 June North West Met~ Hobsons Bay 6028
## 5 2018 June North West Met~ Hume 20217
## 6 2018 June North West Met~ Maribyrnong 8297
## # ... with 4 more variables: `Offence per 100,000 population` <dbl>, `Total
## # alcohol sold (L)` <dbl>, `Total Population` <dbl>, `Average sold per person
## # in Liters` <dbl>
sapply(alcohol_offence, function (x) which(is.na(x)))
## $Year
## integer(0)
##
## $`Year ending`
## integer(0)
##
## $`Police Region`
## [1] 84 85 86 87
##
## $`Local Government Area`
## integer(0)
##
## $`Offence Count`
## integer(0)
##
## $`Offence per 100,000 population`
## [1] 84 85 86 87
##
## $`Total alcohol sold (L)`
## [1] 15 41 52 83 84 85 87
##
## $`Total Population`
## [1] 84 85 86 87
##
## $`Average sold per person in Liters`
## [1] 15 41 52 83 84 85 86 87
colSums(is.na(alcohol_offence))
## Year Year ending
## 0 0
## Police Region Local Government Area
## 4 0
## Offence Count Offence per 100,000 population
## 0 4
## Total alcohol sold (L) Total Population
## 7 4
## Average sold per person in Liters
## 8
head(sapply(alcohol_offence, function (x) is.infinite(x)))
## Year Year ending Police Region Local Government Area Offence Count
## [1,] FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE FALSE FALSE
## Offence per 100,000 population Total alcohol sold (L) Total Population
## [1,] FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE
## Average sold per person in Liters
## [1,] FALSE
## [2,] FALSE
## [3,] FALSE
## [4,] FALSE
## [5,] FALSE
## [6,] FALSE
head(sapply(alcohol_offence, function (x) is.nan(x)))
## Year Year ending Police Region Local Government Area Offence Count
## [1,] FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE FALSE FALSE
## Offence per 100,000 population Total alcohol sold (L) Total Population
## [1,] FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE
## Average sold per person in Liters
## [1,] FALSE
## [2,] FALSE
## [3,] FALSE
## [4,] FALSE
## [5,] FALSE
## [6,] FALSE
alcohol_offence <- alcohol_offence[complete.cases(alcohol_offence),]
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
alcohol_offence_df <- alcohol_offence %>% select(`Offence Count`, `Offence per 100,000 population`, `Total alcohol sold (L)`, `Total Population`, `Average sold per person in Liters` )
alcohol_sub <- as.data.frame(sapply(alcohol_offence_df, FUN =cap))
summary(alcohol_sub)
## Offence Count Offence per 100,000 population Total alcohol sold (L)
## Min. : 159 Min. : 2493 Min. : 20970
## 1st Qu.: 1076 1st Qu.: 5293 1st Qu.: 114540
## Median : 3698 Median : 6898 Median : 360043
## Mean : 6130 Mean : 7338 Mean : 504264
## 3rd Qu.:10539 3rd Qu.: 9203 3rd Qu.: 843549
## Max. :21602 Max. :13988 Max. :1916536
## Total Population Average sold per person in Liters
## Min. : 2982 Min. : 2.350
## 1st Qu.: 16314 1st Qu.: 4.940
## Median : 46816 Median : 6.500
## Mean : 80306 Mean : 7.188
## 3rd Qu.:136049 3rd Qu.: 8.925
## Max. :255367 Max. :14.130
alcohol_offence$`Offence Count` %>% boxplot(main="Box plot of Total Offences",ylab="Total Offences")
z.scores <- alcohol_offence$`Offence Count` %>% scores(type="z")
z.scores %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.9128 -0.7781 -0.3927 0.0000 0.6128 4.5748
which(abs(z.scores)>3)
## [1] 7
length(which(abs(z.scores)>3))
## [1] 1
alcohol_offence_clean1 <- alcohol_offence$`Offence Count` %>% cap()
alcohol_offence_clean1 %>% boxplot(main="Box plot of Total Offences without outliers", ylab="Total Offences")
alcohol_offence$`Offence per 100,000 population` %>% boxplot(main="Box plot of Offences/100,000 population", ylab="Offences per 100,000 population")
z.scores1 <- alcohol_offence$`Offence per 100,000 population` %>% scores(type="z")
z.scores1 %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.4870 -0.6610 -0.1876 0.0000 0.4925 4.2719
which(abs(z.scores1)>3)
## [1] 7 24
length(which(abs(z.scores1)>3))
## [1] 2
alcohol_offence_clean2 <- alcohol_offence$`Offence per 100,000 population` %>% cap()
alcohol_offence_clean2 %>% boxplot(main="Box plot of Offences/100,000 population without outliers", ylab="Offences per 100,000 population")
alcohol_offence$`Total alcohol sold (L)` %>% boxplot(main="Box plot of Alcohol sold",
ylab="Alcohol sold in Liters")
z.scores2 <- alcohol_offence$`Total alcohol sold (L)` %>% scores(type="z")
z.scores2 %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.9744 -0.7935 -0.3190 0.0000 0.6155 4.8634
which(abs(z.scores2)>3)
## [1] 7
length(which(abs(z.scores2)>3))
## [1] 1
alcohol_offence_clean3 <- alcohol_offence$`Total alcohol sold (L)` %>% cap()
alcohol_offence_clean3 %>% boxplot(main="Box plot of Alcohol sold without outliers",
ylab="Alcohol sold in Liters")
alcohol_offence$`Total Population` %>% boxplot(main="Box plot of Total population",
ylab="Total population")
z.scores3 <- alcohol_offence$`Total Population` %>% scores(type="z")
z.scores3 %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.0229 -0.8498 -0.4539 0.0000 0.7043 3.3574
which(abs(z.scores3)>3)
## [1] 42
length(which(abs(z.scores3)>3))
## [1] 1
alcohol_offence_clean4 <- alcohol_offence$`Total Population` %>% cap()
alcohol_offence_clean4 %>% boxplot(main="Box plot of Total population without outliers",
ylab="Total population")
alcohol_offence$`Average sold per person in Liters` %>% boxplot(main="Box plot of alcohol sold per person", ylab="Alcohol sold (Liters)")
z.scores4 <- alcohol_offence$`Average sold per person in Liters` %>% scores(type="z")
z.scores4 %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.6347 -0.7710 -0.2507 0.0000 0.5580 3.5377
which(abs(z.scores4)>3)
## [1] 7
length(which(abs(z.scores4)>3))
## [1] 1
alcohol_offence_clean5 <- alcohol_offence$`Average sold per person in Liters` %>% cap()
alcohol_offence_clean5 %>% boxplot(main="Box plot of alcohol sold per person without outliers", ylab="Alcohol sold (Liters)")
# multivariate method
boxplot(alcohol_offence$`Offence per 100,000 population` ~ alcohol_offence$`Police Region`,
ylab="Offence per 100000 population", xlab= "Police Region",
main="Box Plot of Offences")
boxplot(alcohol_offence$`Average sold per person in Liters` ~ alcohol_offence$`Police Region`, ylab="Average Alcohol sold per person (Liters)", xlab="Police Region",
main="Box Plot of average alcohol sold")
alcohol_offence %>% plot(`Offence per 100,000 population` ~ `Average sold per person in Liters`,data=.)
offence_alcohol <- alcohol_offence %>% select(`Average sold per person in Liters`, `Offence per 100,000 population`)
results <- mvn(data= offence_alcohol,
multivariateOutlierMethod = "quan",
showOutliers = TRUE)
results$multivariateOutliers
## Observation Mahalanobis Distance Outlier
## 1 1 30.856 TRUE
## 2 2 16.927 TRUE
## 3 3 15.197 TRUE
## 4 4 13.147 TRUE
## 5 5 11.398 TRUE
## 6 6 8.476 TRUE
## 7 7 7.834 TRUE
## 8 8 7.800 TRUE
## 9 9 7.431 TRUE
alcohol_clean3 <- offence_alcohol[-(1:9),]
alcohol_clean3 %>% plot(`Offence per 100,000 population`~`Average sold per person in Liters`,data=.)
dim(alcohol_clean3)
## [1] 70 2
hist(alcohol_offence$`Offence per 100,000 population`, xlab="Offences per 100000 population", main="Offences")
skewness(alcohol_offence$`Offence per 100,000 population`)
## [1] 1.451044
log_offences <- log10(alcohol_offence$`Offence per 100,000 population`)
hist(log_offences)
skewness(log_offences)
## [1] 0.01334287
ln_offences <- log(alcohol_offence$`Offence per 100,000 population`)
hist(ln_offences)
skewness(ln_offences)
## [1] 0.01334287
sqrt_offences <- sqrt(alcohol_offence$`Offence per 100,000 population`)
hist(sqrt_offences)
skewness(sqrt_offences)
## [1] 0.6562218
rec_offences <- 1/alcohol_offence$`Offence per 100,000 population`
hist(rec_offences)
skewness(rec_offences)
## [1] 1.041724
box_offences <- BoxCox(alcohol_offence$`Offence per 100,000 population`, lambda="auto")
hist(box_offences)
skewness(box_offences)
## [1] -0.229151
Crime Statistics Agency (2020). Crime Statistics Agency: Recorded Offences. https://www.crimestatistics.vic.gov.au/crime-statistics/latest-victorian-crime-data/recorded-offences
Changyong Feng, Hongyue Wang, Naiji Lu, Tian Chen, Hua He, Ying Lu, and Xin M. Tu. Log-transformation and its implications for data analysis. Shanghai Arch Psychiatry. 2014 Apr; 26(2): 105-109. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4120293/#:~:text=Using%20the%20log%20transformation%20to%20make%20data%20conform%20to%20normality&text=If%20the%20original%20data%20follows,does%20remove%20or%20reduce%20skewness.