Analyzing large amounts of data in search of anomalies can be a frustrating task. You need techniques that allow you to quickly evaluate data in a way that highlights potential anomalies and prevents you from conducting analysis that are meaningless.
Here are the steps on how to perform a Benford’s Law analysis using the 2012 to 2022 Connecticut’s daily accidental drug related deaths data:
# load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gtrendsR)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(dplyr)
library(benford.analysis)
# read data from data.gov
accidental_death_data = read.csv("https://data.ct.gov/api/views/rybz-nyjw/rows.csv", header = T)
str(accidental_death_data)
## 'data.frame': 10654 obs. of 48 variables:
## $ Date : chr "05/29/2012" "06/27/2012" "03/24/2014" "12/31/2014" ...
## $ Date.Type : chr "Date of death" "Date of death" "Date of death" "Date of death" ...
## $ Age : int 37 37 28 26 41 57 26 64 33 23 ...
## $ Sex : chr "Male" "Male" "Male" "Female" ...
## $ Race : chr "Black" "White" "White" "White" ...
## $ Ethnicity : chr "" "" "" "" ...
## $ Residence.City : chr "STAMFORD" "NORWICH" "HEBRON" "BALTIC" ...
## $ Residence.County : chr "FAIRFIELD" "NEW LONDON" "" "" ...
## $ Residence.State : chr "" "" "" "" ...
## $ Injury.City : chr "STAMFORD" "NORWICH" "HEBRON" "" ...
## $ Injury.County : chr "" "" "" "" ...
## $ Injury.State : chr "CT" "CT" "CT" "" ...
## $ Injury.Place : chr "Residence" "Residence" "Residence" "" ...
## $ Description.of.Injury : chr "Used Cocaine" "Drug Use" "Drug Use" "" ...
## $ Death.City : chr "" "NORWICH" "MARLBOROUGH" "BALTIC" ...
## $ Death.County : chr "" "NEW LONDON" "" "NEW LONDON" ...
## $ Death.State : chr "" "" "" "" ...
## $ Location : chr "Residence" "Hospital" "Hospital" "Residence" ...
## $ Location.if.Other : chr "" "" "" "" ...
## $ Cause.of.Death : chr "Cocaine Toxicity" "Heroin Toxicity" "Heroin Intoxication" "Acute Heroin Intoxication" ...
## $ Manner.of.Death : chr "Accident" "Accident" "Accident" "Accident" ...
## $ Other.Significant.Conditions : chr "" "" "" "" ...
## $ Heroin : chr "" "Y" "Y" "Y" ...
## $ Heroin.death.certificate..DC.: chr "" "" "" "" ...
## $ Cocaine : chr "Y" "" "" "" ...
## $ Fentanyl : chr "" "" "" "" ...
## $ Fentanyl.Analogue : chr "" "" "" "" ...
## $ Oxycodone : chr "" "" "" "" ...
## $ Oxymorphone : chr "" "" "" "" ...
## $ Ethanol : chr "" "" "" "" ...
## $ Hydrocodone : chr "" "" "" "" ...
## $ Benzodiazepine : chr "" "" "" "" ...
## $ Methadone : chr "" "" "" "" ...
## $ Meth.Amphetamine : chr "" "" "" "" ...
## $ Amphet : chr "" "" "" "" ...
## $ Tramad : chr "" "" "" "" ...
## $ Hydromorphone : chr "" "" "" "" ...
## $ Morphine..Not.Heroin. : chr "" "" "" "" ...
## $ Xylazine : chr "" "" "" "" ...
## $ Gabapentin : chr "" "" "" "" ...
## $ Opiate.NOS : chr "" "" "" "" ...
## $ Heroin.Morph.Codeine : chr "" "" "" "" ...
## $ Other.Opioid : chr "" "" "" "" ...
## $ Any.Opioid : chr "" "" "" "" ...
## $ Other : chr "" "" "" "" ...
## $ ResidenceCityGeo : chr "STAMFORD, CT\n(41.051924, -73.539475)" "NORWICH, CT\n(41.524304, -72.075821)" "HEBRON, CT\n(41.658069, -72.366324)" "BALTIC, CT\n(41.617221, -72.085031)" ...
## $ InjuryCityGeo : chr "STAMFORD, CT\n(41.051924, -73.539475)" "NORWICH, CT\n(41.524304, -72.075821)" "HEBRON, CT\n(41.658069, -72.366324)" "CT\n(41.575155, -72.738288)" ...
## $ DeathCityGeo : chr "CT\n(41.575155, -72.738288)" "Norwich, CT\n(41.524304, -72.075821)" "Marlborough, CT\n(41.632043, -72.461309)" "Baltic, CT\n(41.617221, -72.085031)" ...
ls(accidental_death_data)
## [1] "Age" "Amphet"
## [3] "Any.Opioid" "Benzodiazepine"
## [5] "Cause.of.Death" "Cocaine"
## [7] "Date" "Date.Type"
## [9] "Death.City" "Death.County"
## [11] "Death.State" "DeathCityGeo"
## [13] "Description.of.Injury" "Ethanol"
## [15] "Ethnicity" "Fentanyl"
## [17] "Fentanyl.Analogue" "Gabapentin"
## [19] "Heroin" "Heroin.death.certificate..DC."
## [21] "Heroin.Morph.Codeine" "Hydrocodone"
## [23] "Hydromorphone" "Injury.City"
## [25] "Injury.County" "Injury.Place"
## [27] "Injury.State" "InjuryCityGeo"
## [29] "Location" "Location.if.Other"
## [31] "Manner.of.Death" "Meth.Amphetamine"
## [33] "Methadone" "Morphine..Not.Heroin."
## [35] "Opiate.NOS" "Other"
## [37] "Other.Opioid" "Other.Significant.Conditions"
## [39] "Oxycodone" "Oxymorphone"
## [41] "Race" "Residence.City"
## [43] "Residence.County" "Residence.State"
## [45] "ResidenceCityGeo" "Sex"
## [47] "Tramad" "Xylazine"
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
accidental_death_data<- accidental_death_data %>%
clean_names()
ls(accidental_death_data)
## [1] "age" "amphet"
## [3] "any_opioid" "benzodiazepine"
## [5] "cause_of_death" "cocaine"
## [7] "date" "date_type"
## [9] "death_city" "death_city_geo"
## [11] "death_county" "death_state"
## [13] "description_of_injury" "ethanol"
## [15] "ethnicity" "fentanyl"
## [17] "fentanyl_analogue" "gabapentin"
## [19] "heroin" "heroin_death_certificate_dc"
## [21] "heroin_morph_codeine" "hydrocodone"
## [23] "hydromorphone" "injury_city"
## [25] "injury_city_geo" "injury_county"
## [27] "injury_place" "injury_state"
## [29] "location" "location_if_other"
## [31] "manner_of_death" "meth_amphetamine"
## [33] "methadone" "morphine_not_heroin"
## [35] "opiate_nos" "other"
## [37] "other_opioid" "other_significant_conditions"
## [39] "oxycodone" "oxymorphone"
## [41] "race" "residence_city"
## [43] "residence_city_geo" "residence_county"
## [45] "residence_state" "sex"
## [47] "tramad" "xylazine"
class(accidental_death_data$date)
## [1] "character"
library(lubridate)
#convert character to date format
accidental_death_data$date <- mdy(accidental_death_data$date)
#view class of date column
class(accidental_death_data$date)
## [1] "Date"
# Create final dataset for analysis
ADRD_12_22<- accidental_death_data %>%
group_by(date) %>%
summarise(deaths=n())
str(ADRD_12_22)
## tibble [3,520 × 2] (S3: tbl_df/tbl/data.frame)
## $ date : Date[1:3520], format: "2012-01-01" "2012-01-03" ...
## $ deaths: int [1:3520] 1 1 1 1 1 3 1 2 1 2 ...
print(ADRD_12_22)
## # A tibble: 3,520 × 2
## date deaths
## <date> <int>
## 1 2012-01-01 1
## 2 2012-01-03 1
## 3 2012-01-04 1
## 4 2012-01-05 1
## 5 2012-01-07 1
## 6 2012-01-08 3
## 7 2012-01-11 1
## 8 2012-01-12 2
## 9 2012-01-13 1
## 10 2012-01-14 2
## # ℹ 3,510 more rows
# Basic line plot
p_ADRD<- ADRD_12_22 %>%
ggplot(aes(x = date, y = deaths)) +
geom_line(color = "#00AFBB", linewidth = 1) +
ggtitle("Daily Accidental Drug Related Deaths, Connecticut, 2012-2022") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_date(date_labels="%b %Y")
p_ADRD
# Add trend smoothed line
# Note. 'loess' method is used for less than 1,000 observations
p_ADRD + stat_smooth(color = "#FC4E07", fill = "#FC4E07", method = "gam")
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
# Interactive plot
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(p_ADRD)
In order to see if data fits Benford’s law curve, Mantissa statistics results need to be close to these numbers:
Statistic Value
Mean = 0.4
Variance = 0.083
Ex.Kurtosis = -1.2
Skewness = 0
If the data follow Benford’s law distribution, it should be close to these values.
Run Benford’s Law analysis.
ADRD_trends = benford(ADRD_12_22$deaths, number.of.digits = 1, discrete = T, sign = "positive")
ADRD_trends
##
## Benford object:
##
## Data: ADRD_12_22$deaths
## Number of observations used = 3520
## Number of obs. for second order = 11
## First digits analysed = 1
##
## Mantissa:
##
## Statistic Value
## Mean 0.396
## Var 0.074
## Ex.Kurtosis -1.053
## Skewness -0.128
##
##
## The 5 largest deviations:
##
## digits absolute.diff
## 1 1 227.63
## 2 3 224.22
## 3 2 221.16
## 4 4 162.88
## 5 9 145.07
##
## Stats:
##
## Pearson's Chi-squared test
##
## data: ADRD_12_22$deaths
## X-squared = 605.76, df = 8, p-value < 2.2e-16
##
##
## Mantissa Arc Test
##
## data: ADRD_12_22$deaths
## L2 = 0.015985, df = 2, p-value < 2.2e-16
##
## Mean Absolute Deviation (MAD): 0.04081649
## MAD Conformity - Nigrini (2012): Nonconformity
## Distortion Factor: -73.66243
##
## Remember: Real data will never conform perfectly to Benford's Law. You should not focus on p-values!
# plot Benford's Law distribution
plot(ADRD_trends)
Here is a simple visualization of a theoretical Benford distribution
with the given probabilities.
Compare this distribution to your ‘Digits Distribution’ to see if there
are distribution similarities.
# Crete dataset of Benford’s Law distribution
theory_probs = c(30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6)
digits = c(1:9)
theory_df <- data.frame(digits, theory_probs)
theory_df %>%
ggplot(aes(x=digits, y=theory_probs)) +
geom_col(mapping=aes(x=digits)) +
geom_text(aes(label = theory_probs), vjust = -0.2) +
xlab('Digits (1-9)') + ylab('Theoretical Probabilities (%)') +
scale_x_discrete(labels=(c(1,2,3,4,5,6,7,8,9))) +
ggtitle("Theoretical Benford's Law Distribution")
A.M.D.G.