gdpr <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')
gdpr_tidy <- gdpr %>%
transmute(id,
country = name,
price,
date = mdy(date),
article_violated, ## reference
articles = str_extract_all(article_violated, "Art. [:digit:]+|Art.[:digit:]+")) %>%
mutate(total_articles = map_int(articles, length)) %>% ## find multiple violations
unnest(articles) %>%
select(-article_violated)
Use summary to check the articles column
summary(as_factor(gdpr_tidy$articles))
## Art. 28 Art. 12 Art. 13 Art. 5 Art. 6 Art. 31 Art. 32 Art. 33 Art. 25 Art. 21
## 3 9 19 114 93 2 61 9 8 9
## Art. 15 Art. 17 Art. 37 Art. 9 Art. 35 Art. 36 Art. 14 Art. 7 Art. 34 Art. 4
## 17 7 3 7 1 1 4 5 3 1
## Art. 18 Art.14 Art. 83 Art. 58 Art 6 Art 58 Art 83
## 1 3 1 6 1 1 1
gdpr_tidy_2 <- gdpr_tidy %>%
mutate(articles_no = str_extract_all(articles, "[:digit:]+"),## get article number and convert to factor
country = as_factor(country)) %>%
select(-articles) %>%
unnest(articles_no) %>%
mutate(articles_no = as_factor(articles_no))
summary(gdpr_tidy_2)
## id country price date
## Min. : 1.00 Spain : 90 Min. : 0 Min. :1970-01-01
## 1st Qu.: 53.25 Romania : 47 1st Qu.: 3000 1st Qu.:2019-04-04
## Median :120.00 Germany : 38 Median : 12000 Median :2019-10-04
## Mean :123.09 Bulgaria : 29 Mean : 1116532 Mean :2017-03-28
## 3rd Qu.:192.00 Hungary : 29 3rd Qu.: 60000 3rd Qu.:2019-12-16
## Max. :250.00 Czech Republic: 17 Max. :50000000 Max. :2020-03-25
## (Other) :140
## total_articles articles_no
## Min. :1.000 5 :114
## 1st Qu.:1.000 6 : 94
## Median :2.000 32 : 61
## Mean :2.087 13 : 19
## 3rd Qu.:3.000 15 : 17
## Max. :5.000 12 : 9
## (Other): 76
There are some violations that has their date missing (hence labeled 1970-01-01), which is not correct as the law is implemented in 2018, so we will filter out those early date.
In view of the country, Spain is by far the most amount of violations, followed by Romania at a distant second. Most fine (price) is between 12k to 60k range, some cases involves no fine, while the maximum issued is 50 million.
ggplot(gdpr_tidy_2, aes(x = price + 1)) + ## handel 0 value
geom_histogram(color = "darkblue", fill = "lightblue") +
scale_x_log10(labels = dollar_format(prefix = "€")) +
labs(x = "Fines Amount",
y = "Frequency",
title = "Distribution of GDPR Fines (€)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We will group the uncommon levels in the factors
gdpr_tidy_3 <- gdpr_tidy_2 %>%
filter(date > 1971)
gdpr_tidy_3 %>%
ggplot(aes(date, y = price +1)) +
geom_point(alpha = 0.5, colour = "tomato1") +
geom_smooth(se = FALSE) +
scale_y_log10(labels = dollar_format(prefix = "€"), n.breaks = 7) +
labs(x = "",
y = "Fines (log scale)",
title = "Fines Amount by Date")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Since GDPR went into law, 2019 sees an increase number of violations, the initial gap probably due to investigation period. Although the average fine amount doesn’t increase, we do see authority are dishing out higher fines for serious violations.
Let’s have a look at the articles distributions
gdpr_tidy_3 %>%
mutate(articles_no = fct_reorder(articles_no, price)) %>%
ggplot(aes(x = articles_no, y = price + 1)) +
geom_jitter(alpha = 0.5, colour = "tomato1") +
geom_boxplot(alpha = 0.2,
fill = "darkblue",
outlier.colour = "NA") +
scale_y_log10(labels = dollar_format(prefix = "€"), n.breaks = 7) +
labs(x = "Article No.",
y = "Fines (log scale)",
title = "Fines Amount by Article No.")
As noted before, the median fines for most violations are below €100k, article no.5 and no.6 are the most common violations for fines. Though article no.14 has low count, but it does show a higher fines than other articles.
Similarly we can do the same thing for country.
gdpr_tidy_3 %>%
mutate(country = fct_reorder(country, price)) %>%
ggplot(aes(x = country, y = price + 1)) +
geom_jitter(alpha = 0.5, colour = "tomato1") +
geom_boxplot(alpha = 0.2,
fill = "darkblue",
outlier.colour = "NA") +
scale_y_log10(labels = dollar_format(prefix = "€"), n.breaks = 7) +
labs(x = "",
y = "Fines (log scale)",
title = "Fines Amount by Country") +
theme(axis.text.x = element_text(angle = 90))
We do see different countries have different amount fine, Italy and France have given out highest fines. While for the high count countries such as Spain and Romania is bounded in certain range.
We will pivot table for modeling
Setup for tidy mode workflow
gdpr_rec <- recipe(price ~ ., data = gdpr_tidy_3) %>%
update_role(id, new_role = "id") %>% ## so not used in modelling
step_log(price,
base = 10,
offset = 1,
skip = TRUE) %>% ## log transform
step_other(country, articles_no) %>% ## group uncommon level into "other" at 5% threshold
step_dummy(all_nominal()) %>%
step_nzv(all_predictors())
gdpr_prep <- prep(gdpr_rec)
gdpr_wf <- workflow() %>%
add_recipe(gdpr_rec) %>%
add_model(linear_reg() %>%
set_engine("lm"))
gdpr_fit <- gdpr_wf %>%
fit(data = gdpr_tidy_3)
gdpr_fit %>%
pull_workflow_fit() %>%
tidy() %>%
arrange(p.value)
## # A tibble: 12 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 total_articles 0.267 0.0533 5.02 0.000000814
## 2 country_other 0.952 0.190 5.00 0.000000898
## 3 country_Spain 0.742 0.214 3.48 0.000571
## 4 country_Germany 0.803 0.257 3.13 0.00191
## 5 articles_no_X32 0.747 0.300 2.49 0.0132
## 6 date 0.000940 0.000448 2.10 0.0364
## 7 (Intercept) -14.5 8.15 -1.77 0.0770
## 8 articles_no_X5 0.397 0.273 1.45 0.147
## 9 articles_no_X6 0.365 0.280 1.30 0.193
## 10 articles_no_other 0.354 0.278 1.27 0.205
## 11 country_Hungary 0.205 0.277 0.740 0.460
## 12 country_Bulgaria 0.0706 0.273 0.258 0.796
We see the linear regression model indicate that total_articles (multiply violations at the same time) increase the amount of fines, which is expected as fines are likely to be added together. For country hasn’t fined before (low count grouped into country_other), there will likely to be higher fines. Spain and Germany gave out higher fines than other country. For different Articles, only No.32 give out higher fines on average.
Predication on some assumed data
new_data <- crossing(
country = c("Germany", "Spain", "United Kingdom"),
articles_no = c(rep(32, times = 2), rep(6, times = 2))
) %>%
mutate(
articles_no = as_factor(articles_no),
total_articles = 1,
date = as.Date("2021-06-01"),
id = row_number()
)
Get predicted fine and their confidence interval
prediction <- predict(gdpr_fit,
new_data = new_data)
prediction_ci <- predict(gdpr_fit,
new_data = new_data,
type = "conf_int")
Plot the results
new_data %>%
bind_cols(prediction, prediction_ci) %>%
ggplot(aes(
x = articles_no,
y = 10 ^ .pred,
colour = country
)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = 10 ^ .pred_lower,
ymax = 10 ^ .pred_upper)) +
facet_wrap( ~ country) +
scale_y_log10(labels = dollar_format(prefix = "€"), n.breaks = 5) +
labs(x = "Article No.",
y = "Fines (log scale)",
title = "Predicted Fine Amount") +
theme(legend.position = "none")