Load and clean

gdpr <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv')
gdpr_tidy <- gdpr %>%
  transmute(id,
            country = name,
            price, 
            date = mdy(date),
            article_violated, ## reference
            articles = str_extract_all(article_violated, "Art. [:digit:]+|Art.[:digit:]+")) %>% 
  mutate(total_articles = map_int(articles, length)) %>% ## find multiple violations
  unnest(articles) %>% 
  select(-article_violated)

Use summary to check the articles column

summary(as_factor(gdpr_tidy$articles))
## Art. 28 Art. 12 Art. 13  Art. 5  Art. 6 Art. 31 Art. 32 Art. 33 Art. 25 Art. 21 
##       3       9      19     114      93       2      61       9       8       9 
## Art. 15 Art. 17 Art. 37  Art. 9 Art. 35 Art. 36 Art. 14  Art. 7 Art. 34  Art. 4 
##      17       7       3       7       1       1       4       5       3       1 
## Art. 18  Art.14 Art. 83 Art. 58   Art 6  Art 58  Art 83 
##       1       3       1       6       1       1       1
gdpr_tidy_2 <- gdpr_tidy %>% 
  mutate(articles_no = str_extract_all(articles, "[:digit:]+"),## get article number and convert to factor
         country = as_factor(country)) %>% 
  select(-articles) %>% 
  unnest(articles_no) %>% 
  mutate(articles_no = as_factor(articles_no)) 

summary(gdpr_tidy_2)
##        id                   country        price               date           
##  Min.   :  1.00   Spain         : 90   Min.   :       0   Min.   :1970-01-01  
##  1st Qu.: 53.25   Romania       : 47   1st Qu.:    3000   1st Qu.:2019-04-04  
##  Median :120.00   Germany       : 38   Median :   12000   Median :2019-10-04  
##  Mean   :123.09   Bulgaria      : 29   Mean   : 1116532   Mean   :2017-03-28  
##  3rd Qu.:192.00   Hungary       : 29   3rd Qu.:   60000   3rd Qu.:2019-12-16  
##  Max.   :250.00   Czech Republic: 17   Max.   :50000000   Max.   :2020-03-25  
##                   (Other)       :140                                          
##  total_articles   articles_no 
##  Min.   :1.000   5      :114  
##  1st Qu.:1.000   6      : 94  
##  Median :2.000   32     : 61  
##  Mean   :2.087   13     : 19  
##  3rd Qu.:3.000   15     : 17  
##  Max.   :5.000   12     :  9  
##                  (Other): 76

There are some violations that has their date missing (hence labeled 1970-01-01), which is not correct as the law is implemented in 2018, so we will filter out those early date.

Visiual Plot

In view of the country, Spain is by far the most amount of violations, followed by Romania at a distant second. Most fine (price) is between 12k to 60k range, some cases involves no fine, while the maximum issued is 50 million.

ggplot(gdpr_tidy_2, aes(x = price + 1)) + ## handel 0 value
  geom_histogram(color = "darkblue", fill = "lightblue") +
  scale_x_log10(labels = dollar_format(prefix = "€")) +
  labs(x = "Fines Amount",
       y = "Frequency",
       title = "Distribution of GDPR Fines (€)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We will group the uncommon levels in the factors

gdpr_tidy_3 <- gdpr_tidy_2 %>%
  filter(date > 1971)
gdpr_tidy_3 %>% 
  ggplot(aes(date, y = price +1)) +
  geom_point(alpha = 0.5, colour = "tomato1") +
  geom_smooth(se = FALSE) +
    scale_y_log10(labels = dollar_format(prefix = "€"), n.breaks = 7) +
  labs(x = "",
       y = "Fines (log scale)",
       title = "Fines Amount by Date")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Since GDPR went into law, 2019 sees an increase number of violations, the initial gap probably due to investigation period. Although the average fine amount doesn’t increase, we do see authority are dishing out higher fines for serious violations.

Let’s have a look at the articles distributions

gdpr_tidy_3 %>%
  mutate(articles_no = fct_reorder(articles_no, price)) %>%
  ggplot(aes(x = articles_no, y = price + 1)) +
  geom_jitter(alpha = 0.5, colour = "tomato1") +
  geom_boxplot(alpha = 0.2,
               fill = "darkblue",
               outlier.colour = "NA") +
  scale_y_log10(labels = dollar_format(prefix = "€"), n.breaks = 7) +
  labs(x = "Article No.",
       y = "Fines (log scale)",
       title = "Fines Amount by Article No.")

As noted before, the median fines for most violations are below €100k, article no.5 and no.6 are the most common violations for fines. Though article no.14 has low count, but it does show a higher fines than other articles.

Similarly we can do the same thing for country.

gdpr_tidy_3 %>%
  mutate(country = fct_reorder(country, price)) %>%
  ggplot(aes(x = country, y = price + 1)) +
  geom_jitter(alpha = 0.5, colour = "tomato1") +
  geom_boxplot(alpha = 0.2,
               fill = "darkblue",
               outlier.colour = "NA") +
  scale_y_log10(labels = dollar_format(prefix = "€"), n.breaks = 7) +
  labs(x = "",
       y = "Fines (log scale)",
       title = "Fines Amount by Country") +
  theme(axis.text.x = element_text(angle = 90))

We do see different countries have different amount fine, Italy and France have given out highest fines. While for the high count countries such as Spain and Romania is bounded in certain range.

Modelling

We will pivot table for modeling

Setup for tidy mode workflow

gdpr_rec <- recipe(price ~ ., data = gdpr_tidy_3) %>%
  update_role(id, new_role = "id") %>% ## so not used in modelling
  step_log(price,
           base = 10,
           offset = 1,
           skip = TRUE) %>% ## log transform
  step_other(country, articles_no) %>% ## group uncommon level into "other" at 5% threshold
  step_dummy(all_nominal()) %>%
  step_nzv(all_predictors())

gdpr_prep <- prep(gdpr_rec)
gdpr_wf <- workflow() %>%
  add_recipe(gdpr_rec) %>%
  add_model(linear_reg() %>%
              set_engine("lm"))
gdpr_fit <- gdpr_wf %>%
  fit(data = gdpr_tidy_3)

Results

gdpr_fit %>% 
  pull_workflow_fit() %>%
  tidy() %>% 
  arrange(p.value)
## # A tibble: 12 x 5
##    term                estimate std.error statistic     p.value
##    <chr>                  <dbl>     <dbl>     <dbl>       <dbl>
##  1 total_articles      0.267     0.0533       5.02  0.000000814
##  2 country_other       0.952     0.190        5.00  0.000000898
##  3 country_Spain       0.742     0.214        3.48  0.000571   
##  4 country_Germany     0.803     0.257        3.13  0.00191    
##  5 articles_no_X32     0.747     0.300        2.49  0.0132     
##  6 date                0.000940  0.000448     2.10  0.0364     
##  7 (Intercept)       -14.5       8.15        -1.77  0.0770     
##  8 articles_no_X5      0.397     0.273        1.45  0.147      
##  9 articles_no_X6      0.365     0.280        1.30  0.193      
## 10 articles_no_other   0.354     0.278        1.27  0.205      
## 11 country_Hungary     0.205     0.277        0.740 0.460      
## 12 country_Bulgaria    0.0706    0.273        0.258 0.796

We see the linear regression model indicate that total_articles (multiply violations at the same time) increase the amount of fines, which is expected as fines are likely to be added together. For country hasn’t fined before (low count grouped into country_other), there will likely to be higher fines. Spain and Germany gave out higher fines than other country. For different Articles, only No.32 give out higher fines on average.

Predication on some assumed data

new_data <- crossing(
  country = c("Germany", "Spain", "United Kingdom"),
  articles_no = c(rep(32, times = 2), rep(6, times = 2))
) %>%
  mutate(
    articles_no = as_factor(articles_no),
    total_articles = 1,
    date = as.Date("2021-06-01"),
    id = row_number()
  )

Get predicted fine and their confidence interval

prediction <- predict(gdpr_fit,
                      new_data = new_data)
prediction_ci <- predict(gdpr_fit,
                         new_data = new_data,
                         type = "conf_int")

Plot the results

new_data %>%
  bind_cols(prediction, prediction_ci) %>%
  ggplot(aes(
    x = articles_no,
    y = 10 ^ .pred,
    colour = country
  )) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = 10 ^ .pred_lower,
                    ymax = 10 ^ .pred_upper)) +
  facet_wrap( ~ country) +
  scale_y_log10(labels = dollar_format(prefix = "€"), n.breaks = 5) +
  labs(x = "Article No.",
       y = "Fines (log scale)",
       title = "Predicted Fine Amount") +
  theme(legend.position = "none")