# This is pre-processing you can ignore
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
library(rsample)
data <- read_csv("https://github.com/diagram-chasing/censor-board-cuts/raw/refs/heads/master/data/data.csv",
col_types = cols(.default = "c")) %>%
mutate(
cert_date = as.Date(cert_date),
total_modified_time_secs = as.numeric(total_modified_time_secs),
deleted_secs = as.numeric(deleted_secs),
replaced_secs = as.numeric(replaced_secs),
inserted_secs = as.numeric(inserted_secs)
) %>%
mutate(
office = str_split(certifier, ",") %>%
map_chr(last) %>%
str_trim()
) %>%
separate_rows(ai_content_types, sep = "\\|") %>%
mutate(ai_content_types = str_trim(ai_content_types))%>%
mutate(
language = case_when(
language == "Oriya" ~ "Odia",
language == "Gujrati" ~ "Gujarati",
language == "Chhatisgarhi" ~ "Chhattisgarhi",
language == "Hariyanvi" ~ "Haryanvi",
language == "Hindi Dub" ~ "Hindi Dubbed",
TRUE ~ language
)
) %>%
filter(!is.na(language))
top_languages <- data %>%
filter(!is.na(language), language != "") %>%
distinct(certificate_id, language) %>%
count(language) %>%
slice_max(order_by = n, n = 15) %>%
pull(language)
Percentage of action
types
data %>%
filter(!is.na(ai_action)) %>%
count(ai_action, name = "count", sort = TRUE) %>%
mutate(`Percentage (%)` = count / sum(count) * 100) %>%
rename(`Action Type` = ai_action, Count = count) %>%
kable(digits = 1)
deletion |
45443 |
43.7 |
audio_modification |
22247 |
21.4 |
insertion |
15071 |
14.5 |
visual_modification |
8326 |
8.0 |
replacement |
6609 |
6.4 |
text_modification |
5740 |
5.5 |
content_overlay |
439 |
0.4 |
Percentage of content
altered by type
data %>%
filter(!is.na(ai_media_element)) %>%
count(ai_media_element, name = "count", sort = TRUE) %>%
mutate(`Percentage (%)` = count / sum(count) * 100) %>%
rename(`Media Type` = ai_media_element, Count = count) %>%
kable(digits = 1)
visual_scene |
47869 |
46.1 |
text_dialogue |
35207 |
33.9 |
metadata |
16034 |
15.4 |
music |
4211 |
4.1 |
other |
554 |
0.5 |
Share of ratings by
language
ratings_by_language <- data %>%
mutate(rating_group = case_when(
rating %in% c("UA", "UA 7+", "UA 13+", "UA 16+") ~ "General Audience (U/UA)",
rating == "A" ~ "Adults Only (A)",
rating == "U" ~ "Unrestricted (U)",
rating == "S" ~ "S",
TRUE ~ "NA / Other"
)) %>%
filter(language %in% top_languages) %>%
distinct(certificate_id, language, rating_group) %>%
count(language, rating_group) %>%
group_by(language) %>%
mutate(percentage = n / sum(n)) %>%
ungroup() %>%
select(-n) %>%
pivot_wider(names_from = rating_group, values_from = percentage, values_fill = 0) %>%
mutate(across(where(is.numeric), ~ .x * 100))
ratings_by_language%>%
kable(digits = 1)
Bengali |
7.2 |
74.7 |
18.1 |
0.0 |
0 |
Bhojpuri |
9.0 |
85.1 |
5.9 |
0.0 |
0 |
Chhattisgarhi |
3.7 |
72.0 |
24.4 |
0.0 |
0 |
English |
16.1 |
72.3 |
11.7 |
0.0 |
0 |
Gujarati |
2.2 |
71.9 |
26.0 |
0.0 |
0 |
Hindi |
9.5 |
78.1 |
12.2 |
0.3 |
0 |
Hindustani |
0.9 |
93.0 |
6.0 |
0.0 |
0 |
Kannada |
11.2 |
71.3 |
17.5 |
0.0 |
0 |
Malayalam |
4.8 |
60.1 |
34.8 |
0.3 |
0 |
Marathi |
6.9 |
73.8 |
19.3 |
0.0 |
0 |
Odia |
1.1 |
56.8 |
42.0 |
0.0 |
0 |
Punjabi |
5.6 |
69.4 |
25.0 |
0.0 |
0 |
Tamil |
6.6 |
67.7 |
25.4 |
0.4 |
0 |
Telugu |
12.4 |
74.7 |
12.9 |
0.0 |
0 |
Urdu |
1.4 |
91.3 |
7.2 |
0.0 |
0 |
Duration modified by
language
Total time modified by language:
In your SQL code, you grouped it by rating which I have not done; I
have just calculated by grouping movies by language. But the values are
comparable.
movie_durations <- data %>%
filter(!is.na(language), language != "") %>%
distinct(certificate_id, language, duration_secs) %>%
mutate(duration_secs = as.numeric(duration_secs)) %>%
group_by(language) %>%
summarise(total_movie_secs = sum(duration_secs, na.rm = TRUE))
modified_durations <- data %>%
mutate(total_modified_time_secs = as.numeric(total_modified_time_secs)) %>%
filter(!is.na(language), language != "") %>%
group_by(language) %>%
summarise(total_modified_secs = sum(total_modified_time_secs, na.rm = TRUE))
language_summary <- movie_durations %>%
full_join(modified_durations, by = "language") %>%
mutate(
`Total Movie Duration (Hours)` = total_movie_secs / 3600,
`Total Modified Duration (Hours)` = total_modified_secs / 3600,
`Percent Modified (%)` = (total_modified_secs / total_movie_secs) * 100
) %>%
select(
Language = language,
`Total Movie Duration (Hours)`,
`Total Modified Duration (Hours)`,
`Percent Modified (%)`
) %>%
filter(`Language` %in% top_languages) %>%
arrange(desc(`Total Movie Duration (Hours)`))
language_summary %>%
kable(digits = 1)
Hindi |
8351.4 |
192.6 |
2.3 |
Tamil |
5534.6 |
63.3 |
1.1 |
Telugu |
5405.3 |
55.8 |
1.0 |
Kannada |
4223.3 |
52.9 |
1.3 |
English |
2876.2 |
63.9 |
2.2 |
Malayalam |
2852.5 |
53.2 |
1.9 |
Bhojpuri |
2358.2 |
84.1 |
3.6 |
Marathi |
1520.9 |
26.3 |
1.7 |
Bengali |
1033.8 |
27.9 |
2.7 |
Gujarati |
775.1 |
4.9 |
0.6 |
Odia |
557.9 |
11.9 |
2.1 |
Punjabi |
548.4 |
5.3 |
1.0 |
Hindustani |
471.8 |
64.5 |
13.7 |
Chhattisgarhi |
191.1 |
0.8 |
0.4 |
Urdu |
143.4 |
17.6 |
12.3 |
Average time modified by language:
movie_modifications <- data %>%
filter(!is.na(language), language != "", total_modified_time_secs > 0) %>%
group_by(certificate_id, language) %>%
summarize(
total_duration_modified = sum(total_modified_time_secs, na.rm = TRUE),
.groups = "drop"
)
top_languages <- movie_modifications %>%
count(language) %>%
slice_max(order_by = n, n = 10) %>%
pull(language)
movie_modifications %>%
filter(language %in% top_languages) %>%
bootstraps(times = 2000) %>%
mutate(movie_data = map(splits, analysis)) %>%
unnest(movie_data) %>%
group_by(language, id) %>%
summarize(avg_duration = mean(total_duration_modified), .groups = "drop") %>%
group_by(language) %>%
summarize(
mean = mean(avg_duration),
conf.low = quantile(avg_duration, 0.025),
conf.high = quantile(avg_duration, 0.975)
) %>%
mutate(across(c(mean, conf.low, conf.high), ~ .x / 60)) %>%
arrange(desc(mean))%>%
kable(digits = 1)
Hindustani |
18.0 |
16.9 |
19.1 |
Bhojpuri |
6.5 |
5.9 |
7.1 |
Bengali |
5.4 |
4.5 |
6.2 |
Malayalam |
4.0 |
3.5 |
4.6 |
Marathi |
3.8 |
3.1 |
4.4 |
Hindi |
3.7 |
3.5 |
3.9 |
Kannada |
3.4 |
1.8 |
6.2 |
English |
2.6 |
2.5 |
2.8 |
Telugu |
2.6 |
2.4 |
2.9 |
Tamil |
2.3 |
2.1 |
2.5 |
Avg modifications per
film (by language)
I generally avoid saying cuts because these could just be visual
disclaimers being added and so on, not necessarily a
deletion/replacement. Also mapped with confidence intervals based on
this example: https://github.com/dgrtwo/data-screencasts/blob/master/bird-collisions.Rmd
avg_cuts_by_lang <- data %>%
filter(!is.na(language), language != "") %>%
group_by(language) %>%
summarise(
total_movies = n_distinct(certificate_id),
total_cuts = n(),
.groups = "drop"
) %>%
filter(total_movies > 50) %>%
mutate(avg_cuts = total_cuts / total_movies) %>%
arrange(desc(avg_cuts))
avg_cuts_by_lang %>%
kable(digits = 1)
English |
1687 |
16446 |
9.7 |
Tamil |
2671 |
16309 |
6.1 |
Bhojpuri |
1031 |
6240 |
6.1 |
Telugu |
2558 |
14745 |
5.8 |
Bengali |
498 |
2714 |
5.4 |
Hindustani |
215 |
1078 |
5.0 |
Hindi |
4178 |
20915 |
5.0 |
Kannada |
1975 |
9886 |
5.0 |
Marathi |
737 |
3584 |
4.9 |
Urdu |
69 |
325 |
4.7 |
Malayalam |
1338 |
5683 |
4.2 |
Punjabi |
268 |
1084 |
4.0 |
Gujarati |
366 |
1413 |
3.9 |
Chhattisgarhi |
82 |
315 |
3.8 |
Odia |
264 |
972 |
3.7 |
Maybe graphing these with confidence intervals? Which ones have the
highest spread?
library(tidyverse)
library(rsample)
set.seed(2025)
movie_cuts <- data %>%
filter(!is.na(language), language != "") %>%
count(certificate_id, language, name = "n_cuts")
top_languages <- movie_cuts %>%
count(language) %>%
slice_max(order_by = n, n = 15) %>%
pull(language)
movie_cuts %>%
filter(language %in% top_languages) %>%
bootstraps(times = 2000) %>%
mutate(movie_data = map(splits, analysis)) %>%
unnest(movie_data) %>%
group_by(language, id) %>%
summarize(avg_cuts = mean(n_cuts), .groups = "drop") %>%
group_by(language) %>%
summarize(
mean = mean(avg_cuts),
conf.low = quantile(avg_cuts, 0.025),
conf.high = quantile(avg_cuts, 0.975)
) %>%
mutate(language = fct_reorder(language, mean)) %>%
ggplot(aes(x = mean, y = language)) +
geom_vline(
xintercept = mean(movie_cuts$n_cuts),
linetype = "dashed",
color = "gray50"
) +
geom_errorbarh(
aes(xmin = conf.low, xmax = conf.high),
height = 0.2,
color = "gray70"
) +
geom_point(
aes(color = mean > mean(movie_cuts$n_cuts)),
size = 3
) +
scale_color_manual(guide = "none", values = c("FALSE" = "#0072B2", "TRUE" = "#D55E00")) +
labs(
title = "Which Languages Receive More Censor Cuts?",
subtitle = "95% confidence intervals for the average number of cuts per film.",
x = "Average Cuts per Movie",
y = "Language"
) +
theme_minimal(base_family = "sans") +
theme(
panel.grid.major.y = element_blank()
)

Languages certified by
each office
Just curious about which offices modify which languages
valid_offices <- c("Mumbai", "Kolkata", "Hyderabad", "Guwahati", "Delhi",
"Cuttack", "Chennai", "Bangalore", "Thiruvananthpuram")
top_n_languages <- 15
data %>%
mutate(
office = str_trim(office),
office = str_to_title(office)
) %>%
distinct(id, language, office) %>%
filter(office %in% valid_offices, !is.na(language), language != "") %>%
count(office, language, name = "count") %>%
mutate(language = fct_lump_n(language, n = top_n_languages, w = count)) %>%
filter(language != "Other") %>%
mutate(
office = fct_reorder(office, count, .fun = sum),
language = fct_reorder(language, count, .fun = sum, .desc = TRUE)
) %>%
ggplot(aes(x = language, y = office, fill = count)) +
geom_tile(color = "gray20", linewidth = 0.4) +
geom_text(
aes(label = count, color = count > 1200),
size = 2.75,
fontface = "bold"
) +
scale_fill_viridis_c(
option = "magma",
name = "Movie Count",
direction = -1
) +
scale_color_manual(guide = "none", values = c("FALSE" = "black", "TRUE" = "white")) +
labs(
title = paste("Frequency of Film Language by Certifying Office"),
x = "",
y = ""
) +
theme_minimal(base_family = "sans") +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
plot.subtitle = element_text(hjust = 0.5, size = 12, color = "gray40"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.grid = element_blank(),
legend.position = "none"
)

---
title: "CBFC Watch Summaries"
output:
  html_document:
    theme: readable
    highlight: tango
    toc: true
    toc_depth: 3
    toc_float:
      collapsed: false
      smooth_scroll: true
    code_folding: show
    code_download: true
    df_print: paged
    number_sections: true
    fig_width: 10
    fig_height: 6
    fig_caption: true
    self_contained: true
    keep_md: false
editor_options:
  chunk_output_type: console
---

```{r}

# This is pre-processing you can ignore
library(tidyverse)
library(knitr)
library(rsample)
data <- read_csv("https://github.com/diagram-chasing/censor-board-cuts/raw/refs/heads/master/data/data.csv", 
                 col_types = cols(.default = "c")) %>%
  mutate(
    cert_date = as.Date(cert_date),
    total_modified_time_secs = as.numeric(total_modified_time_secs),
    deleted_secs = as.numeric(deleted_secs),
    replaced_secs = as.numeric(replaced_secs),
    inserted_secs = as.numeric(inserted_secs)
  ) %>%
  mutate(
    office = str_split(certifier, ",") %>%
      map_chr(last) %>%
      str_trim()
  ) %>%
  separate_rows(ai_content_types, sep = "\\|") %>%
  mutate(ai_content_types = str_trim(ai_content_types))%>%
  mutate(
    language = case_when(
      language == "Oriya" ~ "Odia",
      language == "Gujrati" ~ "Gujarati",
      language == "Chhatisgarhi" ~ "Chhattisgarhi",
      language == "Hariyanvi" ~ "Haryanvi",
      language == "Hindi Dub" ~ "Hindi Dubbed",
      TRUE ~ language
    )
  ) %>%
  filter(!is.na(language))

top_languages <- data %>%
  filter(!is.na(language), language != "") %>%
  distinct(certificate_id, language) %>%
  count(language) %>%
  slice_max(order_by = n, n = 15) %>%
  pull(language)
```

## Percentage of action types

```{r}
data %>%
  filter(!is.na(ai_action)) %>%
  count(ai_action, name = "count", sort = TRUE) %>%
  mutate(`Percentage (%)` = count / sum(count) * 100) %>%
  rename(`Action Type` = ai_action, Count = count) %>%
  kable(digits = 1)
```

## Percentage of content altered by type
```{r}
data %>%
  filter(!is.na(ai_media_element)) %>%
  count(ai_media_element, name = "count", sort = TRUE) %>%
  mutate(`Percentage (%)` = count / sum(count) * 100) %>%
  rename(`Media Type` = ai_media_element, Count = count) %>%
  kable(digits = 1)
```


## Share of ratings by language

```{r}
ratings_by_language <- data %>%
  mutate(rating_group = case_when(
    rating %in% c("UA", "UA 7+", "UA 13+", "UA 16+") ~ "General Audience (U/UA)",
    rating == "A"                                         ~ "Adults Only (A)",
    rating == "U"                                         ~ "Unrestricted (U)",
    rating == "S"                                         ~ "S",
    TRUE                                                  ~ "NA / Other"
  )) %>% 
 filter(language %in% top_languages) %>%
  distinct(certificate_id, language, rating_group) %>%
  count(language, rating_group) %>%
  group_by(language) %>%
  mutate(percentage = n / sum(n)) %>%
  ungroup() %>%
  select(-n) %>%
  pivot_wider(names_from = rating_group, values_from = percentage, values_fill = 0) %>% 
  mutate(across(where(is.numeric), ~ .x * 100))

ratings_by_language%>%
  kable(digits = 1)
```


## Duration modified by language

Total time modified by language:

In your SQL code, you grouped it by rating which I have not done; I have just calculated by grouping movies by language. But the values are comparable. 

```{r}
movie_durations <- data %>%
  filter(!is.na(language), language != "") %>%
  distinct(certificate_id, language, duration_secs) %>%
  mutate(duration_secs = as.numeric(duration_secs)) %>%
  group_by(language) %>%
  summarise(total_movie_secs = sum(duration_secs, na.rm = TRUE))

modified_durations <- data %>%
  mutate(total_modified_time_secs = as.numeric(total_modified_time_secs)) %>%
  filter(!is.na(language), language != "") %>%
  group_by(language) %>%
  summarise(total_modified_secs = sum(total_modified_time_secs, na.rm = TRUE))

language_summary <- movie_durations %>%
  full_join(modified_durations, by = "language") %>%
  mutate(
    `Total Movie Duration (Hours)` = total_movie_secs / 3600,
    `Total Modified Duration (Hours)` = total_modified_secs / 3600,
    `Percent Modified (%)` = (total_modified_secs / total_movie_secs) * 100
  ) %>%
  select(
    Language = language,
    `Total Movie Duration (Hours)`,
    `Total Modified Duration (Hours)`,
    `Percent Modified (%)`
  ) %>%
  filter(`Language` %in% top_languages) %>% 
  arrange(desc(`Total Movie Duration (Hours)`))

language_summary %>%
  kable(digits = 1)
```


Average time modified by language:
```{r}
movie_modifications <- data %>%
    filter(!is.na(language), language != "", total_modified_time_secs > 0) %>%
    group_by(certificate_id, language) %>%
    summarize(
        total_duration_modified = sum(total_modified_time_secs, na.rm = TRUE),
        .groups = "drop"
    )
top_languages <- movie_modifications %>%
    count(language) %>%
    slice_max(order_by = n, n = 10) %>%
    pull(language)


movie_modifications %>%
    filter(language %in% top_languages) %>%
    bootstraps(times = 2000) %>%
    mutate(movie_data = map(splits, analysis)) %>%
    unnest(movie_data) %>%
    group_by(language, id) %>%
    summarize(avg_duration = mean(total_duration_modified), .groups = "drop") %>%
    group_by(language) %>%
    summarize(
        mean = mean(avg_duration),
        conf.low = quantile(avg_duration, 0.025),
        conf.high = quantile(avg_duration, 0.975)
    ) %>%
    mutate(across(c(mean, conf.low, conf.high), ~ .x / 60)) %>%
    arrange(desc(mean))%>%
  kable(digits = 1)
```


## Avg modifications per film (by language)

I generally avoid saying cuts because these could just be visual disclaimers being added and so on, not necessarily a deletion/replacement. 
Also mapped with confidence intervals based on this example: https://github.com/dgrtwo/data-screencasts/blob/master/bird-collisions.Rmd

```{r}
avg_cuts_by_lang <- data %>%
  filter(!is.na(language), language != "") %>%
  group_by(language) %>%
  summarise(
    total_movies = n_distinct(certificate_id),
    total_cuts = n(),
    .groups = "drop"
  ) %>%
  filter(total_movies > 50) %>%
  mutate(avg_cuts = total_cuts / total_movies) %>%
  arrange(desc(avg_cuts))

avg_cuts_by_lang %>%
  kable(digits = 1)


```

Maybe graphing these with confidence intervals? Which ones have the highest spread? 

```{r}
library(tidyverse)
library(rsample)

set.seed(2025)

movie_cuts <- data %>%
  filter(!is.na(language), language != "") %>%
  count(certificate_id, language, name = "n_cuts")

top_languages <- movie_cuts %>%
  count(language) %>%
  slice_max(order_by = n, n = 15) %>%
  pull(language)

movie_cuts %>%
  filter(language %in% top_languages) %>%
  bootstraps(times = 2000) %>%
  mutate(movie_data = map(splits, analysis)) %>%
  unnest(movie_data) %>%
  group_by(language, id) %>%
  summarize(avg_cuts = mean(n_cuts), .groups = "drop") %>%
  group_by(language) %>%
  summarize(
    mean = mean(avg_cuts),
    conf.low = quantile(avg_cuts, 0.025),
    conf.high = quantile(avg_cuts, 0.975)
  ) %>%
  mutate(language = fct_reorder(language, mean)) %>%
  ggplot(aes(x = mean, y = language)) +
  geom_vline(
    xintercept = mean(movie_cuts$n_cuts),
    linetype = "dashed",
    color = "gray50"
  ) +
  geom_errorbarh(
    aes(xmin = conf.low, xmax = conf.high),
    height = 0.2,
    color = "gray70"
  ) +
  geom_point(
    aes(color = mean > mean(movie_cuts$n_cuts)),
    size = 3
  ) +
  scale_color_manual(guide = "none", values = c("FALSE" = "#0072B2", "TRUE" = "#D55E00")) +
  labs(
    title = "Which Languages Receive More Censor Cuts?",
    subtitle = "95% confidence intervals for the average number of cuts per film.",
    x = "Average Cuts per Movie",
    y = "Language"
  ) +
  theme_minimal(base_family = "sans") +
  theme(
    panel.grid.major.y = element_blank()
  )
```


## Languages certified by each office

Just curious about which offices modify which languages

```{r}
valid_offices <- c("Mumbai", "Kolkata", "Hyderabad", "Guwahati", "Delhi",
                   "Cuttack", "Chennai", "Bangalore", "Thiruvananthpuram")

top_n_languages <- 15 

data %>%
  mutate(
    office = str_trim(office),
    office = str_to_title(office)
  ) %>%
    distinct(id, language, office) %>%
    filter(office %in% valid_offices, !is.na(language), language != "") %>%
    count(office, language, name = "count") %>%
    mutate(language = fct_lump_n(language, n = top_n_languages, w = count)) %>%
    filter(language != "Other") %>% 
    mutate(
    office = fct_reorder(office, count, .fun = sum),
    language = fct_reorder(language, count, .fun = sum, .desc = TRUE)
  ) %>%
  
  ggplot(aes(x = language, y = office, fill = count)) +
  geom_tile(color = "gray20", linewidth = 0.4) +
  
  geom_text(
    aes(label = count, color = count > 1200), 
    size = 2.75,
    fontface = "bold"
  ) +
  scale_fill_viridis_c(
    option = "magma",
    name = "Movie Count",
    direction = -1
  ) +
  scale_color_manual(guide = "none", values = c("FALSE" = "black", "TRUE" = "white")) +
  labs(
    title = paste("Frequency of Film Language by Certifying Office"),
    x = "",
    y = ""
  ) +
  theme_minimal(base_family = "sans") +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
    plot.subtitle = element_text(hjust = 0.5, size = 12, color = "gray40"),
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.grid = element_blank(),
    legend.position = "none"
  )
```

