# install.packages("readr") #For reading my CSV
# install.packages("dplyr") #For Data Cleaning and piping ( %>% )
# install.packages("ggplot2") #To do some graphs
# install.packages("XML") #To import XML file for web scrapping
# install.packages("xml2") #To manage XML data
# install.packages("tibble")
# install.packages("tidyverse")
# install.packages("rvest")
# install.packages("leaflet") #To create interactive map
# install.packages("rgdal") #To manage geojson data
# install.packages("ggalt") #To create more ggplot like geom_dumbbell
# install.packages("kableExtra") #To create some table with a HTML format
# install.packages("formattable") #To format my columns with color for example
# install.packages("wordcloud") #To create worldcloud
# install.packages("tm") #for text mining
# install.packages("SnowballC") #for text mining
# install.packages("maps") #To load some data about the countries/cities
# install.packages("geosphere")
# install.packages("rnaturalearth") #To load geo data
# install.packages("ggridges") #To create Basic ridgeline plot
# install.packages("patchwork") #to create network Diagram
# install.packages("hrbrthemes")
# install.packages("circlize") #to create network Diagram
# install.packages("networkD3") #to create network Diagram
# install.packages("influential") #to create Sankey Diagram
# install.packages("igraph") #to create Sankey Diagram
# install.packages("oce") #to create Sankey Diagram
# install.packages("ggraph") #to create Sankey Diagram
# install.packages("devtools") #to add some external libraries
# install.packages("addTextLabels") #to add label on my maps
# install.packages("devtools")
# install_github("JosephCrispell/addTextLabels")
# devtools::install_github("wch/webshot")

library(readr) #For reading my CSV
library(dplyr) #For Data Cleaning and piping ( %>% )
library(ggplot2) #To do some graphs
library(XML) #To import XML file for web scrapping
library(xml2) #To manage XML data
library(tibble)
library(tidyverse)
library(rvest)
library(leaflet) #To create interactive map
library(rgdal) #To manage geojson data
library(ggalt) #To create more ggplot like geom_dumbbell
library(kableExtra) #To create some table with a HTML format
library(formattable) #To format my columns with color for example
library(wordcloud) #To create worldcloud
library(tm) #for text mining
library(SnowballC) #for text mining
library(maps) #To load some data about the countries/cities
library(geosphere)
library(rnaturalearth)
library(ggridges) #To create Basic ridgeline plot
library(patchwork)
library(hrbrthemes)
library(circlize)
library(networkD3)
library(influential) #to create Sankey Diagram
library(igraph) #to create Sankey Diagram
library(oce) #to create Sankey Diagram
library(ggraph) #to create Sankey Diagram
library(devtools) #to add some external libraries
library(addTextLabels)


#My graphic charter
purple <- c("#2C2C54")
pink <- c("#A40E4C")
blue <- c("#2E86AB")
yellow <- c("#FF9C00")
lila <- c("#E3DFFF")
brown <- c("#C3979F")
grey <- c("#BFBFBF")
white <- c("#FFFFFF")
allcols <- c(purple,pink,blue,yellow,lila,brown)
allcols2 <- c(purple,pink,blue,yellow,lila,brown)

#I have created a function using my graphic palette to load X colours from this palette depending on the number of colours I need
mypal <- function(nbcol){
  colsample <- allcols[1:nbcol]
  return(colsample)
}

#I load my two databases
Filming_loc <- read_delim("lieux-de-tournage-a-paris.csv", delim = ";") #Filming locations
Arrondissements <- rgdal::readOGR("arrondissements.geojson") #To diplay Paris' arrondissements on a map with polygon


#Here you can find my css code to customize font size, color etc.

Introduction

Cinema, and especially French cinema, is an integral part of the French cultural heritage. Great French names have participated in the development of the cinema we know today, such as Léon Bouly who gave the name of “cinématographe” to the camera for which he filed a patent in 1892, or the Lumière brothers who played a major role in the history of cinema and photography.

For its part, Paris enjoys an international aura:

To better understand how Paris is the scene of many productions, I use a database from the opendata.paris.fr website to list the various filming locations.

Source of my database

Overview of my Database

#I defined my minimum and maximum year in these variables to reuse them later
YearMin <- min(Filming_loc$`Année du tournage`)
YearMax <- max(Filming_loc$`Année du tournage`)

#I wanted to know how many lines I had in my database to know the number of project
Nrows <- nrow(Filming_loc)
Nrows_project <- Filming_loc %>% filter(`Type de tournage`=="Long métrage") %>% nrow()

#I created a table to see occurrences of the different types of projects
Table <- table(Filming_loc$`Type de tournage`) %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%  #I ordered my data by Freq
  `colnames<-`(c("Type", "Freq")) #I renamed the columns

#I plot this table to display four categories and select just the movies
ggplot(Table, aes(x = Type, y = Freq))+ #I defined my x and y axis in aesthetics
  geom_bar(stat = "summary", fun = "mean", fill=purple, width = 0.2)+ #I summarized my data, I choosed a colour and a size
  scale_x_discrete(limits=Table$Type)+ #I defined the scale of my x-axis to have discerete value
  ylim(0,6000)+ #I set the limit at 6000
  geom_text(aes(label = Freq), #I displayed Freq and i adjusted the position of these labels 
            size=3,
            hjust=0.5,
            vjust=-1,
            check_overlap = T)+ #I avoid overlap of my labels
  theme(text=element_text(size=12), #I set the font size at 12
        panel.grid.major = element_blank(), #I deleted the grid to have a minimalist theme
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none", #I deleted legend because it's not necessary in this graph
        axis.line = element_line(colour = purple))+ 
  labs(title="Type of projects filmed in Paris", #I defined a title of my graph and axis
       subtitle=paste0("from ",YearMin," to ",YearMax),
       y="Frequency", x="Type")

My database lists shoots from 2016 to 2020 with over 8919 entries recorded.

Here are some columns that will be useful for my analysis:

  • Year of filming: from 2016 to 2020, here we can make an analysis over time.
  • Type of filming: 4 types of projects are represented here. Opposite, a graph representing the volume of projects according to type.
  • Title: The title of the film will allow to scrape the data concerning this film to allow a better understanding of this environment
  • Director: will identify the most prolific
  • Production company: the production is in charge of all the logistics related to the film and in particular of booking the spaces proposed by the city of Paris. Even for a foreign film, to shoot in France, part of the production must be French.
  • Street and Postal Code: Not very useful in its current state unless accompanied by coordinates but may be useful for future labels on a map
  • Start and end date of shooting: will allow me to calculate the duration of a shoot and the seasonality.
  • Coordinates: Essential for the creation of a map

Incremental data for reinforcement

#I filter my base to obtain only the feature films: the heart of my analysis
Filming_loc <- Filming_loc %>% filter(`Type de tournage`=="Long métrage")
Filming_loc <- Filming_loc[str_detect(Filming_loc$`Code postal`,"750"),]

# #/!\ PLEASE RUN THIS CODE ONLY IF YOU WANT TO UPDATE THE FILM DATABASE (20 minutes) /!\
# 
# #I generate a second database to scrape information on films that have been shot in Paris
# Movies_DB <- Filming_loc %>% select(Titre,Réalisateur)
# Movies_DB <- unique(Movies_DB) %>% as.data.frame() #removal of duplicates
# 
# #I generate the Google search links that will allow me to obtain the Allociné links
# Movies_DB$Link <- paste0("https://qc.search.yahoo.com/search;_ylt=AwrJ7F8UlJtheRIAUQvTGAx.;_ylc=X1MDMjExNDcyMjAwMwRfcgMyBGZyA3lmcC10BGZyMgNzYi10b3AEZ3ByaWQDVy5tY3JFUEpTdENlYVguaFpqaUdxQQRuX3JzbHQDMARuX3N1Z2cDMARvcmlnaW4DcWMuc2VhcmNoLnlhaG9vLmNvbQRwb3MDMARwcXN0cgMEcHFzdHJsAzAEcXN0cmwDMjkEcXVlcnkDVm9sdCUyMHN0YXIlMjBtYWxnciVDMyVBOSUyMGx1aSUyMGFsbG9jaW4lQzMlQTkEdF9zdG1wAzE2Mzc1ODU5NTM-?p=",URLencode(paste(Movies_DB$Titre,"de",Movies_DB$Réalisateur)),"+allociné")
# #I encode the name of the film, the director and Allociné in UTF8 to match a URL format that I paste at the end of a standard Yahoo search: Yahoo being more permissive than Google, it is easier to scrape data.
# 
# #I create a loop allowing me to retrieve the Allociné link for each line in my database
# for (l in 1:length(Movies_DB$Link)){
#   Link2 <- read_html(Movies_DB$Link[l]) %>% html_nodes(".pb-4") %>% html_text()
#   #This function reads a web page and then I ask it to select the html element ".pb-4" and then I convert my string into characters
#   Link2 <- Link2[str_detect(Link2,"allo")] #I filter links that do not contain "allo
#   Movies_DB$Link2[l] <- Link2[1]
# }
# 
# #I clean up my database by removing invalid links and converting my links so that they are usable
# Movies_DB$Link2 <- paste0("https://",str_replace_all(Movies_DB$Link2," › ", "/"),".html")
# Movies_DB <- Movies_DB[Movies_DB$Link2!="NA.html",]
# Movies_DB <- select(Movies_DB,-Link)
# Movies_DB <- Movies_DB[str_detect(Movies_DB$Link,"https://www.allocine.fr/film/fichefilm_gen_cfilm="),]
# Movies_DB$Title2 <- NA
# colnames(Movies_DB) <- c("Title","Director","Link","Title2")
# 
# #I get more data and operate the same functions as before
# for (l in 1:length(Movies_DB$Link)){
#   website <- read_html(Movies_DB$Link[l])
#   Movies_DB$Title2[l] <- website %>% html_nodes(".titlebar-title-lg") %>% html_text()
#   EnglishTitle <- website %>% html_nodes(".meta-body-item") %>% html_text()
#   EnglishTitle <- EnglishTitle[str_detect(EnglishTitle,"Titre")]
#   EnglishTitle <- str_split(EnglishTitle,"\n")
#   EnglishTitle <- unlist(EnglishTitle)[3]
#   Movies_DB$`English Title`[l] <- if(is.null(EnglishTitle)) {Movies_DB$Title2[l]} else EnglishTitle
#   Director <- website %>% html_nodes(".meta-body-direction") %>% html_text()
#   Director <- str_split(Director,"\n")
#   Director <- unlist(Director)[3]
#   Movies_DB$Director[l] <- Director
#   Date <- website %>% html_nodes(".date") %>% html_text()
#   Date <- str_split(Date,"\n") %>% unlist()
#   Movies_DB$Date[l] <- if(is.null(Date[2])) {NA} else Date[2]
# }
# 
# months.english <- month.name
# month.french <- c("janvier","février","mars","avril","mai","juin","juillet","août","septembre","octobre","novembre","décembre")
# 
# #I get more data and operate the same functions as before
# for (l in 1:length(Movies_DB$Link)){
#   Movies_DB$Date[l] <- if(is.na(Movies_DB$Date[l])){NA}else{
#     format(as.Date(ISOdate(
#       as.numeric(unlist(str_split(Movies_DB$Date," ")[l])[3]),
#       match(unlist(str_split(Movies_DB$Date," ")[l])[2], month.french),
#       as.numeric(unlist(str_split(Movies_DB$Date," ")[l])[1]),tz="UTC")),"01/%m/%Y")}
# }
# 
# for (l in 1:length(Movies_DB$Link)){
#   website <- read_html(Movies_DB$Link[l])
#   Duration <- website %>% html_nodes(".meta-body-info") %>% html_text() %>% str_split("\n") %>% unlist()
#   Duration <- Duration[str_detect(Duration,"h ")] %>% str_split("h ") %>% unlist()
#   Duration[2] <- str_replace(Duration[2],"min","")
#   Movies_DB$Duration[l] <- as.numeric(Duration)[1]*60+as.numeric(Duration)[2]
# }
# 
# #Data cleaning
# Movies_DB <- Movies_DB[!is.na(Movies_DB$Duration),]
# 
# for (l in 1:length(Movies_DB$Link)){
#   website <- read_html(Movies_DB$Link[l])
#   Budget <- website %>% html_nodes(".item:nth-child(12)") %>% html_text() %>% str_split("\n") %>% unlist()
#   Movies_DB$Budget[l] <- if(sum(str_detect(Budget,"0"))==0){NA} else {Budget[str_detect(Budget,"0")]}
#   Summary <- website %>% html_nodes(".content-txt") %>% html_text() %>%str_remove_all("\n")
#   Movies_DB$Summary[l] <- Summary[1]
#   Genre <- website %>% html_nodes(".meta-body-info") %>% html_text() %>% str_split("\n") %>% unlist()
#   Genre <- Genre[Genre!=""]
#   Genre <- Genre[(length(Genre)-1):length(Genre)] %>% str_remove_all(",")
#   Movies_DB$Genre[l] <- paste0(Genre[1]," / ",Genre[2])
#   Actor <- website %>% html_nodes(".meta-body-actor") %>% html_text() %>% str_split("\n") %>% unlist()
#   Actor <- Actor[Actor!=""] %>% str_remove_all(",")
#   Actor <- Actor[Actor!="Avec"]
#   Movies_DB$Actor1[l] <- Actor[1]
#   Movies_DB$Actor2[l] <- Actor[2]
#   Movies_DB$Actor3[l] <- Actor[3]
#   Notes <- website %>% html_nodes(".rating-item") %>% html_text() %>% unlist()
#   Presse <- Notes[2] %>% str_split("\n") %>% unlist()
#   Presse <- Presse[4] %>% str_split(" ") %>% unlist()
#   Movies_DB$Grade[l] <- as.numeric(str_replace(Presse[1],",","."))
#   # Suggest <- website %>% html_nodes(".entity-card-simple") %>% html_text() %>% str_split("\n") %>% unlist()
#   # Suggest <- Suggest[Suggest!=""]
#   # Movies_DB$Suggest[l] <- paste(Suggest[1],"/",Suggest[2],"/",Suggest[3],"/",Suggest[4])
#   Movies_DB$Country[l] <- website %>% html_nodes(".nationality") %>% html_text()
# }
# 
# Movies_DB$Budget <- as.character(Movies_DB$Budget)
# Movies_DB$Budget <- substr(Movies_DB$Budget,1,nchar(Movies_DB)-2)
# Movies_DB$Budget <- as.numeric(str_remove_all(Movies_DB$Budget," "))
# Movies_DB$Genre <- Movies_DB$Genre %>% str_remove_all("/ / ")
# save(Movies_DB, file = "Movies_DB.RData")

#I load my new data base after saving it
load(file = "Movies_DB.RData")

#I indicate in my first database if I was able to find any information during the data recovery
Filming_loc$Infos <- Filming_loc$Titre %in% Movies_DB$Title
#I count the number of days it took to shoot
Filming_loc$Days <- Filming_loc$`Date de fin`-Filming_loc$`Date de début`+1

#I create a table to count the occurrences
Table_days <- Filming_loc %>%
  select(Titre,Days) %>%
  table() %>%
  rowSums() %>%
  table() %>%
  as.data.frame() %>% 
  `colnames<-`(c("Days", "Freq"))

#I convert this column as numeric
Table_days$Days <- Table_days$Days %>% as.integer()

#Same graph than above
ggplot(Table_days, aes(x = Days, y = Freq))+
  geom_bar(stat = "summary", fun = "mean", fill=pink)+
  ylim(0,100)+
  geom_smooth(col = purple, #I create a trend line using a mathematical method to get as close as possible to reality
              se=F,
              method = "glm",
              formula = y~x,
              method.args = list(family = gaussian(link = 'log')))+
  geom_text(aes(label = Freq),
            size=3,
            hjust=0.5,
            vjust=-1,
            check_overlap = T)+
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none",
        axis.line = element_line(colour = purple))+
  labs(title="Number of films per number of filming days",
       subtitle=paste0("from ",YearMin," to ",YearMax),
       y="Frequency", x="Days of filming")

For this analysis, only the feature films are of interest. So we get a database with 5013 rows. Each line corresponds to a shooting project: several lines therefore correspond to a single film.

The objective here is to complete the database to retrieve information on each film. By reading web pages, we can retrieve relevant information for our analysis.

Once the database has been processed, we obtain 605 films where we will try to obtain a maximum of information. Despite extensive research, the database only contains 449 films. On the orbital database, 76.3% of the entries had additional information allowing us to analyse also the types of films shot in Paris.

The graph below represents the number of days of shooting on location in Paris for each film, proving that the shoots are very short even if this number corresponds to the total number of shooting days. This very short duration allows us to assume that there are many foreign productions that only have a few scenes in Paris, such as Inception, which only shot a few scenes in Paris but the rest of the production was done in England and the USA. This graph proves that we need to add to the data to understand behaviour.

My problematics

Why does Paris attract and continue to attract? Which districts are the most popular? What is its international resonance? What is the make-up of the Parisian film ecosystem? Which actors have set foot in the French capital? Is Paris the Proust’s dream for certain directors? Can we identify trends specific to this city?

Filming analysis

Over the time

#I extract the Month of my Beginning date
Filming_loc$Month <- format(Filming_loc$`Date de début`,"%m") %>% as.integer()

#I create a table
Table_Month <- Filming_loc$Month %>%
  table() %>%
  as.data.frame() %>% 
  `colnames<-`(c("Month", "Freq"))

#I display this table in a graph
ggplot(Table_Month, aes(x = Month, y = Freq))+
  geom_bar(stat = "summary", fun = "mean", fill=pink)+
  ylim(0,max(Table_Month$Freq)+100)+
  scale_x_discrete(guide = guide_axis(n.dodge=2),
                   labels = month.name)+ #I need to indicate what type of scale the x axis have because they are discrete values
  geom_text(aes(label = Freq),
            size=3,
            hjust=0.5,
            vjust=-1,
            check_overlap = T)+
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none",
        axis.line = element_line(colour = purple))+
  labs(title="Number of projects filmed per month",
       subtitle=paste0("from ",YearMin," to ",YearMax),
       y="Frequency", x="Months of filming")

Thanks to the graph below, we can see a certain seasonality in the number of projects filmed per month (between 2016 and 2020). Indeed, the months of October and November are the months with the most projects filmed. Then August and March. The months with the fewest projects are January, followed by May, April and June.

As all the scenes are shot in the streets of Paris, this seasonality can be justified by several factors:

  • The weather: January is the coldest month of the year, so filming can be more complicated. The opposite is not true, however, as August is one of the busiest months.

  • Activity: In August, the capital is emptied of its inhabitants and the overall activity of the city is slowed down, which allows more opportunities. The months of October and November are also good for filming as the streets are only active during rush hour.

  • Tourism: The filming seasons are slowed down in correlation with tourism as from April to June the tourist attraction takes over.

  • Release dates: a section will be dedicated to this subject but filming depends on release dates and production.

#I change the format of my date to have yyyy-mm-01 because I don't need the days in this analysis
Date_format <- "%Y-%m-%01"
Table_Days <- Filming_loc$`Date de début` %>%
  format(Date_format) %>% 
  table() %>%
  as.data.frame() %>% 
  `colnames<-`(c("Days", "Freq"))

#I convert as Date
Table_Days$Days <- Table_Days$Days %>% as.Date()
Table_Days$Year <- format(Table_Days$Days,"%Y")

#I create a function that allows me to generate X colour from 3 colours and thus generate a gradient to remain in my chart
colfunc <- colorRampPalette(c(purple,pink,blue))

#I create a graph by year and by month
ggplot(Table_Days, aes(x = Days, y=Freq)) +
  geom_col(aes(fill = Year))+
  scale_fill_manual(values=colfunc(5))+ #I used my function to generate 5 colors because I need 5 colors for 5 years
  geom_smooth(col = yellow,
              se=F,
              method = "glm",
              formula = y~x,
              method.args = list(family = gaussian(link = 'log')))+
  scale_x_date(date_breaks = "4 month",date_labels = "%b %y")+ #I define a scale of 4 months and I display the month in letters
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "bottom",
        legend.title = element_blank(),
        axis.text.x=element_text(angle = -45, hjust = 0), #I rotate my x axis' labels
        axis.line = element_line(colour = purple))+
  labs(title="Number of projects filmed per year and per month",
       subtitle=paste0("from ",YearMin," to ",YearMax),
       y="Frequency", x="Years and Months")

This second graph shows the number of projects filmed by month and by year, thanks to the addition of the notion of year, we can see that the seasonality remains similar from one year to the next (except 2020) and that the number of total projects filmed decreases over the years from an average of 145 projects per month in January 2016 to 50 projects filmed at the end of 2020. August 2016 was the month with the most projects filmed in Paris in any year (over 200). This decrease can be explained by several reasons:

  • The impact of Covid which has significantly slowed down the production of many films during 2020.

  • A potential decline in the attractiveness of Paris to the benefit of other regions

  • Increasing inflation, especially in the Paris region

  • An overall slowdown in French productions.

Through Paris



Paris is a city with a dense and diverse historical and architectural heritage. Composed of 20 arrondissements, each part of the city has its own specificities, buildings, history, etc. and is an unlimited source of inspiration. These three maps represent the filming ecosystem that animated Paris:

  • The map on the top left represents the number of shoots by district. As Paris is built in a spiral, we can see that the majority of filming is done on the outer ring, notably in the 18th arrondissement and the 19th which are not renowned for their heritage but for being the least expensive areas of Paris as they are more recent. Conversely, the center with arrondissements 1 to 4 where most of the iconic buildings of Paris such as the Louvre or Notre-Dame seem to be the least popular arrondissements for filming.

  • The map on the right illustrates the duration of filming in Paris. Even if most shoots last less than 2 days as explained before, we see here that only the 14th arrondissement stands out for its duration. We can assume that shooting a scene in the centre of Paris over several days can be complicated and expensive. The optimization of the schedule is also part of the production’s concerns, which is why the shoots are condensed into a few days to limit the loss of time and agents.

  • The map below shows the different locations of the filming in Paris, allowing us to see the plurality of the locations. Although my analysis focuses on Paris intra-muros, we can see that some scenes are shot outside of Paris, notably around the Bois de Vincennes, to capture moments of nature. Even if the duration and the number do not allow us to say that the hyper center of Paris is a popular place for filming, this map proves to us a certain attraction since each street has already been exploited for filming by different films (interactive map allowing to zoom in on a place and obtain the associated project)



A declining attraction

#I created a dataframe with Year (Column) and Postal Code (Row)
Table_Postal_Year <- matrix(NA, nrow = unique(Filming_loc$`Code postal`) %>% length(), ncol = 5) %>% 
  `colnames<-`(unique(format(Filming_loc$`Date de début`,"%Y")) %>% as.integer() %>% sort()) %>% 
  `rownames<-`(unique(Filming_loc$`Code postal`) %>% sort()) %>% 
  as.data.frame()

#I completed this table with the sum of each project carried out in a particular district in a particular year
for (c in 1:ncol(Table_Postal_Year)){
  for (l in 1:nrow(Table_Postal_Year)){
    Table_Postal_Year[l,c] <- sum(format(Filming_loc$`Date de début`,"%Y")==colnames(Table_Postal_Year)[c] &
  Filming_loc$`Code postal`==rownames(Table_Postal_Year)[l])
  }
}

#I took row names to have a column dedicated
Table_Postal_Year$PC <- rownames(Table_Postal_Year)

#I selected 2016 and 2019 to make a comparison
Table_Postal_Year2 <- Table_Postal_Year %>% select(PC,`2019`,`2016`)
Table_Postal_Year2$`2019` <- Table_Postal_Year2$`2019`*100/Table_Postal_Year2$`2016`-100
Table_Postal_Year2$`2016` <- 0

#I ordered this table
Table_Postal_Year2 <- Table_Postal_Year2[order(-Table_Postal_Year2$`2019`),]

#I defined the mean because I will reuse the mean in my graph to define a color and create a vertical line
v <- round(mean(Table_Postal_Year2$`2019`,na.rm = T),1)

Table_Postal_Year_Graph <-
  ggplot(Table_Postal_Year2, aes(
    x = `2019`,
    y = factor(PC, rownames(Table_Postal_Year2)),
    group = PC)) +
  geom_vline(xintercept = v, col = pink)+ #I created a vertical line to display the mean
  geom_col(fill=purple)+
  geom_text(
    aes(label = round(`2019`, 0)),
    nudge_x = 3,
    nudge_y = 0,
    size = 3,
    color = white,
    check_overlap = T) +
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_blank(),
        axis.line = element_line(colour = purple))+
  labs(title="Evolution of the volume of scenes shot according to the district",
       subtitle=paste0("between 2016 and 2019"),
       y=NULL, x="Percentage") #I didn't want to have a title for my y axis
Table_Postal_Year_Graph

As observed in the graph illustrating the years, the attraction for the French capital is declining. On the left, we have materialized this decline by arrondissements and see a clear downward trend. This comparison is between 2016 and 2019 and proves a real disinterest in the Parisian scene. Only the 12th arrondissement enjoys a growing attraction with an increase of 50%. Conversely, all the other arrondissements have seen a 50.9% drop in their attractiveness on average.

However, as this drop is global, we cannot conclude that only certain arrondissements have lost popularity, even if the 16th has suffered a 77% drop. The same is true for geography, since the unpopular arrondissements are spread out over the whole city and do not allow groups to be formed according to cardinal points.

However, it is important to contextualise again. The absence of filming in 2020 necessarily has a funnel effect which will require a solid organisation of the year 2021 and 2022 for the postponement of all filming. We can therefore assume that a revival is highly likely even if the figures analysed are before Covid.

Directors

This table shows the different directors with the most projects and scenes shot in Paris. We note that one director stands out in particular: Aditya Chopra with nearly 130 projects, i.e. between two and three times more than the other directors, and almost 150 days of shooting in Paris, compared to less than one hundred for each of the other directors. It is also possible to notice that the number of days per shoot is always between 1 and 2 days, rarely more. Each of the directors seems to have a favourite arrondissement, but some stand out more often, such as the 13th or the 8th arrondissement.


#No comments for this part because i used the same functions than before
Table_Director <- Filming_loc$Réalisateur %>%
  toupper() %>% 
  table() %>% 
  as.data.frame() %>%
  arrange(desc(Freq)) %>% 
  `colnames<-`(c("Director", "Projects"))

for (l in 1:nrow(Table_Director)){
  Table_Director$Days[l] <- sum(Filming_loc$Days[Table_Director$Director[l]== Filming_loc$Réalisateur %>% toupper()],na.rm = T) %>% as.numeric()
  Table_Director$`Favorite District`[l] <- Filming_loc$`Code postal`[Table_Director$Director[l]== Filming_loc$Réalisateur %>% toupper()] %>%
  table() %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  head(1)[1] %>% 
  droplevels() %>%
  levels() %>%
  as.numeric()
}

Table_Director$`Days per shot` <- round(Table_Director$Days/Table_Director$Projects,1)

#I create a table with kable to have a html table
Table_Director %>%
  head(10) %>% #Top 10
  mutate(Projects= color_tile(pink, purple)(Projects)) %>%  #I create a gradient from pink to purple for my Projects column
  kable(escape = F, align = c("l", "c", "c", "c", "c")) %>% #I line up the first column on the left and the others in the centre
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = T) %>% #I configure the style of my table
  column_spec(1, bold = T, width = "40%") %>% #I set a size for my first column
  column_spec(2, bold = T, color = white)
Director Projects Days Favorite District Days per shot
ADITYA CHOPRA 129 142 75004 1.1
SYLVIE VERHEYDE 74 77 75013 1.0
NOVEMIE LVOVSKY 62 95 75013 1.5
VALERIE LEMERCIER 52 60 75016 1.2
STEPHEN WOOLFENDEN 48 48 75008 1.0
NOEMIE SAGLIO 47 65 75012 1.4
JALIL LESPERT 42 54 75005 1.3
GUILLAUME CANET 40 44 75008 1.1
TAREK BOUDALI 40 57 75004 1.4
AUDREY DANA 39 50 75020 1.3

Production Compagny

In the same way as the directors’ table, this table shows the different production companies according to the number of projects filmed and the number of shooting days. Three of them stand out: Firstep, Curiosa Films and Rectangle Productions, each of which has more than 130 projects filmed and more than 180 days of shooting. Each of the companies has an average number of shooting days between 1 and 2 and has a district where the most projects were filmed. It can be seen that certain districts such as the 4th, 9th, 12th and 13th arrondissements seem to be popular with these companies.


#Exactly the same than before
Table_Production <- Filming_loc$Producteur %>%
  toupper() %>% 
  table() %>% 
  as.data.frame() %>%
  arrange(desc(Freq)) %>% 
  `colnames<-`(c("Production", "Projects"))

for (l in 1:nrow(Table_Production)){
  Table_Production$Days[l] <- sum(Filming_loc$Days[Table_Production$Production[l]== Filming_loc$Producteur %>% toupper()],na.rm = T) %>% as.numeric()
  Table_Production$`Favorite District`[l] <- Filming_loc$`Code postal`[Table_Production$Production[l]== Filming_loc$Producteur %>% toupper()] %>%
  table() %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  head(1)[1] %>% 
  droplevels() %>%
  levels() %>%
  as.numeric()
}

Table_Production$`Days per shot` <- round(Table_Production$Days/Table_Production$Projects,1)

Table_Production %>%
  head(10) %>% 
  mutate(Projects= color_tile(pink, purple)(Projects)) %>% 
  kable(escape = F, align = c("l", "c", "c", "c", "c")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = T) %>%
  column_spec(1, bold = T, width = "40%") %>%
  column_spec(2, bold = T, color = white)
Production Projects Days Favorite District Days per shot
FIRSTEP 178 195 75004 1.1
CURIOSA FILMS 139 203 75018 1.5
RECTANGLE PRODUCTIONS 135 184 75012 1.4
BLUE MONDAY PRODUCTIONS 88 110 75009 1.2
CHAPTER 2 87 120 75009 1.4
WHY NOT PRODUCTIONS 81 148 75012 1.8
MY FAMILY 79 113 75003 1.4
F COMME FILM 74 115 75013 1.6
LES FILMS DU 24 72 96 75004 1.3
ATELIER DE PRODUCTION 68 68 75013 1.0

This Sankey Diagram represents the number of directors per production handed over to another. Often a director will tend to work with the same producers on their various projects as a relationship of trust is established. As each project is different, only the director and the production can be the common denominators. It was therefore relevant to take a look at the top Parisian productions and see the exchanges made between 2016 and 2019 and we can see that 100% of the exchanges are satisfactory since for every director lost, a new one is found by the opposite production.

For example, RECTANGLE PRODUCTIONS has exchanged some of its directors with PAN EUROPENNE PRODUCTION and SBS FILMS. There is therefore no real loss of market to one player. Note that the final loss of contract is not represented in this diagram since only the exchanges were relevant in my opinion.

#I want to display the evolution of production
Prod_evol <- Filming_loc %>% select(Titre, Réalisateur, Producteur,`Date de début`)
colnames(Prod_evol) <- c("Title","Director","Producer","Year")
Prod_evol <- merge(Prod_evol, Movies_DB, by = "Title")
Prod_evol <- Prod_evol %>% select(Title2,Director.y,Producer,Year)
colnames(Prod_evol) <- c("Title","Director","Producer","Year")
Prod_evol$Year <- format(Prod_evol$Year,"%Y") #I extracted the year
Prod_evol <- Prod_evol %>% filter(Year==2016 | Year==2019) #I selected only 2016 and 2019 to make a comparison
Prod_evol <- unique(Prod_evol) 
Prod_evol$Producer <- Prod_evol$Producer %>% toupper()

Prod_df <- matrix(
ncol = Prod_evol$Producer %>% unique() %>% length(),
nrow = Prod_evol$Producer %>% unique() %>% length()) %>%
  as.data.frame() %>%
  `colnames<-`(Prod_evol$Producer %>% unique()) %>% 
  `rownames<-`(Prod_evol$Producer %>% unique())


for (c in 1:ncol(Prod_df)){
  for (l in 1:nrow(Prod_df)){
    Prod_df[l,c] <- sum(Prod_evol$Director[colnames(Prod_df)[c]==Prod_evol$Producer]==Prod_evol$Director[rownames(Prod_df)[l]==Prod_evol$Producer],na.rm = T)
  }
}

Prod_df$from <- rownames(Prod_df) %>% as.character()
Prod_df <- Prod_df[,c(ncol(Prod_df),1:(ncol(Prod_df)-1))]

#I rearrange my table to have only 3 columns 
Prod_df <- Prod_df %>%
  as.data.frame() %>%
  gather(key="to", value="value", -1) %>%
  mutate(to = gsub("\\.", " ",to)) %>%
  na.omit()

#I rename my columns
Prod_df <- Prod_df[Prod_df$value>0,]
colnames(Prod_df) <- c("source","target","value")

# Prod_df <- Prod_df[Prod_df$target %in% ( Prod_df %>%
#   group_by(target) %>%
#   summarise(sum = sum(value)) %>%
#   arrange(-sum) %>%
#   head(10) %>%
#   select(target) %>%
#   unlist() %>%
#   as.character()),]

#I filtered on the Top productions in Paris
links <- Prod_df
links <- links[links$source %in% links$target[links$source %in% head(Table_Production$Production,10)],]
links$target <- paste(links$target," ")
 
# From these flows we need to create a node data frame: it lists every entities involved in the flow
nodes <- data.frame(
  name=c(as.character(links$source), 
  as.character(links$target)) %>% unique()
)
 
# With networkD3, connection must be provided using id, not using real name like in the links dataframe.. So we need to reformat it.
links$IDsource <- match(links$source, nodes$name)-1 
links$IDtarget <- match(links$target, nodes$name)-1


colfunc <- colorRampPalette(c(purple,pink,yellow))
#colfunc(10)

#I set a new pal of 10 colors
my_color <- 'd3.scaleOrdinal() .range(["#2C2C54","#462552","#611E50","#7C184E","#96114C","#AE1D43","#C23D32","#D65C21","#EA7C10","#FF9C00"])'

# Make the Network
p <- sankeyNetwork(Links = links, Nodes = nodes,
              Source = "IDsource", Target = "IDtarget",
              Value = "value", NodeID = "name", units = 'TWh', fontSize = 12, nodeWidth = 30, colourScale=my_color,
              sinksRight=FALSE)
p

Movies analysis

After studying Paris as a filming location, it is important to understand what is produced by and for our capital. To do this, I have strengthened our database by collecting data on the projects shot to better understand what attracts so much.

We will start with an analysis of the films produced, followed by a strengthening of the directors’ section, a section on actors, a text and genre analysis and finally an international opening.

#I reload my database of Movies and Languages
load(file = "Movies_DB.RData")
load(file = "Languages.RData")

#I clean Country column by removing space and replacing languages by countries
Movies_DB$Country <- trimws(Movies_DB$Country)
Movies_DB$Country <- Languages$Eng[match(Movies_DB$Country,Languages$.)]

#Remove space from Genre
Movies_DB$Genre <- trimws(Movies_DB$Genre)

#I set a rank by sorting my data
Movies_DB <- Movies_DB %>% arrange(-Grade)
Movies_DB$Rank <- 1:nrow(Movies_DB)

#I attribute a letter to have a "Tier-List"
Movies_DB$Rank <- LETTERS[trunc(Movies_DB$Rank/(max(Movies_DB$Rank)/5)-0.001,0)+1]

#I created a table with Top movies (go to conclusion)
Top_movies <- Movies_DB %>% select(Title2, Director, Date, Grade) %>% `colnames<-`(c("Title", "Director", "Date", "Grade"))
Top_movies$Date <- Top_movies$Date %>% as.Date("%d/%m/%Y") %>% format("%Y")
Top_movies <- Top_movies %>% filter(Date>=2016) %>% head(10)

Movies

#I round my Duration to have multiple of 5
Duration <- round(Movies_DB$Duration/5,0)*5
Duration <- Duration %>% table() %>% as.data.frame() %>% `colnames<-`(c("Duration", "Freq"))

#I plot my graph
ggplot(Duration, aes(x = Duration, y = Freq))+
  geom_bar(stat = "summary", fun = "mean", fill=pink)+
  ylim(0,100)+ #I delete some white space
  geom_text(aes(label = Freq),
            size=3,
            hjust=0.5,
            vjust=-1,
            check_overlap = T)+
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none",
        axis.line = element_line(colour = purple))+
  labs(title="Number of films per Duration",
       subtitle=paste0("in minutes"),
       y="Frequency", x="Duration")

The first graph represents the number of films by length, and we can see that the vast majority of films shot in Paris last between 85 minutes and 110 minutes. This is the average length of the vast majority of films today.

Shorter films will logically have fewer scenes shot in Paris and longer ones are less frequent so few of them are shot in Paris. It is also possible to correlate this with the budgets of films which are often relatively higher for these durations, allowing for more expensive scenes to be shot.

For information, the shortest films are often linked to a comedy or a documentary whereas a thriller will require a longer set-up which may correspond to the type of film shot in Paris and which will be confirmed in the section dedicated to genres.

#I convert my Date column as universal date format 
Movies_DB$Date <- as.Date(Movies_DB$Date,"%d/%m/%Y")
Date <- Movies_DB[!Movies_DB$Date < min(Filming_loc$`Date de fin`,na.rm = T),]

for (l in 1:nrow(Date)){
  Date$Beginning[l] <- min(Filming_loc$`Date de début`[Date$Title[l]==Filming_loc$Titre],na.rm=T) %>% as.character()
}

Date$Beginning <- Date$Beginning %>% as.Date()

#I merge my two tables
Date_table <- rbind(
table(format(Date$Date,"%Y-%m-01")) %>% as.data.frame() %>% mutate(Source="Release date"),
table(format(Date$Beginning,"%Y-%m-01")) %>% as.data.frame() %>% mutate(Source="Filming date"))

Date_table$Var1 <- Date_table$Var1 %>% as.Date()

#I display my table
ggplot(Date_table, aes(x = Var1, y=Freq)) +
  geom_point(aes(size=Freq, colour = Source)) +
  geom_smooth(aes(group=Source, col = Source), method = lm, formula = y ~ splines::bs(x, 4), se = FALSE)+
  scale_color_manual(values=mypal(2)) +
  scale_y_continuous(limits = c(0,20), breaks = seq(0,20,2)) + #I set my scale to remove outliers
  scale_x_date(date_breaks = "4 month", date_labels = "%b %y")+
  labs(x = "Publication Date")+
  theme(text=element_text(size=12),
        axis.text.x = element_text(angle=45, hjust = 1),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        axis.line = element_line(colour = purple))+
  labs(title="Number of films filmed or released",
       subtitle="Between 2016 and 2022",
       y="Frequency", x="Date")

This second graph looks at the seasonality of filming by comparing it to the actual release dates of the films. As indicated, the data collected are those of the films initially present in the database listing filming in the city of Paris: only films filmed from 2016 onward are present and by definition, the release dates are later and extend until 2022.

Here we see two trend curves. The one concerning shooting dates shows a drastic drop in 2020 (Covid) and then a return to normal in 2021 (end of the database). The curve representing film releases starts at 0, which is normal since a film cannot be released at the same time as it is shot. This curve mirrors the shootings since this gap corresponds to post-production. The dip of 2020 is felt in the releases (containment) and spreads further as a shooting bottleneck appears. We can say that without Covid, the two curves will be identical with a gap of about one year.

Directors

#All is already explained
Director <- Movies_DB$Director %>%
  table() %>% 
  as.data.frame() %>% 
  arrange(-Freq) %>%
  `colnames<-`(c("Director", "Freq")) %>% 
  filter(Director!="") %>% 
  filter(Freq>2)

for (l in 1:nrow(Director)){
  Director$Grade[l] <- round(mean(Movies_DB$Grade[Director$Director[l]==Movies_DB$Director],na.rm = T),2) %>% as.numeric()
}

Director_graph <- ggplot(Director, aes(x = Grade, y = Freq))+
  xlim(2,5)+
  geom_vline(xintercept = mean(Director$Grade,na.rm = T),
            col = grey)+
  geom_point(aes(colour = Grade > mean(Grade,na.rm = T)))+ #I use a formula to have conditional colours
  geom_smooth(col = yellow, method = lm, formula = y ~ splines::bs(x, 5), se = FALSE)+
  geom_text(aes(label = paste0(Director,": ",Grade),colour = Grade > mean(Grade,na.rm = T)),
            hjust=-0.1,
            vjust=-0.5,
            check_overlap = T) +
  scale_color_manual(values=mypal(2)) +
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none",
        axis.line = element_line(colour = purple))+
  labs(title="Directors by volume and score",
       subtitle = paste("Based on",Movies_DB %>% nrow(),"movies"),
       y="Frequency", x="Grade")
Director_graph

The data retrieval allowed us to obtain information on 449 films films from the initial list, i.e. 76.3%. One variable is interesting to process: The ratings. Allociné collects the ratings given by the press and spectators, and is considered as one of the most reliable sites in France. For example, the best film on our list is Hors Normes directed by Eric Toledano, with an average of 4.3/5.

From there, we can study which director in our database is the best rated according to volume. The more films a director makes, the closer he or she will tend to be to the overall average or, on the contrary, would he or she tend to stand out?

The trend line shows that the more films a director makes, the closer he/she will be to the average and this line has a slight rebound towards the end proving that some directors keep a very good average despite a large number of films. The majority, however, are below the average score.

The directors present in the table of the first part confirm this idea since the most prolific directors obtain an average close to 3 which corresponds to the general average.

Actors

#I merge my 3 Actors columns 
Actor <- rbind(
Movies_DB %>% 
  select(Actor1, Grade) %>% 
  `colnames<-`(c("Actor", "Grade")),
Movies_DB %>% 
  select(Actor2, Grade) %>% 
  `colnames<-`(c("Actor", "Grade")),
Movies_DB %>% 
  select(Actor3, Grade) %>% 
  `colnames<-`(c("Actor", "Grade")))

#I create a table
Actor_table <- Actor$Actor %>% 
  table() %>% 
  as.data.frame() %>% 
  `colnames<-`(c("Actor", "Freq")) %>% 
  arrange(-Freq)

for (t in 1:nrow(Actor_table)){
  Actor_table$Grade[t] <- round(mean(Actor$Grade[Actor$Actor==Actor_table$Actor[t]],na.rm = T),2)
}

Actor_table <- Actor_table[Actor_table$Freq>3,]

v <- round(mean(Actor_table$Grade,na.rm = T),1)

Actor_graph <- ggplot(Actor_table, aes(x = Grade, y = Freq))+
  xlim(2,4.2)+
  geom_vline(xintercept = mean(Actor_table$Grade,na.rm = T),
            col = grey)+
  geom_text(colour =grey,aes(v, 1+max(Actor_table$Freq, na.rm = T), label = paste("Mean:",v), hjust = -0.1))+
  geom_point(aes(colour = Grade > v))+
  geom_smooth(col = yellow, method = lm, formula = y ~ splines::bs(x, 3), se = FALSE)+ #Polynomial trend line of degree 3
  geom_text(aes(label = paste0(Actor,": ",round(Grade,1)),colour = Grade > v),
            hjust=-0.1,
            vjust=-0.5,
            check_overlap = T) +
  scale_color_manual(values=mypal(2)) + #I set two colors from my pal
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none",
        axis.line = element_line(colour = purple))+
  labs(title="Actors by volume and score",
       subtitle = paste("Based on",Movies_DB %>% nrow(),"movies"),
       y="Frequency", x="Grade")
Actor_graph

The second element where the scores are important and which may echo the directors’ analysis is the actors.

On the right, we can assume the same hypothesis as before, the more an actor is present, the more his scores will tend towards the global average of the actors. However, the trend line proves that **the more an actor plays, the more his average *increases**, which may be linked to his popularity. Indeed, an actor who stands out in a role will have more opportunities to play in films with a well-production.

Here, Vincent Lacoste stands out as the most prolific actor with average scores above the general average. On the right we see the most popular actors and it is with great pride that we can find several female names like Alice Isaaz or Alexandra Lamy. Our database does not contain the gender of the actors and does not allow us to base our analysis on this, although it would be interesting to understand the place of women in French cinema.

Actor_list <- head(Actor_table$Actor,50) %>% as.character()
Actor_matrix <- matrix(ncol = Actor_list %>% length(), nrow=Actor_list %>% length())
colnames(Actor_matrix) <- Actor_list
rownames(Actor_matrix) <- Actor_list

for (c in 1:ncol(Actor_matrix)){
  for (l in 1:nrow(Actor_matrix)){
    Actor_matrix[l,c] <- sum(Movies_DB$Actor2[colnames(Actor_matrix)[c]==Movies_DB$Actor1]==rownames(Actor_matrix)[l],na.rm=T)
    Actor_matrix[l,c] <- Actor_matrix[l,c]+sum(Movies_DB$Actor3[colnames(Actor_matrix)[c]==Movies_DB$Actor1]==rownames(Actor_matrix)[l],na.rm=T)
  }
}

Actor_matrix <- Actor_matrix %>% as.data.frame()
Actor_matrix$from <- rownames(Actor_matrix) %>% as.character()
Actor_matrix <- Actor_matrix[,c(ncol(Actor_matrix),1:(ncol(Actor_matrix)-1))]


# Transform the adjacency matrix in a long format
connect <- Actor_matrix %>%
  as.data.frame() %>%
  gather(key="to", value="value", -1) %>%
  mutate(to = gsub("\\.", " ",to)) %>%
  na.omit()

connect <- connect[connect$value>0,]

# Number of connection per person
coauth <- c( as.character(connect$from), as.character(connect$to)) %>%
  as.tibble() %>%
  group_by(value) %>%
  summarize(n=n())
colnames(coauth) <- c("name", "n")
#dim(coauth)

# Create a graph object with igraph
mygraph <- graph_from_data_frame( connect, vertices = coauth, directed = FALSE )

# Find community
com <- walktrap.community(mygraph)
#max(com$membership)

#Reorder dataset and make the graph
coauth <- coauth %>% 
  mutate( grp = com$membership) %>%
  arrange(grp) %>%
  mutate(name=factor(name, name))

# keep only 7 first communities
coauth <- coauth %>% 
  filter(grp<=7)

scale_col <- mypal(max(coauth$grp))

# keep only this people in edges
connect <- connect %>%
  filter(from %in% coauth$name) %>%
  filter(to %in% coauth$name)

# Create a graph object with igraph
mygraph <- graph_from_data_frame( connect, vertices = coauth, directed = FALSE )

# Make the graph
ggraph(mygraph, layout="linear") + 
  geom_edge_arc(edge_colour=grey, fold=TRUE) +
  geom_node_point(aes(size=n, color=as.factor(grp), fill=grp)) +
  scale_color_manual(values =scale_col)+
  scale_size_continuous(range=c(0.5,4)) +
  geom_node_text(aes(label=name), angle=65, hjust=1, nudge_y = -0.5, size=3) +
  expand_limits(x = c(-1.2, 1.2), y = c(-5,0))+
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none")

Like any ecosystem, the Parisian cinema has its groups, which have been materialised here in 7 groups of actors who collaborate on common projects. Each branch represents at least one film.

For example, Vincent Lacoste has worked on 8 films with at least 8 different actors (on lead roles, only the top 3 actors are counted for a film). Groups are formed when actors tend to work together regularly. Examples include Romain Duris and Pierre Deladonchamps, who worked together on Eiffel, and François Damiens and Dany Boon, who are used to working together in comedies. These groups are formed by actor but suggest a common film genre. If the base were larger, the groups would form along genre lines as comedy actors would tend to cross paths more regularly.


Genre <- Movies_DB$Genre
Genre <- unlist(strsplit(Genre," / "))
Genre <- trimws(Genre)
Genre <- as.data.frame(table(Genre))
Genre <- Genre[order(-Genre$Freq),]

Movies_DB <- Movies_DB %>%
  separate(Genre, c("Genre 1", "Genre 2"), " / ")

Genre1 <- select(Movies_DB,`Genre 1`,Grade)
Genre2 <- select(Movies_DB,`Genre 2`,Grade)
colnames(Genre2) <- colnames(Genre1) <- c("Genre","Grade")
Genre_merge <- rbind(Genre1,Genre2)

Genre_merge$Genre <- str_remove(Genre_merge$Genre,"/")
Genre_merge$Genre <- trimws(Genre_merge$Genre)
Genre_merge <- Genre_merge[!is.na(Genre_merge$Grade),]
Genre_merge <- Genre_merge[!is.na(Genre_merge$Genre),]

for(z in 1:nrow(Genre)){
  Genre$Grade[z] <- round(mean(Genre_merge$Grade[Genre_merge$Genre == Genre$Genre[z]], na.rm = T),1)
}

Genre$Freq[Genre$Freq>50] <- 50
# v <- round(mean(Genre$Grade,na.rm = T),1)
# 
# Genre_graph <- ggplot(Genre, aes(x = Grade, y = Freq))+
#   ylim(0,55)+
#   xlim(2,4.5)+
#   geom_vline(xintercept = v,
#             col = grey)+
#   geom_point(aes(colour = Grade > v))+
#   geom_smooth(col = yellow, method = lm, formula = y ~ splines::bs(x, 3), se = FALSE)+
#   geom_text(aes(label = paste(Genre,Grade),colour = Grade > v),
#             hjust=-0.1,
#             vjust=-0.5,
#             check_overlap = T) +
#   scale_color_manual(values=mypal(2)) +
#   geom_text(colour =grey,aes(v, 55, label = paste("Mean:",v), hjust = -0.1))+
#   theme(text=element_text(size=12),
#         panel.grid.major = element_blank(),
#         panel.grid.minor = element_blank(),
#         panel.background = element_blank(),
#         legend.position = "none",
#         axis.line = element_line(colour = purple))+
#   labs(title="Type of films with the best average according to their frequency",
#        y="Frequency", x="Genre")
# Genre_graph

Genre & Description

Top <- 10
Genre_Rank <- matrix(nrow = Top,ncol=6) %>% `colnames<-`(c("Genre",LETTERS[1:5])) %>% as.data.frame()
Genre_Rank$Genre <- Genre$Genre[1:Top] %>% as.character()

for(c in 1:5){
  for (l in 1:Top){
    Genre_Rank[l,1+c] <- sum(Movies_DB$Rank[Genre_Rank$Genre[l]==Movies_DB$`Genre 1`]==colnames(Genre_Rank)[1+c],na.rm = T)+
                         sum(Movies_DB$Rank[Genre_Rank$Genre[l]==Movies_DB$`Genre 2`]==colnames(Genre_Rank)[1+c],na.rm = T)
  }
}


Genre_Rank[2:6] <- round(Genre_Rank[2:6]/rowSums(Genre_Rank[2:6])*100,1)

Genre_Rank <- Genre_Rank %>% arrange(-A,E)

Genre_Rank2 <- rbind(
Genre_Rank %>% select(1,2) %>% mutate(Rank="A") %>% `colnames<-`(c("Genre","Perc","Rank")),
Genre_Rank %>% select(1,3) %>% mutate(Rank="B") %>% `colnames<-`(c("Genre","Perc","Rank")),
Genre_Rank %>% select(1,4) %>% mutate(Rank="C") %>% `colnames<-`(c("Genre","Perc","Rank")),
Genre_Rank %>% select(1,5) %>% mutate(Rank="D") %>% `colnames<-`(c("Genre","Perc","Rank")),
Genre_Rank %>% select(1,6) %>% mutate(Rank="E") %>% `colnames<-`(c("Genre","Perc","Rank"))
)

for (l in 1:nrow(Genre_merge)){
  Genre_merge$Mean[l] <- mean(Genre_merge$Grade[Genre_merge$Genre[l]==Genre_merge$Genre])
}

Genre_merge <- Genre_merge[Genre_merge$Genre %in% Genre_Rank2$Genre,]
Genre_merge <- Genre_merge %>% arrange(-Mean)

ggplot(Genre_merge, aes(x = Grade, y = Genre)) +
  geom_density_ridges(scale = 1.5, quantile_lines = TRUE, quantiles = 2, aes(fill = Mean > mean(Grade,na.rm = T)))+
  scale_y_discrete(limits=unique(Genre_merge$Genre))+
  scale_fill_manual(values=mypal(2))+
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none",
        axis.line = element_line(colour = purple))+
  labs(title="Genre according to grades",
       subtitle="Top 10",
       x="Grade", y=element_blank())

Following the analysis of the actors, it is interesting to understand what genres of film are shot in Paris.

The two main genres are Drama and Comedy which correspond to the two main film genres in the world. In third position, we find Romance which corresponds perfectly to what the collective imagination has of Paris, capital of love. The graph opposite shows the distribution of scores by genre. The most appreciated genres are biographical and historical films, which seems to argue in favour of Paris as a historical scene with a rich heritage.

On the other hand, action and crime films shot in Paris seem to be less appreciated as the French cinema codes seem to be not very flexible on this kind of film and do not allow to reveal the full potential of a playground like Paris for this kind of film. Finally, romantic films are in the middle of the top with a normal coherent distribution.

Of note, the historical films follow a two-part distribution that may show the opportunism of this cinema with a median in the lower part and a second part that expands with the best scores listed.

Paris <- sum(str_detect(Movies_DB$Summary,"Paris") | str_detect(Movies_DB$Summary,"France"),na.rm = T) %>% as.numeric()
Foreign_perc <- (Movies_DB$Summary[Movies_DB$Country!="France"] %>% str_detect("Paris") %>% sum(na.rm = T))/(Movies_DB$Summary[Movies_DB$Country!="France"] %>% length())*100

Movies_Description <- Movies_DB$Summary
#Movies_Description <- str_remove_all(Movies_Description,"’")
Movies_Description <- str_remove(Movies_Description,".")
Movies_Description <- Corpus(VectorSource(Movies_Description))
Movies_Description <- tm_map(Movies_Description, content_transformer(tolower))
Movies_Description <- tm_map(Movies_Description, removeNumbers)
Movies_Description <- tm_map(Movies_Description, removeWords, stopwords("english"))
Movies_Description <- tm_map(Movies_Description, removeWords, stopwords("french"))
Movies_Description <- tm_map(Movies_Description, removeWords, c("plus","ça","où","jusqu’","…"))
Movies_Description <- tm_map(Movies_Description, removePunctuation)
Movies_Description <- TermDocumentMatrix(Movies_Description)
Movies_Description <- as.matrix(Movies_Description)
Movies_Description <- sort(rowSums(Movies_Description),decreasing=TRUE)
Movies_Description <- data.frame(word = names(Movies_Description),freq=Movies_Description)
Movies_Description <- Movies_Description[!str_detect(Movies_Description$word,"’"),]

Top <- 120

wordcloud(words = Movies_Description$word, freq = Movies_Description$freq, min.freq = 3,
          max.words=Top, random.order=FALSE, rot.per=0, 
          colors=rev(mypal(4)))

Given the genres represented in Paris, it was secondly essential to analyze the descriptions of the films.

The descriptions are in French as my analysis focuses on cinema in Paris and the data is retrieved from French sites.

123 films contain the word Paris or France in their descriptions and the action will therefore take place in Paris where the city will play a central role in the plot. The word cloud on the right highlights the most represented words. First we see the lexical field of time with “ans”, “vie” and “jeune” which obviously refer to the historical and biographical genre. Then we see the lexical field of the family with father, mother, daughter, children… Then all my words revolve around the genre they represent such as love, dream, meeting… These words gravitate around the image that Paris sends back and can feed the cliché but also the story that Paris tells us.


International

To finish this analysis, I wanted to study the influence of Paris internationally. Here are some of the countries that shot scenes in Paris between 2016 and 2020. We note:

Capital <- subset(world.cities)
Capital <- Capital[Capital$capital==1,]

for (l in 1:nrow(Movies_DB)){
  Movies_DB$lat[l] <- Capital[Movies_DB$Country[l] == Capital$country.etc,]["lat"] %>% as.numeric()
  Movies_DB$long[l] <- Capital[Movies_DB$Country[l] == Capital$country.etc,]["long"] %>% as.numeric()
}

Map <- Movies_DB %>%
  select(Country,lat,long) %>%
  group_by(Country) %>%
  summarise(long = mean(long), lat = mean(lat)) %>%
  filter(!is.na(long))

for (l in 1:nrow(Map)){
  Map$Freq[l] <- sum(Map$Country[l]==Movies_DB$Country, na.rm =T)
}


# No margin
par(mar=c(0,0,0,0))

France <- Map[Map$Country=="France",] %>% select(long,lat) %>% as.numeric()

map('world',
    col="#F2F2F2", fill=TRUE, bg="white", lwd=0.05,
    mar=rep(0,4),border=0, ylim=c(-80,80) 
)

# Compute the connection between Countries and Paris
for (l in 1:nrow(Map)){
  inter <- gcIntermediate(France,  Map[l,] %>% select(long,lat) %>% as.numeric(), n=50, addStartEnd=TRUE, breakAtDateLine=F)
  lines(inter, col=pink, lwd=1)
}


# Dot for cities
addTextLabels(x=Map$long, y=Map$lat, label=Map$Country, col.label = purple, col.line = pink)
points(x=Map$long, y=Map$lat, col=purple, cex=1.2, pch=20)

  • Asian influence: Asians have always been attracted to Paris and have a strongly developed cinema. Among the countries producing films with international resonance, we can cite India with Bollywood, Korea in full cinematographic expansion and Japan, a historical player. Also, Qatar, which is still closely linked to France by major economic and heritage agreements, has taken advantage of Paris as a film location.

  • A European influence: French cinema is an integral part of the European landscape and also allows for European co-productions thanks to shared financing. It is therefore normal to see a significant concentration of exchanges in Europe. Morocco, with its large French-speaking population, allows for good exchanges of filming locations, as it is not unusual for many French films to be shot there and vice versa. As an example of European co-production we can cite the latest Asterix films, a real nest of diversity.

  • North America is also obviously represented, even if the map does not point to the United States, which is obviously one of the main players in this market, as shown in this second graph.

for (l in 1:nrow(Movies_DB)){
  Movies_DB$Filming_Count[l] <- Filming_loc$Days[Movies_DB$Title[l]==Filming_loc$Titre] %>% sum(na.rm = T) %>% as.numeric()
}

Map2 <- Movies_DB %>%
  group_by(Country) %>%
  summarise(Freq=sum(Filming_Count)) %>%
  as.data.frame() %>% 
  `colnames<-`(c("individual", "value")) %>% 
  filter(individual!="France") %>% 
  arrange(-value)

Map2$id <- 1:nrow(Map2)

Map2$individual[Map2$individual=="United States of America"] <- "USA"

label_data <- Map2

number_of_bar <- nrow(label_data)
angle <-  90 - 360 * (label_data$id-0.5) /number_of_bar
label_data$hjust<-ifelse( angle < -90, 1, 0)
label_data$angle<-ifelse(angle < -90, angle+180, angle)

p <- ggplot(Map2, aes(x=as.factor(id), y=value)) +
  geom_bar(stat="identity", fill=pink) +
  ylim(-100,120) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.margin = unit(rep(-1,4), "cm")      # Adjust the margin to make in sort labels are not truncated!
  ) +
  coord_polar(start = 0) +
  geom_text(data=label_data, aes(x=id, y=value+10, label=paste0(individual,": ",round(value,1)), hjust=hjust), color=purple, size=3, angle= label_data$angle, inherit.aes = FALSE ) 
 
p

The supremacy of American cinema is evident on this graph with about a hundred scenes made in France, i.e. about 2%. Paris is above all reserved for French cinema, which is obvious, but it also attracts many countries. Among them, we find our neighbours the Belgians with François Damiens or Benoît Poelvoorde as headliners. We then see the presence of Qatar confirmed by this graph, followed by the United Kingdom and Canada. Asian and European countries then follow. Covid has obviously prevented any foreign country from coming to film in France, which is why it is not relevant to note a drop in the attraction of foreign countries to Paris, since this drop is obvious and linked to an uncontrollable external event. Foreign cinema still represents about 10% of the scenes shot in Paris. It should be noted that the percentage of films whose description contains the word Paris or France increases significantly to reach 31.25% for foreign films, proving that the word Paris is a selling point to attract a specific audience.


Conclusion

Top_movies %>% 
  mutate(Grade= color_tile(pink, purple)(Grade)) %>% 
  kable(escape = F, align = c("l", "l", "c", "c")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F, position = "float_right") %>%
  column_spec(1, bold = T) %>%
  column_spec(4, bold = T, color = white)
Title Director Date Grade
Hors Normes Eric Toledano, 2019 4.3
Au Revoir Là-haut Albert Dupontel 2017 4.3
Illusions Perdues Xavier Giannoli 2021 4.3
120 battements Par Minute Robin Campillo 2017 4.2
Grâce à Dieu François Ozon 2019 4.2
Mauvaises herbes Kheiron 2018 4.2
Boîte noire Yann Gozlan 2021 4.2
Sauver ou périr Frédéric Tellier 2018 4.1
Jusqu’à la Garde Xavier Legrand 2018 4.1
Mon Inconnue Hugo Gélin 2019 4.1

In conclusion, Paris is indeed a key stage for French and international cinema.

A dynamic scene occupied in a very short period of time with a seasonality timed to optimise financial and human costs. Despite this, this attraction is inexorably falling. A fall accelerated by Covid which has slowed down all productions. However, Paris is offering more and more locations throughout its historically rich city and even if the fall is global, a revival is to be expected in 2022. Directors and producers still seem to be invested in witnessing the beauty of Paris. A resumption of filming is already underway with an acceleration in recent months to coincide with delayed film releases. Actors are an integral part of this ecosystem and consolidate it by forming natural groups that allow for sustainability in the cinematographic landscape. The film genres tend to reinforce the image of Paris by feeding its history and clichés. This heritage attracts productions from all over the world, making it an even more important place for cinema. On the right you will find the top 10 films shot in Paris between 2016 and 2019, enjoy!


#I import a picture and I set the size and the float
knitr::include_graphics("PhotoThéotime.svg")

Analysis conducted by Théotime Bourgeois

In the framework of the Master of Science - Data Science & Organizational Behavior by Burgundy School of Business

Supervised by Frank Lentz

Instagram LinkedIn