Introduction

library(tidyverse) 
library(tidytext)
library(rvest)
library(xml2)
library(urltools)
library(plotly)
library(ggplot2)
library(ggrepel)
library(igraph)
library(ggraph)
library(networkD3)
library(wordcloud)
library(wordcloud2)
library(SnowballC)
library(patchwork)
library(png)
library(ggimage)

red <- c("#E91E22")
blue <- c("#4395C0")
purple <- c("#1D2F6F")
yellow <- c("#F1BF45")
green <- c("#4C9F70")
cassis <- c("#832749")

font <- "Lato"

colfunc <- colorRampPalette(c(green,red,cassis,purple,blue,yellow))
colfunc2 <- colorRampPalette(c(red,cassis,purple))

List_names <- list.files("ALL_Seasons", pattern=NULL, all.files=FALSE,
    full.names=FALSE)

List_names <- paste0("ALL_Seasons/",List_names) %>% as.data.frame()

#I set up my dataframe
Friends <- data.frame(File=NA,ID=NA,Text=NA)

df <- read_html(List_names$.[199]) %>%
  html_elements("body") %>%
  html_text2() %>%
  str_split("\n") %>%
  as.data.frame() %>%
  `colnames<-`(c("Text"))
df$File <- List_names$.[199]
df <- df[df$Text!="",]
df$ID <- 1:nrow(df)
Friends <- rbind(Friends,df)

#To know the validity of my HTML files
for (i in 1:nrow(List_names)){
  List_names$Keep[i] <- (read_html(List_names$.[i], encoding = "UTF-8")%>%
    html_elements(css = "p") %>%
    html_text() %>%
    length())>0
}

List_names <- List_names[List_names$Keep,]

for (i in 1:nrow(List_names)){
  Page <- read_html(List_names$.[i], encoding = "UTF-8")
  df <- Page %>%
    html_elements(css = "p") %>% 
    html_text() %>% 
    as.data.frame()
  colnames(df) <- "Text"
  df$ID <- 1:nrow(df)
  df$File <- List_names$.[i]
  Friends <- rbind(Friends,df)
}

rm(Page,df,List_names,i)

#Cleaning my database
Friends <- Friends[Friends$Text!="",]
Friends <- Friends[!is.na(Friends$Text),]

Friends$File <- Friends$File %>%
  str_remove("ALL_Seasons/") %>%
  str_remove(".html")

#For double episode
Friends <- separate(Friends,col=File , into = c("TV", "Autre"), sep = "-")

Friends$Season <- substr(Friends$TV, start = 1, stop = 2) %>% as.numeric()
Friends$Episode <- substr(Friends$TV, start = 3, stop = 4) %>% as.numeric()
#Friends$ID <- paste0("S",Friends$Season,"-E",Friends$Episode,"-",Friends$ID)
Friends <- Friends %>% select(-TV,-Autre)


#Exclude Producer, Writer, Director...
Prod <- Friends[Friends$Text %>% str_detect(" by: "),]
Friends <- Friends[!Friends$Text %>% str_detect(" by: "),]
Friends <- Friends[Friends$ID!=1,]

#Exclude Description
Friends$Text <- iconv(Friends$Text, from="UTF-8", to="UTF-8", sub="NA")
Friends$Desc <- substr(Friends$Text,1,1)=="("|substr(Friends$Text,1,1)=="["
Friends$Scene <- cumsum(Friends$Desc)
Desc <- Friends[Friends$Desc,]
Friends <- Friends[!Friends$Desc,]

Friends <- separate(Friends,col=Text , into = c("Character", "Text"), sep = ":",fill = "left")
Friends <- Friends[Friends$Text!="",]
Friends$Character <- Friends$Character %>% toupper()

Friends <- Friends[!str_detect(Friends$Character," BY"),]


#Prod database
Prod <- separate(Prod,col=Text, into=c(paste0("Info",1:5)), sep = "\n",fill = "left")

Prod <- rbind(
Prod %>% select(Episode, Season, Info1) %>% `colnames<-`(c("Episode", "Season", "Info")),
Prod %>% select(Episode, Season, Info2) %>% `colnames<-`(c("Episode", "Season", "Info")),
Prod %>% select(Episode, Season, Info3) %>% `colnames<-`(c("Episode", "Season", "Info")),
Prod %>% select(Episode, Season, Info4) %>% `colnames<-`(c("Episode", "Season", "Info")),
Prod %>% select(Episode, Season, Info5) %>% `colnames<-`(c("Episode", "Season", "Info")))

Prod <- Prod[!Prod$Info %>% is.na(),]

Prod <- separate(Prod,col=Info, into=c("Job","Name"), sep = ": ",fill = "left")
Prod <- separate(Prod,col=Name, into=c("Job2","Name"), sep = "by",fill = "left")
Prod$Job[is.na(Prod$Job)] <- Prod$Job2[is.na(Prod$Job)]
Prod <- Prod %>% select(-Job2)

Prod <- Prod %>% mutate(
  Job = case_when(
    Job %>% str_detect("rit") ~ "Writter",
    Job %>% str_detect("eleplay") ~ "Teleplay",
    Job %>% str_detect("irect") ~ "Director",
    Job %>% str_detect("scri") ~ "Transcript",
    Job %>% str_detect("tory") ~ "Story",
    Job %>% str_detect("rodu") ~ "Producer"
  )
)

Prod <- Prod[!is.na(Prod$Job),]

Friends <- Friends[!is.na(Friends$ID),]

# Friends$Episode[is.na(Friends$Episode)] <- 0

Episode <- Friends %>%
  select(Season, Episode) %>% 
  arrange(Season, Episode) %>% 
  unique() %>% 
  filter(!is.na(Episode))

Episode$ID <- 1:nrow(Episode)

Episode <- Episode %>% mutate(Key = 100*Season+Episode)
Friends <- Friends %>% mutate(Key = 100*Season+Episode)

for(i in 1:nrow(Friends)){
  Friends$ID_Episode[i] <- Episode$ID[Friends$Key[i]==Episode$Key] 
}

rm(Episode,i)

Friends$Words <- str_count(Friends$Text, '\\w+')

Friends <- Friends[!is.na(Friends$Text),]

IMDB <- data.frame(Season=NA,.=NA)

URL_List <- paste0("https://m.imdb.com/title/tt0108778/episodes/?season=",1:10)

for (url in URL_List){
  Page <- read_html(url)
  Web <- Page %>% html_elements(css = ".btn-full") %>% html_text()
  Web <- Web %>% as.data.frame() %>% mutate(Season = url)
  IMDB <- rbind(IMDB,Web)
}

rm(Web, Page, url, URL_List, i)

IMDB <- IMDB[!is.na(IMDB$Season),]
IMDB <- IMDB[IMDB$.!="\nFriends \n(1994–2004)\n\n",]

IMDB <- separate(IMDB,col=Season, into=c("Delete","Season"), sep = "=",fill = "left")

IMDB <- separate(IMDB,col=., into=c("Delete","Episode","Title","Delete2","Grade","Date_brut"), sep = "\n",fill = "left")

IMDB <- IMDB %>% select(Season, Episode, Title, Grade, Date_brut)
IMDB$Episode <- IMDB$Episode %>% as.numeric()

IMDB$Date <- IMDB$Date_brut %>% as.Date(format("%d %b. %Y"))
IMDB$Date[is.na(IMDB$Date)] <- IMDB$Date_brut[is.na(IMDB$Date)] %>% as.Date(format("%d %b %Y"))

IMDB <- IMDB %>% select(-Date_brut)

IMDB$ID_Episode <- 1:nrow(IMDB)
IMDB$Grade <- IMDB$Grade %>% str_trim() %>% as.numeric()
IMDB$Season <- IMDB$Season %>% as.numeric() %>% as.factor()
IMDB$Episode <- IMDB$Episode %>% as.numeric()

Infos <- read_csv(file = "friends_info.csv")

Infos <- Infos %>% 
  separate(col = Episode, into = c("Part1", "Part2"), sep = "\n")

Infos_temp <- Infos[!is.na(Infos$Part2),] %>% 
  select(-Part1)

Infos <- Infos %>% select(-3)
colnames(Infos)[2] <- "Episode"
colnames(Infos_temp)[2] <- "Episode"

Infos <- rbind(Infos,Infos_temp) %>%
  separate(col = Episode, into = c("Season", "Episode"), sep = "-") %>% 
  mutate(Season=as.numeric(Season),
         Episode=as.numeric(Episode),
         Key = Season*100+Episode)

IMDB <- IMDB %>%
  mutate(Season=as.numeric(Season),
         Key = Season*100+Episode)

for (i in 1:nrow(Infos)){
  Infos$Grade[i] <- ifelse(is_empty(IMDB$Grade[Infos$Key[i]==IMDB$Key]),NA,IMDB$Grade[Infos$Key[i]==IMDB$Key])
}

IMDB <- Infos[!is.na(Infos$Episode),]
IMDB$Season_label <- paste("Season" , IMDB$Season)
IMDB$Episode_label <- paste("Episode" , IMDB$Episode)

IMDB <- IMDB %>%
  group_by(Season_label,Episode_label) %>% 
  mutate(Episode_Title = paste(Episode_label, Title, sep=" : "),
         Summary = paste(strwrap(Summary,90), collapse="\n"),
         Summary_tree = paste(Episode_Title, Summary, sep="\n")) %>% 
  ungroup() %>%
  mutate(viewers = str_remove(`U.S. viewers`," million") %>% as.numeric()) %>% select(-`U.S. viewers`)

rm(Infos)

write_csv(IMDB, file = "IMDB.csv")
write_csv(Friends, file = "Friends.csv")
write_csv(Prod, file = "Prod.csv")
write_csv(Desc, file = "Desc.csv")

Team <- Friends$Character %>%
  table() %>%
  as.data.frame() %>% 
  arrange(desc(Freq)) %>%
  head(6) %>% 
  `colnames<-`(c("Character", "Freq"))

Name_Team <- Team$Character %>% droplevels() %>% as.character()

Friends_Historical <- Friends[Friends$Character %in% Name_Team,] %>% 
  group_by(Character, Season) %>% 
  summarise(Words = sum(Words)) %>%
  mutate(Words_Cum = cumsum(Words)) %>%
  arrange(Season, desc(Words_Cum)) %>% 
  group_by(Season) %>%
  mutate(Rank = 1, Rank = cumsum(Rank)) %>% 
  ungroup() %>% 
  arrange(Character, Season) %>%
  group_by(Character) %>% 
  mutate(Rank_diff = Rank - lag(Rank, default = Rank[1])) %>% 
  ungroup() %>% 
  arrange(Season, desc(Rank_diff), desc(Words_Cum)) %>% 
  group_by(Season) %>% 
  mutate(Rank_diff_TF = Rank_diff==max(Rank_diff),
         Rank_diff_TF = ifelse(Rank_diff_TF==T,cumsum(Rank_diff_TF),F),
         Rank_diff_TF = Rank_diff_TF==1) %>% 
  mutate(Image = ifelse(Rank_diff_TF==T,paste0("Photos/",Character,".png"),NA))

ggplot(Friends_Historical,aes(x = Season, y = Words_Cum))+
  geom_smooth(aes(group=Character, col = Character),
              method = lm, se = FALSE)+
  #geom_line(aes(col = Character))+
  scale_color_manual(values=colfunc(6))+
  scale_x_discrete(limits=c(1:10))+
  geom_image(aes(image=Image,),
             asp = 2,
             size=.07)+
  labs(title="Number of words spoken by the protagonists",
       subtitle="From the beginning to the end of friends",
       y="Count of Words", x="Season")+
  theme(text= element_text(family = font),
        legend.background = element_blank(),
        legend.position = "right",
        panel.background = element_blank(),
        legend.text = element_text(size = 8),
        axis.line = element_line(),
        axis.text.y = element_text(angle = 0),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple))

IMDB_Season <- IMDB %>%
  filter(Season %in% c(1:10)) %>%
  arrange(Season_label, Episode) %>% 
  mutate(ID_Episode = 1, ID_Episode = cumsum(ID_Episode)) %>%
  select(ID_Episode, Grade, Season_label, Episode, viewers) %>% 
  group_by(Season_label) %>% 
  mutate(max = max(Episode),
         max = max==Episode)


Season_grade_graph <- ggplot(IMDB_Season,aes(x = ID_Episode, y = Grade))+
  geom_point(aes(col=Season_label))+
  scale_color_manual(values=colfunc(IMDB_Season$Season_label %>% unique() %>% length()))+
  geom_smooth(aes(group=Season_label, col = Season_label),
              method = lm, formula = y ~ splines::bs(x, 4), se = FALSE)+
  geom_smooth(col = red, method = lm, formula = y ~ splines::bs(x, 4), se = FALSE)+
  labs(title="Friends: a series that keeps its viewers on their toes",
       subtitle=paste0("With ",
                       ((IMDB_Season$Grade[IMDB_Season$max] %>% mean(na.rm = T))-(IMDB_Season$Grade %>% mean(na.rm = T))) %>% round(1) %>% abs(),
                       " points more, the last episodes of selected seasons are generally more appreciated"),
       y="Grade", x="Episode", col = "Season",
       caption = "Source : IMDB")+
  theme(text= element_text(family = font),
        legend.background = element_blank(),
        legend.position = "right",
        panel.background = element_blank(),
        legend.text = element_text(size = 8),
        axis.line = element_line(),
        axis.text.y = element_text(angle = 0),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple))

Season_grade_graph

ggplot(IMDB_Season, aes(x = viewers, y = Grade))+
  #geom_point(col = "grey", alpha = .5)+
  scale_color_manual(values=colfunc(IMDB_Season$Season_label %>% unique() %>% length()))+
  #geom_text_repel(aes(label = paste(Season_label,"\n",Episode_label)))+
  #geom_smooth(col = red, method = lm, formula = y ~ splines::bs(x, 3), se = FALSE)+
  geom_smooth(aes(group=Season_label, col = Season_label),
              method = lm, se = FALSE, linetype = "dotted", size = .5)+
  geom_smooth(col = purple, method = lm, formula = y ~ splines::bs(x, 4), se = FALSE)+
  labs(title="Evolution of grades per views",
       y="Grade", x="Views (M)", col = "Season",
       caption = "Source : IMDB")+
  theme(text= element_text(family = font),
        legend.background = element_blank(),
        legend.position = "right",
        panel.background = element_blank(),
        legend.text = element_text(size = 8),
        axis.line = element_line(),
        axis.text.y = element_text(angle = 0),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple))

Evolution_Season <- IMDB %>%
  filter(!is.na(Grade)) %>% 
  group_by(Season) %>%
  mutate(LastEp = Episode==max(Episode),
         LastEp = ifelse(LastEp==T,Grade,NA)) %>%
  summarise(Mean = mean(Grade,na.rm = T),
            Max = max(Grade,na.rm = T),
            Min = min(Grade,na.rm = T),
            LastEp = max(LastEp,na.rm = T)) %>%
  arrange(desc(Season)) %>% 
  mutate(Season = paste("Season",Season),
         #Max = ifelse(LastEp>Max, LastEp,Max),
         Season = factor(Season, levels=unique(Season)))

colors <- c("Last Episode" = red, "Average Grade" = purple, "Range Episode"= blue)

sum(Evolution_Season$Max==Evolution_Season$LastEp)*100/10

## [1] 30

ggplot(Evolution_Season) +
  geom_segment( aes(x=Season, xend=Season, y=Max, yend=Min, color="Range Episode"), size=0.5) +
  geom_segment( aes(x=Season, xend=Season, y=LastEp, yend=Mean), color="grey", size=0.5) +
  geom_point( aes(x=Season, y=Min, color="Range Episode"), size=1.5 ) +
  geom_point( aes(x=Season, y=Max, color="Range Episode"), size=1.5 ) +
  geom_point( aes(x=Season, y=LastEp, color="Last Episode"), size=3 ) +
  geom_point( aes(x=Season, y=Mean, color="Average Grade"), size=3 ) +
  scale_color_manual(values = colors)+
  coord_flip()+
  labs(title="Grade per episode and per season",
       subtitle="From the beginning to the end of Friends",
       x = "",
       y = "Grade",
       color = "Legend")+
  theme(text= element_text(family = font),
        legend.background = element_blank(),
        legend.position = "right",
        panel.background = element_blank(),
        legend.text = element_text(size = 8),
        axis.line = element_line(),
        axis.text.y = element_text(angle = 0),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple))

for (i in Name_Team){
  Team$Words[Team$Character==i] <- Friends$Words[Friends$Character==i] %>% sum(na.rm = T)
}

Team <- Team %>%
  mutate(Image = paste0("Photos/",Character,".png"))


ggplot(Team,aes(x = Words, y = Freq))+
  # geom_point(aes(col=Character))+
  # geom_text_repel(aes(label = Character))+
  geom_image(aes(image=Image),
             asp = 2,
             size=.15)+
  ylim(c(6500,9000))+
  scale_color_manual(values=colfunc(6))+
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        axis.line = element_line(colour = purple))+
  labs(title="Number of words spoken by the protagonists",
       subtitle="From the beginning to the end of friends",
       y="Number of lines", x="Count of Words")+
  theme(text= element_text(family = font),
        legend.background = element_blank(),
        legend.position = "right",
        panel.background = element_blank(),
        legend.text = element_text(size = 8),
        axis.line = element_line(),
        axis.text.y = element_text(angle = 0),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple))

Friends_Network <- Friends %>% select(Character, Season, Scene)

Friends_Network$Character[Friends_Network$Character %in% Name_Team] <- NA
Friends_Network$Character[Friends_Network$Character == "ALL"] <- NA
Friends_Network$Character[Friends_Network$Character == "WOMAN"] <- NA
Friends_Network$Character[Friends_Network$Character == "MAN"] <- NA
Friends_Network$Character[Friends_Network$Character == "RACH"] <- NA
Friends_Network$Character[Friends_Network$Character == "MNCA"] <- NA
Friends_Network$Character[Friends_Network$Character == "THE DIRECTOR"] <- NA
#Friends_Network$Character[Friends_Network$Character == "MONA"] <- NA

for (name in Name_Team){
  Friends_Network$Character[str_detect(Friends_Network$Character,name)] <- NA
}

Friends_Network <- Friends_Network[!is.na(Friends_Network$Character),]

Top <- 40

Other_char_raw <- table(Friends_Network$Character) %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  head(Top)

Other_char <- Other_char_raw$Var1 %>% as.character()

Friends_Network <- Friends_Network[Friends_Network$Character %in% Other_char,]

Friends_matrix <- matrix(data = 0, ncol = Top, nrow = Top)

colnames(Friends_matrix) <- Friends_Network$Character %>% unique()
rownames(Friends_matrix) <- Friends_Network$Character %>% unique()

Friends_Network$ID <- Friends_Network$Season*100+Friends_Network$Scene

Friends_Network <- Friends_Network %>% 
  group_by(ID, Character) %>% 
  summarise(ID = mean(ID))

for (col in 1:ncol(Friends_matrix)){
  for (row in 1:nrow(Friends_matrix)){
    Friends_matrix[row,col] <-
      Friends_Network$ID[Friends_Network$Character ==colnames(Friends_matrix)[row]] %in%
      Friends_Network$ID[Friends_Network$Character == colnames(Friends_matrix)[col]] %>%
      sum()
  }
}

Friends_matrix <- Friends_matrix %>% as.data.frame()

Friends_Network <- matrix(ncol=3) %>% `colnames<-`(c("size", "from","to"))

for (i in 1:ncol(Friends_matrix)){
  Temp <- Friends_matrix %>%
    select(i) %>%
    mutate(from = colnames(Friends_matrix %>% select(i)),
           to = rownames(Friends_matrix)) %>%
    `colnames<-`(c("size", "from","to"))
  Friends_Network <- rbind(Friends_Network,Temp)
}

Friends_Network <- Friends_Network %>% filter(size>0) %>% as.data.frame()

Friends_Network <- Friends_Network[Friends_Network$from != Friends_Network$to,]

Friends_Network[Friends_Network$from %in% ((Friends_Network$from %>% table() %>% as.data.frame() %>% filter(Freq>1))$. %>% as.character()),]

##               size        from          to
## GAVIN            1         GUY       GAVIN
## STEVE            1         GUY       STEVE
## SUSAN4          25       CAROL       SUSAN
## NURSE4           3       CAROL       NURSE
## BEN4             4       CAROL         BEN
## EMILY4           1       CAROL       EMILY
## MR. GELLER5     31 MRS. GELLER  MR. GELLER
## GUNTHER5         1 MRS. GELLER     GUNTHER
## EMILY5           2 MRS. GELLER       EMILY
## MRS. GELLER6    31  MR. GELLER MRS. GELLER
## GUNTHER6         1  MR. GELLER     GUNTHER
## EMILY6           3  MR. GELLER       EMILY
## CAROL7          25       SUSAN       CAROL
## NURSE7           1       SUSAN       NURSE
## EMILY7           3       SUSAN       EMILY
## NURSE8           1      JANICE       NURSE
## GUNTHER8         2      JANICE     GUNTHER
## CAROL10          3       NURSE       CAROL
## SUSAN10          1       NURSE       SUSAN
## JANICE10         1       NURSE      JANICE
## ERICA10          1       NURSE       ERICA
## MIKE11           4       DAVID        MIKE
## BEN11            1       DAVID         BEN
## ERIC13           3      URSULA        ERIC
## MONA13           1      URSULA        MONA
## URSULA19         3        ERIC      URSULA
## MONA19           1        ERIC        MONA
## MRS. GELLER20    1     GUNTHER MRS. GELLER
## MR. GELLER20     1     GUNTHER  MR. GELLER
## JANICE20         2     GUNTHER      JANICE
## JILL20           1     GUNTHER        JILL
## MARK20           1     GUNTHER        MARK
## KATHY20          1     GUNTHER       KATHY
## JOSHUA20         1     GUNTHER      JOSHUA
## CHARLIE20        1     GUNTHER     CHARLIE
## WAITER21         1   DR. GREEN      WAITER
## MONA21           3   DR. GREEN        MONA
## CAROL22          4         BEN       CAROL
## DAVID22          1         BEN       DAVID
## GUNTHER29        1      JOSHUA     GUNTHER
## EMILY29          3      JOSHUA       EMILY
## CAROL30          1       EMILY       CAROL
## MRS. GELLER30    2       EMILY MRS. GELLER
## MR. GELLER30     3       EMILY  MR. GELLER
## SUSAN30          3       EMILY       SUSAN
## JOSHUA30         3       EMILY      JOSHUA
## URSULA36         1        MONA      URSULA
## ERIC36           1        MONA        ERIC
## DR. GREEN36      3        MONA   DR. GREEN
## GUNTHER38        1     CHARLIE     GUNTHER
## AMY38            1     CHARLIE         AMY

p <- simpleNetwork(Friends_Network %>% select(from,to), height="100px", width="100px",        
        Source = 1,             # column number of source
        Target = 2,             # column number of target
        linkDistance = 10,      # distance between node. Increase this value to have more space between nodes
        charge = -900,          # numeric value indicating either the strength of the node repulsion (negative value) or attraction (positive value)
        fontSize = 14,          # size of the node names
        fontFamily = "lato",   # font og node names
        linkColour = "#666",    # colour of edges, MUST be a common colour for the whole graph
        nodeColour = cassis,     # colour of nodes, MUST be a common colour for the whole graph
        opacity = 0.9,        # opacity of nodes. 0=transparent. 1=no transparency
        zoom = T              # Can you zoom on the figure?
        )

p

connect <- Friends_Network %>% select(from, to, size)
rownames(connect) <- 1:nrow(connect)
colnames(connect)[3] <- "value"

coauth <- c( as.character(connect$from), as.character(connect$to)) %>%
  as.tibble() %>%
  group_by(value) %>%
  summarize(n=n())
colnames(coauth) <- c("name", "n")

# Create a graph object with igraph
mygraph <- graph_from_data_frame( connect, vertices = coauth, directed = FALSE )

# Find community
com <- walktrap.community(mygraph)
#max(com$membership)

#Reorder dataset and make the graph
coauth <- coauth %>% 
  mutate( grp = com$membership) %>%
  arrange(grp) %>%
  mutate(name=factor(name, name))

# keep only 7 first communities
coauth <- coauth %>% 
  filter(grp<=7)

scale_col <- colfunc(max(coauth$grp))

# keep only this people in edges
connect <- connect %>%
  filter(from %in% coauth$name) %>%
  filter(to %in% coauth$name)

# Create a graph object with igraph
mygraph <- graph_from_data_frame( connect, vertices = coauth, directed = FALSE )

# Make the graph
ggraph(mygraph, layout="linear") + 
  geom_edge_arc(edge_colour="grey", fold=TRUE) +
  geom_node_point(aes(size=n, color=as.factor(grp), fill=grp)) +
  scale_color_manual(values=scale_col)+
  geom_node_text(aes(label=name), angle=65, hjust=1, nudge_y = -0.5, size=3) +
  expand_limits(x = c(0,0), y = c(-2,0))+
  theme(text=element_text(size=12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        legend.position = "none")

Other_char <- Other_char_raw %>%
  select(1) %>%
  `colnames<-`(c("Character")) %>% 
  mutate(Number_episode = NA, Average_grade = NA)

#Other_char <- Other_char[Other_char$Character!="GUNTHER",]

Friends <- Friends[!is.na(Friends$Character),]

IMDB$ID_Episode <- 1:nrow(IMDB)

for (i in 1:nrow(Other_char)){
  Other_char$Number_episode[i] <- Friends$ID_Episode[Friends$Character == Other_char$Character[i]] %>%
    unique() %>% length()
  Other_char$Average_grade[i] <- (IMDB$Grade[IMDB$ID_Episode %in%
             (Friends$ID_Episode[Friends$Character == Other_char$Character[i]] %>%
                unique())]) %>% mean()
}

Other_char <- Other_char %>% as.data.frame()
Other_char$Average_grade <- Other_char$Average_grade %>% round(2)

Other_char <- Other_char[Other_char$Number_episode>=5,]


Other_graph <- ggplot(Other_char, aes(x = Average_grade, y = Number_episode))+
  geom_vline(xintercept = mean(Other_char$Average_grade,na.rm = T), col = "grey")+
  geom_text(aes(x=mean(Average_grade,na.rm = T)-0.01,label=round(mean(Average_grade,na.rm = T),2), y=max(Number_episode)+1), colour="grey", angle=0, vjust = 0, size = 9/.pt)+
  geom_smooth(col = yellow, method = lm, formula = y ~ splines::bs(x, 3), se = FALSE)+
  geom_point(aes(size = Number_episode, colour = Average_grade > mean(Average_grade,na.rm = T)))+
  xlim(min(Other_char$Average_grade),8.7)+
  geom_text(aes(label = paste(Character,Average_grade),colour = Average_grade > mean(Average_grade,na.rm = T)),
            hjust=-0.1,
            vjust=-0.5,
            size = 9/.pt,
            check_overlap = T) +
  scale_color_manual(values=c(red,blue)) +
  labs(title="Character with the best average according to their frequency",
       subtitle="",
       y="Frequency", x="Grade")+
  theme(text= element_text(family = font),
        legend.background = element_blank(),
        legend.position = "none",
        panel.background = element_blank(),
        legend.text = element_text(size = 8),
        axis.line = element_line(),
        axis.text.y = element_text(angle = 0),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple))
Other_graph

Friends$Text <- Friends$Text %>% str_replace_all("\n"," ")
unnested_text <- Friends %>%
  tidytext::unnest_tokens(output = "word", input = Text) %>%
  select(Character, ID_Episode, Words, word)

stop_words <- stop_words

clean_text_raw <- anti_join(x = unnested_text, y = stop_words, by = "word")

clean_text <- clean_text_raw[!(clean_text_raw$word %in% tolower(Name_Team)),]
clean_text <- clean_text[!(clean_text$word %in% tolower(Other_char$Character)),]

to_remove <- c("yeah","hey","uh","don","ya","didn","ll","ve","y'know","um","gonna","umm","ah","doesn","ooh","ohh","huh")

clean_text <- clean_text[!(clean_text$word %in% tolower(to_remove)),]

clean_text <- clean_text %>%
  mutate(lemme = wordStem(words = word, language = "en"))

Top <- 120
text_count <- clean_text %>%
  count(lemme) %>%
  arrange(desc(n)) %>%
  head(Top)

for (i in 1:Top){
  text_count$word[i] <- (clean_text$word[clean_text$lemme==text_count$lemme[i]])[1]
}

wordcloud(words = text_count$word, freq = text_count$n, min.freq = 3,
          max.words=Top, random.order=FALSE, rot.per=0, 
          colors=rev(c(yellow, red, cassis, purple)))

Main_top_word <- clean_text[clean_text$lemme %in% text_count$lemme,]
Main_top_word <- Main_top_word %>% select(Character, word, lemme)
Main_top_word <- Main_top_word[Main_top_word$Character %in% Name_Team,]

Main_top_word <- Main_top_word %>%
  select(Character, lemme) %>%
  table() %>%
  as.data.frame() %>%
  group_by(Character) %>%
  top_n(10, Freq) %>%
  ungroup() %>%
  arrange(Character, -Freq)

Main_top_word %>%
  mutate(lemme = reorder(lemme, Freq)) %>%
  ggplot(aes(y = lemme, x = Freq, fill = factor(Character)))+
  geom_col(show.legend = FALSE)+
  scale_fill_manual(values=colfunc(6))+
  facet_wrap(~ Character, scales = "free")+
  theme(text=element_text(size=12),
        legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = blue))

clean_text_interact <- clean_text_raw[clean_text_raw$word %in% tolower(Name_Team),]
clean_text_interact <- clean_text_interact[clean_text_interact$Character %in% Name_Team,]

Interaction_Main <- clean_text_interact %>%
  select(Character, word) %>% 
  table() %>% 
  as.data.frame() %>%
  mutate(Interaction=toupper(word)) %>% 
  select(-word)

Interaction_Main %>%
  ggplot(aes(y = Interaction, x = Freq, fill = Character))+
  geom_col()+
  scale_x_continuous(limits = c(0, 800))+
  scale_fill_manual(values=colfunc(6))+
  facet_wrap(~ Character, scales = "free")+
  theme(text=element_text(size=12),
        legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = blue))

Third_person <- Interaction_Main[Interaction_Main$Character==Interaction_Main$Interaction,]

ggplot(Third_person,aes(x = reorder(Character,-Freq), y = Freq))+
  geom_col(fill = purple)+
  theme(text=element_text(size=12),
        legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = blue))+
  labs(title="Character talking most about himself in the third person",
       subtitle="",
       y="Frequency", x="Character")

Writter <- merge(Prod, IMDB) %>%
  filter(Job=="Writter") %>%
  select(Name, Grade) %>% 
  group_by(Name) %>% 
  summarise(Grade=round(mean(Grade),1), Freq = n()) %>%
  filter(Freq>2) %>% 
  arrange(-Grade)

Writter_graph <- ggplot(Writter, aes(x = Grade, y = Freq))+
  geom_vline(xintercept = mean(Writter$Grade,na.rm = T), col = "grey")+
  geom_text(aes(x=mean(Grade,na.rm = T)-0.05,label=round(mean(Grade,na.rm = T),2), y=max(Freq)+1), colour="grey", angle=0, vjust = 0, size = 9/.pt)+
  #geom_smooth(col = yellow, method = lm, formula = y ~ splines::bs(x, 3), se = FALSE)+
  geom_point(aes(size = Freq, colour = Grade > mean(Grade,na.rm = T)))+
  xlim(min(Writter$Grade),9.3)+
  geom_text(aes(label = paste(Name,Grade),colour = Grade > mean(Grade,na.rm = T)),
            hjust=-0.1,
            vjust=-0.5,
            size = 9/.pt,
            check_overlap = T) +
  scale_color_manual(values=c(red,blue)) +
  theme(text=element_text(size=12),
        legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = blue)) +
  labs(title="Writter with the best average according to their frequency",
       subtitle="",
       y="Frequency", x="Grade")
Writter_graph

library(plotly)

p <- ggplot(IMDB, aes(Episode, Season, fill= Grade)) + 
  geom_tile(color = "white") +
  geom_text(aes(label = Grade), color = "white", size = 2.5)+
  scale_fill_gradient(low=blue, high=purple)+
  theme(text=element_text(size=12),
        legend.position = "right",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = blue)) +
  labs(title="Writter with the best average according to their frequency",
       subtitle="")

ggplotly(p)

library(plotly)

Episodes <- c("Friends", unique(IMDB$Season_label), IMDB$Summary_tree)

Seasons <- c("", rep("Friends",times=10), IMDB$Season_label)

fig <- plot_ly(
  type="treemap",
  labels=Episodes,
  parents=Seasons,
  maxdepth=4,
  textposition= "middle center",
  text=list(size=10),
  marker = list(colorscale = "Reds"))

fig

library(reshape2)

friends_tokens <- Friends %>%
  mutate(text=as.character(Friends$Text)) %>%
  unnest_tokens(word, text)

#wordcloud for positive and negative sentiments
wordcloud_pos_neg<- friends_tokens %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort=TRUE) %>%
  acast(word ~ sentiment, value.var="n", fill=0) %>%
  comparison.cloud(colors=c(red, yellow), max.words=200)

wordcloud_pos_neg

## NULL

library(tidytext)
library(textdata)

nrc <- get_sentiments(lexicon ="nrc")

fsentiments <- friends_tokens %>%
  inner_join(nrc, "word") %>%
  count(word, sentiment, sort=TRUE)

# Frequency of each sentiment
ggplot(data=fsentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
  geom_bar(stat="identity", aes(fill=sentiment, color=sentiment), show.legend=FALSE) +
  scale_fill_manual(values=colfunc(10))+
  scale_color_manual(values=colfunc(10))+
  labs(x="Sentiment", y="Frequency")+
  theme(axis.text.x = element_text(angle=45, hjust=1))

fsentiments %>%
  group_by(sentiment) %>%
  arrange(desc(n)) %>%
  slice(1:10) %>%
  ggplot(aes(x=reorder(word, n), y=n)) +
    geom_col(aes(fill=sentiment), show.legend=FALSE) +
    facet_wrap(~sentiment, scales="free_y") +
    theme(legend.position = "none",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank())+
    labs(y="Frequency", x="Terms") +
    coord_flip()

friends_tokens %>%
  filter(Character %in% c("MONICA","RACHEL","PHOEBE","ROSS","CHANDLER","JOEY")) %>%
  inner_join(nrc, "word") %>%
  count(Character, sentiment, sort=TRUE) %>%
  ggplot(aes(x=sentiment, y=n)) +
    geom_col(aes(fill=sentiment), show.legend=FALSE) +
    facet_wrap(~Character, scales="free_x") +
    theme(text=element_text(size=12),
          legend.position = "none",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.background = element_blank(),
          axis.line = element_line(colour = blue))+
    labs(x="Sentiment", y="Frequency") +
    coord_flip()

TEST<- friends_tokens %>%
  filter(Character %in% c("MONICA","RACHEL","PHOEBE","ROSS","CHANDLER","JOEY")) %>%
  inner_join(nrc, "word") %>% 
  select(Season, Episode, sentiment, Words) %>% 
  group_by(Season, Episode, sentiment) %>% 
  summarise(Words = sum(Words)) %>% 
  ungroup() %>% 
  filter(sentiment != "positive",sentiment != "negative")

TEST <- TEST %>%
  group_by(Season, Episode) %>% 
  filter(Words == max(Words))

ggplot(TEST, aes(x = Episode, y = sentiment, color = Season, group = Season))+
  geom_point()+
  geom_line()

TEST<- friends_tokens %>%
  filter(Character %in% c("MONICA","RACHEL","PHOEBE","ROSS","CHANDLER","JOEY")) %>%
  inner_join(nrc, "word")

TEST %>%
group_by(Episode) %>%
count(sentiment) %>%
arrange(desc(n)) %>%
ggplot( aes(x=Episode, y=n, group=sentiment, color=sentiment)) +
geom_line() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("sentiment over episode by season")

Scenes <- Desc[str_detect(Desc$Text, pattern = "Scene"),] %>% 
  separate(col=Text, into = c("Scene","Description"), sep = "Scene") %>% 
  select(-Scene, - ID, -Desc) %>% 
  mutate(Description = Description %>% str_remove(": ")) %>% 
  separate(col=Description, into = c("Scene","Description"), sep = ",|]|:|;|\\.") %>% 
  select(-Description) %>% 
  filter(Scene != "") %>% 
  mutate(Scene = Scene %>% str_remove("inside|outside|Inside|Outside"),
         Scene = str_trim(Scene),
         Scene = toupper(Scene))

# Scenes$Scene %>% table() %>% as.data.frame() %>% arrange(desc(Freq))
# 
# str_trim(Scenes$Scene)

Scenes$Scene[(Scenes$Scene %>% str_detect("ERK"))] %>% table()

## .
##                                         13CENTRAL PERK 
##                                                      1 
##                                            CENTAL PERK 
##                                                      1 
##                                           CENTRAL PERK 
##                                                    465 
##                                CENTRAL PERK - CHANDLER 
##                                                      1 
##                                        OF CENTRAL PERK 
##                                                      1 
## PHOEBE AND RACHEL SITTING ON THE COUCH IN CENTRAL PERK 
##                                                      1 
##                              PHOEBE IS AT CENTRAL PERK 
##                                                      1 
##                                ROSS IS IN CENTRAL PERK 
##                                                      1 
##                               THE STREET  CENTRAL PERK 
##                                                      2 
##            THE STREET DOWN THE BLOCK FROM CENTRAL PERK 
##                                                      1 
##                    THE STREET IN FRONT OF CENTRAL PERK 
##                                                      4 
##              THE STREET RIGHT IN FRONT OF CENTRAL PERK 
##                                                      1

# 
# Scenes <- Scenes %>%
#   separate(col=Text, into = c("Scene","Description"), sep = ":") %>%
#   separate(col=Description, into=c('Location', 'Description'), sep=c(',',';',']'))

text_count_locations <- Scenes %>%
  count(Scene) %>%
  arrange(desc(n)) %>%
  mutate(Percentage=n/sum(n)*100) %>%
  head(10) %>%
  mutate(Scene = fct_reorder(Scene,Percentage))

#Focus sur les meileurs et pires épisodes pour comprendre ce qui constitue un bon scenario

IMDB_Category <- IMDB %>% 
  select(Season_label, Episode_label, Season, Episode, viewers, Grade) %>% 
  filter(!is.na(Grade)) %>% 
  mutate(Key = Season*100+Episode,
    Category = ifelse(
  Grade>mean(Grade,na.rm = T) & viewers>mean(viewers,na.rm = T),
  Category <- "Stars",
  ifelse(
    Grade<mean(Grade,na.rm = T) & viewers>mean(viewers,na.rm = T),
    Category <- "High Rentability but Low Quality",
    ifelse(
      Grade>mean(Grade,na.rm = T) & viewers<mean(viewers,na.rm = T),
      Category <- "Low Rentability but High Quality",
      Category <- "Bad Episodes"))))

ggplot(IMDB_Category, aes(x = Grade, y = viewers))+
  geom_point(aes(col = Category))+
  scale_color_manual(values=colfunc(4))+
  geom_vline(xintercept = mean(IMDB_Category$Grade,na.rm = T))+
  geom_hline(yintercept = mean(IMDB_Category$viewers,na.rm = T))+
  geom_label(label = "Bad Episodes", hjust = 1, alpha = 5,
            aes(y = viewers %>% quantile(.25) %>% as.numeric(),
                x = Grade %>% quantile(.25) %>% as.numeric()))+
  geom_label(label = "High Rentability\nbut Low Quality", hjust = 1, vjust = -.01,
            aes(y = viewers %>% quantile(.75) %>% as.numeric(),
                x = Grade %>% quantile(.25) %>% as.numeric()))+
  geom_label(label = "Low Rentability\nbut High Quality", hjust = 0, vjust = .6,
            aes(y = viewers %>% quantile(.25) %>% as.numeric(),
                x = Grade %>% quantile(.75) %>% as.numeric()))+
  geom_label(label = "Stars", hjust = 0,
            aes(y = viewers %>% quantile(.75) %>% as.numeric(),
                x = Grade %>% quantile(.75) %>% as.numeric()))+
  labs(title="Profitability, quality and views: Segmentation of Friends episodes",
     subtitle = "The aim is that the most appreciated episodes are the most seen",
     y="Viewers (million)",x="Grade", color="Category",
     caption="Source : IMDB")+
  theme_minimal()+
  theme(legend.position = "none",
        plot.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple),
        axis.line = element_blank(),
        axis.ticks = element_blank())

Category_filter <- (IMDB_Category$Category %>% unique())[c(1,2,4)]

BW_Analysis <- IMDB_Category %>%
  filter(Category %in% Category_filter) %>% 
  merge(IMDB) %>% 
  select(Key, Grade, viewers, Category) %>% 
  merge(Friends) %>% 
  group_by(Key) %>% 
  mutate(Category = Category) %>% 
  summarise(Nb_Character = n_distinct(Character),
            Nb_Scene = n_distinct(Scene),
            Grade = mean(Grade),
            viewers = mean(viewers),
            Category = Category) %>%
  unique() %>% 
  filter(Nb_Scene > 5,Nb_Scene < 90)

ggplot(BW_Analysis, aes(x = Nb_Scene, y = Nb_Character, col = Category))+
  geom_point()+
  scale_color_manual(values=colfunc(BW_Analysis$Category %>% unique() %>% length()))+
  geom_smooth(aes(group=Category), method = lm, se = FALSE)+
  labs(title="Profitability, quality and views: Segmentation of Friends episodes",
     subtitle = "The aim is that the most appreciated episodes are the most seen",
     y="Number of Character",x="Number of Scene", color="Category",
     caption="Source : IMDB")+
  theme(text= element_text(family = font),
        legend.background = element_blank(),
        legend.position = "right",
        panel.background = element_blank(),
        legend.text = element_text(size = 8),
        axis.line = element_line(),
        axis.text.y = element_text(angle = 0),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple))

IMDB_Category %>%
  select(Season,Category) %>%
  table() %>%
  as.data.frame.matrix()

##    Bad Episodes High Rentability but Low Quality
## 1            11                                6
## 2             1                               12
## 3             8                                9
## 4             8                                2
## 5             9                                0
## 6            10                                0
## 7            12                                2
## 8             8                                5
## 9            10                                5
## 10            4                                1
##    Low Rentability but High Quality Stars
## 1                                 2     5
## 2                                 0    11
## 3                                 2     6
## 4                                 7     7
## 5                                 5    10
## 6                                12     3
## 7                                 8     2
## 8                                 1    10
## 9                                 3     6
## 10                                9     3

ggplot(IMDB_Category %>% filter(Category == "Stars"|Category == "Bad Episodes"), aes(x = Season))+
  geom_bar(aes(fill = Category), position = "dodge")+
  coord_polar(theta = "y")+
  scale_fill_manual(values=c(blue, purple))+
  ylim(c(0,20))+
  xlim(c(-2,11))+
  geom_text(hjust = 1.1, size = 3, family = font, col = purple,
            aes(y = 0, label = paste("Season",Season)))+
  labs(title="Evaluation of the seasons according to the best and worst episodes",
     subtitle = "The last seasons have some excellent episodes unlike the first ones which are mainly bad",
     y=NULL,x=NULL, fill="Category",
     caption="Source : IMDB")+
  theme_minimal() +
  theme(legend.position = "right",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
        plot.caption = element_text(family = font, size = 10, color = purple),
        axis.line = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks = element_blank())

# library(odbc)
# library(DBI)
# conn = dbConnect(drv = odbc::odbc(),
#                  Driver = "ODBC Driver 13 for SQL Server",
#                  server = "tcp:ibrahimmeddiouiserver.database.windows.net",
#                  database = "ibrahimmeddiouidatabase",
#                  uid = "ibrahimmeddioui",
#                  pwd = "181SPPum")
# 
# Friends <- read.csv(file = "Friends.csv")
# IMDB <- read.csv(file = "IMDB.csv")
# Desc <- read.csv(file = "Desc.csv")
# Prod <- read.csv(file = "Prod.csv")
# 
# dbWriteTable(conn = conn, name = "Friends", value = Friends, overwrite = T)
# dbWriteTable(conn = conn, name = "IMDB", value = IMDB, overwrite = T)
# dbWriteTable(conn = conn, name = "Desc", value = Desc, overwrite = T)
# dbWriteTable(conn = conn, name = "Prod", value = Prod, overwrite = T)
# 
# # This is for test only
# Friends_sql <- dbReadTable(conn = conn, name = 'Friends')

vline <- function(x = 0, color = "black") {
list(
type = "line",
y0 = 0,
y1 = 1,
yref = "paper",
x0 = x,
x1 = x,
line = list(color = color, dash="dot")
)
}



IMDB_writers <- IMDB[!grepl('Story by', IMDB$`Written by`),]
IMDB_writers %>% select(`Written by`, Episode, Season_label) %>%
group_by(`Written by`)%>%
arrange(`Written by`)

## # A tibble: 186 x 3
## # Groups:   Written by [55]
##    `Written by`                 Episode Season_label
##    <chr>                          <dbl> <chr>       
##  1 Adam Chase                        17 Season 2    
##  2 Adam Chase                         8 Season 3    
##  3 Adam Chase                        19 Season 3    
##  4 Adam Chase                         7 Season 4    
##  5 Adam Chase                        15 Season 5    
##  6 Adam Chase                         1 Season 6    
##  7 Adam Chase & Ira Ungerleider       6 Season 1    
##  8 Adam Chase & Ira Ungerleider      10 Season 1    
##  9 Adam Chase & Ira Ungerleider      21 Season 1    
## 10 Adam Chase & Ira Ungerleider       2 Season 2    
## # ... with 176 more rows

writers_grade <- IMDB_writers %>% select(`Written by`, Grade) %>%
group_by(`Written by`) %>%
summarize(Count=n(), Grade=round(mean(Grade, na.rm=T), 2)) %>%
filter(Grade>=5) %>%
arrange(desc(Count))




writers_grade %>% plot_ly() %>%
add_markers(x=~Grade, y=~Count ,size=~Count, hovertemplate= paste( writers_grade$`Written by`,"%{y} Episodes:%{x} <extra></extra>"), color = (writers_grade$Grade<median(writers_grade$Grade)), colors=colfunc2(2) ) %>%
layout(title= "Grades of the episodes per writer", xaxis = list(title = "Grade", showline= T, linewidth=2, linecolor='black'),
yaxis = list(title = "Frequency", showline= T, linewidth=2, linecolor='black'), shapes= list(vline(median(writers_grade$Grade))), showlegend=F)

Friends project

Pamela Memic, Imané Namolsi, Ibrahim Meddioui & Théotime Bourgeois

25 avril 2022

Introduction