library(tidyverse)
library(tidytext)
library(rvest)
library(xml2)
library(urltools)
library(plotly)
library(ggplot2)
library(ggrepel)
library(igraph)
library(ggraph)
library(networkD3)
library(wordcloud)
library(wordcloud2)
library(SnowballC)
library(patchwork)
library(png)
library(ggimage)
red <- c("#E91E22")
blue <- c("#4395C0")
purple <- c("#1D2F6F")
yellow <- c("#F1BF45")
green <- c("#4C9F70")
cassis <- c("#832749")
font <- "Lato"
colfunc <- colorRampPalette(c(green,red,cassis,purple,blue,yellow))
colfunc2 <- colorRampPalette(c(red,cassis,purple))
List_names <- list.files("ALL_Seasons", pattern=NULL, all.files=FALSE,
full.names=FALSE)
List_names <- paste0("ALL_Seasons/",List_names) %>% as.data.frame()
#I set up my dataframe
Friends <- data.frame(File=NA,ID=NA,Text=NA)
df <- read_html(List_names$.[199]) %>%
html_elements("body") %>%
html_text2() %>%
str_split("\n") %>%
as.data.frame() %>%
`colnames<-`(c("Text"))
df$File <- List_names$.[199]
df <- df[df$Text!="",]
df$ID <- 1:nrow(df)
Friends <- rbind(Friends,df)
#To know the validity of my HTML files
for (i in 1:nrow(List_names)){
List_names$Keep[i] <- (read_html(List_names$.[i], encoding = "UTF-8")%>%
html_elements(css = "p") %>%
html_text() %>%
length())>0
}
List_names <- List_names[List_names$Keep,]
for (i in 1:nrow(List_names)){
Page <- read_html(List_names$.[i], encoding = "UTF-8")
df <- Page %>%
html_elements(css = "p") %>%
html_text() %>%
as.data.frame()
colnames(df) <- "Text"
df$ID <- 1:nrow(df)
df$File <- List_names$.[i]
Friends <- rbind(Friends,df)
}
rm(Page,df,List_names,i)
#Cleaning my database
Friends <- Friends[Friends$Text!="",]
Friends <- Friends[!is.na(Friends$Text),]
Friends$File <- Friends$File %>%
str_remove("ALL_Seasons/") %>%
str_remove(".html")
#For double episode
Friends <- separate(Friends,col=File , into = c("TV", "Autre"), sep = "-")
Friends$Season <- substr(Friends$TV, start = 1, stop = 2) %>% as.numeric()
Friends$Episode <- substr(Friends$TV, start = 3, stop = 4) %>% as.numeric()
#Friends$ID <- paste0("S",Friends$Season,"-E",Friends$Episode,"-",Friends$ID)
Friends <- Friends %>% select(-TV,-Autre)
#Exclude Producer, Writer, Director...
Prod <- Friends[Friends$Text %>% str_detect(" by: "),]
Friends <- Friends[!Friends$Text %>% str_detect(" by: "),]
Friends <- Friends[Friends$ID!=1,]
#Exclude Description
Friends$Text <- iconv(Friends$Text, from="UTF-8", to="UTF-8", sub="NA")
Friends$Desc <- substr(Friends$Text,1,1)=="("|substr(Friends$Text,1,1)=="["
Friends$Scene <- cumsum(Friends$Desc)
Desc <- Friends[Friends$Desc,]
Friends <- Friends[!Friends$Desc,]
Friends <- separate(Friends,col=Text , into = c("Character", "Text"), sep = ":",fill = "left")
Friends <- Friends[Friends$Text!="",]
Friends$Character <- Friends$Character %>% toupper()
Friends <- Friends[!str_detect(Friends$Character," BY"),]
#Prod database
Prod <- separate(Prod,col=Text, into=c(paste0("Info",1:5)), sep = "\n",fill = "left")
Prod <- rbind(
Prod %>% select(Episode, Season, Info1) %>% `colnames<-`(c("Episode", "Season", "Info")),
Prod %>% select(Episode, Season, Info2) %>% `colnames<-`(c("Episode", "Season", "Info")),
Prod %>% select(Episode, Season, Info3) %>% `colnames<-`(c("Episode", "Season", "Info")),
Prod %>% select(Episode, Season, Info4) %>% `colnames<-`(c("Episode", "Season", "Info")),
Prod %>% select(Episode, Season, Info5) %>% `colnames<-`(c("Episode", "Season", "Info")))
Prod <- Prod[!Prod$Info %>% is.na(),]
Prod <- separate(Prod,col=Info, into=c("Job","Name"), sep = ": ",fill = "left")
Prod <- separate(Prod,col=Name, into=c("Job2","Name"), sep = "by",fill = "left")
Prod$Job[is.na(Prod$Job)] <- Prod$Job2[is.na(Prod$Job)]
Prod <- Prod %>% select(-Job2)
Prod <- Prod %>% mutate(
Job = case_when(
Job %>% str_detect("rit") ~ "Writter",
Job %>% str_detect("eleplay") ~ "Teleplay",
Job %>% str_detect("irect") ~ "Director",
Job %>% str_detect("scri") ~ "Transcript",
Job %>% str_detect("tory") ~ "Story",
Job %>% str_detect("rodu") ~ "Producer"
)
)
Prod <- Prod[!is.na(Prod$Job),]
Friends <- Friends[!is.na(Friends$ID),]
# Friends$Episode[is.na(Friends$Episode)] <- 0
Episode <- Friends %>%
select(Season, Episode) %>%
arrange(Season, Episode) %>%
unique() %>%
filter(!is.na(Episode))
Episode$ID <- 1:nrow(Episode)
Episode <- Episode %>% mutate(Key = 100*Season+Episode)
Friends <- Friends %>% mutate(Key = 100*Season+Episode)
for(i in 1:nrow(Friends)){
Friends$ID_Episode[i] <- Episode$ID[Friends$Key[i]==Episode$Key]
}
rm(Episode,i)
Friends$Words <- str_count(Friends$Text, '\\w+')
Friends <- Friends[!is.na(Friends$Text),]
IMDB <- data.frame(Season=NA,.=NA)
URL_List <- paste0("https://m.imdb.com/title/tt0108778/episodes/?season=",1:10)
for (url in URL_List){
Page <- read_html(url)
Web <- Page %>% html_elements(css = ".btn-full") %>% html_text()
Web <- Web %>% as.data.frame() %>% mutate(Season = url)
IMDB <- rbind(IMDB,Web)
}
rm(Web, Page, url, URL_List, i)
IMDB <- IMDB[!is.na(IMDB$Season),]
IMDB <- IMDB[IMDB$.!="\nFriends \n(1994–2004)\n\n",]
IMDB <- separate(IMDB,col=Season, into=c("Delete","Season"), sep = "=",fill = "left")
IMDB <- separate(IMDB,col=., into=c("Delete","Episode","Title","Delete2","Grade","Date_brut"), sep = "\n",fill = "left")
IMDB <- IMDB %>% select(Season, Episode, Title, Grade, Date_brut)
IMDB$Episode <- IMDB$Episode %>% as.numeric()
IMDB$Date <- IMDB$Date_brut %>% as.Date(format("%d %b. %Y"))
IMDB$Date[is.na(IMDB$Date)] <- IMDB$Date_brut[is.na(IMDB$Date)] %>% as.Date(format("%d %b %Y"))
IMDB <- IMDB %>% select(-Date_brut)
IMDB$ID_Episode <- 1:nrow(IMDB)
IMDB$Grade <- IMDB$Grade %>% str_trim() %>% as.numeric()
IMDB$Season <- IMDB$Season %>% as.numeric() %>% as.factor()
IMDB$Episode <- IMDB$Episode %>% as.numeric()
Infos <- read_csv(file = "friends_info.csv")
Infos <- Infos %>%
separate(col = Episode, into = c("Part1", "Part2"), sep = "\n")
Infos_temp <- Infos[!is.na(Infos$Part2),] %>%
select(-Part1)
Infos <- Infos %>% select(-3)
colnames(Infos)[2] <- "Episode"
colnames(Infos_temp)[2] <- "Episode"
Infos <- rbind(Infos,Infos_temp) %>%
separate(col = Episode, into = c("Season", "Episode"), sep = "-") %>%
mutate(Season=as.numeric(Season),
Episode=as.numeric(Episode),
Key = Season*100+Episode)
IMDB <- IMDB %>%
mutate(Season=as.numeric(Season),
Key = Season*100+Episode)
for (i in 1:nrow(Infos)){
Infos$Grade[i] <- ifelse(is_empty(IMDB$Grade[Infos$Key[i]==IMDB$Key]),NA,IMDB$Grade[Infos$Key[i]==IMDB$Key])
}
IMDB <- Infos[!is.na(Infos$Episode),]
IMDB$Season_label <- paste("Season" , IMDB$Season)
IMDB$Episode_label <- paste("Episode" , IMDB$Episode)
IMDB <- IMDB %>%
group_by(Season_label,Episode_label) %>%
mutate(Episode_Title = paste(Episode_label, Title, sep=" : "),
Summary = paste(strwrap(Summary,90), collapse="\n"),
Summary_tree = paste(Episode_Title, Summary, sep="\n")) %>%
ungroup() %>%
mutate(viewers = str_remove(`U.S. viewers`," million") %>% as.numeric()) %>% select(-`U.S. viewers`)
rm(Infos)
write_csv(IMDB, file = "IMDB.csv")
write_csv(Friends, file = "Friends.csv")
write_csv(Prod, file = "Prod.csv")
write_csv(Desc, file = "Desc.csv")
Team <- Friends$Character %>%
table() %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
head(6) %>%
`colnames<-`(c("Character", "Freq"))
Name_Team <- Team$Character %>% droplevels() %>% as.character()
Friends_Historical <- Friends[Friends$Character %in% Name_Team,] %>%
group_by(Character, Season) %>%
summarise(Words = sum(Words)) %>%
mutate(Words_Cum = cumsum(Words)) %>%
arrange(Season, desc(Words_Cum)) %>%
group_by(Season) %>%
mutate(Rank = 1, Rank = cumsum(Rank)) %>%
ungroup() %>%
arrange(Character, Season) %>%
group_by(Character) %>%
mutate(Rank_diff = Rank - lag(Rank, default = Rank[1])) %>%
ungroup() %>%
arrange(Season, desc(Rank_diff), desc(Words_Cum)) %>%
group_by(Season) %>%
mutate(Rank_diff_TF = Rank_diff==max(Rank_diff),
Rank_diff_TF = ifelse(Rank_diff_TF==T,cumsum(Rank_diff_TF),F),
Rank_diff_TF = Rank_diff_TF==1) %>%
mutate(Image = ifelse(Rank_diff_TF==T,paste0("Photos/",Character,".png"),NA))
ggplot(Friends_Historical,aes(x = Season, y = Words_Cum))+
geom_smooth(aes(group=Character, col = Character),
method = lm, se = FALSE)+
#geom_line(aes(col = Character))+
scale_color_manual(values=colfunc(6))+
scale_x_discrete(limits=c(1:10))+
geom_image(aes(image=Image,),
asp = 2,
size=.07)+
labs(title="Number of words spoken by the protagonists",
subtitle="From the beginning to the end of friends",
y="Count of Words", x="Season")+
theme(text= element_text(family = font),
legend.background = element_blank(),
legend.position = "right",
panel.background = element_blank(),
legend.text = element_text(size = 8),
axis.line = element_line(),
axis.text.y = element_text(angle = 0),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple))
IMDB_Season <- IMDB %>%
filter(Season %in% c(1:10)) %>%
arrange(Season_label, Episode) %>%
mutate(ID_Episode = 1, ID_Episode = cumsum(ID_Episode)) %>%
select(ID_Episode, Grade, Season_label, Episode, viewers) %>%
group_by(Season_label) %>%
mutate(max = max(Episode),
max = max==Episode)
Season_grade_graph <- ggplot(IMDB_Season,aes(x = ID_Episode, y = Grade))+
geom_point(aes(col=Season_label))+
scale_color_manual(values=colfunc(IMDB_Season$Season_label %>% unique() %>% length()))+
geom_smooth(aes(group=Season_label, col = Season_label),
method = lm, formula = y ~ splines::bs(x, 4), se = FALSE)+
geom_smooth(col = red, method = lm, formula = y ~ splines::bs(x, 4), se = FALSE)+
labs(title="Friends: a series that keeps its viewers on their toes",
subtitle=paste0("With ",
((IMDB_Season$Grade[IMDB_Season$max] %>% mean(na.rm = T))-(IMDB_Season$Grade %>% mean(na.rm = T))) %>% round(1) %>% abs(),
" points more, the last episodes of selected seasons are generally more appreciated"),
y="Grade", x="Episode", col = "Season",
caption = "Source : IMDB")+
theme(text= element_text(family = font),
legend.background = element_blank(),
legend.position = "right",
panel.background = element_blank(),
legend.text = element_text(size = 8),
axis.line = element_line(),
axis.text.y = element_text(angle = 0),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple))
Season_grade_graph
ggplot(IMDB_Season, aes(x = viewers, y = Grade))+
#geom_point(col = "grey", alpha = .5)+
scale_color_manual(values=colfunc(IMDB_Season$Season_label %>% unique() %>% length()))+
#geom_text_repel(aes(label = paste(Season_label,"\n",Episode_label)))+
#geom_smooth(col = red, method = lm, formula = y ~ splines::bs(x, 3), se = FALSE)+
geom_smooth(aes(group=Season_label, col = Season_label),
method = lm, se = FALSE, linetype = "dotted", size = .5)+
geom_smooth(col = purple, method = lm, formula = y ~ splines::bs(x, 4), se = FALSE)+
labs(title="Evolution of grades per views",
y="Grade", x="Views (M)", col = "Season",
caption = "Source : IMDB")+
theme(text= element_text(family = font),
legend.background = element_blank(),
legend.position = "right",
panel.background = element_blank(),
legend.text = element_text(size = 8),
axis.line = element_line(),
axis.text.y = element_text(angle = 0),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple))
Evolution_Season <- IMDB %>%
filter(!is.na(Grade)) %>%
group_by(Season) %>%
mutate(LastEp = Episode==max(Episode),
LastEp = ifelse(LastEp==T,Grade,NA)) %>%
summarise(Mean = mean(Grade,na.rm = T),
Max = max(Grade,na.rm = T),
Min = min(Grade,na.rm = T),
LastEp = max(LastEp,na.rm = T)) %>%
arrange(desc(Season)) %>%
mutate(Season = paste("Season",Season),
#Max = ifelse(LastEp>Max, LastEp,Max),
Season = factor(Season, levels=unique(Season)))
colors <- c("Last Episode" = red, "Average Grade" = purple, "Range Episode"= blue)
sum(Evolution_Season$Max==Evolution_Season$LastEp)*100/10
## [1] 30
ggplot(Evolution_Season) +
geom_segment( aes(x=Season, xend=Season, y=Max, yend=Min, color="Range Episode"), size=0.5) +
geom_segment( aes(x=Season, xend=Season, y=LastEp, yend=Mean), color="grey", size=0.5) +
geom_point( aes(x=Season, y=Min, color="Range Episode"), size=1.5 ) +
geom_point( aes(x=Season, y=Max, color="Range Episode"), size=1.5 ) +
geom_point( aes(x=Season, y=LastEp, color="Last Episode"), size=3 ) +
geom_point( aes(x=Season, y=Mean, color="Average Grade"), size=3 ) +
scale_color_manual(values = colors)+
coord_flip()+
labs(title="Grade per episode and per season",
subtitle="From the beginning to the end of Friends",
x = "",
y = "Grade",
color = "Legend")+
theme(text= element_text(family = font),
legend.background = element_blank(),
legend.position = "right",
panel.background = element_blank(),
legend.text = element_text(size = 8),
axis.line = element_line(),
axis.text.y = element_text(angle = 0),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple))
for (i in Name_Team){
Team$Words[Team$Character==i] <- Friends$Words[Friends$Character==i] %>% sum(na.rm = T)
}
Team <- Team %>%
mutate(Image = paste0("Photos/",Character,".png"))
ggplot(Team,aes(x = Words, y = Freq))+
# geom_point(aes(col=Character))+
# geom_text_repel(aes(label = Character))+
geom_image(aes(image=Image),
asp = 2,
size=.15)+
ylim(c(6500,9000))+
scale_color_manual(values=colfunc(6))+
theme(text=element_text(size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
legend.position = "right",
axis.line = element_line(colour = purple))+
labs(title="Number of words spoken by the protagonists",
subtitle="From the beginning to the end of friends",
y="Number of lines", x="Count of Words")+
theme(text= element_text(family = font),
legend.background = element_blank(),
legend.position = "right",
panel.background = element_blank(),
legend.text = element_text(size = 8),
axis.line = element_line(),
axis.text.y = element_text(angle = 0),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple))
Friends_Network <- Friends %>% select(Character, Season, Scene)
Friends_Network$Character[Friends_Network$Character %in% Name_Team] <- NA
Friends_Network$Character[Friends_Network$Character == "ALL"] <- NA
Friends_Network$Character[Friends_Network$Character == "WOMAN"] <- NA
Friends_Network$Character[Friends_Network$Character == "MAN"] <- NA
Friends_Network$Character[Friends_Network$Character == "RACH"] <- NA
Friends_Network$Character[Friends_Network$Character == "MNCA"] <- NA
Friends_Network$Character[Friends_Network$Character == "THE DIRECTOR"] <- NA
#Friends_Network$Character[Friends_Network$Character == "MONA"] <- NA
for (name in Name_Team){
Friends_Network$Character[str_detect(Friends_Network$Character,name)] <- NA
}
Friends_Network <- Friends_Network[!is.na(Friends_Network$Character),]
Top <- 40
Other_char_raw <- table(Friends_Network$Character) %>%
as.data.frame() %>%
arrange(desc(Freq)) %>%
head(Top)
Other_char <- Other_char_raw$Var1 %>% as.character()
Friends_Network <- Friends_Network[Friends_Network$Character %in% Other_char,]
Friends_matrix <- matrix(data = 0, ncol = Top, nrow = Top)
colnames(Friends_matrix) <- Friends_Network$Character %>% unique()
rownames(Friends_matrix) <- Friends_Network$Character %>% unique()
Friends_Network$ID <- Friends_Network$Season*100+Friends_Network$Scene
Friends_Network <- Friends_Network %>%
group_by(ID, Character) %>%
summarise(ID = mean(ID))
for (col in 1:ncol(Friends_matrix)){
for (row in 1:nrow(Friends_matrix)){
Friends_matrix[row,col] <-
Friends_Network$ID[Friends_Network$Character ==colnames(Friends_matrix)[row]] %in%
Friends_Network$ID[Friends_Network$Character == colnames(Friends_matrix)[col]] %>%
sum()
}
}
Friends_matrix <- Friends_matrix %>% as.data.frame()
Friends_Network <- matrix(ncol=3) %>% `colnames<-`(c("size", "from","to"))
for (i in 1:ncol(Friends_matrix)){
Temp <- Friends_matrix %>%
select(i) %>%
mutate(from = colnames(Friends_matrix %>% select(i)),
to = rownames(Friends_matrix)) %>%
`colnames<-`(c("size", "from","to"))
Friends_Network <- rbind(Friends_Network,Temp)
}
Friends_Network <- Friends_Network %>% filter(size>0) %>% as.data.frame()
Friends_Network <- Friends_Network[Friends_Network$from != Friends_Network$to,]
Friends_Network[Friends_Network$from %in% ((Friends_Network$from %>% table() %>% as.data.frame() %>% filter(Freq>1))$. %>% as.character()),]
## size from to
## GAVIN 1 GUY GAVIN
## STEVE 1 GUY STEVE
## SUSAN4 25 CAROL SUSAN
## NURSE4 3 CAROL NURSE
## BEN4 4 CAROL BEN
## EMILY4 1 CAROL EMILY
## MR. GELLER5 31 MRS. GELLER MR. GELLER
## GUNTHER5 1 MRS. GELLER GUNTHER
## EMILY5 2 MRS. GELLER EMILY
## MRS. GELLER6 31 MR. GELLER MRS. GELLER
## GUNTHER6 1 MR. GELLER GUNTHER
## EMILY6 3 MR. GELLER EMILY
## CAROL7 25 SUSAN CAROL
## NURSE7 1 SUSAN NURSE
## EMILY7 3 SUSAN EMILY
## NURSE8 1 JANICE NURSE
## GUNTHER8 2 JANICE GUNTHER
## CAROL10 3 NURSE CAROL
## SUSAN10 1 NURSE SUSAN
## JANICE10 1 NURSE JANICE
## ERICA10 1 NURSE ERICA
## MIKE11 4 DAVID MIKE
## BEN11 1 DAVID BEN
## ERIC13 3 URSULA ERIC
## MONA13 1 URSULA MONA
## URSULA19 3 ERIC URSULA
## MONA19 1 ERIC MONA
## MRS. GELLER20 1 GUNTHER MRS. GELLER
## MR. GELLER20 1 GUNTHER MR. GELLER
## JANICE20 2 GUNTHER JANICE
## JILL20 1 GUNTHER JILL
## MARK20 1 GUNTHER MARK
## KATHY20 1 GUNTHER KATHY
## JOSHUA20 1 GUNTHER JOSHUA
## CHARLIE20 1 GUNTHER CHARLIE
## WAITER21 1 DR. GREEN WAITER
## MONA21 3 DR. GREEN MONA
## CAROL22 4 BEN CAROL
## DAVID22 1 BEN DAVID
## GUNTHER29 1 JOSHUA GUNTHER
## EMILY29 3 JOSHUA EMILY
## CAROL30 1 EMILY CAROL
## MRS. GELLER30 2 EMILY MRS. GELLER
## MR. GELLER30 3 EMILY MR. GELLER
## SUSAN30 3 EMILY SUSAN
## JOSHUA30 3 EMILY JOSHUA
## URSULA36 1 MONA URSULA
## ERIC36 1 MONA ERIC
## DR. GREEN36 3 MONA DR. GREEN
## GUNTHER38 1 CHARLIE GUNTHER
## AMY38 1 CHARLIE AMY
p <- simpleNetwork(Friends_Network %>% select(from,to), height="100px", width="100px",
Source = 1, # column number of source
Target = 2, # column number of target
linkDistance = 10, # distance between node. Increase this value to have more space between nodes
charge = -900, # numeric value indicating either the strength of the node repulsion (negative value) or attraction (positive value)
fontSize = 14, # size of the node names
fontFamily = "lato", # font og node names
linkColour = "#666", # colour of edges, MUST be a common colour for the whole graph
nodeColour = cassis, # colour of nodes, MUST be a common colour for the whole graph
opacity = 0.9, # opacity of nodes. 0=transparent. 1=no transparency
zoom = T # Can you zoom on the figure?
)
p
connect <- Friends_Network %>% select(from, to, size)
rownames(connect) <- 1:nrow(connect)
colnames(connect)[3] <- "value"
coauth <- c( as.character(connect$from), as.character(connect$to)) %>%
as.tibble() %>%
group_by(value) %>%
summarize(n=n())
colnames(coauth) <- c("name", "n")
# Create a graph object with igraph
mygraph <- graph_from_data_frame( connect, vertices = coauth, directed = FALSE )
# Find community
com <- walktrap.community(mygraph)
#max(com$membership)
#Reorder dataset and make the graph
coauth <- coauth %>%
mutate( grp = com$membership) %>%
arrange(grp) %>%
mutate(name=factor(name, name))
# keep only 7 first communities
coauth <- coauth %>%
filter(grp<=7)
scale_col <- colfunc(max(coauth$grp))
# keep only this people in edges
connect <- connect %>%
filter(from %in% coauth$name) %>%
filter(to %in% coauth$name)
# Create a graph object with igraph
mygraph <- graph_from_data_frame( connect, vertices = coauth, directed = FALSE )
# Make the graph
ggraph(mygraph, layout="linear") +
geom_edge_arc(edge_colour="grey", fold=TRUE) +
geom_node_point(aes(size=n, color=as.factor(grp), fill=grp)) +
scale_color_manual(values=scale_col)+
geom_node_text(aes(label=name), angle=65, hjust=1, nudge_y = -0.5, size=3) +
expand_limits(x = c(0,0), y = c(-2,0))+
theme(text=element_text(size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
legend.position = "none")
Other_char <- Other_char_raw %>%
select(1) %>%
`colnames<-`(c("Character")) %>%
mutate(Number_episode = NA, Average_grade = NA)
#Other_char <- Other_char[Other_char$Character!="GUNTHER",]
Friends <- Friends[!is.na(Friends$Character),]
IMDB$ID_Episode <- 1:nrow(IMDB)
for (i in 1:nrow(Other_char)){
Other_char$Number_episode[i] <- Friends$ID_Episode[Friends$Character == Other_char$Character[i]] %>%
unique() %>% length()
Other_char$Average_grade[i] <- (IMDB$Grade[IMDB$ID_Episode %in%
(Friends$ID_Episode[Friends$Character == Other_char$Character[i]] %>%
unique())]) %>% mean()
}
Other_char <- Other_char %>% as.data.frame()
Other_char$Average_grade <- Other_char$Average_grade %>% round(2)
Other_char <- Other_char[Other_char$Number_episode>=5,]
Other_graph <- ggplot(Other_char, aes(x = Average_grade, y = Number_episode))+
geom_vline(xintercept = mean(Other_char$Average_grade,na.rm = T), col = "grey")+
geom_text(aes(x=mean(Average_grade,na.rm = T)-0.01,label=round(mean(Average_grade,na.rm = T),2), y=max(Number_episode)+1), colour="grey", angle=0, vjust = 0, size = 9/.pt)+
geom_smooth(col = yellow, method = lm, formula = y ~ splines::bs(x, 3), se = FALSE)+
geom_point(aes(size = Number_episode, colour = Average_grade > mean(Average_grade,na.rm = T)))+
xlim(min(Other_char$Average_grade),8.7)+
geom_text(aes(label = paste(Character,Average_grade),colour = Average_grade > mean(Average_grade,na.rm = T)),
hjust=-0.1,
vjust=-0.5,
size = 9/.pt,
check_overlap = T) +
scale_color_manual(values=c(red,blue)) +
labs(title="Character with the best average according to their frequency",
subtitle="",
y="Frequency", x="Grade")+
theme(text= element_text(family = font),
legend.background = element_blank(),
legend.position = "none",
panel.background = element_blank(),
legend.text = element_text(size = 8),
axis.line = element_line(),
axis.text.y = element_text(angle = 0),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple))
Other_graph
Friends$Text <- Friends$Text %>% str_replace_all("\n"," ")
unnested_text <- Friends %>%
tidytext::unnest_tokens(output = "word", input = Text) %>%
select(Character, ID_Episode, Words, word)
stop_words <- stop_words
clean_text_raw <- anti_join(x = unnested_text, y = stop_words, by = "word")
clean_text <- clean_text_raw[!(clean_text_raw$word %in% tolower(Name_Team)),]
clean_text <- clean_text[!(clean_text$word %in% tolower(Other_char$Character)),]
to_remove <- c("yeah","hey","uh","don","ya","didn","ll","ve","y'know","um","gonna","umm","ah","doesn","ooh","ohh","huh")
clean_text <- clean_text[!(clean_text$word %in% tolower(to_remove)),]
clean_text <- clean_text %>%
mutate(lemme = wordStem(words = word, language = "en"))
Top <- 120
text_count <- clean_text %>%
count(lemme) %>%
arrange(desc(n)) %>%
head(Top)
for (i in 1:Top){
text_count$word[i] <- (clean_text$word[clean_text$lemme==text_count$lemme[i]])[1]
}
wordcloud(words = text_count$word, freq = text_count$n, min.freq = 3,
max.words=Top, random.order=FALSE, rot.per=0,
colors=rev(c(yellow, red, cassis, purple)))
Main_top_word <- clean_text[clean_text$lemme %in% text_count$lemme,]
Main_top_word <- Main_top_word %>% select(Character, word, lemme)
Main_top_word <- Main_top_word[Main_top_word$Character %in% Name_Team,]
Main_top_word <- Main_top_word %>%
select(Character, lemme) %>%
table() %>%
as.data.frame() %>%
group_by(Character) %>%
top_n(10, Freq) %>%
ungroup() %>%
arrange(Character, -Freq)
Main_top_word %>%
mutate(lemme = reorder(lemme, Freq)) %>%
ggplot(aes(y = lemme, x = Freq, fill = factor(Character)))+
geom_col(show.legend = FALSE)+
scale_fill_manual(values=colfunc(6))+
facet_wrap(~ Character, scales = "free")+
theme(text=element_text(size=12),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = blue))
clean_text_interact <- clean_text_raw[clean_text_raw$word %in% tolower(Name_Team),]
clean_text_interact <- clean_text_interact[clean_text_interact$Character %in% Name_Team,]
Interaction_Main <- clean_text_interact %>%
select(Character, word) %>%
table() %>%
as.data.frame() %>%
mutate(Interaction=toupper(word)) %>%
select(-word)
Interaction_Main %>%
ggplot(aes(y = Interaction, x = Freq, fill = Character))+
geom_col()+
scale_x_continuous(limits = c(0, 800))+
scale_fill_manual(values=colfunc(6))+
facet_wrap(~ Character, scales = "free")+
theme(text=element_text(size=12),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = blue))
Third_person <- Interaction_Main[Interaction_Main$Character==Interaction_Main$Interaction,]
ggplot(Third_person,aes(x = reorder(Character,-Freq), y = Freq))+
geom_col(fill = purple)+
theme(text=element_text(size=12),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = blue))+
labs(title="Character talking most about himself in the third person",
subtitle="",
y="Frequency", x="Character")
Writter <- merge(Prod, IMDB) %>%
filter(Job=="Writter") %>%
select(Name, Grade) %>%
group_by(Name) %>%
summarise(Grade=round(mean(Grade),1), Freq = n()) %>%
filter(Freq>2) %>%
arrange(-Grade)
Writter_graph <- ggplot(Writter, aes(x = Grade, y = Freq))+
geom_vline(xintercept = mean(Writter$Grade,na.rm = T), col = "grey")+
geom_text(aes(x=mean(Grade,na.rm = T)-0.05,label=round(mean(Grade,na.rm = T),2), y=max(Freq)+1), colour="grey", angle=0, vjust = 0, size = 9/.pt)+
#geom_smooth(col = yellow, method = lm, formula = y ~ splines::bs(x, 3), se = FALSE)+
geom_point(aes(size = Freq, colour = Grade > mean(Grade,na.rm = T)))+
xlim(min(Writter$Grade),9.3)+
geom_text(aes(label = paste(Name,Grade),colour = Grade > mean(Grade,na.rm = T)),
hjust=-0.1,
vjust=-0.5,
size = 9/.pt,
check_overlap = T) +
scale_color_manual(values=c(red,blue)) +
theme(text=element_text(size=12),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = blue)) +
labs(title="Writter with the best average according to their frequency",
subtitle="",
y="Frequency", x="Grade")
Writter_graph
library(plotly)
p <- ggplot(IMDB, aes(Episode, Season, fill= Grade)) +
geom_tile(color = "white") +
geom_text(aes(label = Grade), color = "white", size = 2.5)+
scale_fill_gradient(low=blue, high=purple)+
theme(text=element_text(size=12),
legend.position = "right",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = blue)) +
labs(title="Writter with the best average according to their frequency",
subtitle="")
ggplotly(p)
library(plotly)
Episodes <- c("Friends", unique(IMDB$Season_label), IMDB$Summary_tree)
Seasons <- c("", rep("Friends",times=10), IMDB$Season_label)
fig <- plot_ly(
type="treemap",
labels=Episodes,
parents=Seasons,
maxdepth=4,
textposition= "middle center",
text=list(size=10),
marker = list(colorscale = "Reds"))
fig
library(reshape2)
friends_tokens <- Friends %>%
mutate(text=as.character(Friends$Text)) %>%
unnest_tokens(word, text)
#wordcloud for positive and negative sentiments
wordcloud_pos_neg<- friends_tokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c(red, yellow), max.words=200)
wordcloud_pos_neg
## NULL
library(tidytext)
library(textdata)
nrc <- get_sentiments(lexicon ="nrc")
fsentiments <- friends_tokens %>%
inner_join(nrc, "word") %>%
count(word, sentiment, sort=TRUE)
# Frequency of each sentiment
ggplot(data=fsentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment, color=sentiment), show.legend=FALSE) +
scale_fill_manual(values=colfunc(10))+
scale_color_manual(values=colfunc(10))+
labs(x="Sentiment", y="Frequency")+
theme(axis.text.x = element_text(angle=45, hjust=1))
fsentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank())+
labs(y="Frequency", x="Terms") +
coord_flip()
friends_tokens %>%
filter(Character %in% c("MONICA","RACHEL","PHOEBE","ROSS","CHANDLER","JOEY")) %>%
inner_join(nrc, "word") %>%
count(Character, sentiment, sort=TRUE) %>%
ggplot(aes(x=sentiment, y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~Character, scales="free_x") +
theme(text=element_text(size=12),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = blue))+
labs(x="Sentiment", y="Frequency") +
coord_flip()
TEST<- friends_tokens %>%
filter(Character %in% c("MONICA","RACHEL","PHOEBE","ROSS","CHANDLER","JOEY")) %>%
inner_join(nrc, "word") %>%
select(Season, Episode, sentiment, Words) %>%
group_by(Season, Episode, sentiment) %>%
summarise(Words = sum(Words)) %>%
ungroup() %>%
filter(sentiment != "positive",sentiment != "negative")
TEST <- TEST %>%
group_by(Season, Episode) %>%
filter(Words == max(Words))
ggplot(TEST, aes(x = Episode, y = sentiment, color = Season, group = Season))+
geom_point()+
geom_line()
TEST<- friends_tokens %>%
filter(Character %in% c("MONICA","RACHEL","PHOEBE","ROSS","CHANDLER","JOEY")) %>%
inner_join(nrc, "word")
TEST %>%
group_by(Episode) %>%
count(sentiment) %>%
arrange(desc(n)) %>%
ggplot( aes(x=Episode, y=n, group=sentiment, color=sentiment)) +
geom_line() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("sentiment over episode by season")
Scenes <- Desc[str_detect(Desc$Text, pattern = "Scene"),] %>%
separate(col=Text, into = c("Scene","Description"), sep = "Scene") %>%
select(-Scene, - ID, -Desc) %>%
mutate(Description = Description %>% str_remove(": ")) %>%
separate(col=Description, into = c("Scene","Description"), sep = ",|]|:|;|\\.") %>%
select(-Description) %>%
filter(Scene != "") %>%
mutate(Scene = Scene %>% str_remove("inside|outside|Inside|Outside"),
Scene = str_trim(Scene),
Scene = toupper(Scene))
# Scenes$Scene %>% table() %>% as.data.frame() %>% arrange(desc(Freq))
#
# str_trim(Scenes$Scene)
Scenes$Scene[(Scenes$Scene %>% str_detect("ERK"))] %>% table()
## .
## 13CENTRAL PERK
## 1
## CENTAL PERK
## 1
## CENTRAL PERK
## 465
## CENTRAL PERK - CHANDLER
## 1
## OF CENTRAL PERK
## 1
## PHOEBE AND RACHEL SITTING ON THE COUCH IN CENTRAL PERK
## 1
## PHOEBE IS AT CENTRAL PERK
## 1
## ROSS IS IN CENTRAL PERK
## 1
## THE STREET CENTRAL PERK
## 2
## THE STREET DOWN THE BLOCK FROM CENTRAL PERK
## 1
## THE STREET IN FRONT OF CENTRAL PERK
## 4
## THE STREET RIGHT IN FRONT OF CENTRAL PERK
## 1
#
# Scenes <- Scenes %>%
# separate(col=Text, into = c("Scene","Description"), sep = ":") %>%
# separate(col=Description, into=c('Location', 'Description'), sep=c(',',';',']'))
text_count_locations <- Scenes %>%
count(Scene) %>%
arrange(desc(n)) %>%
mutate(Percentage=n/sum(n)*100) %>%
head(10) %>%
mutate(Scene = fct_reorder(Scene,Percentage))
#Focus sur les meileurs et pires épisodes pour comprendre ce qui constitue un bon scenario
IMDB_Category <- IMDB %>%
select(Season_label, Episode_label, Season, Episode, viewers, Grade) %>%
filter(!is.na(Grade)) %>%
mutate(Key = Season*100+Episode,
Category = ifelse(
Grade>mean(Grade,na.rm = T) & viewers>mean(viewers,na.rm = T),
Category <- "Stars",
ifelse(
Grade<mean(Grade,na.rm = T) & viewers>mean(viewers,na.rm = T),
Category <- "High Rentability but Low Quality",
ifelse(
Grade>mean(Grade,na.rm = T) & viewers<mean(viewers,na.rm = T),
Category <- "Low Rentability but High Quality",
Category <- "Bad Episodes"))))
ggplot(IMDB_Category, aes(x = Grade, y = viewers))+
geom_point(aes(col = Category))+
scale_color_manual(values=colfunc(4))+
geom_vline(xintercept = mean(IMDB_Category$Grade,na.rm = T))+
geom_hline(yintercept = mean(IMDB_Category$viewers,na.rm = T))+
geom_label(label = "Bad Episodes", hjust = 1, alpha = 5,
aes(y = viewers %>% quantile(.25) %>% as.numeric(),
x = Grade %>% quantile(.25) %>% as.numeric()))+
geom_label(label = "High Rentability\nbut Low Quality", hjust = 1, vjust = -.01,
aes(y = viewers %>% quantile(.75) %>% as.numeric(),
x = Grade %>% quantile(.25) %>% as.numeric()))+
geom_label(label = "Low Rentability\nbut High Quality", hjust = 0, vjust = .6,
aes(y = viewers %>% quantile(.25) %>% as.numeric(),
x = Grade %>% quantile(.75) %>% as.numeric()))+
geom_label(label = "Stars", hjust = 0,
aes(y = viewers %>% quantile(.75) %>% as.numeric(),
x = Grade %>% quantile(.75) %>% as.numeric()))+
labs(title="Profitability, quality and views: Segmentation of Friends episodes",
subtitle = "The aim is that the most appreciated episodes are the most seen",
y="Viewers (million)",x="Grade", color="Category",
caption="Source : IMDB")+
theme_minimal()+
theme(legend.position = "none",
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple),
axis.line = element_blank(),
axis.ticks = element_blank())
Category_filter <- (IMDB_Category$Category %>% unique())[c(1,2,4)]
BW_Analysis <- IMDB_Category %>%
filter(Category %in% Category_filter) %>%
merge(IMDB) %>%
select(Key, Grade, viewers, Category) %>%
merge(Friends) %>%
group_by(Key) %>%
mutate(Category = Category) %>%
summarise(Nb_Character = n_distinct(Character),
Nb_Scene = n_distinct(Scene),
Grade = mean(Grade),
viewers = mean(viewers),
Category = Category) %>%
unique() %>%
filter(Nb_Scene > 5,Nb_Scene < 90)
ggplot(BW_Analysis, aes(x = Nb_Scene, y = Nb_Character, col = Category))+
geom_point()+
scale_color_manual(values=colfunc(BW_Analysis$Category %>% unique() %>% length()))+
geom_smooth(aes(group=Category), method = lm, se = FALSE)+
labs(title="Profitability, quality and views: Segmentation of Friends episodes",
subtitle = "The aim is that the most appreciated episodes are the most seen",
y="Number of Character",x="Number of Scene", color="Category",
caption="Source : IMDB")+
theme(text= element_text(family = font),
legend.background = element_blank(),
legend.position = "right",
panel.background = element_blank(),
legend.text = element_text(size = 8),
axis.line = element_line(),
axis.text.y = element_text(angle = 0),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple))
IMDB_Category %>%
select(Season,Category) %>%
table() %>%
as.data.frame.matrix()
## Bad Episodes High Rentability but Low Quality
## 1 11 6
## 2 1 12
## 3 8 9
## 4 8 2
## 5 9 0
## 6 10 0
## 7 12 2
## 8 8 5
## 9 10 5
## 10 4 1
## Low Rentability but High Quality Stars
## 1 2 5
## 2 0 11
## 3 2 6
## 4 7 7
## 5 5 10
## 6 12 3
## 7 8 2
## 8 1 10
## 9 3 6
## 10 9 3
ggplot(IMDB_Category %>% filter(Category == "Stars"|Category == "Bad Episodes"), aes(x = Season))+
geom_bar(aes(fill = Category), position = "dodge")+
coord_polar(theta = "y")+
scale_fill_manual(values=c(blue, purple))+
ylim(c(0,20))+
xlim(c(-2,11))+
geom_text(hjust = 1.1, size = 3, family = font, col = purple,
aes(y = 0, label = paste("Season",Season)))+
labs(title="Evaluation of the seasons according to the best and worst episodes",
subtitle = "The last seasons have some excellent episodes unlike the first ones which are mainly bad",
y=NULL,x=NULL, fill="Category",
caption="Source : IMDB")+
theme_minimal() +
theme(legend.position = "right",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(family = font, size = 16, face = "bold", color = purple),
plot.caption = element_text(family = font, size = 10, color = purple),
axis.line = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.ticks = element_blank())
# library(odbc)
# library(DBI)
# conn = dbConnect(drv = odbc::odbc(),
# Driver = "ODBC Driver 13 for SQL Server",
# server = "tcp:ibrahimmeddiouiserver.database.windows.net",
# database = "ibrahimmeddiouidatabase",
# uid = "ibrahimmeddioui",
# pwd = "181SPPum")
#
# Friends <- read.csv(file = "Friends.csv")
# IMDB <- read.csv(file = "IMDB.csv")
# Desc <- read.csv(file = "Desc.csv")
# Prod <- read.csv(file = "Prod.csv")
#
# dbWriteTable(conn = conn, name = "Friends", value = Friends, overwrite = T)
# dbWriteTable(conn = conn, name = "IMDB", value = IMDB, overwrite = T)
# dbWriteTable(conn = conn, name = "Desc", value = Desc, overwrite = T)
# dbWriteTable(conn = conn, name = "Prod", value = Prod, overwrite = T)
#
# # This is for test only
# Friends_sql <- dbReadTable(conn = conn, name = 'Friends')
vline <- function(x = 0, color = "black") {
list(
type = "line",
y0 = 0,
y1 = 1,
yref = "paper",
x0 = x,
x1 = x,
line = list(color = color, dash="dot")
)
}
IMDB_writers <- IMDB[!grepl('Story by', IMDB$`Written by`),]
IMDB_writers %>% select(`Written by`, Episode, Season_label) %>%
group_by(`Written by`)%>%
arrange(`Written by`)
## # A tibble: 186 x 3
## # Groups: Written by [55]
## `Written by` Episode Season_label
## <chr> <dbl> <chr>
## 1 Adam Chase 17 Season 2
## 2 Adam Chase 8 Season 3
## 3 Adam Chase 19 Season 3
## 4 Adam Chase 7 Season 4
## 5 Adam Chase 15 Season 5
## 6 Adam Chase 1 Season 6
## 7 Adam Chase & Ira Ungerleider 6 Season 1
## 8 Adam Chase & Ira Ungerleider 10 Season 1
## 9 Adam Chase & Ira Ungerleider 21 Season 1
## 10 Adam Chase & Ira Ungerleider 2 Season 2
## # ... with 176 more rows
writers_grade <- IMDB_writers %>% select(`Written by`, Grade) %>%
group_by(`Written by`) %>%
summarize(Count=n(), Grade=round(mean(Grade, na.rm=T), 2)) %>%
filter(Grade>=5) %>%
arrange(desc(Count))
writers_grade %>% plot_ly() %>%
add_markers(x=~Grade, y=~Count ,size=~Count, hovertemplate= paste( writers_grade$`Written by`,"%{y} Episodes:%{x} <extra></extra>"), color = (writers_grade$Grade<median(writers_grade$Grade)), colors=colfunc2(2) ) %>%
layout(title= "Grades of the episodes per writer", xaxis = list(title = "Grade", showline= T, linewidth=2, linecolor='black'),
yaxis = list(title = "Frequency", showline= T, linewidth=2, linecolor='black'), shapes= list(vline(median(writers_grade$Grade))), showlegend=F)