friends <- read.csv(file = 'friends.csv', stringsAsFactors = F)
friends_info <- read.csv(file = 'friends_info.csv')
main <- c("Rachel Green", "Ross Geller", "Monica Geller", "Joey Tribbiani", "Phoebe Buffay", "Chandler Bing")
mainlines <- friends%>%
filter(speaker %in% main) %>%
count(speaker, sort = TRUE)
mainlines %>%
ggplot(., aes(reorder(speaker, n), n))+
coord_flip()+
geom_bar(stat = "identity", colour = "black", fill = rainbow(n=6, alpha = 0.9, start = 0.52, end = 0.56))+
labs(title = "Main Characters", subtitle = "Who had the most lines?", x = "", y = "# of Lines")+
geom_text(aes(label = scales::comma(n,2), y= n), position = position_stack(vjust = 0.5), size = 4)+
theme_economist() +
theme(axis.title = element_text(size = 8), axis.text = element_text(size = 8, face = "bold"), axis.text.y = element_text(size = 10), axis.line = element_line(size = 0.4, colour = "grey10"), plot.caption = element_text(color = "gray25", face = "italic", size = 14), axis.text.x = element_text(size=10,))

guestlines <- friends%>%
filter(!speaker %in% main,!speaker %in% c("NA","#ALL#","Scene Directions"),!is.na(speaker)) %>%
count(speaker, sort = TRUE)
guestlines%>%
slice(1:10) %>%
ggplot(., aes(reorder(speaker, n), n))+
coord_flip()+
geom_bar(stat = "identity", colour = "black", fill = rainbow(n=10, alpha = 0.9, start = 0.06, end = 0.14))+
labs(title = "Guest Stars", subtitle = "Who had the most lines?", x = "", y = "# of Lines")+
geom_text(aes(label = scales::comma(n,2), y= n), position = position_stack(vjust = 0.5), size = 4)+
theme_economist() +
theme(axis.title = element_text(size = 8), axis.text = element_text(size = 8, face = "bold"), axis.text.y = element_text(size = 10), axis.line = element_line(size = 0.4, colour = "grey10"), plot.caption = element_text(color = "gray25", face = "italic", size = 14), axis.text.x = element_text(size=10,))

avgViews <- mean(friends_info$us_views_millions)
friends_info %>%
filter(season %in% c(1,2,3,4,5,6,7,8,9,10)) %>%
ggplot(aes(x = as.Date(air_date), y = us_views_millions, group = factor(season), color=factor(season))) +
geom_line(size=0.8) +
labs(title = "US Viewers Over Time", subtitle = "How many views did the show have?", x = "Air Date", y = "# of Views (m)") +
theme_economist() +
guides(col = guide_legend(nrow = 1, title = "Season", title.theme = element_text(size = 8))) +
theme(axis.text = element_text(size = 8, face = "bold"), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.text = element_text(size = 8),legend.position = "bottom")+
geom_hline(yintercept = avgViews, linetype = "longdash", size = 0.4) +
annotate(geom = "curve", x = as.Date("1996-09-08"), y = 52, xend = as.Date("1996-01-28"), yend = 52, curvature = .3, arrow = arrow(length = unit(2, "mm"))) +
annotate(geom = "text", x = as.Date("1996-09-08"), y = 52, label = "The One After the Superbowl", hjust = "left") +
annotate(geom = "curve", x = as.Date("1994-04-01"), y = 33.15, xend = as.Date("1994-04-01"), yend = 25.5, curvature = .3, arrow = arrow(length = unit(2, "mm"))) +
annotate(geom = "curve", x = as.Date("2003-12-06"), y = 50, xend = as.Date("2004-04-18"), yend = 52, curvature = .3, arrow = arrow(length = unit(2, "mm"))) +
geom_label(x = as.Date("1994-04-01"), y = 35, label = "25m avg", hjust = "left",fontface="italic", colour = "black")+
annotate(geom = "text", x = as.Date("2003-12-06"), y = 50, label = "The Last One", hjust = "right")

ggplot(friends_info, aes(x = imdb_rating, y = us_views_millions, group=factor(season), colour=factor(season))) +
geom_point(position = position_jitter(),colour = "black", shape = 21, size = 3.5, aes(fill = factor(season))) +
labs(title = "Ratings vs. Views", subtitle = "Do higher rated episodes have higher views?", x = "IMDB Rating", y = "# of Views (m)") +
theme_economist() +
theme(axis.text = element_text(size = 8, face = "bold"), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.text = element_text(size = 8),legend.position = "bottom") +
guides(fill = guide_legend(nrow = 1, title = "Season", title.theme = element_text(size = 8))) +
annotate(geom = "text", x = 9.5, y = 49, label = "The Last One", hjust = "right") +
annotate(geom = "curve", x = 9.52, y = 49, xend = 9.7, yend = 51.94, curvature = .3, arrow = arrow(length = unit(2, "mm"))) +
annotate(geom = "text", x = 9.5, y = 41, label = "The One Where Everybody Finds Out", hjust = "right") +
annotate(geom = "curve", x = 9.52, y = 41, xend = 9.69, yend = 28.22, curvature = -0.3, arrow = arrow(length = unit(2, "mm")))

friendsLines <- data.frame(friends) %>%
filter(speaker %in% main)
# Sentiment Analysis
tokens <- data_frame(friendsLines) %>%
unnest_tokens(text, output=word, format="text", token="words", drop=FALSE, to_lower=TRUE)
tokens %>%
inner_join(get_sentiments("bing")) %>% # pull out only sentiment words
group_by(season, speaker, sentiment) %>%
count(sentiment) %>% # count the # of positive & negative words
spread(sentiment, n, fill = 0) %>% # make data wide rather than narrow
mutate(sentiment = positive - negative) # # of positive words - # of negative words
## Joining, by = "word"
## # A tibble: 60 x 5
## # Groups: season, speaker [60]
## season speaker negative positive sentiment
## <int> <chr> <dbl> <dbl> <dbl>
## 1 1 Chandler Bing 212 363 151
## 2 1 Joey Tribbiani 130 268 138
## 3 1 Monica Geller 154 308 154
## 4 1 Phoebe Buffay 170 297 127
## 5 1 Rachel Green 176 386 210
## 6 1 Ross Geller 225 379 154
## 7 2 Chandler Bing 233 338 105
## 8 2 Joey Tribbiani 135 303 168
## 9 2 Monica Geller 187 334 147
## 10 2 Phoebe Buffay 196 382 186
## # ... with 50 more rows
GetSentiment <- function(file){
tokens <- data_frame(friendsLines) %>%
filter(speaker %in% main) %>%
unnest_tokens(text, output=word, format="text", token="words", drop=FALSE, to_lower=TRUE)
# get the sentiment from the first text:
sentiment <- tokens %>%
inner_join(get_sentiments("bing")) %>% # pull out only sentiment words
group_by(season, speaker, sentiment) %>%
count(sentiment) %>% # count the # of positive & negative words
spread(sentiment, n, fill = 0) %>% # make data wide rather than narrow
mutate(sentiment = positive - negative) # # of positive words - # of negative words
return(sentiment)
}
GetSentiment()
## Joining, by = "word"
## # A tibble: 60 x 5
## # Groups: season, speaker [60]
## season speaker negative positive sentiment
## <int> <chr> <dbl> <dbl> <dbl>
## 1 1 Chandler Bing 212 363 151
## 2 1 Joey Tribbiani 130 268 138
## 3 1 Monica Geller 154 308 154
## 4 1 Phoebe Buffay 170 297 127
## 5 1 Rachel Green 176 386 210
## 6 1 Ross Geller 225 379 154
## 7 2 Chandler Bing 233 338 105
## 8 2 Joey Tribbiani 135 303 168
## 9 2 Monica Geller 187 334 147
## 10 2 Phoebe Buffay 196 382 186
## # ... with 50 more rows
sentiment <- data_frame()
sentiments <- rbind(sentiment, GetSentiment())
## Joining, by = "word"
ggplot(sentiments, aes(x = speaker, y = sentiment, color = speaker)) +
geom_boxplot(colour = "black", fill=rainbow(n=6, alpha = 0.9, start = 0.42, end = 0.46)) +
labs(title = "Character Sentiments", subtitle = "Who had the most positive sentiments?", y = "# of Sentiments") +
theme_economist() +
theme(axis.text = element_text(size = 8), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.position = "none", axis.title.x=element_blank())

linesperscene <- friends %>%
filter(speaker %in% main) %>%
group_by(season, episode, scene, speaker) %>%
count(speaker)
scenesperCharacter <- linesperscene %>%
group_by(speaker, season, episode, scene) %>%
count(scene, name = "sceneCount")
sceneSum <- scenesperCharacter %>%
group_by(speaker) %>%
summarise(sceneCount = sum(sceneCount))
## `summarise()` ungrouping output (override with `.groups` argument)
plot_all <- left_join(sceneSum, mainlines, by="speaker") %>%
arrange(desc(n))
plot_all
## # A tibble: 6 x 3
## speaker sceneCount n
## <chr> <int> <int>
## 1 Rachel Green 1461 9312
## 2 Ross Geller 1416 9157
## 3 Chandler Bing 1508 8465
## 4 Monica Geller 1440 8441
## 5 Joey Tribbiani 1451 8171
## 6 Phoebe Buffay 1341 7501
rownames(plot_all) <- c("Rachel Green", "Ross Geller", "Chandler Bing", "Monica Geller", "Joey Tribbiani", "Phoebe Buffay")
ggplot(plot_all, aes(x=sceneCount, y=n)) +
geom_point(colour = "black", shape = 21, size = 3.5, aes(fill = factor(speaker)))+
geom_text_repel(aes(label = rownames(plot_all)), size = 3.5)+
labs(title = "Main Characters", subtitle = "Does having more scenes lead to more lines?", x = "# of Scenes", y = "# of Lines") +
theme_economist() +
theme(axis.text = element_text(size = 8), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.position = "none")

episodecount <- friends %>%
filter(!speaker %in% main,!speaker %in% c("NA","#ALL#","Scene Directions", "Both", "Woman", "Man", "Guy", "Nurse", "Waiter", "Ben Geller"),!is.na(speaker)) %>%
group_by(season, episode, speaker) %>%
count(speaker)
episodesperCharacter <- episodecount %>%
group_by(speaker, season, episode) %>%
count(episode, name = "episodeCount")
episodeSum <- episodesperCharacter %>%
group_by(speaker) %>%
summarise(episodeCount = sum(episodeCount))
## `summarise()` ungrouping output (override with `.groups` argument)
topepisode <- episodeSum %>%
arrange(desc(episodeCount)) %>%
slice(1:11)
topepisode
## # A tibble: 11 x 2
## speaker episodeCount
## <chr> <int>
## 1 Gunther 52
## 2 Judy Geller 20
## 3 Janice Litman Goralnik 19
## 4 Mike Hannigan 17
## 5 Carol Willick 16
## 6 Jack Geller 16
## 7 Emily Waltham 13
## 8 Richard Burke 12
## 9 Susan Bunch 12
## 10 Frank Buffay Jr. 10
## 11 Charlie Wheeler 9
plot_guest <- left_join(topepisode, guestlines, by="speaker")
plot_guest
## # A tibble: 11 x 3
## speaker episodeCount n
## <chr> <int> <int>
## 1 Gunther 52 131
## 2 Judy Geller 20 180
## 3 Janice Litman Goralnik 19 216
## 4 Mike Hannigan 17 330
## 5 Carol Willick 16 193
## 6 Jack Geller 16 150
## 7 Emily Waltham 13 174
## 8 Richard Burke 12 281
## 9 Susan Bunch 12 104
## 10 Frank Buffay Jr. 10 179
## 11 Charlie Wheeler 9 189
rownames(plot_guest) <- c("Gunther", "Judy Geller", "Janice Litman Goralnik", "Mike Hannigan", "Carol Willick", "Jack Geller", "Emily Waltham", "Richard Burke", "Susan Bunch", "Frank Buffay Jr.", "Charlie Wheeler")
ggplot(plot_guest, aes(x=episodeCount, y=n)) +
geom_point(colour = "black", shape = 21, size = 3.5, aes(fill = factor(speaker))) +
geom_text_repel(aes(label = rownames(plot_guest)), size = 3.5)+
labs(title = "Guest Stars", subtitle = "Does being in more episodes lead to more lines?", x = "# of Scenes", y = "# of Lines") +
theme_economist() +
theme(axis.text = element_text(size = 8), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.position = "none")
