friends <- read.csv(file = 'friends.csv', stringsAsFactors = F)
friends_info <- read.csv(file = 'friends_info.csv')

main <- c("Rachel Green", "Ross Geller", "Monica Geller", "Joey Tribbiani", "Phoebe Buffay", "Chandler Bing")
mainlines <- friends%>%
  filter(speaker %in% main) %>%
  count(speaker, sort = TRUE)

mainlines %>%
  ggplot(., aes(reorder(speaker, n), n))+
  coord_flip()+
  geom_bar(stat = "identity", colour = "black", fill = rainbow(n=6, alpha = 0.9, start = 0.52, end = 0.56))+
  labs(title = "Main Characters", subtitle = "Who had the most lines?", x = "", y = "# of Lines")+
  geom_text(aes(label = scales::comma(n,2), y= n), position = position_stack(vjust = 0.5), size = 4)+
  theme_economist() +
  theme(axis.title = element_text(size = 8), axis.text = element_text(size = 8, face = "bold"), axis.text.y = element_text(size = 10), axis.line = element_line(size = 0.4, colour = "grey10"), plot.caption = element_text(color = "gray25", face = "italic", size = 14), axis.text.x = element_text(size=10,))

guestlines <- friends%>%
  filter(!speaker %in% main,!speaker %in% c("NA","#ALL#","Scene Directions"),!is.na(speaker)) %>%
  count(speaker, sort = TRUE)

guestlines%>%
  slice(1:10) %>%
  ggplot(., aes(reorder(speaker, n), n))+
  coord_flip()+
  geom_bar(stat = "identity", colour = "black", fill = rainbow(n=10, alpha = 0.9, start = 0.06, end = 0.14))+
  labs(title = "Guest Stars", subtitle = "Who had the most lines?", x = "", y = "# of Lines")+
  geom_text(aes(label = scales::comma(n,2), y= n), position = position_stack(vjust = 0.5), size = 4)+
  theme_economist() +
  theme(axis.title = element_text(size = 8), axis.text = element_text(size = 8, face = "bold"), axis.text.y = element_text(size = 10), axis.line = element_line(size = 0.4, colour = "grey10"), plot.caption = element_text(color = "gray25", face = "italic", size = 14), axis.text.x = element_text(size=10,))

avgViews <- mean(friends_info$us_views_millions)

friends_info %>%
  filter(season %in% c(1,2,3,4,5,6,7,8,9,10)) %>%
  ggplot(aes(x = as.Date(air_date), y = us_views_millions, group = factor(season), color=factor(season))) +
  geom_line(size=0.8) +
  labs(title = "US Viewers Over Time", subtitle = "How many views did the show have?", x = "Air Date", y = "# of Views (m)") +
  theme_economist() +
  guides(col = guide_legend(nrow = 1, title = "Season", title.theme = element_text(size = 8))) +
  theme(axis.text = element_text(size = 8, face = "bold"), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.text = element_text(size = 8),legend.position = "bottom")+
  geom_hline(yintercept = avgViews, linetype = "longdash", size = 0.4) +
  annotate(geom = "curve", x = as.Date("1996-09-08"), y = 52, xend = as.Date("1996-01-28"), yend = 52, curvature = .3, arrow = arrow(length = unit(2, "mm"))) +
  annotate(geom = "text", x = as.Date("1996-09-08"), y = 52, label = "The One After the Superbowl", hjust = "left") +
  annotate(geom = "curve", x = as.Date("1994-04-01"), y = 33.15, xend = as.Date("1994-04-01"), yend = 25.5, curvature = .3, arrow = arrow(length = unit(2, "mm"))) +
  annotate(geom = "curve", x = as.Date("2003-12-06"), y = 50, xend = as.Date("2004-04-18"), yend = 52, curvature = .3, arrow = arrow(length = unit(2, "mm"))) +
  geom_label(x = as.Date("1994-04-01"), y = 35, label = "25m avg", hjust = "left",fontface="italic", colour = "black")+
  annotate(geom = "text", x = as.Date("2003-12-06"), y = 50, label = "The Last One", hjust = "right")

ggplot(friends_info, aes(x = imdb_rating, y = us_views_millions, group=factor(season), colour=factor(season))) +
  geom_point(position = position_jitter(),colour = "black", shape = 21, size = 3.5, aes(fill = factor(season))) +
  labs(title = "Ratings vs. Views", subtitle = "Do higher rated episodes have higher views?", x = "IMDB Rating", y = "# of Views (m)") +
  theme_economist() +
  theme(axis.text = element_text(size = 8, face = "bold"), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.text = element_text(size = 8),legend.position = "bottom") +
  guides(fill = guide_legend(nrow = 1, title = "Season", title.theme = element_text(size = 8))) +
  annotate(geom = "text", x = 9.5, y = 49, label = "The Last One", hjust = "right") +
  annotate(geom = "curve", x = 9.52, y = 49, xend = 9.7, yend = 51.94, curvature = .3, arrow = arrow(length = unit(2, "mm"))) +
  annotate(geom = "text", x = 9.5, y = 41, label = "The One Where Everybody Finds Out", hjust = "right") +
  annotate(geom = "curve", x = 9.52, y = 41, xend = 9.69, yend = 28.22, curvature = -0.3, arrow = arrow(length = unit(2, "mm")))

friendsLines <- data.frame(friends) %>%
  filter(speaker %in% main)

# Sentiment Analysis
tokens <- data_frame(friendsLines)  %>%
  unnest_tokens(text, output=word, format="text", token="words", drop=FALSE, to_lower=TRUE)

tokens %>%
  inner_join(get_sentiments("bing")) %>% # pull out only sentiment words
  group_by(season, speaker,  sentiment) %>%
  count(sentiment) %>% # count the # of positive & negative words
  spread(sentiment, n, fill = 0) %>% # make data wide rather than narrow
  mutate(sentiment = positive - negative) # # of positive words - # of negative words
## Joining, by = "word"
## # A tibble: 60 x 5
## # Groups:   season, speaker [60]
##    season speaker        negative positive sentiment
##     <int> <chr>             <dbl>    <dbl>     <dbl>
##  1      1 Chandler Bing       212      363       151
##  2      1 Joey Tribbiani      130      268       138
##  3      1 Monica Geller       154      308       154
##  4      1 Phoebe Buffay       170      297       127
##  5      1 Rachel Green        176      386       210
##  6      1 Ross Geller         225      379       154
##  7      2 Chandler Bing       233      338       105
##  8      2 Joey Tribbiani      135      303       168
##  9      2 Monica Geller       187      334       147
## 10      2 Phoebe Buffay       196      382       186
## # ... with 50 more rows
GetSentiment <- function(file){
  tokens <- data_frame(friendsLines) %>%
  filter(speaker %in% main) %>%
  unnest_tokens(text, output=word, format="text", token="words", drop=FALSE, to_lower=TRUE)
  
  # get the sentiment from the first text: 
  sentiment <- tokens %>%
    inner_join(get_sentiments("bing")) %>% # pull out only sentiment words
    group_by(season, speaker, sentiment) %>%
    count(sentiment) %>% # count the # of positive & negative words
    spread(sentiment, n, fill = 0) %>% # make data wide rather than narrow
    mutate(sentiment = positive - negative) # # of positive words - # of negative words

  return(sentiment)
}

GetSentiment()
## Joining, by = "word"
## # A tibble: 60 x 5
## # Groups:   season, speaker [60]
##    season speaker        negative positive sentiment
##     <int> <chr>             <dbl>    <dbl>     <dbl>
##  1      1 Chandler Bing       212      363       151
##  2      1 Joey Tribbiani      130      268       138
##  3      1 Monica Geller       154      308       154
##  4      1 Phoebe Buffay       170      297       127
##  5      1 Rachel Green        176      386       210
##  6      1 Ross Geller         225      379       154
##  7      2 Chandler Bing       233      338       105
##  8      2 Joey Tribbiani      135      303       168
##  9      2 Monica Geller       187      334       147
## 10      2 Phoebe Buffay       196      382       186
## # ... with 50 more rows
sentiment <- data_frame()
sentiments <- rbind(sentiment, GetSentiment())
## Joining, by = "word"
ggplot(sentiments, aes(x = speaker, y = sentiment, color = speaker)) + 
  geom_boxplot(colour = "black", fill=rainbow(n=6, alpha = 0.9, start = 0.42, end = 0.46)) +
  labs(title = "Character Sentiments", subtitle = "Who had the most positive sentiments?", y = "# of Sentiments") +
  theme_economist() +
  theme(axis.text = element_text(size = 8), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.position = "none", axis.title.x=element_blank())

linesperscene <- friends %>% 
  filter(speaker %in% main) %>%
  group_by(season, episode, scene, speaker) %>%
  count(speaker)

scenesperCharacter <- linesperscene %>%
  group_by(speaker, season, episode, scene) %>%
  count(scene, name = "sceneCount")

sceneSum <- scenesperCharacter %>%
  group_by(speaker) %>%
  summarise(sceneCount = sum(sceneCount))
## `summarise()` ungrouping output (override with `.groups` argument)
plot_all <- left_join(sceneSum, mainlines, by="speaker") %>%
  arrange(desc(n))
plot_all
## # A tibble: 6 x 3
##   speaker        sceneCount     n
##   <chr>               <int> <int>
## 1 Rachel Green         1461  9312
## 2 Ross Geller          1416  9157
## 3 Chandler Bing        1508  8465
## 4 Monica Geller        1440  8441
## 5 Joey Tribbiani       1451  8171
## 6 Phoebe Buffay        1341  7501
rownames(plot_all) <- c("Rachel Green", "Ross Geller", "Chandler Bing", "Monica Geller", "Joey Tribbiani", "Phoebe Buffay")

ggplot(plot_all, aes(x=sceneCount, y=n)) +
  geom_point(colour = "black", shape = 21, size = 3.5, aes(fill = factor(speaker)))+
  geom_text_repel(aes(label = rownames(plot_all)), size = 3.5)+
  labs(title = "Main Characters", subtitle = "Does having more scenes lead to more lines?", x = "# of Scenes", y = "# of Lines") +
  theme_economist() +
  theme(axis.text = element_text(size = 8), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.position = "none")

episodecount <- friends %>% 
  filter(!speaker %in% main,!speaker %in% c("NA","#ALL#","Scene Directions", "Both", "Woman", "Man", "Guy", "Nurse", "Waiter", "Ben Geller"),!is.na(speaker)) %>%
  group_by(season, episode, speaker) %>%
  count(speaker)

episodesperCharacter <- episodecount %>%
  group_by(speaker, season, episode) %>%
  count(episode, name = "episodeCount")

episodeSum <- episodesperCharacter %>%
    group_by(speaker) %>%
  summarise(episodeCount = sum(episodeCount))
## `summarise()` ungrouping output (override with `.groups` argument)
topepisode <- episodeSum %>%
  arrange(desc(episodeCount)) %>%
  slice(1:11)
topepisode
## # A tibble: 11 x 2
##    speaker                episodeCount
##    <chr>                         <int>
##  1 Gunther                          52
##  2 Judy Geller                      20
##  3 Janice Litman Goralnik           19
##  4 Mike Hannigan                    17
##  5 Carol Willick                    16
##  6 Jack Geller                      16
##  7 Emily Waltham                    13
##  8 Richard Burke                    12
##  9 Susan Bunch                      12
## 10 Frank Buffay Jr.                 10
## 11 Charlie Wheeler                   9
plot_guest <- left_join(topepisode, guestlines, by="speaker")
plot_guest
## # A tibble: 11 x 3
##    speaker                episodeCount     n
##    <chr>                         <int> <int>
##  1 Gunther                          52   131
##  2 Judy Geller                      20   180
##  3 Janice Litman Goralnik           19   216
##  4 Mike Hannigan                    17   330
##  5 Carol Willick                    16   193
##  6 Jack Geller                      16   150
##  7 Emily Waltham                    13   174
##  8 Richard Burke                    12   281
##  9 Susan Bunch                      12   104
## 10 Frank Buffay Jr.                 10   179
## 11 Charlie Wheeler                   9   189
rownames(plot_guest) <- c("Gunther", "Judy Geller", "Janice Litman Goralnik", "Mike Hannigan", "Carol Willick", "Jack Geller", "Emily Waltham", "Richard Burke", "Susan Bunch", "Frank Buffay Jr.", "Charlie Wheeler")

ggplot(plot_guest, aes(x=episodeCount, y=n)) +
  geom_point(colour = "black", shape = 21, size = 3.5, aes(fill = factor(speaker))) +
  geom_text_repel(aes(label = rownames(plot_guest)), size = 3.5)+
  labs(title = "Guest Stars", subtitle = "Does being in more episodes lead to more lines?", x = "# of Scenes", y = "# of Lines") +
  theme_economist() +
  theme(axis.text = element_text(size = 8), axis.line.y = element_line(size = 0.4, colour = "black"),panel.grid.major.y = element_blank(), axis.title = element_text(size = 8), legend.position = "none")