Running the libraries

library(tidyverse)
library(ggplot2)
library(googlesheets)
library(dplyr)
library(lubridate)
library(DT)
library(htmlwidgets)
library(plotly)
library(taucharts)

Reading the file

sheeturl = "https://docs.google.com/spreadsheets/d/1Yv_9nDl4ocIZR0GXU3OZuBaXxER1blfwR_XHvklPpEM/edit?hl=en&hl=en&hl=en#gid=0"
tedsheet <- sheeturl %>% gs_url()
TED <- tedsheet %>% gs_read()
# Lists the column names
names(TED)
##  [1] "Talk ID"      "public_url"   "speaker_name" "headline"     "description" 
##  [6] "event"        "duration"     "language"     "published"    "tags"

1. Create an interactive table that shows the total number of talks given by an individual and the average duration of all their talks. Hence, there should be three columns in the table: The name, the number of talks, and the mean of the talk time (in minutes) for all their talks.

# Extracting the values of hour, minute, and seconds for the first observation
hour(TED$duration[1])
## [1] 0
minute(TED$duration[1])
## [1] 16
second(TED$duration[1])
## [1] 17
# Computing total minutes 
# Add seconds plus + 60 seconds*minutes + 3600 seconds*hours, then all divided by 60 for total minutes
TED <- TED %>%
  mutate(duration_minutes = (second(duration)+60*minute(duration)+3600*hour(duration))/60)
# Verifying total time in minutes
head(TED[,c("duration","duration_minutes")])
## # A tibble: 6 x 2
##   duration duration_minutes
##   <time>              <dbl>
## 1 16'17"               16.3
## 2 21'26"               21.4
## 3 18'36"               18.6
## 4 19'24"               19.4
## 5 19'50"               19.8
## 6 21'45"               21.8
# Computing total number of talks by each speaker and average minute per talk for each speaker
TED_speaker_metrics <- TED %>%
  group_by(speaker_name) %>%
  summarise(Number_talks = length(speaker_name),
            Mean_talk_duration = mean(duration_minutes))
# Shows the first 6 records in a dataframe
head(as.data.frame(TED_speaker_metrics))
##      speaker_name Number_talks Mean_talk_duration
## 1   Aakash Odedra            1           9.833333
## 2   Aala El-Khani            1          14.266667
## 3      Aaron Huey            1          15.450000
## 4    Aaron Koblin            1          18.300000
## 5 Aaron O'Connell            1           7.850000
## 6       Abe Davis            1          17.950000
# Rounding to 2 decimals for Mean_talk_duration
TED_speaker_metrics$Mean_talk_duration <-
  round(TED_speaker_metrics$Mean_talk_duration,2)
# Verifies the rounding of the first 6 records
head(as.data.frame(TED_speaker_metrics))
##      speaker_name Number_talks Mean_talk_duration
## 1   Aakash Odedra            1               9.83
## 2   Aala El-Khani            1              14.27
## 3      Aaron Huey            1              15.45
## 4    Aaron Koblin            1              18.30
## 5 Aaron O'Connell            1               7.85
## 6       Abe Davis            1              17.95
# Creates the interactive table
datatable(TED_speaker_metrics)

2. Create bar graphs to:

(a) show speakers who gave more than 3 talks, such that the height of bars corresponds to the mean talk time of each speaker and the color of the bar corresponds to the number of talks given by each speaker.

# Graphing all the number of talks
TED_speaker_metrics %>%
  ggplot(.,aes(Number_talks)) + geom_histogram()

# Graphing the names of the speakers who have gave more than 3 talks
TED_speaker_metrics %>%
  filter(Number_talks>3) %>%
  ggplot(.,aes(reorder(speaker_name, Mean_talk_duration), Mean_talk_duration)) +
  geom_bar(stat="identity") + coord_flip()

# Fixing the label name and cleaning up the background and overall look through theme
TED_speaker_metrics %>%
  filter(Number_talks>3) %>%
  ggplot(.,aes(reorder(speaker_name, Mean_talk_duration), Mean_talk_duration)) +
  geom_bar(stat="identity") + coord_flip() +
  labs(x="", y="Mean Talk Duration") + theme_bw()

# Filling the bars with a gradient color based on number of talks
TED_speaker_metrics %>%
  filter(Number_talks>3) %>%
  ggplot(.,aes(reorder(speaker_name, Mean_talk_duration), Mean_talk_duration, fill=Number_talks)) +
  geom_bar(stat="identity") + coord_flip() +
  labs(x="", y="Mean Talk Duration") + theme_bw()

# Filling the bars with distinct color for each number of talks
TED_speaker_metrics %>%
  filter(Number_talks>3) %>%
  ggplot(.,aes(reorder(speaker_name, Mean_talk_duration), Mean_talk_duration, fill=as.factor(Number_talks))) +
  geom_bar(stat="identity") + coord_flip() +
  labs(x="", y="Mean Talk Duration") + theme_bw()

#Fixing the label for the color scale of Number of Talks
TED_speaker_metrics %>%
 filter(Number_talks>3) %>%
  ggplot(.,aes(reorder(speaker_name, Mean_talk_duration), Mean_talk_duration, fill=as.factor(Number_talks))) +
  geom_bar(stat="identity") + coord_flip() +
  labs(x="", y="Mean Talk Duration") + theme_bw() + 
  scale_fill_discrete("Number of Talks")

ggobject <- TED_speaker_metrics %>%
  filter(Number_talks>3) %>%
  ggplot(.,aes(reorder(speaker_name, Mean_talk_duration), Mean_talk_duration, fill=as.factor(Number_talks))) +
  geom_bar(stat="identity") + coord_flip() +
  labs(x="", y="Mean Talk Duration") + theme_bw() + 
  scale_fill_discrete("Number of Talks")
ggobject

ggplotly(ggobject)
TED_speaker_metrics %>% filter(Number_talks>3) %>%
  plot_ly(x = ~ Mean_talk_duration, y = ~ speaker_name, color = ~ as.factor(Number_talks))
TED_speaker_metrics %>% filter(Number_talks>3) %>%
  plot_ly(x = ~ Mean_talk_duration, y = ~ reorder(speaker_name, Mean_talk_duration), color = ~ as.factor(Number_talks),
          type="bar") %>%
  layout(title="Speakers with More Than 3 Ted Talks",
         yaxis=list(title=""),xaxis=list(title="Mean Talk Duration"))
tmp = TED_speaker_metrics %>%
  filter(Number_talks>3)
tauchart(tmp) %>%
  tau_bar("Mean_talk_duration","speaker_name", color = "Number_talks", horizontal = "TRUE") %>%
  tau_legend() %>%
  tau_tooltip()
# Rearranging the data in a descending order of the mean talk duration
tmp = TED_speaker_metrics %>%
  filter(Number_talks>3) %>%
  arrange(-Mean_talk_duration)
tmp$speaker_name = fct_inorder(tmp$speaker_name)
tauchart(tmp) %>%
  tau_bar("Mean_talk_duration","speaker_name", color = "Number_talks", horizontal = "TRUE") %>%
  tau_legend() %>%
  tau_tooltip()

(b) show the top 20 tag terms/phrase (based on the frequency of use of each term/phrase) and how frequently they were present in the dataset.

head(TED$tags)
## [1] "alternative energy,cars,global issues,climate change,environment,science,culture,sustainability,technology"
## [2] "simplicity,entertainment,interface design,software,media,computers,technology,music,performance"           
## [3] "MacArthur grant,cities,green,activism,politics,pollution,environment,inequality,business"                  
## [4] "children,teaching,creativity,parenting,culture,dance,education"                                            
## [5] "demo,Asia,global issues,visualizations,global development,statistics,math,health,economics,Google,Africa"  
## [6] "entertainment,goal-setting,potential,psychology,motivation,emotions,culture,business"
TEDtags <- TED %>%
  select(tags) %>%
  separate(tags,c("tag1", "tag2", "tag3","tag4", "tag5", "tag6","tag7", "tag8", "tag9","tag10", "tag11", "tag12","tag13", "tag14", "tag15","tag16", "tag17", "tag18","tag19", "tag20", "tag21","tag22", "tag23", "tag24","tag25", "tag26", "tag27","tag28", "tag29", "tag30","tag31", "tag32", "tag33","tag34", "tag35", "tag36","tag37", "tag38", "tag39","tag40", "tag41", "tag42","tag43", "tag44", "tag45", "tag46","tag47", "tag48", "tag49", "tag50"), sep = ",") %>%
  gather(tagnum, Tag, tag1:tag50) %>%
  filter(Tag != "")
head(TEDtags)
## # A tibble: 6 x 2
##   tagnum Tag               
##   <chr>  <chr>             
## 1 tag1   alternative energy
## 2 tag1   simplicity        
## 3 tag1   MacArthur grant   
## 4 tag1   children          
## 5 tag1   demo              
## 6 tag1   entertainment
# Cleaning the data: removing beginning and ending spaces
TEDtags$Tag = trimws(TEDtags$Tag)
TEDtags$Tag = tolower(TEDtags$Tag)
#Counting the tags up
tagcount <- TEDtags %>%
  group_by(Tag) %>%
  summarise(Tag_count = length(Tag)) %>%
  arrange(-Tag_count)
# Reording the tags by count number
tagcount$Tag = fct_inorder(tagcount$Tag)
# Shows top 20 tags
tauchart(tagcount[1:20,]) %>%
  tau_bar("Tag_count","Tag", horizontal = "TRUE") %>%
  tau_legend() %>%
  tau_tooltip()