Assignment 6, TED TALK

library(tidyverse)
library(lubridate)
library(htmlwidgets)
library(DT)
library(plotly)
library("taucharts")
library(stringr)

For this assignment, I can read the file with the code below:

ted <- readxl::read_excel("TED.xlsx")

Create an interactive table that shows the total number of talks given by an individual and the average duration of all their talks. Hence, there should be three columns in the table: The name, the number of talks, and the mean of the talk time (in minutes) for all their talks.

In order to compute the total number of talks given we must use the group_by function.

We also need to change the date and time so that we can see hh:mm:ss.

The code below illustrates how to extract the hours, minutes, and seconds from a particular line, in this case the 1st.

hour(ted$duration[1])

## [1] 0

minute(ted$duration[1])

## [1] 16

second(ted$duration[1])

## [1] 17

Compute the total seconds for each talk and then divide by 60 to find the duration of each talk in minutes:

TED <- ted %>% mutate(duration_minutes=(second(duration)+60*minute(duration)+3600*hour(duration))/60)

We can verify that the job was done by looking at the first three observations of the duration and the duration_minutes.

head(TED[,c("duration","duration_minutes")])

Now we need to compute the total number of talks by each speaker and the average minutes per talk for each speaker too.

TED_speaker_metrics <- TED %>% group_by(speaker_name) %>% 
  summarise(Number_talks=length(speaker_name),
            Mean_talk_duration=mean(duration_minutes))
TED_speaker_metrics

We can verify this process to make sure everything is in the right format:

head(as.data.frame(TED_speaker_metrics))

Now, lets round all the decimals to only 2:

TED_speaker_metrics$Mean_talk_duration <- round(TED_speaker_metrics$Mean_talk_duration,2)
TED_speaker_metrics

After these steps, we need to turn our data into an interactive table:

datatable(TED_speaker_metrics)

Create bar graphs to:
1. show speakers who gave more than 3 talks, such that the height of bars corresponds to the mean talk time of each speaker and the color of the bar corresponds to the number of talks given by each speaker.

We can use ggplot to plot a histogram of the talks.

TED_speaker_metrics %>% 
  ggplot(.,aes(Number_talks)) + geom_histogram()

Some people have more than 3 talks, lets filter the data:

TED_speaker_metrics %>% filter(Number_talks>3) %>% 
  ggplot(.,aes(reorder(speaker_name,
  Mean_talk_duration), Mean_talk_duration))+geom_bar(stat="identity")+coord_flip()

TED_speaker_metrics %>% filter(Number_talks>3) %>% 
  ggplot(.,aes(reorder(speaker_name,
  Mean_talk_duration),
  Mean_talk_duration))+geom_bar(stat="identity")+coord_flip()+
  labs(x="",y="Mean talk duration")+theme_bw()

TED_speaker_metrics %>% filter(Number_talks>3) %>% 
  ggplot(.,aes(reorder(speaker_name,
  Mean_talk_duration), Mean_talk_duration, fill = Number_talks)) + geom_bar(stat = "identity") + coord_flip() + labs(x="", y = "Mean talk duration") + theme_bw()

TED_speaker_metrics %>% filter(Number_talks>3) %>% 
  ggplot(.,aes(reorder(speaker_name,
  Mean_talk_duration), Mean_talk_duration,
  fill=as.factor(Number_talks)))+geom_bar(stat="identity")+coord_flip()+labs(x="",y="Mean talk duration")+theme_bw()+scale_fill_discrete(
    "Number of talks"
  )

We can set the previous code to ggobject:

ggobject <- TED_speaker_metrics %>% filter(Number_talks>3) %>% 
  ggplot(.,aes(reorder(speaker_name,
  Mean_talk_duration), Mean_talk_duration,
  fill=as.factor(Number_talks)))+geom_bar(stat="identity")+coord_flip()+labs(x="",y="Mean talk duration")+theme_bw()+scale_fill_discrete(
    "Number of talks"
  )

Now we can make an interaction version of ggobject with ggplotly:

ggplotly(ggobject)

TED_speaker_metrics %>% filter(Number_talks>3) %>% 
  plot_ly(x=~Mean_talk_duration,y=~speaker_name,color=~as.factor(
    Number_talks
  ))

TED_speaker_metrics %>% filter(Number_talks>3) %>% 
  plot_ly(x=~Mean_talk_duration,y=~reorder(speaker_name,Mean_talk_duration), color=~as.factor(Number_talks),type="bar") %>% 
  layout(title="Speakers with more than 3 Ted Talks",
         yaxis=list(title=""),xaxis=list(title="Mean talk duration"))

tmp = TED_speaker_metrics %>% filter(Number_talks>3)

tauchart(tmp) %>% 
  tau_bar("Mean_talk_duration","speaker_name",color="Number_talks",
          horizontal = "TRUE") %>% tau_legend() %>% tau_tooltip()

tmp = TED_speaker_metrics %>% 
  filter(Number_talks>3) %>% arrange(-Mean_talk_duration)
tmp

Rearranging the data in descending order of Mean_talk_duration, then converting speaker_name to factor variable and specifying that the order of factor variables be in the sequence in which they appear.

tmp$speaker_name=fct_inorder(tmp$speaker_name)

tauchart(tmp) %>% tau_bar("Mean_talk_duration", "speaker_name", color="Number_talks", horizontal = "TRUE") %>% tau_legend() %>% tau_tooltip()

show the top 20 tag terms/phrase (based on the frequency of use of each term/phrase) and how frequently they were present in the dataset.

head(TED$tags)

## [1] "alternative energy,cars,global issues,climate change,environment,science,culture,sustainability,technology"
## [2] "simplicity,entertainment,interface design,software,media,computers,technology,music,performance"           
## [3] "MacArthur grant,cities,green,activism,politics,pollution,environment,inequality,business"                  
## [4] "children,teaching,creativity,parenting,culture,dance,education"                                            
## [5] "demo,Asia,global issues,visualizations,global development,statistics,math,health,economics,Google,Africa"  
## [6] "entertainment,goal-setting,potential,psychology,motivation,emotions,culture,business"

Separate into multiple commas:

TEDtags <- TED %>%
  select(tags) %>% separate(tags, c("tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7", "tag8", "tag9", "tag10", "tag11", "tag12", "tag13", "tag14", "tag15", "tag16", "tag17", "tag18", "tag19", "tag20", "tag21", "tag22", "tag23", "tag24", "tag25", "tag26", "tag27", "tag28", "tag29", "tag30", "tag31", "tag32", "tag33", "tag34", "tag35", "tag36", "tag37", "tag38", "tag39", "tag40", "tag41", "tag42", "tag43", "tag44", "tag45", "tag46", "tag47", "tag48", "tag49", "tag50"), sep = ",") %>%
  gather(tagnum, Tag, tag1:tag50) %>%
  filter(Tag != "")
head(TEDtags)

Now we can clean the entries so there are no beginning and trailing spaces and convert the text to lowercase:

TEDtags$Tag <- trimws(TEDtags$Tag)
TEDtags$Tag <- tolower(TEDtags$Tag)

tagcount <- TEDtags %>%
  group_by(Tag) %>% summarise(Tag_count=length(Tag)) %>% arrange(-Tag_count)
tagcount

tagcount$Tag=fct_inorder(tagcount$Tag)

tauchart(tagcount[1:20,]) %>% tau_bar("Tag_count", "Tag", horizontal = "TRUE") %>% tau_legend() %>% tau_tooltip()

Assignment 6, TED TALK

Judge Thomas Kearns