library(tidyverse)
library(lubridate)
library(htmlwidgets)
library(DT)
library(plotly)
library("taucharts")
library(stringr)
For this assignment, I can read the file with the code below:
ted <- readxl::read_excel("TED.xlsx")
In order to compute the total number of talks given we must use the group_by function.
We also need to change the date and time so that we can see hh:mm:ss.
The code below illustrates how to extract the hours, minutes, and seconds from a particular line, in this case the 1st.
hour(ted$duration[1])
## [1] 0
minute(ted$duration[1])
## [1] 16
second(ted$duration[1])
## [1] 17
Compute the total seconds for each talk and then divide by 60 to find the duration of each talk in minutes:
TED <- ted %>% mutate(duration_minutes=(second(duration)+60*minute(duration)+3600*hour(duration))/60)
We can verify that the job was done by looking at the first three observations of the duration and the duration_minutes.
head(TED[,c("duration","duration_minutes")])
Now we need to compute the total number of talks by each speaker and the average minutes per talk for each speaker too.
TED_speaker_metrics <- TED %>% group_by(speaker_name) %>%
summarise(Number_talks=length(speaker_name),
Mean_talk_duration=mean(duration_minutes))
TED_speaker_metrics
We can verify this process to make sure everything is in the right format:
head(as.data.frame(TED_speaker_metrics))
Now, lets round all the decimals to only 2:
TED_speaker_metrics$Mean_talk_duration <- round(TED_speaker_metrics$Mean_talk_duration,2)
TED_speaker_metrics
After these steps, we need to turn our data into an interactive table:
datatable(TED_speaker_metrics)
Create bar graphs to:
We can use ggplot to plot a histogram of the talks.
TED_speaker_metrics %>%
ggplot(.,aes(Number_talks)) + geom_histogram()
Some people have more than 3 talks, lets filter the data:
TED_speaker_metrics %>% filter(Number_talks>3) %>%
ggplot(.,aes(reorder(speaker_name,
Mean_talk_duration), Mean_talk_duration))+geom_bar(stat="identity")+coord_flip()
TED_speaker_metrics %>% filter(Number_talks>3) %>%
ggplot(.,aes(reorder(speaker_name,
Mean_talk_duration),
Mean_talk_duration))+geom_bar(stat="identity")+coord_flip()+
labs(x="",y="Mean talk duration")+theme_bw()
TED_speaker_metrics %>% filter(Number_talks>3) %>%
ggplot(.,aes(reorder(speaker_name,
Mean_talk_duration), Mean_talk_duration, fill = Number_talks)) + geom_bar(stat = "identity") + coord_flip() + labs(x="", y = "Mean talk duration") + theme_bw()
TED_speaker_metrics %>% filter(Number_talks>3) %>%
ggplot(.,aes(reorder(speaker_name,
Mean_talk_duration), Mean_talk_duration,
fill=as.factor(Number_talks)))+geom_bar(stat="identity")+coord_flip()+labs(x="",y="Mean talk duration")+theme_bw()+scale_fill_discrete(
"Number of talks"
)
We can set the previous code to ggobject:
ggobject <- TED_speaker_metrics %>% filter(Number_talks>3) %>%
ggplot(.,aes(reorder(speaker_name,
Mean_talk_duration), Mean_talk_duration,
fill=as.factor(Number_talks)))+geom_bar(stat="identity")+coord_flip()+labs(x="",y="Mean talk duration")+theme_bw()+scale_fill_discrete(
"Number of talks"
)
Now we can make an interaction version of ggobject with ggplotly:
ggplotly(ggobject)
TED_speaker_metrics %>% filter(Number_talks>3) %>%
plot_ly(x=~Mean_talk_duration,y=~speaker_name,color=~as.factor(
Number_talks
))
TED_speaker_metrics %>% filter(Number_talks>3) %>%
plot_ly(x=~Mean_talk_duration,y=~reorder(speaker_name,Mean_talk_duration), color=~as.factor(Number_talks),type="bar") %>%
layout(title="Speakers with more than 3 Ted Talks",
yaxis=list(title=""),xaxis=list(title="Mean talk duration"))
tmp = TED_speaker_metrics %>% filter(Number_talks>3)
tauchart(tmp) %>%
tau_bar("Mean_talk_duration","speaker_name",color="Number_talks",
horizontal = "TRUE") %>% tau_legend() %>% tau_tooltip()
tmp = TED_speaker_metrics %>%
filter(Number_talks>3) %>% arrange(-Mean_talk_duration)
tmp
Rearranging the data in descending order of Mean_talk_duration, then converting speaker_name to factor variable and specifying that the order of factor variables be in the sequence in which they appear.
tmp$speaker_name=fct_inorder(tmp$speaker_name)
tauchart(tmp) %>% tau_bar("Mean_talk_duration", "speaker_name", color="Number_talks", horizontal = "TRUE") %>% tau_legend() %>% tau_tooltip()
head(TED$tags)
## [1] "alternative energy,cars,global issues,climate change,environment,science,culture,sustainability,technology"
## [2] "simplicity,entertainment,interface design,software,media,computers,technology,music,performance"
## [3] "MacArthur grant,cities,green,activism,politics,pollution,environment,inequality,business"
## [4] "children,teaching,creativity,parenting,culture,dance,education"
## [5] "demo,Asia,global issues,visualizations,global development,statistics,math,health,economics,Google,Africa"
## [6] "entertainment,goal-setting,potential,psychology,motivation,emotions,culture,business"
Separate into multiple commas:
TEDtags <- TED %>%
select(tags) %>% separate(tags, c("tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7", "tag8", "tag9", "tag10", "tag11", "tag12", "tag13", "tag14", "tag15", "tag16", "tag17", "tag18", "tag19", "tag20", "tag21", "tag22", "tag23", "tag24", "tag25", "tag26", "tag27", "tag28", "tag29", "tag30", "tag31", "tag32", "tag33", "tag34", "tag35", "tag36", "tag37", "tag38", "tag39", "tag40", "tag41", "tag42", "tag43", "tag44", "tag45", "tag46", "tag47", "tag48", "tag49", "tag50"), sep = ",") %>%
gather(tagnum, Tag, tag1:tag50) %>%
filter(Tag != "")
head(TEDtags)
Now we can clean the entries so there are no beginning and trailing spaces and convert the text to lowercase:
TEDtags$Tag <- trimws(TEDtags$Tag)
TEDtags$Tag <- tolower(TEDtags$Tag)
tagcount <- TEDtags %>%
group_by(Tag) %>% summarise(Tag_count=length(Tag)) %>% arrange(-Tag_count)
tagcount
tagcount$Tag=fct_inorder(tagcount$Tag)
tauchart(tagcount[1:20,]) %>% tau_bar("Tag_count", "Tag", horizontal = "TRUE") %>% tau_legend() %>% tau_tooltip()