knitr::opts_chunk$set(echo = TRUE,message=FALSE, warning=FALSE)
library(stringr)
library(tm)
library(wordcloud)
library(wesanderson)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(readxl)
library(tidyr)
library(cluster)
library(forcats)
ted <- read_xlsx("/Users/Bryce/DataViz/TedTalks/TED Talks as of 3.29.2018.xlsx")
colnames(ted)
## [1] "Talk ID" "public_url" "speaker_name" "headline" "description"
## [6] "event" "duration" "language" "published" "tags"
clean.text <- function(x) {
# to lowercase
x <- tolower(x)
# remove punctuation marks
x <- gsub("[[:punct:]]", "", x)
# remove numbers
x <- gsub("[[:digit:]]", "", x)
# remove tabs and extra spaces
x <- gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x <- gsub("^ ", "", x)
# remove blank spaces at the end
x <- gsub(" $", "", x)
# result
return(x)
}
numtalks <- data.frame(table(ted$speaker_name))
table(numtalks$Freq)
##
## 1 2 3 4 5 6 7 9
## 2047 208 51 18 6 2 1 1
2047 speakers have only done one TED Talk whereas one speaker has done a record nine TED Talks
# Remove the 1899 Date from each duration entry
ted$duration <- gsub('1899-12-31 ', '', ted$duration)
# Create function to display talk time in minutes
ted$TalkTime <- sapply(strsplit(as.character(ted$duration), ":"), function(x) {
x <- as.numeric(x)
x[1] * 60 + x[2] + x[3]/60
})
speakfreqandduration <- ted %>% group_by(speaker_name) %>% summarise(NumTalks = n(),
Mean.Talk.Time = mean(TalkTime, na.rm = TRUE)) %>% filter(NumTalks >
3)
# View talk time stats
summary(speakfreqandduration$Mean.Talk.Time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.056 13.360 15.074 15.037 17.286 33.458
library(htmlwidgets)
suppressPackageStartupMessages(library(dplyr))
library(DT)# if (!require("DT")) install.packages('DT')
dt=datatable(speakfreqandduration, options=list(pageLength = 15), colnames = c('Speaker Name','Number of Talks','Average Talk Time (min)'))
dt
library(taucharts)
library(plotly)
gg3 <- ggplot(na.omit(speakfreqandduration), aes(x = reorder(speaker_name, Mean.Talk.Time),
y = Mean.Talk.Time, fill = as.factor(NumTalks))) + geom_bar(stat = "identity") +
xlab("Speaker") + theme_bw() + theme(axis.text.y = element_text(size = 8), plot.title = element_text(hjust = 0.5),
axis.title.y = element_blank()) + coord_flip() + ggtitle("Speakers with more than 3 TED Talks") +guides(fill=guide_legend(title="Number of TED Talks Given")) + ylab("Mean Talk Time (Minutes)")
ggplotly(gg3)