Assignment-6: Ted Talks

Bryce J. Parsons

5/1/2020

Ted Talks: Using Interactive Graphs to Display Information About Featured Ted Talks

#install.packages("googlesheets")

sheeturl="https://docs.google.com/spreadsheets/d/1Yv_9nDl4ocIZR0GXU3OZuBaXxER1blfwR_XHvklPpEM/edit?hl=en&hl=en&hl=en#gid=0"

library(googlesheets)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
tedsheet <- sheeturl %>% gs_url()
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1Yv_9nDl4ocIZR0GXU3OZuBaXxER1blfwR_XHvklPpEM
## Worksheets feed constructed with public visibility
TED <- tedsheet %>% gs_read()
## Accessing worksheet titled 'TEDTalks by date'.
## Parsed with column specification:
## cols(
##   `Talk ID` = col_double(),
##   public_url = col_character(),
##   speaker_name = col_character(),
##   headline = col_character(),
##   description = col_character(),
##   event = col_character(),
##   duration = col_time(format = ""),
##   language = col_character(),
##   published = col_character(),
##   tags = col_character()
## )

Question 1:

Create an interactive table that shows the total number of talks given by an individual and the average duration of all their talks. Hence, there should be three columns in the table: The name, the number of talks, and the mean of the talk time (in minutes) for all their talks.

Installing Lubridate Package & Filtering Out Vlaues: Hour, Minute, Second for Analysis

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
hour(TED$duration[1])
## [1] 0
minute(TED$duration[1])
## [1] 16
second(TED$duration[1])
## [1] 17

Computing total seconds for each TedTalk by dividing by 60 Seconds

TED <- TED %>% mutate(duration_minutes=(second(duration)+60*minute(duration)+3600*hour(duration))/60)
head(TED[,c("duration","duration_minutes")])
## # A tibble: 6 x 2
##   duration duration_minutes
##   <time>              <dbl>
## 1 16'17"               16.3
## 2 21'26"               21.4
## 3 18'36"               18.6
## 4 19'24"               19.4
## 5 19'50"               19.8
## 6 21'45"               21.8

Creating Ted Speaker Info Variable for Analysis

library('dplyr')
TED_speaker_info <- TED %>% group_by(speaker_name)%>%
  summarise(Number_talks=length(speaker_name),
            Mean_talk_duration=mean(duration_minutes))
head(as.data.frame(TED_speaker_info))
##      speaker_name Number_talks Mean_talk_duration
## 1   Aakash Odedra            1           9.833333
## 2   Aala El-Khani            1          14.266667
## 3      Aaron Huey            1          15.450000
## 4    Aaron Koblin            1          18.300000
## 5 Aaron O'Connell            1           7.850000
## 6       Abe Davis            1          17.950000

Cleaning Up the Data, Rounding Down to Two Decimals Places

TED_speaker_info$Mean_talk_duration <- round(TED_speaker_info$Mean_talk_duration,2)
head(as.data.frame(TED_speaker_info))
##      speaker_name Number_talks Mean_talk_duration
## 1   Aakash Odedra            1               9.83
## 2   Aala El-Khani            1              14.27
## 3      Aaron Huey            1              15.45
## 4    Aaron Koblin            1              18.30
## 5 Aaron O'Connell            1               7.85
## 6       Abe Davis            1              17.95
#much better!

Installing Packages and Loading Libraries

#install.packages('DT')
library(htmlwidgets)
library(DT)
datatable(TED_speaker_info)

Question 2:

Create bar graphs to:

A. Show speakers who gave more than 3 talks, such that the height of bars corresponds to the mean talk time of each speaker and the color of the bar corresponds to the number of talks given by each speaker. B. Show the top 20 tag terms/phrase (based on the frequency of use of each term/phrase) and how frequently they were present in the dataset.

library("ggplot2")
library('dplyr')
TED_speaker_info %>% ggplot(.,aes(Number_talks))+geom_histogram(fill = 'dark green')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Filtering for speakers with more than 3 talks & Displaying Mean Talk Duration on the X-Axis

I believe this graph is the best way to display number of talks by speaker, and also the mean talk duration. Since there are such a high number of speakers with 4 talks, it makes it clear which other speakers had more than 4 with a lighter gradiant of blue. I believe this is easier to interpret for general information, rather than an indiviudal color for each number of talks e.g., 5 talks = Green, 6 talks = Yellow, etc.

An example which you (Prof. Patil) made that is also great for processing info in a vizulaization was the use of “plotly” and the hover over info tool

TED_speaker_info %>% filter(Number_talks>3)%>% ggplot(.,aes(reorder(speaker_name, Mean_talk_duration),Mean_talk_duration, fill=Number_talks))+geom_bar(stat="identity")+coord_flip()+labs(x="Speaker",y="Mean talk duration")+theme_light()

B. Show the top 20 tag terms/phrase (based on the frequency of use of each term/phrase) and how frequently they were present in the dataset.

head(TED$tags)
## [1] "alternative energy,cars,global issues,climate change,environment,science,culture,sustainability,technology"
## [2] "simplicity,entertainment,interface design,software,media,computers,technology,music,performance"           
## [3] "MacArthur grant,cities,green,activism,politics,pollution,environment,inequality,business"                  
## [4] "children,teaching,creativity,parenting,culture,dance,education"                                            
## [5] "demo,Asia,global issues,visualizations,global development,statistics,math,health,economics,Google,Africa"  
## [6] "entertainment,goal-setting,potential,psychology,motivation,emotions,culture,business"
#TEDtags$Tag <- trimws(TEDtags$Tag)
#TEDtags$Tag <- tolower(TEDtags$Tag)
library(htmlwidgets)
library(forcats)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
tagCorpus <- Corpus(VectorSource(gsub(","," ",TED$tags)))
tagStopwords <- c(stopwords("english"), "the")


tagCorpus <- tm_map(tagCorpus, removeWords, tagStopwords)
## Warning in tm_map.SimpleCorpus(tagCorpus, removeWords, tagStopwords):
## transformation drops documents
tdmpremat_tag <- TermDocumentMatrix(tagCorpus)
tdm_tag <- as.matrix(tdmpremat_tag)
sortedMatrix_tag <- sort(rowSums(tdm_tag), decreasing = TRUE)
tdmframe_tag <- data.frame(word = names(sortedMatrix_tag), freq = sortedMatrix_tag)

top_20_tags <- top_n(tdmframe_tag,20)
## Selecting by freq
ggplot(top_20_tags, aes(reorder(word, freq), freq)) + geom_bar(stat='identity') + coord_flip() + xlab("Tags") + ylab("Frequency") + ggtitle("Most Popular Video Tags") + theme(plot.title = element_text(hjust = 0.5))