#install.packages("googlesheets")
sheeturl="https://docs.google.com/spreadsheets/d/1Yv_9nDl4ocIZR0GXU3OZuBaXxER1blfwR_XHvklPpEM/edit?hl=en&hl=en&hl=en#gid=0"
library(googlesheets)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
tedsheet <- sheeturl %>% gs_url()
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1Yv_9nDl4ocIZR0GXU3OZuBaXxER1blfwR_XHvklPpEM
## Worksheets feed constructed with public visibility
TED <- tedsheet %>% gs_read()
## Accessing worksheet titled 'TEDTalks by date'.
## Parsed with column specification:
## cols(
## `Talk ID` = col_double(),
## public_url = col_character(),
## speaker_name = col_character(),
## headline = col_character(),
## description = col_character(),
## event = col_character(),
## duration = col_time(format = ""),
## language = col_character(),
## published = col_character(),
## tags = col_character()
## )
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
hour(TED$duration[1])
## [1] 0
minute(TED$duration[1])
## [1] 16
second(TED$duration[1])
## [1] 17
TED <- TED %>% mutate(duration_minutes=(second(duration)+60*minute(duration)+3600*hour(duration))/60)
head(TED[,c("duration","duration_minutes")])
## # A tibble: 6 x 2
## duration duration_minutes
## <time> <dbl>
## 1 16'17" 16.3
## 2 21'26" 21.4
## 3 18'36" 18.6
## 4 19'24" 19.4
## 5 19'50" 19.8
## 6 21'45" 21.8
library('dplyr')
TED_speaker_info <- TED %>% group_by(speaker_name)%>%
summarise(Number_talks=length(speaker_name),
Mean_talk_duration=mean(duration_minutes))
head(as.data.frame(TED_speaker_info))
## speaker_name Number_talks Mean_talk_duration
## 1 Aakash Odedra 1 9.833333
## 2 Aala El-Khani 1 14.266667
## 3 Aaron Huey 1 15.450000
## 4 Aaron Koblin 1 18.300000
## 5 Aaron O'Connell 1 7.850000
## 6 Abe Davis 1 17.950000
TED_speaker_info$Mean_talk_duration <- round(TED_speaker_info$Mean_talk_duration,2)
head(as.data.frame(TED_speaker_info))
## speaker_name Number_talks Mean_talk_duration
## 1 Aakash Odedra 1 9.83
## 2 Aala El-Khani 1 14.27
## 3 Aaron Huey 1 15.45
## 4 Aaron Koblin 1 18.30
## 5 Aaron O'Connell 1 7.85
## 6 Abe Davis 1 17.95
#much better!
Installing Packages and Loading Libraries
#install.packages('DT')
library(htmlwidgets)
library(DT)
datatable(TED_speaker_info)
A. Show speakers who gave more than 3 talks, such that the height of bars corresponds to the mean talk time of each speaker and the color of the bar corresponds to the number of talks given by each speaker. B. Show the top 20 tag terms/phrase (based on the frequency of use of each term/phrase) and how frequently they were present in the dataset.
library("ggplot2")
library('dplyr')
TED_speaker_info %>% ggplot(.,aes(Number_talks))+geom_histogram(fill = 'dark green')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
TED_speaker_info %>% filter(Number_talks>3)%>% ggplot(.,aes(reorder(speaker_name, Mean_talk_duration),Mean_talk_duration, fill=Number_talks))+geom_bar(stat="identity")+coord_flip()+labs(x="Speaker",y="Mean talk duration")+theme_light()
head(TED$tags)
## [1] "alternative energy,cars,global issues,climate change,environment,science,culture,sustainability,technology"
## [2] "simplicity,entertainment,interface design,software,media,computers,technology,music,performance"
## [3] "MacArthur grant,cities,green,activism,politics,pollution,environment,inequality,business"
## [4] "children,teaching,creativity,parenting,culture,dance,education"
## [5] "demo,Asia,global issues,visualizations,global development,statistics,math,health,economics,Google,Africa"
## [6] "entertainment,goal-setting,potential,psychology,motivation,emotions,culture,business"
#TEDtags$Tag <- trimws(TEDtags$Tag)
#TEDtags$Tag <- tolower(TEDtags$Tag)
library(htmlwidgets)
library(forcats)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
tagCorpus <- Corpus(VectorSource(gsub(","," ",TED$tags)))
tagStopwords <- c(stopwords("english"), "the")
tagCorpus <- tm_map(tagCorpus, removeWords, tagStopwords)
## Warning in tm_map.SimpleCorpus(tagCorpus, removeWords, tagStopwords):
## transformation drops documents
tdmpremat_tag <- TermDocumentMatrix(tagCorpus)
tdm_tag <- as.matrix(tdmpremat_tag)
sortedMatrix_tag <- sort(rowSums(tdm_tag), decreasing = TRUE)
tdmframe_tag <- data.frame(word = names(sortedMatrix_tag), freq = sortedMatrix_tag)
top_20_tags <- top_n(tdmframe_tag,20)
## Selecting by freq
ggplot(top_20_tags, aes(reorder(word, freq), freq)) + geom_bar(stat='identity') + coord_flip() + xlab("Tags") + ylab("Frequency") + ggtitle("Most Popular Video Tags") + theme(plot.title = element_text(hjust = 0.5))