library(htmltab)
library(DT)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
url <- "https://cran.r-project.org/web/packages/available_packages_by_date.html"
cran <- htmltab(doc = url)
## No encoding supplied: defaulting to UTF-8.
## Argument 'which' was left unspecified. Choosing first table.
## [1] 17753 3
## [1] "Date" "Package" "Title"
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
cran$year1 <- as.Date(as.character(cran$Date), format = "%Y")
cran$year <- year(cran$year1)
table(cran$year)
##
## 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
## 2 1 6 19 21 40 304 371 503 730 1018 1170 1717 2191 4689 4971
library(tidytext)
### ALL
cran1 <- cran %>%
unnest_tokens(word, Title)
data(stop_words)
cran2 <- cran1 %>%
anti_join(stop_words)
## Joining, by = "word"
cran3= cran2 %>%
count(word, sort = TRUE)
library(DT)
datatable(cran3)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
### 2017 and after
cran= subset(cran, year > 2016)
cran1 <- cran %>%
unnest_tokens(word, Title)
data(stop_words)
cran2 <- cran1 %>%
anti_join(stop_words)
## Joining, by = "word"
cran3= cran2 %>%
count(word, sort = TRUE)
library(DT)
datatable(cran3)
### 2019 and after (bigram)
bigrams <- cran %>%
unnest_tokens(bigram, Title, token = "ngrams", n = 2)
bigrams2 =bigrams %>%
count(bigram, sort = TRUE)
datatable(bigrams2)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html