Course catalogs were scraped to assess the most frequent terms associated with study in the field of data science. Using rvest,stringr,tm,wordcloud,RCurl,data.table,dplyr,XML, each webpage was individually scanned for links to course material, or simply scanned for descriptive text. The method evolved as each page was analyzed as institutions varied in their approach to web design. The resulting word frequencies were written to .csv and uploaded to a cloud database for querying using DBI and RMySQL.
Each specific course was accessed through a link; links were obtained with the function getlinks, then filtered and cleaned. The resulting list was accessed via the function scrape_pages. The result was cleaned, scanned, converted to a corpus, scanned for stopwords and tidied further, then written to a dataframe and rendered in a word cloud to check the result.
# load packages
library(rvest)
library(stringr)
library(tm)
library(wordcloud)
library(RCurl)
library(data.table)
library(dplyr)
library(XML)
url_base<-"https://www.ischool.berkeley.edu/courses/datasci"
#' Extract link texts and urls from a web page
getlinks <- function(url){
linkspage <- read_html(url)#Read html
url_ <- linkspage %>%#Grab specific text
html_nodes("a") %>%
html_attr("href")
return(url_)
}
#create dataframe and select only the urls we want.
just_text <- as.data.frame(lapply(url_base,getlinks))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
#continue to check and filter titles
clean_text<-filter(just_text, grepl("/courses/",url))
clean_text<-as.list(unique(clean_text$url))
clean_text<-paste(url_base,clean_text)
clean_text<-gsub(" ", "", clean_text)
#function runs through our list of urls
scrape_pages <- function(x){
tmp <- htmlParse(getURI(x))
tmp <- xpathSApply(tmp, '//div/p', xmlValue)#grab only text tagged '<p>'
return(tmp)
}
#activate function to scrape text
textblock <- sapply(clean_text, scrape_pages)
#clean and filter using gsub
omit <- c("\n", "\t", "\r")
textblock <- gsub(paste(omit,collapse="|"), " ", textblock)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
#lowercase is needed for stopwords to function.
textblock <- tolower(textblock)
require(tm)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#use function to remove stopwords.
ToOmit <- function(x) removeWords(x, stopwords("english"))
#set up a function list to remove punctuation, numbers, extra space, and stopwords.
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#Use tm-map from tm
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix, omit words shorter than 3.
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df)[1]<-"frequency"
df<-setDT(df, keep.rownames = TRUE)[]
colnames(df)[1]<-"word"
#order descending
df<-df[order(-df$frequency),]
set.seed(1973)
wordcloud(words = df$word, freq = df$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
The same functions were used in much the same manner, though the list of urls needed more cleaning, eliminating many of the special interest links found on many school sites.
url_base<-"https://cds.nyu.edu/academics/ms-in-data-science/ms-courses/"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-filter(just_text, grepl("http",url))
#.html files and github textblock will foil the function.
clean_text<-filter(clean_text, !grepl("html|github|forms|http://nyu.edu|albert.nyu.edu|admissions|academics|our-people|twitter|facebook|medium|about|opportunities|contact|linkedin|footer",url))
clean_text<-as.list(unique(clean_text$url))
textblock <- sapply(clean_text, scrape_pages)
textblock <- gsub(paste(omit,collapse="|"), " ", textblock)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df2<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df2)[1]<-"frequency"
df2<-setDT(df2, keep.rownames = TRUE)[]
colnames(df2)[1]<-"word"
df2<-df2[order(-df$frequency),]
set.seed(1973)
wordcloud(words = df2$word, freq = df2$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
As I scraped more sites I encountered errors in the scraping function; with a little research I incorporated the tryCatch function, which allows the scrape_pages function to skip errors (https://stackoverflow.com/questions/14748557/skipping-error-in-for-loop).
url_base<-"https://datascience.columbia.edu/course-inventory"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-filter(just_text, grepl("http",url))
#.html files and github textblock will foil the function.
clean_text<-filter(clean_text, !grepl("youtube|html|tumblr|github|forms|admissions|academics|twitter|facebook|medium|about|opportunities|contact|linkedin|footer",url))
clean_text<-as.list(unique(clean_text$url))
#added tryCatch to continue if errors are encountered, adapted from https://stackoverflow.com/questions/14748557/skipping-error-in-for-loop.
scrape_pages <- function(x){
tryCatch({
tmp <- htmlParse(getURI(x))
tmp <- xpathSApply(tmp, '//div/p', xmlValue)
return(tmp)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
textblock <- sapply(clean_text, scrape_pages)
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
## ERROR : XML content does not seem to be XML: ''
textblock <- gsub(paste(omit,collapse="|"), " ", textblock)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df3<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df3)[1]<-"frequency"
df3<-setDT(df3, keep.rownames = TRUE)[]
colnames(df3)[1]<-"word"
df3<-df3[order(-df3$frequency),]
set.seed(1973)
wordcloud(words = df3$word, freq = df3$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
url_base<-"http://catalog.northeastern.edu/undergraduate/computer-information-science/data-science/data-science-bs/#programrequirementstext"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-filter(just_text, grepl("search",url))
#.html files and github textblock will foil the function.
clean_text<-filter(clean_text, !grepl("youtube|html|tumblr|github|forms|admissions|academics|twitter|facebook|medium|about|opportunities|contact|linkedin|footer",url))
clean_text<-as.list(unique(clean_text$url))
#add root to search popups
clean_text<-paste(url_base,clean_text)
clean_text<-gsub(" ", "", clean_text)
#scrape the urls
textblock <- sapply(clean_text, scrape_pages)
textblock <- gsub(paste(omit,collapse="|"), " ", textblock)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df4<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df4)[1]<-"frequency"
df4<-setDT(df4, keep.rownames = TRUE)[]
colnames(df4)[1]<-"word"
df4<-df4[order(-df4$frequency),]
set.seed(1973)
wordcloud(words = df4$word, freq = df4$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
url_base<-"https://ep.jhu.edu/programs-and-courses/programs/data-science#quickset-program_textblock_content_4"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-filter(just_text, grepl("programs-and-courses/",url))
#.html files and github textblock will foil the function.
clean_text<-filter(clean_text, !grepl("request|youtube|html|tumblr|github|forms|admissions|academics|twitter|facebook|medium|about|opportunities|contact|linkedin|footer",url))
clean_text<-as.list(unique(clean_text$url))
#add root to search popups
clean_text<-paste(url_base,clean_text)
clean_text<-gsub(" ", "", clean_text)
#scrape
textblock <- sapply(clean_text, scrape_pages)
textblock <- gsub(paste(omit,collapse="|"), " ", textblock)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df5<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df5)[1]<-"frequency"
df5<-setDT(df5, keep.rownames = TRUE)[]
colnames(df5)[1]<-"word"
df5<-df5[order(-df5$frequency),]
set.seed(1973)
wordcloud(words = df5$word, freq = df5$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
From this point each site was scanned only for text, pulling word frequecies from course and subject titles. The result, along with word clouds, are shown below. In each case the function was slightly adjusted to optimize the process.
url_base<-"http://catalogue.usc.edu/preview_program.php?catoid=6&poid=5602"
#Alter the function to grab text or tag titles, not urls
getlinks2 <- function(url){
linkspage <- read_html(url)
url_ <- linkspage %>%
html_nodes("a") %>%
html_text()
return(url_)
}
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df6<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df6)[1]<-"frequency"
df6<-setDT(df6, keep.rownames = TRUE)[]
colnames(df6)[1]<-"word"
df6<-df6[order(-df6$frequency),]
set.seed(1973)
wordcloud(words = df6$word, freq = df6$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
url_base<-"https://catalog.njit.edu/graduate/computing-sciences/computer-science/data-science-ms/"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df7<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df7)[1]<-"frequency"
df7<-setDT(df7, keep.rownames = TRUE)[]
colnames(df7)[1]<-"word"
df7<-df7[order(-df7$frequency),]
set.seed(1973)
wordcloud(words = df7$word, freq = df7$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
url_base<-"http://catalogue.usc.edu/preview_program.php?catoid=6&poid=5602"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df8<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df8)[1]<-"frequency"
df8<-setDT(df8, keep.rownames = TRUE)[]
colnames(df8)[1]<-"word"
df8<-df8[order(-df8$frequency),]
set.seed(1973)
wordcloud(words = df8$word, freq = df8$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
url_base<-"https://statistics.stanford.edu/academics/ms-statistics-data-science"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df9<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df9)[1]<-"frequency"
df9<-setDT(df9, keep.rownames = TRUE)[]
colnames(df9)[1]<-"word"
df9<-df9[order(-df9$frequency),]
set.seed(1973)
wordcloud(words = df9$word, freq = df9$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
A google search was conducted and a root url (https://www.google.com/search?q=data%20science%20course%20catalog&start=) waas used to obtain ten pages of search results; in order to proceed, ten links were generated iteratively using a loop which appended a value to the end of the root link in multiples of ten. The resulting list of urls–each representing a different online course catalog–resisted scraping using the previous functions, so five pages were selected and scanned individually. The result is show in the code blocks and word clouds below. The google results will change so the urls were entered individually.
urllist <- list()
for(i in 1:10){
root <- "https://www.google.com/search?q=data%20science%20course%20catalog&start="
num <- i*10
name<-paste(i)
tmp <- list(paste0(root,num))
urllist[[name]] <- tmp
}
url.df<-t(as.data.frame(urllist))
colnames(url.df)[1]<-"url"
rownames(url.df)<-NULL
#function adapted from https://stackoverflow.com/questions/32889136/how-to-get-google-search-results
getGoogleLinks <- function(google.url) {
doc <- getURL(google.url, httpheader = c("User-Agent" = "R
(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
(...){})
nodes <- getNodeSet(html, "//h3[@class='r']//a")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]))
}
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url.df,getGoogleLinks))
#rename all columns.
for (i in 1:ncol(just_text)){
colnames(just_text)[i] <- paste0("url")
}
#keep first column.
just_text2<-just_text[1]
#add columns as rows.
for (i in 2:ncol(just_text)){
just_text2<-rbind(just_text2,as.vector(just_text[i]))
print(just_text[i])
}
## url
## 1 /url?q=https://www.datacamp.com/courses&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCBQwAA&usg=AOvVaw0Y0Uv_u6Vty-dBX7aW5JLm
## 2 /url?q=https://sps.northwestern.edu/masters/data-science/program-courses.php&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCBowAQ&usg=AOvVaw0NuwxNMYmk8Jz1nKiwwAxC
## 3 /url?q=http://coursecatalog.web.cmu.edu/dietrichcollegeofhumanitiesandsocialsciences/departmentofstatistics/&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCB8wAg&usg=AOvVaw2qGNMy6TlNUA2BXSWvX0mZ
## 4 /url?q=https://www.learndatasci.com/best-data-science-online-courses/&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCCUwAw&usg=AOvVaw0fJ5lIKBBQEayy73UMSWxP
## 5 /url?q=http://catalog.fairfield.edu/graduate/engineering/programs/applied-data-science/&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCCowBA&usg=AOvVaw0vPVFxVANFmdJslfYn37w-
## 6 /url?q=https://datascience.smu.edu/academics/curriculum/coursedescriptions/&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCC8wBQ&usg=AOvVaw2v5eUR3kPRmJqseA5ZUwsD
## 7 /url?q=http://catalog.mit.edu/degree-charts/computer-science-economics-data-science-course-6-14/&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCDUwBg&usg=AOvVaw23esUY-3WZuvstZw651PTf
## 8 /url?q=https://www.extension.harvard.edu/academics/professional-graduate-certificates/data-science-certificate&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCDowBw&usg=AOvVaw14tnIEo5mvRkgTtx1c4QLz
## 9 /url?q=http://catalog.utc.edu/preview_program.php%3Fcatoid%3D16%26poid%3D2573%26returnto%3D529&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCD8wCA&usg=AOvVaw1FoAtL_pwSifwtqKpquUNZ
## 10 /url?q=http://catalog.wmich.edu/preview_program.php%3Fcatoid%3D29%26poid%3D8761%26returnto%3D1232&sa=U&ved=0ahUKEwiR8IzK25vhAhVQsZ4KHfTUCCw4FBAWCEEwCQ&usg=AOvVaw2MGC5UhuavrvVI-JvJR_J8
## url
## 1 /url?q=https://www.depts.ttu.edu/rawlsbusiness/graduate/ms/datascience/schedule.php&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCBQwAA&usg=AOvVaw3mX0BcmaxDbCNSZlwRenxy
## 2 /url?q=https://ep.jhu.edu/programs-and-courses/programs/data-science&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCBkwAQ&usg=AOvVaw0dVYn4S-MguKo3vdD4u_fM
## 3 /url?q=https://catalog.byu.edu/physical-and-mathematical-sciences/statistics/statistics-data-science-bs&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCB8wAg&usg=AOvVaw2xQ2_tfFUxV0zv5F9PJruN
## 4 /url?q=https://www.mastersindatascience.org/schools/23-great-schools-with-masters-programs-in-data-science/&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCCQwAw&usg=AOvVaw3o9QqsVeIorF3DKsuq3LPC
## 5 /url?q=https://www.wpi.edu/academics/departments/data-science/courses&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCCkwBA&usg=AOvVaw2wT5Jg2R6FPyFyEvgLYox5
## 6 /url?q=https://www.wm.edu/as/undergraduate/undergrad_catalog/data_catalog/index.php&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCC8wBQ&usg=AOvVaw0OdBsxHNjqUOcCZQP81rll
## 7 /url?q=http://bulletin.temple.edu/undergraduate/science-technology/computer-information-science/data-science-computational-analytics-certificate/&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCDEwBg&usg=AOvVaw3gJ1e-ey12rhUhFYuzrojl
## 8 /url?q=http://catalogue.usc.edu/preview_program.php%3Fcatoid%3D8%26poid%3D10302%26returnto%3D3397&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCDYwBw&usg=AOvVaw1O_MQngMvJa5L6Gy5rFnMp
## 9 /url?q=http://uww-public.courseleaf.com/undergraduate/letters-sciences/computer_science/data-science-minor/&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCDgwCA&usg=AOvVaw2HottdLBBjAfgYkgTj25cr
## 10 /url?q=https://floridapoly.edu/degree/data-science/&sa=U&ved=0ahUKEwjvtanK25vhAhVM654KHc78BT44HhAWCD0wCQ&usg=AOvVaw1Hpyy-m5n_VzzJBW7xCykg
## url
## 1 /url?q=http://simmons.smartcatalogiq.com/en/2018-2019/Undergraduate-Course-Catalog/Programs-of-Study/Program-in-Computer-Science-and-Informatics/Data-Science-and-Analytics-DSandA-BS&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCBQwAA&usg=AOvVaw1Upx986ND5FLQtp-j2I_NL
## 2 /url?q=http://catalog.yale.edu/ycps/subjects-of-instruction/statistics/&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCBkwAQ&usg=AOvVaw12aV8-K-lc9GiX5sHEdKVJ
## 3 /url?q=https://www.marquette.edu/grad/programs-data-science-certificate.php&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCB8wAg&usg=AOvVaw2gb_AqovIGzr4g5cae7vy8
## 4 /url?q=https://hilo.hawaii.edu/catalog/data-science-cert&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCCQwAw&usg=AOvVaw1SdWdbwUYuE0WxyfvxTJ3R
## 5 /url?q=http://catalog.apsu.edu/preview_program.php%3Fcatoid%3D23%26poid%3D5309%26returnto%3D933&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCCowBA&usg=AOvVaw0QMAYZdYFZL02NHk2T7mox
## 6 /url?q=https://www.nku.edu/academics/informatics/programs/undergraduate/datascience.html&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCCwwBQ&usg=AOvVaw3629uPqNYmAcpVJgAFe6Qe
## 7 /url?q=https://www.switchup.org/rankings/best-data-science-bootcamps&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCDEwBg&usg=AOvVaw0bDGkHI032dX1Vhu8yp67u
## 8 /url?q=http://catalog.pfw.edu/preview_program.php%3Fcatoid%3D49%26poid%3D10799%26returnto%3D1454&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCDcwBw&usg=AOvVaw2ePbO1l8PfS9IKeD1H817d
## 9 /url?q=http://catalogue.uci.edu/donaldbrenschoolofinformationandcomputersciences/departmentofstatistics/&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCDwwCA&usg=AOvVaw2GHo24Bn0JGqkxxCPIt0HM
## 10 /url?q=https://catalog.emich.edu/preview_program.php%3Fcatoid%3D25%26poid%3D11972&sa=U&ved=0ahUKEwj78cbK25vhAhUJv54KHclRBEo4KBAWCEIwCQ&usg=AOvVaw1Uo7-FYHLj7hcZi56L3MHj
## url
## 1 /url?q=http://catalog.nau.edu/Catalog/details%3Fplan%3DINFDSCT%26catalogYear%3D1819&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCBQwAA&usg=AOvVaw0k7XT5NZ4NnODQ7ZmtVa9Y
## 2 /url?q=https://www.jewell.edu/programs/data-science&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCBkwAQ&usg=AOvVaw3bjpQKZK-xbts1BLRFwl4h
## 3 /url?q=https://www.maryville.edu/as/science-and-mathematics/data-science/&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCB4wAg&usg=AOvVaw0rh3VQ1OODzO0ywtJ-Pr05
## 4 /url?q=https://www.pce.uw.edu/certificates/data-science&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCCMwAw&usg=AOvVaw0RcneSU1FOj6T5OdVdFAW8
## 5 /url?q=https://www.bgsu.edu/arts-and-sciences/mathematics-and-statistics/comast1.html&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCCkwBA&usg=AOvVaw2QkVTsqEHwF8BsXaoQsA0M
## 6 /url?q=https://www.codecademy.com/catalog/subject/data-science&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCC4wBQ&usg=AOvVaw1v9PLB-d2uw2dVc_8AspiL
## 7 /url?q=http://catalogue.uvm.edu/undergraduate/engineeringandmathematicalsciences/datascience/&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCDMwBg&usg=AOvVaw23sBjn5PKiLpCJAFvq8dPX
## 8 /url?q=https://www.cdm.depaul.edu/academics/Pages/Current/Requirements-BS-in-Data-Science.aspx&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCDkwBw&usg=AOvVaw3Lct9CXx_ar_msXfw4SrMe
## 9 /url?q=https://catalog.claremontmckenna.edu/preview_program.php%3Fcatoid%3D21%26poid%3D1633&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCD4wCA&usg=AOvVaw1oPBlajA4NnpWgJZXcK8uG
## 10 /url?q=https://louisville.edu/online/programs/certificate-programs/graduate-certificate-in-data-science&sa=U&ved=0ahUKEwiw0vbK25vhAhWEqZ4KHZJbD9U4MhAWCEMwCQ&usg=AOvVaw2wA0YICHNMWYLJExjAY3Jj
## url
## 1 /url?q=https://www.edx.org/professional-certificate/harvardx-data-science&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCBQwAA&usg=AOvVaw20YI0x6CZZXRVU2N6kw4vw
## 2 /url?q=https://www.sice.indiana.edu/graduate/degrees/data-science/courses/index.html&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCBkwAQ&usg=AOvVaw2ogv_ecxLwBmjVwybXmNlG
## 3 /url?q=https://catalog.ku.edu/engineering/electrical-engineering-computer-science/certificate-data-science/&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCB4wAg&usg=AOvVaw2oxP4gcuUgAJ22Ut5qh55K
## 4 /url?q=https://www.drew.edu/academics/undergraduate-studies/launch/majors-and-minors/data-science/&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCCMwAw&usg=AOvVaw3g2zOG7ptO4SMRpmFwukVC
## 5 /url?q=https://www.uml.edu/catalog/undergraduate/sciences/departments/computer-science/degree-pathways/dp-cs-data-science.aspx&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCCgwBA&usg=AOvVaw0TP2dFKJDtRyzzC0-VKDGp
## 6 /url?q=https://datascience.virginia.edu/degrees/info/programs-and-courses&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCC4wBQ&usg=AOvVaw0r6k8yGPFM_413ad-dUxm-
## 7 /url?q=https://catalog.mills.edu/undergraduate/majors-minors/data-science/&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCDMwBg&usg=AOvVaw3-9sxLPO6EXnXfWcf7Q6v9
## 8 /url?q=https://www.datasciencedegreeprograms.net/rankings/certificate/&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCDgwBw&usg=AOvVaw3u9mvE2OGhxb0Bb8P_ddUj
## 9 /url?q=https://www.baypath.edu/academics/graduate-programs/applied-data-science-ms/&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCD0wCA&usg=AOvVaw0yh4KXYNvaO5XHJSdXLqxQ
## 10 /url?q=https://www.cs.purdue.edu/undergraduate/curriculum/data-science.html&sa=U&ved=0ahUKEwiH8pPL25vhAhUNvp4KHaWnBJE4PBAWCEIwCQ&usg=AOvVaw1bCaY3k_ZAU-7FWRy-psED
## url
## 1 /url?q=https://www.memphis.edu/cs/programs/grad_cert_data_science.php&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCBQwAA&usg=AOvVaw3yZC7HE6Z5XjAw-TFVMF2_
## 2 /url?q=https://engineering.tufts.edu/cs/bachelor-science-data-science&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCBkwAQ&usg=AOvVaw1UjUa3tKhQ70Lw528k4113
## 3 /url?q=https://statistics.stanford.edu/academics/ms-statistics-data-science&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCB4wAg&usg=AOvVaw29d4wZsnbYq4CK8qjSGDtI
## 4 /url?q=https://registrar.boisestate.edu/undergraduate/course-catalog/data-la/&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCCQwAw&usg=AOvVaw0uTX3hpgfP3wKZgcI78JZS
## 5 /url?q=http://catalog.gvsu.edu/preview_program.php%3Fcatoid%3D48%26poid%3D9271&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCCkwBA&usg=AOvVaw2yaXGKo7wtYAuEaTWpXHW9
## 6 /url?q=http://www.byui.edu/mathematics/data-science&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCC4wBQ&usg=AOvVaw3Uqgn97ZKbMm-bo2E0mN9B
## 7 /url?q=https://www.regis.edu/CCIS/Academics/Degrees-Programs/Graduate-Programs/MS-Data-Science.aspx&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCDMwBg&usg=AOvVaw0H3RPhK6JW4ybdB64bOhij
## 8 /url?q=https://catalog.montgomerycollege.edu/preview_program.php%3Fcatoid%3D8%26poid%3D1877%26returnto%3D13&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCDgwBw&usg=AOvVaw1laPGtbmYzY4GJcW6ATT4q
## 9 /url?q=https://www.unh.edu/analytics&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCD0wCA&usg=AOvVaw2B3rh99eP1nL1DNxeb-k9E
## 10 /url?q=http://datascience.umbc.edu/courses.php&sa=U&ved=0ahUKEwiKl7PL25vhAhVCoZ4KHbGVC3A4RhAWCD8wCQ&usg=AOvVaw3Fil5ztQBKm6C7v8U4t4vf
## url
## 1 /url?q=https://www.winona.edu/math-stat/data-science.asp&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCBQwAA&usg=AOvVaw18Y8NUQ8pMtyVEpbVgrYtJ
## 2 /url?q=https://www.discoverdatascience.org/programs/masters-in-data-science/&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCBkwAQ&usg=AOvVaw0C7SiyGmC7rWPV-eIMW-67
## 3 /url?q=https://academy.microsoft.com/en-us/professional-program/tracks/data-science/&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCB4wAg&usg=AOvVaw3303FUGDbTxyvhnQms7aYL
## 4 /url?q=https://www.forbes.com/sites/bernardmarr/2017/05/02/the-6-best-data-science-masters-degree-courses-in-the-us/&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCCMwAw&usg=AOvVaw2EvFpsXpVOzfKFfDZ5TZST
## 5 /url?q=https://www.marist.edu/computer-science-math/data-science-program&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCCkwBA&usg=AOvVaw30i_5Ap2dfqul6tIwpoFFw
## 6 /url?q=https://nycdatascience.com/courses/&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCC4wBQ&usg=AOvVaw33efCz_1OYKb46inxEQ7rY
## 7 /url?q=https://online.lewisu.edu/msds&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCDMwBg&usg=AOvVaw1RwDeHKM2nsRZ2ERTcK2Oq
## 8 /url?q=http://www.sas.rochester.edu/dsc/undergraduate/major.html&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCDgwBw&usg=AOvVaw2j8XceLcohxbJp0bv7_Wu1
## 9 /url?q=https://stat.uconn.edu/individualized-data-science-major/&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCD4wCA&usg=AOvVaw1JF_qTKpakwzGaXv9JqiC0
## 10 /url?q=https://www.minerva.kgi.edu/academics/course_catalog/computational_sciences/&sa=U&ved=0ahUKEwivjdbL25vhAhXNv54KHSHACC44UBAWCEMwCQ&usg=AOvVaw0OXwr1mLtVk3TTkVvmHCZj
## url
## 1 /url?q=https://www.metrostate.edu/academics/programs/data-science-bs&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCBQwAA&usg=AOvVaw1vchkEMjCGyO8QD6PnHeN2
## 2 /url?q=https://informationscience.unt.edu/ms-data-science&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCBkwAQ&usg=AOvVaw3xI6ryONcUbdhUmeclwdWl
## 3 /url?q=http://catalog.calpoly.edu/collegesandprograms/collegeofsciencemathematics/statistics/crossdisciplinarystudiesminordatascience/&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCB4wAg&usg=AOvVaw07wDHZ9xhDqgmEJJu8zsJc
## 4 /url?q=https://www.colby.edu/catalogue/courses/DS/&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCCQwAw&usg=AOvVaw1qSPc7S13pDWP1VDMTSB2K
## 5 /url?q=https://www.skillsoft.com/courses/5608723-data-science-overview/&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCCkwBA&usg=AOvVaw1wPuGz-0Z_zKTX6fijfeoX
## 6 /url?q=https://ds.cs.umass.edu/&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCC4wBQ&usg=AOvVaw2ccyBusTlxKA8R2go--zAW
## 7 /url?q=https://programs.online.utica.edu/programs/masters-data-science/courses&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCDQwBg&usg=AOvVaw3Rq_Dh4U92kzd-vHL0E3be
## 8 /url?q=https://catalog.gmu.edu/colleges-schools/science/computational-data-sciences/computational-data-sciences-minor/&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCDkwBw&usg=AOvVaw3JNIIOmxExc55-HiPsbU_3
## 9 /url?q=https://www.jhsph.edu/courses/course/26098/2018/140.711.01/advanced-data-science-i&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCD4wCA&usg=AOvVaw04F3BiXcOxTQWSHYrMCpCZ
## 10 /url?q=https://engineering.case.edu/eecs/academics/undergraduate-program/data-science-and-analytics&sa=U&ved=0ahUKEwi59PfL25vhAhVHo54KHTYXBE04WhAWCEMwCQ&usg=AOvVaw3YcBTMdqkkZ0Mq0k9Vbc7X
## url
## 1 /url?q=http://compsci.cofc.edu/undergraduate-programs/data-science-bs.php&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCBQwAA&usg=AOvVaw30S22Dxt0Du3doENx3dr7J
## 2 /url?q=https://www.eecs.psu.edu/students/undergraduate/Data-Sciences.aspx&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCBswAQ&usg=AOvVaw0Zxj956UUtQIkF2lxuFcA_
## 3 /url?q=http://catalog.sps.cuny.edu/preview_program.php%3Fcatoid%3D2%26poid%3D607&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCCAwAg&usg=AOvVaw1ArIfyColKZ7qOCZrQoyDD
## 4 /url?q=http://catalog.uwgb.edu/graduate/graduate-programs/data-science-ms/&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCCUwAw&usg=AOvVaw09FcqlXYpYYTDlkT9rZidK
## 5 /url?q=https://www.chapman.edu/scst/graduate/ms-computational-science.aspx&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCCswBA&usg=AOvVaw36Fl-L4tSaaJV-iJn8881e
## 6 /url?q=https://flatironschool.com/career-courses/data-science-bootcamp/&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCDAwBQ&usg=AOvVaw2IgTI9b1VuaaTtaOm9T2kp
## 7 /url?q=https://www.sdstate.edu/mathematics-statistics/master-science-data-science&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCDUwBg&usg=AOvVaw0znu-n2aCKPu7FrazmzBRi
## 8 /url?q=https://catalog.slu.edu/colleges-schools/health-outcomes-research/health-data-science-ms/&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCDowBw&usg=AOvVaw2D1g2AIgQSB_1F-cgLqHSM
## 9 /url?q=http://onlinedsa.merrimack.edu/data-science/&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCD8wCA&usg=AOvVaw1N3n3aXwONBhVf_T5CmLH_
## 10 /url?q=http://mbs.rutgers.edu/node/240&sa=U&ved=0ahUKEwj2ipvM25vhAhXDl54KHRjKBPY4ZBAWCEYwCQ&usg=AOvVaw1V3mq8ouQRsaP5N7TCRyLY
#tidy
just_text2$url<-as.character(trimws(just_text2$url))
clean_text<-substring(just_text2$url,8)
clean_text<-as.data.frame(gsub('&.*','',clean_text),stringsAsFactors = FALSE)
colnames(clean_text)[1]<-"url"
urlsToScrape<-clean_text
#grab url from 2nd row and scrape; Iowa State.
url_base<-"http://catalog.iastate.edu/collegeofliberalartsandsciences/datascience/"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df10<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df10)[1]<-"frequency"
df10<-setDT(df10, keep.rownames = TRUE)[]
colnames(df10)[1]<-"word"
df10<-df10[order(-df10$frequency),]
set.seed(1973)
wordcloud(words = df10$word, freq = df10$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
#Colorado state
url_base<-"http://catalog.colostate.edu/general-catalog/courses-az/dsci/"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df11<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df11)[1]<-"frequency"
df11<-setDT(df11, keep.rownames = TRUE)[]
colnames(df11)[1]<-"word"
df11<-df11[order(-df11$frequency),]
set.seed(1973)
wordcloud(words = df11$word, freq = df11$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
#fairfield
url_base<-"http://catalog.fairfield.edu/graduate/engineering/programs/applied-data-science/"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df12<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df12)[1]<-"frequency"
df12<-setDT(df12, keep.rownames = TRUE)[]
colnames(df12)[1]<-"word"
df12<-df12[order(-df12$frequency),]
set.seed(1973)
wordcloud(words = df12$word, freq = df12$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
#Michigan
url_base<-"https://cse.umich.edu/eecs/undergraduate/data-science/"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df13<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df13)[1]<-"frequency"
df13<-setDT(df13, keep.rownames = TRUE)[]
colnames(df13)[1]<-"word"
df13<-df13[order(-df13$frequency),]
set.seed(1973)
wordcloud(words = df13$word, freq = df13$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
#Hawaii
url_base<-"https://hilo.hawaii.edu/catalog/data-science-cert"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df14<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df14)[1]<-"frequency"
df14<-setDT(df14, keep.rownames = TRUE)[]
colnames(df14)[1]<-"word"
df14<-df14[order(-df14$frequency),]
set.seed(1973)
wordcloud(words = df14$word, freq = df14$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
#37th row
url_base<-"http://catalogue.uci.edu/donaldbrenschoolofinformationandcomputersciences/departmentofstatistics/"
#create data frame and review, then clean.
just_text <- as.data.frame(lapply(url_base,getlinks2))
colnames(just_text)[1]<-"url"
just_text$url<-as.character(just_text$url)
clean_text<-as.list(unique(just_text$url))
textblock <- gsub(paste(omit,collapse="|"), " ", clean_text)
textblock <- gsub('[[:punct:] ]+',' ',textblock)
textblock <- gsub("[^[:alnum:] ]", "",textblock)
textblock <- tolower(textblock)
#convert to corpus
corp <- Corpus(VectorSource(textblock))
#set function to remove stopwords
ToOmit <- function(x) removeWords(x, stopwords("english"))
#remove punctuation, numbers, trim and use stopwords function
functions <- list(removePunctuation, removeNumbers, stripWhitespace, ToOmit)
#map text blocks
map <- tm_map(corp, FUN = tm_reduce, tmFuns = functions)
#convert to matrix
wordfreqs <- DocumentTermMatrix(map, control = list(wordLengths = c(3,20)))
df15<-as.data.frame(apply(wordfreqs, 2, sum))
colnames(df15)[1]<-"frequency"
df15<-setDT(df15, keep.rownames = TRUE)[]
colnames(df15)[1]<-"word"
df15<-df15[order(-df15$frequency),]
set.seed(1973)
wordcloud(words = df15$word, freq = df15$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
Each of the dataframes were combined and filtered further to eliminate terms associated with institutional learning, such as “accreditation” and “instructor”. A final word cloud renders the result.
master<-do.call("rbind", list(df,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15))
master<-master[order(-master$frequency),]
master_agg<-aggregate(master$frequency, by=list(master$word), FUN=sum)
colnames(master_agg)[1]<-"word"
colnames(master_agg)[2]<-"frequency"
#Let's remove some education-related words.
EducWords<-c("waived","pdf","may","term","please","electives","waive","accreditation","program","john","year", "refer","higher","find","courses","course","admission","undergraduate","graduate","schedule","students","programs","whose","requirements","back","xxx","johns","hopkins","edu","must","required","large","beyond","list","page","wide","james","curriculum","piorkowski","waiving","additional","register","prerequisites","chair","middle","nyu","college","fax","spall","one","take","unless","park","prior","applicants","otherwise","school","gpa","jhep","fees","instructors","small","floor","north","followed","learnjhu","tty","full","added","long","toward","accredited","accrediting","jhu","university","new","york","bachelor","faculty","staff","still","can","also","charles","will","degree","replace","replaced","mids","available","upon","various","including","abet","outside","times","nation","everything","center","involving","boston","completed","street","ave","part","certificate","hours","introduction","tells","semester","state","states","admitted","include","commission","massachusetts","huntington")
master_alt<-master_agg[-grep(paste(EducWords,collapse="|"),master_agg$word),]
set.seed(1974)
wordcloud(words = master_alt$word, freq = master_alt$frequency, min.freq = 100,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
#remove troublesome error in first row.
catalog_words<-master_alt[-1,]
catalog_words$frequency<-as.numeric(catalog_words$frequency)
write.csv(catalog_words,"catalog_words.csv")
Data were downloaded from the cloud and compiled.
#install.packages('RMySQL')
#install.packages('DBI')
library(RMySQL)
# Load the DBI library
library(DBI)
# Helper for getting new connection to Cloud SQL
getSqlConnection <- function() {
con <-dbConnect(RMySQL::MySQL(),
username = 'achan',#other ids set up are 'achan' and 'mhayes'
password = 'ac.mh.sj.607',#we all can use the same password
host = '35.202.129.190',#this is the IP address of the cloud instance
dbname = 'softskills')
return(con)
}
getSqlConnection2 <- function() {
con <-dbConnect(RMySQL::MySQL(),
username = 'achan',#other ids set up are 'achan' and 'mhayes'
password = 'ac.mh.sj.607',#we all can use the same password
host = '35.202.129.190',#this is the IP address of the cloud instance
dbname = 'job_postings')
return(con)
}
getSqlConnection3 <- function() {
con <-dbConnect(RMySQL::MySQL(),
username = 'achan',#other ids set up are 'achan' and 'mhayes'
password = 'ac.mh.sj.607',#we all can use the same password
host = '35.202.129.190',#this is the IP address of the cloud instance
dbname = 'blog_topics')
return(con)
}
connection <- getSqlConnection()
reqst <- dbSendQuery(connection,"select * from catalog_words")
catalogdata <- dbFetch(reqst)
connection2 <- getSqlConnection2()
reqst2 <- dbSendQuery(connection2,"select * from bigram_counts")
bigramdata <- dbFetch(reqst2)
reqst3 <- dbSendQuery(connection2,"select * from trigram_counts")
trigramdata <- dbFetch(reqst3)
reqst4 <- dbSendQuery(connection2,"select * from word_counts")
wordcountsdata <- dbFetch(reqst4)
connection3 <- getSqlConnection3()
dscdata <- dbGetQuery(connection3,"select * from blog_topics.dsc_data")
dscsecdata <- dbGetQuery(connection3,"select * from blog_topics.dsc_sec_data")
kgldata <- dbGetQuery(connection3,"select * from blog_topics.kgl_data")
kglsecdata <- dbGetQuery(connection3,"select * from blog_topics.kgl_sec_data")
sdcdata <- dbGetQuery(connection3,"select * from blog_topics.sdc_data")
sdcsecdata <- dbGetQuery(connection3,"select * from blog_topics.sdc_sec_data")
ssdata <- dbGetQuery(connection3,"select * from blog_topics.ss_data")
sssecdata <- dbGetQuery(connection3,"select * from blog_topics.ss_sec_data")
wbgdata <- dbGetQuery(connection3,"select * from blog_topics.wbg_data")
wbgsecdata <- dbGetQuery(connection3,"select * from blog_topics.wgb_sec_data")
Data from blogs is tidied and transformed; two columns, word and frequency are created.
wrdfreq<-rbind(dscdata[5:13],wbgdata[4:12],ssdata[4:12],sdcdata[4:12],kgldata[4:12])
wrdfreq<-as.data.frame(wrdfreq)
wrdfreq$data<-wrdfreq$big_data
colnames(wrdfreq)[1]<-"big"
wrdfreq$intelligence<-wrdfreq$business_intelligence
colnames(wrdfreq)[2]<-"business"
wrdfreq$mining<-wrdfreq$data_mining
colnames(wrdfreq)[3]<-"data2"
wrdfreq$learning<-wrdfreq$deep_learning
colnames(wrdfreq)[4]<-"deep"
wrdfreq$learning2<-wrdfreq$machine_learning
colnames(wrdfreq)[6]<-"machine"
wrdfreq$modeling<-wrdfreq$predictive_modeling
colnames(wrdfreq)[8]<-"predictive"
wrdfreq<-sapply(wrdfreq,as.numeric)
word.blogs<-colnames(wrdfreq)
freq.blogs<-colSums(wrdfreq,na.rm=TRUE)
blogcount<-cbind(word.blogs,freq.blogs)
rownames(blogcount)<-NULL
blogcount<-as.data.frame(gsub('2','',blogcount),stringsAsFactors = FALSE)
blogcount$freq.blogs<-as.numeric(blogcount$freq.blogs)
blogcount<-aggregate(blogcount$freq.blogs, by=list(word=blogcount$word.blogs), FUN=sum)
colnames(blogcount)[2]<-"frequency"
#omit "big" and "data""
blogcount<-blogcount[which(blogcount$word!='big'&blogcount$word!='data'),]
blogcount$proportion<-(blogcount$frequency/(sum(blogcount$frequency)))
library(wordcloud)
set.seed(1970)
wordcloud(words = blogcount$word, freq = blogcount$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
A word cloud is created from the resulting compiled blog word frequency data.
colnames(wordcountsdata)[2]<-"frequency"
#omit "big" and "data"
wordcountsdata<-wordcountsdata[which(wordcountsdata$word!='big'&wordcountsdata$word!='data'&wordcountsdata$word!='e.g'&wordcountsdata$word!='5'&wordcountsdata$word!='i.e'),]
wordcountsdata$proportion<-(wordcountsdata$frequency/(sum(wordcountsdata$frequency)))
set.seed(1969)
wordcloud(words = wordcountsdata$word, freq = wordcountsdata$frequency, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.2,
colors=brewer.pal(8, "Dark2"))
A comparison is drawn between data collected from blogs,course catalogs and from job sites.
#omit "big" and "data"
catalogdata<-catalogdata[which(catalogdata$word!='big'&catalogdata$word!='data'),]
catalogdata$proportion<-(catalogdata$frequency/(sum(catalogdata$frequency)))
wordcountsdata$genre<-"jobs"
catalogdata$genre<-"catalogs"
blogcount$genre<-"blogs"
masterlist<-rbind(wordcountsdata,blogcount,catalogdata)
library(scales)
library(ggplot2)
# expect a warning about rows with missing values being removed
ggplot(masterlist, aes(x = proportion, y = proportion, color = proportion)) +
geom_jitter(alpha = 0.2, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word),alpha=1, check_overlap = TRUE, vjust = 1.5, hjust= .6) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.3), low = "darkslategray4", high = "blue") +
facet_wrap(~genre, ncol = 3) +
theme(legend.position="none",panel.background = element_blank()) +
labs(y = "proportion", x = NULL)
How do the word frequencies of institution course catalog sites and job sites compare?
To answer the question, we’ll merge by word and find the most frequent common occurrences.
compiled<-merge(wordcountsdata,catalogdata,by="word")
compiled100<-compiled[which(compiled$frequency.x>=50&compiled$frequency.y>=50),]
ggplot(compiled100, aes(x=word, y=frequency.x)) +
geom_bar(stat='identity', position='dodge')+
theme(legend.position="none",panel.background = element_blank())+
labs(y = "frequency", x = NULL)+
ggtitle("Frequency of Common Words, Job Listings")
ggplot(compiled100, aes(x=word, y=frequency.y)) +
geom_bar(stat='identity', position='dodge')+
theme(legend.position="none",panel.background = element_blank())+
labs(y = "frequency", x = NULL)+
ggtitle("Frequency of Common Words, Course Catalogs")