Project 3 Attempt

knitr::opts_chunk$set(echo = TRUE)
library(rvest)

## Loading required package: xml2

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------ tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts --------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()

library(stringr)
library(dplyr)
library(purrr)
library(knitr)

My intention here is to scrape course description and curriculum for several universities and college for the data science course taught at the universities.

Specifying the url for desired websites to be scrapped

url1 <- "https://www.gc.cuny.edu/Page-Elements/Academics-Research-Centers-Initiatives/Masters-Programs/Data-Science/Curriculum-and-Courses"
url2 <- "https://www.gc.cuny.edu/Page-Elements/Academics-Research-Centers-Initiatives/Masters-Programs/Data-Science/Curriculum-and-Courses"
url3 <- "https://datascience.columbia.edu/master-of-science-in-data-science"
url4 <- "https://datascience.duke.edu/mids-courses"
url5 <- "http://catalog.cofc.edu/preview_program.php?catoid=13&poid=2778&hl=%22data+science%22&returnto=search"
url6 <- "https://www.cgu.edu/academics/program/ms-information-systems-and-technology/"
url7 <- "https://www.chapman.edu/scst/graduate/ms-computational-science.aspx"
url8 <- "https://calbaptist.edu/programs/data-sciences-minor/"
url9 <- "https://mcds.cs.cmu.edu/learn-us-curriculum"
url10 <- "https://www.brown.edu/initiatives/data-science/masters-degree/curriculum"
url11 <- "https://www.american.edu/programs/shared/data-science/"

Reading the HTML code from the website

webpage1 <- read_html(url1)
webpage2 <- read_html(url2)
webpage3 <- read_html(url3)
webpage4 <- read_html(url4)
webpage5 <- read_html(url5)
webpage6 <- read_html(url6)
webpage7 <- read_html(url7)
webpage8 <- read_html(url8)
webpage9 <- read_html(url9)
webpage10 <- read_html(url10)
webpage11 <- read_html(url11)

Scrape Curriculum and Course Description for Data Science

cuny <- html_nodes(webpage1, "article#main-content")
harvard <- html_nodes(webpage2, "article#main-content")
columbia <- html_nodes(webpage3, "div#content")
duke <- html_nodes(webpage4, "div#Content.row")
charleston <- html_nodes(webpage5, "body")
claremont <- html_nodes(webpage6, "body")
chapman <- html_nodes(webpage7, "div.main")
california_bap <- html_nodes(webpage8, "div.page-main-content")
carnegie <- html_nodes(webpage9, "div#page")
brown <- html_nodes(webpage10, "main")
american <- html_nodes(webpage11, "main#main-container")

cuny <- html_text(cuny)
harvard <- html_text(harvard)
columbia <- html_text(columbia)
duke <- html_text(duke)
charleston <- html_text(charleston)
claremont <- html_text(claremont)
chapman <- html_text(chapman)
california_bap <- html_text(california_bap)
carnegie <- html_text(carnegie)
brown <- html_text(brown)
american <- html_text(american)

Course_desc <- bind_rows(data.frame(c("cuny", "harvard", "columbia", "duke", "charleston", "claremont", "chapman", "california_bap",
                 "carnegie", "brown", "american"), c(cuny,columbia, harvard,duke,charleston,claremont,chapman,california_bap, carnegie, brown,american)))

colnames(Course_desc) <- c("school", "description")
Course_desc$description <- textclean::replace_html(Course_desc$description)

Keywords <- bind_rows(data.frame(Keywords = c(
  "Machine Learning", "Cloud Computing", "Data Mining","Data Science Seminar", "Numerical Linear Algebra",
            "Discrete Mathematics", "Algorithms", "Optimization", "Stochastic",
            "Probability", "Statistical Inference", "Regression", "Statistical Modeling", "Data Mining", "Software Design","Natural Language Processing", "NLP",
  "Neural Networks", "Big Data", "Linear and Nonlinear Models", "Capstone", "Tableau", "Mathematics and Statistics", "Computer Science", "Engineering",
  "Big Data", "Hadoop", "R", "Python", "SQL", "MySQL", "NoSQL", "Apache Spark")))
write.csv(Keywords, file= "Keywords.csv")
Keywords <- read.csv('https://raw.githubusercontent.com/Emahayz/Data-607-Class/master/Keywords.csv')

Keywords <- as.character(Keywords$Keywords)
Course_desc$count <- rep(0,nrow(Course_desc))
for (i in 1:length(Keywords)) {
  for (j in 1:nrow(Course_desc)) {
    if(grepl(Keywords[i],Course_desc[j,2]) == 1){
      Course_desc[j,"count"] <- Course_desc[j,"count"] + 1
    }
  }
}

Listing the Courses

list(Keywords, sort = TRUE)

## [[1]]
##  [1] "Machine Learning"            "Cloud Computing"            
##  [3] "Data Mining"                 "Data Science Seminar"       
##  [5] "Numerical Linear Algebra"    "Discrete Mathematics"       
##  [7] "Algorithms"                  "Optimization"               
##  [9] "Stochastic"                  "Probability"                
## [11] "Statistical Inference"       "Regression"                 
## [13] "Statistical Modeling"        "Data Mining"                
## [15] "Software Design"             "Natural Language Processing"
## [17] "NLP"                         "Neural Networks"            
## [19] "Big Data"                    "Linear and Nonlinear Models"
## [21] "Capstone"                    "Tableau"                    
## [23] "Mathematics and Statistics"  "Computer Science"           
## [25] "Engineering"                 "Big Data"                   
## [27] "Hadoop"                      "R"                          
## [29] "Python"                      "SQL"                        
## [31] "MySQL"                       "NoSQL"                      
## [33] "Apache Spark"               
## 
## $sort
## [1] TRUE

Listing the schools and the frequency of the programs

kable(Course_desc[c(1,3)])

school	count
cuny	11
harvard	4
columbia	11
duke	14
charleston	6
claremont	4
chapman	7
california_bap	4
carnegie	12
brown	6
american	3

Barplot of the schools with the programs

courses <- spread(Course_desc[c(1,3)],school,count)
barplot(as.numeric(courses) ,names.arg = colnames(courses))

Project 3 Attempt

Emmanuel Hayble-Gomes

10/20/2019