Web Scraping of job postings from Pharmaceutical industry (related to data science)

library(xml2)
library(rvest)
library(stringr)
job1 <- read_html("https://rew22.ultipro.com/SAR1004/jobboard/JobDetails.aspx?__ID=*E67125DCA9517820")
job1.title <- job1 %>% html_nodes("#DataCell_Req_TitleFK .PrintSmall") %>% html_text()
job1.skills <- job1 %>% html_nodes("ul:nth-child(10) li") %>% html_text()
job1.title
## [1] "Principal Data Scientist"
job1.skills 
## [1] "Quality by Design experience is a plus "                                                                                                
## [2] "Experience with SQL, NoSQL, Historians, Hadoop is a merit "                                                                             
## [3] "Technical teaching and consulting experience is a definite plus "                                                                       
## [4] "Strong presentation, and communication skills (both written and verbal) are strongly preferred\r\n"                                     
## [5] "Strong drive and persistency and a proven record of accomplishment to drive projects on a continuous basis\r\n"                         
## [6] "Interpersonal and relationship-building skills to facilitate working at all levels within the customers’ and internal organizations\r\n"
job2 <- read_html("https://jobs.biogen.com/job/Cambridge-Senior-Data-Scientist-MA-02138/424526600/?feedId=127000&utm_source=Indeed&utm_campaign=Biogen_Indeed")
job2.title <- job2 %>% html_nodes("#job-title") %>% html_text()
job2.skills <- job2 %>% html_nodes("tr:nth-child(6) td:nth-child(2)") %>% html_text()
job2.title
## [1] "Senior Data Scientist"
job2.skills 
## [1] "Required Skills:• Strong analytical skills for effective problem solving• Development of regression, classification, and clustering models based on customer level data• Ability to connect and explain data science technical aspects with business customers• Strong communication and the ability to clearly convey information both within the group and to external groups• Experience and understanding of Machine Learning and Deep Learning techniques• Experience with Spark (Scala or python implementations)• Advanced knowledge with one Data Science Language and familiarity with at least a second (Python, R, Scala, Java)• Working knowledge of Relational Databases• Working Knowledge of at least one NoSQL datastore (MongoDB, Cassandra, Neo4J, …)• Understanding of WebServices as well as JSON and XML formats• Experience working in a commercial organizationDesired Skills:• Additional Languages: Python, R, Java, Scala, HTML 5, R• Experience in ad hoc reporting and analysis• Knowledge of Bayesian statistics and their implementation• Experience with Visualization tools like Tableau or Qlik • Frameworks: R Shiny, Django• Experience with multiple NoSQL datastores, Including MongoDB, Neo4J, RedShift and Cassandra• Experience in the pharmaceutical industry"
job3 <- read_html("https://www.indeed.com/viewjob?jk=47336e3895e744fc&q=Pharmaceutical+Data+Scientist&tk=1bsrnqg710kgf4b5&from=web")
job3.title <- job3 %>% html_nodes("title") %>% html_text()
job3.skills <- job3 %>% html_nodes("li li") %>% html_text()
job3.title
## [1] "Scientific Data Analyst, R&D Information Sciences job - FORMA Therapeutics - Watertown, MA | Indeed.com"
job3.skills 
##  [1] "Work as part of the FORMA RDIS team to develop and maintain strategies for application deployment and data management practices that support science at FORMA\n"                                                                                                                 
##  [2] "Provide and coordinate day to day operation of scientific software applications that generate and load data for advancing small molecule development programs, including desktop support, training, ongoing account administration, and managing issue resolution with vendors\n"
##  [3] "Develop and maintain the ontology of biological results, provide data curation, and ensure adherence to FORMA data standards\n"                                                                                                                                                  
##  [4] "Develop and maintain application and training documentation\n"                                                                                                                                                                                                                   
##  [5] "Coordinate with internal scientists and external collaborators to ensure appropriate standards are met for data capture\n"                                                                                                                                                       
##  [6] "Work closely with biology scientists to identify gaps in informatics solutions and support, and find/implement innovative workflow solutions, including integration of data from automation platforms\n"                                                                         
##  [7] "Bridge communication between discovery biology scientists and vendors and consultants for the implementation of solutions\n"                                                                                                                                                     
##  [8] "Collaborate with the IT group, FORMA scientists, and work directly with outside software vendors in the specification, evaluation, implementation and upgrade of software that impacts discovery biology scientists\n"                                                           
##  [9] "Work with FORMA contracts and sourcing groups to establish contracts covering vendor engagements\n"                                                                                                                                                                              
## [10] "Develop and execute test plans, record results, and prioritize bugs and fixes for configurations and applications\n"                                                                                                                                                             
## [11] "Support and embody the FORMA Core Values\n"                                                                                                                                                                                                                                      
## [12] "Degree in the life sciences, with a demonstrated understanding of biological and assay execution workflows, ideally with hands-on laboratory experience, most ideally in a pharmaceutical environment\n"                                                                         
## [13] "6+ years working with informatics systems\n"                                                                                                                                                                                                                                     
## [14] "Conversant with assays in a range of formats including biochemical inhibition and activation, cellular and phenotypic screening, in-well kinetic, and ranging from manual to fully-automated HTS\n"                                                                              
## [15] "Demonstrated ability with informatics and scientific data applications at the end user, admin and systems admin level; preferred experience with biological workflow and results management systems such as ActivityBase, IDBS E-Workbook.\n"                                    
## [16] "Ability to transform and analyze data though a variety of methods and comfort with basic scripting to support data transformation prototyping, implement application-level scripts, and deal with internal BI data\n"                                                            
## [17] "Ability to read, analyze, and interpret technical documentation, and to generate internal process and training documentation\n"                                                                                                                                                  
## [18] "SQL, minor programming skills, and/or exposure to genomics data, Spotfire, Pipeline Pilot, or Dotmatics applications are a plus\n"                                                                                                                                               
## [19] "Strong customer service skills\n"                                                                                                                                                                                                                                                
## [20] "Excellent communication skills\n"                                                                                                                                                                                                                                                
## [21] "Problem solving skills\n"                                                                                                                                                                                                                                                        
## [22] "Effective working both independently and within a team environment"
job4 <- read_html("https://careers.regeneron.com/job/REGEA0026304106/Data-Scientist-supporting-Immune-Oncology-efforts-?jobsource=indeed&utm_source=indeed&utm_medium=indeed&utm_campaign=indeed-feed")
job4.title <- job4 %>% html_nodes("h1") %>% html_text()
job4.skills <- job4 %>% html_nodes(".job-details p") %>% html_text()
job4.title
## [1] "Data Scientist supporting Immune-Oncology efforts!"
job4.skills 
## [1] "Known for its scientific and operational excellence, Regeneron is a leading science-based biopharmaceutical company that discovers, invents, develops, manufactures, and commercializes medicines for the treatment of serious medical conditions. Regeneron commercializes medicines for eye diseases, high LDL-cholesterol, atopic dermatitis and a rare inflammatory condition and has product candidates in development in other areas of high unmet medical need, including rheumatoid arthritis, asthma, pain, cancer and infectious diseases.We are seeking a creative and self-motivated Scientist to join the Molecular Profiling team. This position plays an important role in analyzing proprietary and public profiling data to advance our immune-oncology (I/O) targets from discovery to clinical applications, and to identify I/O clinical biomarkers and strategies that help drive our development decisions.  Responsibilities include, but are not limited to:• Collaborate with scientists from the I/O focus therapeutic area and clinical departments to design profiling experiments and draft data analysis plans• Analyze of proprietary (clinical and preclinical) and public data to support Regeneron I/O targets and clinical pipeline; this including target identification, target advancement, biomarker discovery, toxicology studies, and clinical profiling data• Prepare clear, concise and easy-to-understand presentations and documentations for collaborators, senior management and government agencies• Implement and develop analysis pipelines using cutting-edge algorithms and statistically sound methodologiesThis position requires a PhD in bioinformatics, quantitative sciences, or other related disciplines with at least 3-5 years of post-PhD experience. Additional requirements include:• Expertise in computational and Molecular Biology• Proficient in R/MatLab and Python/Perl• Experience in analyzing I/O NGS (RNA-seq and exome-seq) and single cell sequencing data• Experience in analyzing cancer profiling data in public the domain such as TCGA• Experience in modeling (e.g., machine learning) and network building (e.g. gene regulation networks); Keen interest and determination to be at the cutting-edge of algorithms and model building• Excellent communication and presentation skills; Team spirit and commitment to share and collaborateLevel will be commensurate with experienceThis is an opportunity to join our select team that is already leading the way in the Pharmaceutical/Biotech industry. Apply today and learn more about Regeneron’s unwavering commitment to combining good science & good business.To all agencies: Please, no phone calls or emails to any employee of Regeneron about this opening. All resumes submitted by search firms/employment agencies to any employee at Regeneron via-email, the internet or in any form and/or method will be deemed the sole property of Regeneron, unless such search firms/employment agencies were engaged by Regeneron for this position and a valid agreement with Regeneron is in place. In the event a candidate who was submitted outside of the Regeneron agency engagement process is hired, no fee or payment of any kind will be paid.Regeneron is an equal opportunity employer and all qualified applicants will receive consideration for employment without regard to race, color, religion, sex, national origin, sexual orientation, gender identity, disability status, protected veteran status, or any other characteristic protected by law."
## [2] "Requisition Number: 10066BR"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
## [3] "To access this job from another computer, email this information to yourself or a friend."
job5 <- read_html("https://jobs.jnj.com/jobs/1575161128/2-Senior-Data-Scientist-Medical-Safety?lang=en-us&src=JB-10281")
job5.title <- job5 %>% html_nodes("#label-job-title") %>% html_text()
job5.skills <- job5 %>% html_nodes("ul:nth-child(12) li") %>% html_text()
job5.title
## [1] "Senior Data Scientist, Medical Safety"
job5.skills 
##  [1] "Master degree required in Applied Mathematics, Computer Science/Informatics, Epidemiology, Statistics or related discipline with 5 years of overall experience or a PhD."        
##  [2] "Experience applying a minimum of two software languages, such as PL/SQL, R, JMP, Python or other related software language to provide data-driven analytical solutions required."
##  [3] "Advanced knowledge of building custom or web-based applications using JMP, Java, JSON required."                                                                                 
##  [4] "Experience with relational databases (ex: Oracle Databases, MySQL, Microsoft Access, etc.) required."                                                                            
##  [5] "Experience with systems development life cycle (SDLC) and computer system validation within a GxP environment is highly preferred."                                              
##  [6] "Experience with business intelligence/ visualization tools such as Qlik, Tableau or Spotfire is preferred. "                                                                     
##  [7] "Experience with spontaneous adverse events and working knowledge of safety surveillance is preferred."                                                                           
##  [8] "Prior training/experience with natural language processing and machine learning is preferred."                                                                                   
##  [9] "Excellent written and communications skills to report back the findings in a clear, structured manner are required."                                                             
## [10] "Willingness to work in an exploratory environment, handling non-standard tasks with minimal supervision required."                                                               
## [11] "This position is required to be based in Fort Washington, PA or Skillman, NJ."

Group created document with top skills from each industry - Read in as CSV

data <- read.csv ("/Users/christinakasman/Desktop/Clean Data - Sheet1.csv")

Created SQL table for CSV

library(rmarkdown)
library(RMySQL)
## Loading required package: DBI
drv <- dbDriver("MySQL")
dbListFields(con, "Project3")
## [1] "Industry"  "Skillset"  "SkillType"
query <- "SELECT * FROM Project3 WHERE Industry = 'Pharmaceutical';"
Project3 <- dbGetQuery(con, query)
library(MASS)
skills <- Project3$SkillType
skills.freq <- table(skills)
freq <- cbind(skills, skills.freq)
## Warning in cbind(skills, skills.freq): number of rows of result is not a
## multiple of vector length (arg 2)
freq1 <- as.data.frame(freq, stringsAsFactors = FALSE)
newdata <- freq1[ which(freq1$skills.freq > 1), ]
newdata
##                             skills skills.freq
## 1                            SQL\r           2
## 5             Technical Teaching\r           5
## 18                          HTML\r           3
## 21            Good Communication\r           3
## 26       Customer Service Skills\r           2
## 27            Good Communication\r           3
## 28                             R\r           2
## 32 Build Machine Learning Models\r           2
## 36                           JMP\r           5
library(ggplot2)
ggplot(newdata, aes(x = newdata$skills, y = newdata$skills.freq, fill = newdata$skills.freq)) + 
  geom_bar(stat = "identity") +
  xlab("Skills") + 
  ylab("Frequency") + 
  theme(legend.position = "none",  
        axis.text.x = element_text(angle = 65, hjust = 1)) +
  ggtitle("Top 10 Data Science Skills in Pharmaceutical Industry")