R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(rvest)
## Warning: package 'rvest' was built under R version 4.1.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
# Start by reading a HTML page with read_html():
skills_page <- read_html("https://brainstation.io/career-guides/data-science-resume-examples#what-skills-should-you-put-on-a-data-science-resume")
skills_page 
## {html_document}
## <html lang="en-US">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="career-guide-template-default single single-career-guide pos ...

Performing Web Scraping to fetch the skills using the node selector

skills_set1<- skills_page %>% html_nodes("ul:nth-child(4)") %>% html_elements("li")  %>% html_text2()
skills_set2<- skills_page %>% html_nodes("ul:nth-child(6)") %>% html_elements("li")  %>% html_text2()

skills_set1[2:19]
##  [1] "Data Analytics"          "Data Science"           
##  [3] "Python"                  "Cybersecurity"          
##  [5] "Data analysis"           "Data wrangling"         
##  [7] "Data modeling"           "Statistics"             
##  [9] "Data visualization"      "Programming"            
## [11] "Quantitative analysis"   "Machine learning"       
## [13] "Machine learning models" "Data mining"            
## [15] "Debugging"               "Hypothesis testing"     
## [17] "A/B tests"               "Regression"
skills_set2[2:34]
##  [1] "Web Development" "R"               "Python"          "C"              
##  [5] "C++"             "C#"              "HTML"            "Java"           
##  [9] "JavaScript"      "PHP"             "SAS"             "SQL"            
## [13] "Scala"           "MATLAB"          "SQL Server"      "NoSQL"          
## [17] "Hadoop"          "OpenRefine"      "TensorFlow"      "Cloudera"       
## [21] "Tableau"         "Microsoft Excel" "Octave"          "Spark"          
## [25] "PowerBI"         "Plotly"          "Bokeh"           "Matplotlib"     
## [29] "Seaborn"         "Keras"           "Pytorch"         "AWS"            
## [33] "Hive"
writeLines(skills_set1)
## Data Courses
## Data Analytics
## Data Science
## Python
## Cybersecurity
## Data analysis
## Data wrangling
## Data modeling
## Statistics
## Data visualization
## Programming
## Quantitative analysis
## Machine learning
## Machine learning models
## Data mining
## Debugging
## Hypothesis testing
## A/B tests
## Regression
## Research the company, the role, and relevant data skills
## Reference resume templates and samples to build a resume outline
## Add relevant education experience, work experience and data projects to the correct section of your resume
## Highlight experience with machine learning and data tools
## Craft concise bullet points using the action verb + task + result format for each experience, emphasizing data-driven successes
## Have a trusted peer proofread your Data Scientist resume for grammar and spelling to make sure your experience is professionally presented
## Work on projects in a collaborative setting
## Take advantage of our flexible plans and scholarships
## Get access to VIP events and workshops

Trimming the extra spaces from the words in skill_set1

vector1 = c()

for (i in 6:19) {
  
  vector1[i-5] <- as.character(trimws(skills_set1[i], which = c("both")))
  
}

Trimming the extra spaces from the words in skill_set2

vector2 = c()

for (i in 3:34) {
  
  vector2[i-2] <- as.character(trimws(skills_set2[i], which = c("both")))
  
}
vector1
##  [1] "Data analysis"           "Data wrangling"         
##  [3] "Data modeling"           "Statistics"             
##  [5] "Data visualization"      "Programming"            
##  [7] "Quantitative analysis"   "Machine learning"       
##  [9] "Machine learning models" "Data mining"            
## [11] "Debugging"               "Hypothesis testing"     
## [13] "A/B tests"               "Regression"
vector2
##  [1] "R"               "Python"          "C"               "C++"            
##  [5] "C#"              "HTML"            "Java"            "JavaScript"     
##  [9] "PHP"             "SAS"             "SQL"             "Scala"          
## [13] "MATLAB"          "SQL Server"      "NoSQL"           "Hadoop"         
## [17] "OpenRefine"      "TensorFlow"      "Cloudera"        "Tableau"        
## [21] "Microsoft Excel" "Octave"          "Spark"           "PowerBI"        
## [25] "Plotly"          "Bokeh"           "Matplotlib"      "Seaborn"        
## [29] "Keras"           "Pytorch"         "AWS"             "Hive"

Start by reading a HTML page with read_html():

softskills_page <- read_html("https://zety.com/blog/data-scientist-resume-example")

Scraping using node selector

skills_set3<- softskills_page %>% html_nodes("td:nth-child(1) > p > a") %>% html_text2()

skills_set3[3:9]
## [1] "Soft Skills"          "Communication"        "Collaboration"       
## [4] "Creative Thinking"    "Critical Thinking"    "Problem Solving"     
## [7] "Interpersonal Skills"

Scraping using node selector

vector3 = c()

for (i in 3:9) {
  
  vector3[i-2] <- as.character(trimws(skills_set3[i], which = c("both")))
  
}

vector3
## [1] "Soft Skills"          "Communication"        "Collaboration"       
## [4] "Creative Thinking"    "Critical Thinking"    "Problem Solving"     
## [7] "Interpersonal Skills"

Combining the vectors

title_vector <- c(vector1, vector2, vector3)

#Adding the values in the columns

indeed_jobs = c()
# adding jobs to an empty vector 

indeed_jobs = c('46,915','2,660','28,328','2,327',' 41,727','9,444','55,983','75,076','9,240','7,400',
                '49,922','3,706','72,630','5,368','12,265','85,808','1,578','2,277','102,663','14,354',
                '3,059','135,162','16,826','19,130','197,041','16,762','17,577','57,397','34,179','22,990',
                ' 37','9,539','2,472','63,047','868,715','117','7,211','18,860','525','169',
                '935','305','3,025','5,472','120,397','45','43,238','61,936','721,924','114,820',
                '316,760','1,135,873','1,278,165')
linkedin_jobs = c()
# adding jobs to an empty vector 

linkedin_jobs = c('580,822','3,329','135,443','216,875','119,686','510,656','53,452','186,554','21,617','112,298',
                  '98,477','3,452','5,587','60,006','175,906','898,552','315,245','771,041','197,040','812,683',
                  '867,841','800,863','69,694','29,898','1,196,586','27,913','163,737','263,855','64,198','70,445',
                  '19','16,685','4,266','156,649','360,852','627','67,961','60,296','638','180',
                  '1,146','375','14,710','9,081','234,937','38,932','24,555','4,571,013','663,670','45,321',
                  '543,385','1,915,666','1,056,725')
simplyhired_jobs = c()
# adding jobs to an empty vector 

simplyhired_jobs = c('27,586','1,233','13,702','44,456','37,409','87,187','33,666','34,298','21,562','11,574',
                     '26,019','2,237','6,353','14,068','61,044','99,354','193,995','40,280','47,118','34,411',
                     '69,038','63,254','9,906','10,482','104,201','6,411','11,880','31,166','14,219','9,865',
                     '34','4,243','1,038','32,330','497,140','33','23,610','7,756','345','96',
                     '630','219','1,391','3,158','66,652','5,255','22,061','41,747','372,740','60,142',
                     '157,402','548,264','547,690')
ziprecruiter_jobs = c()
# adding jobs to an empty vector 

ziprecruiter_jobs = c('1,685,359','434','623,648','288,970','170,268','2,047,037','261,054','317,549','316,611','116,528',
                      '111,577','2,220,185','3,460','22,966','20,558','457,959','54,686','291,574','243,188','78,526',
                      '292,644','202,555','45,434','39,475','269,004','27,009','29,676','130,988','42,417','36,291',
                      '14','14,633','487','89,157','488,192','13','70,727','17,436','1,508','154',
                      '110,348','168','213,055','13,278','188,515','17,266','39,603','5,547,167','839,660','145,056',
                      '355,482','34,357','1,159,558')

Creating new dataframe for storing the vectors

new_df = data.frame(skill_title = title_vector,
                indeed = indeed_jobs,
                linkedin = linkedin_jobs,
                simplyhired = simplyhired_jobs,
                ziprecruiter = ziprecruiter_jobs
                )
new_df
##                skill_title    indeed  linkedin simplyhired ziprecruiter
## 1            Data analysis    46,915   580,822      27,586    1,685,359
## 2           Data wrangling     2,660     3,329       1,233          434
## 3            Data modeling    28,328   135,443      13,702      623,648
## 4               Statistics     2,327   216,875      44,456      288,970
## 5       Data visualization    41,727   119,686      37,409      170,268
## 6              Programming     9,444   510,656      87,187    2,047,037
## 7    Quantitative analysis    55,983    53,452      33,666      261,054
## 8         Machine learning    75,076   186,554      34,298      317,549
## 9  Machine learning models     9,240    21,617      21,562      316,611
## 10             Data mining     7,400   112,298      11,574      116,528
## 11               Debugging    49,922    98,477      26,019      111,577
## 12      Hypothesis testing     3,706     3,452       2,237    2,220,185
## 13               A/B tests    72,630     5,587       6,353        3,460
## 14              Regression     5,368    60,006      14,068       22,966
## 15                       R    12,265   175,906      61,044       20,558
## 16                  Python    85,808   898,552      99,354      457,959
## 17                       C     1,578   315,245     193,995       54,686
## 18                     C++     2,277   771,041      40,280      291,574
## 19                      C#   102,663   197,040      47,118      243,188
## 20                    HTML    14,354   812,683      34,411       78,526
## 21                    Java     3,059   867,841      69,038      292,644
## 22              JavaScript   135,162   800,863      63,254      202,555
## 23                     PHP    16,826    69,694       9,906       45,434
## 24                     SAS    19,130    29,898      10,482       39,475
## 25                     SQL   197,041 1,196,586     104,201      269,004
## 26                   Scala    16,762    27,913       6,411       27,009
## 27                  MATLAB    17,577   163,737      11,880       29,676
## 28              SQL Server    57,397   263,855      31,166      130,988
## 29                   NoSQL    34,179    64,198      14,219       42,417
## 30                  Hadoop    22,990    70,445       9,865       36,291
## 31              OpenRefine        37        19          34           14
## 32              TensorFlow     9,539    16,685       4,243       14,633
## 33                Cloudera     2,472     4,266       1,038          487
## 34                 Tableau    63,047   156,649      32,330       89,157
## 35         Microsoft Excel   868,715   360,852     497,140      488,192
## 36                  Octave       117       627          33           13
## 37                   Spark     7,211    67,961      23,610       70,727
## 38                 PowerBI    18,860    60,296       7,756       17,436
## 39                  Plotly       525       638         345        1,508
## 40                   Bokeh       169       180          96          154
## 41              Matplotlib       935     1,146         630      110,348
## 42                 Seaborn       305       375         219          168
## 43                   Keras     3,025    14,710       1,391      213,055
## 44                 Pytorch     5,472     9,081       3,158       13,278
## 45                     AWS   120,397   234,937      66,652      188,515
## 46                    Hive        45    38,932       5,255       17,266
## 47             Soft Skills    43,238    24,555      22,061       39,603
## 48           Communication    61,936 4,571,013      41,747    5,547,167
## 49           Collaboration   721,924   663,670     372,740      839,660
## 50       Creative Thinking   114,820    45,321      60,142      145,056
## 51       Critical Thinking   316,760   543,385     157,402      355,482
## 52         Problem Solving 1,135,873 1,915,666     548,264       34,357
## 53    Interpersonal Skills 1,278,165 1,056,725     547,690    1,159,558

Checking the working directory path and exporting data frame into a CSV file

# Get working directory path
path <- getwd()

path
## [1] "C:/Users/Uzma/CUNY_SPS_PROJECTS/Data_607_Project3"

Export file as csv to working directory.

write.csv(new_df, file.path(path, "skills_cleaned_data.csv"))