This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(rvest)
## Warning: package 'rvest' was built under R version 4.1.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
# Start by reading a HTML page with read_html():
skills_page <- read_html("https://brainstation.io/career-guides/data-science-resume-examples#what-skills-should-you-put-on-a-data-science-resume")
skills_page
## {html_document}
## <html lang="en-US">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="career-guide-template-default single single-career-guide pos ...
skills_set1<- skills_page %>% html_nodes("ul:nth-child(4)") %>% html_elements("li") %>% html_text2()
skills_set2<- skills_page %>% html_nodes("ul:nth-child(6)") %>% html_elements("li") %>% html_text2()
skills_set1[2:19]
## [1] "Data Analytics" "Data Science"
## [3] "Python" "Cybersecurity"
## [5] "Data analysis" "Data wrangling"
## [7] "Data modeling" "Statistics"
## [9] "Data visualization" "Programming"
## [11] "Quantitative analysis" "Machine learning"
## [13] "Machine learning models" "Data mining"
## [15] "Debugging" "Hypothesis testing"
## [17] "A/B tests" "Regression"
skills_set2[2:34]
## [1] "Web Development" "R" "Python" "C"
## [5] "C++" "C#" "HTML" "Java"
## [9] "JavaScript" "PHP" "SAS" "SQL"
## [13] "Scala" "MATLAB" "SQL Server" "NoSQL"
## [17] "Hadoop" "OpenRefine" "TensorFlow" "Cloudera"
## [21] "Tableau" "Microsoft Excel" "Octave" "Spark"
## [25] "PowerBI" "Plotly" "Bokeh" "Matplotlib"
## [29] "Seaborn" "Keras" "Pytorch" "AWS"
## [33] "Hive"
writeLines(skills_set1)
## Data Courses
## Data Analytics
## Data Science
## Python
## Cybersecurity
## Data analysis
## Data wrangling
## Data modeling
## Statistics
## Data visualization
## Programming
## Quantitative analysis
## Machine learning
## Machine learning models
## Data mining
## Debugging
## Hypothesis testing
## A/B tests
## Regression
## Research the company, the role, and relevant data skills
## Reference resume templates and samples to build a resume outline
## Add relevant education experience, work experience and data projects to the correct section of your resume
## Highlight experience with machine learning and data tools
## Craft concise bullet points using the action verb + task + result format for each experience, emphasizing data-driven successes
## Have a trusted peer proofread your Data Scientist resume for grammar and spelling to make sure your experience is professionally presented
## Work on projects in a collaborative setting
## Take advantage of our flexible plans and scholarships
## Get access to VIP events and workshops
vector1 = c()
for (i in 6:19) {
vector1[i-5] <- as.character(trimws(skills_set1[i], which = c("both")))
}
vector2 = c()
for (i in 3:34) {
vector2[i-2] <- as.character(trimws(skills_set2[i], which = c("both")))
}
vector1
## [1] "Data analysis" "Data wrangling"
## [3] "Data modeling" "Statistics"
## [5] "Data visualization" "Programming"
## [7] "Quantitative analysis" "Machine learning"
## [9] "Machine learning models" "Data mining"
## [11] "Debugging" "Hypothesis testing"
## [13] "A/B tests" "Regression"
vector2
## [1] "R" "Python" "C" "C++"
## [5] "C#" "HTML" "Java" "JavaScript"
## [9] "PHP" "SAS" "SQL" "Scala"
## [13] "MATLAB" "SQL Server" "NoSQL" "Hadoop"
## [17] "OpenRefine" "TensorFlow" "Cloudera" "Tableau"
## [21] "Microsoft Excel" "Octave" "Spark" "PowerBI"
## [25] "Plotly" "Bokeh" "Matplotlib" "Seaborn"
## [29] "Keras" "Pytorch" "AWS" "Hive"
softskills_page <- read_html("https://zety.com/blog/data-scientist-resume-example")
skills_set3<- softskills_page %>% html_nodes("td:nth-child(1) > p > a") %>% html_text2()
skills_set3[3:9]
## [1] "Soft Skills" "Communication" "Collaboration"
## [4] "Creative Thinking" "Critical Thinking" "Problem Solving"
## [7] "Interpersonal Skills"
vector3 = c()
for (i in 3:9) {
vector3[i-2] <- as.character(trimws(skills_set3[i], which = c("both")))
}
vector3
## [1] "Soft Skills" "Communication" "Collaboration"
## [4] "Creative Thinking" "Critical Thinking" "Problem Solving"
## [7] "Interpersonal Skills"
title_vector <- c(vector1, vector2, vector3)
#Adding the values in the columns
indeed_jobs = c()
# adding jobs to an empty vector
indeed_jobs = c('46,915','2,660','28,328','2,327',' 41,727','9,444','55,983','75,076','9,240','7,400',
'49,922','3,706','72,630','5,368','12,265','85,808','1,578','2,277','102,663','14,354',
'3,059','135,162','16,826','19,130','197,041','16,762','17,577','57,397','34,179','22,990',
' 37','9,539','2,472','63,047','868,715','117','7,211','18,860','525','169',
'935','305','3,025','5,472','120,397','45','43,238','61,936','721,924','114,820',
'316,760','1,135,873','1,278,165')
linkedin_jobs = c()
# adding jobs to an empty vector
linkedin_jobs = c('580,822','3,329','135,443','216,875','119,686','510,656','53,452','186,554','21,617','112,298',
'98,477','3,452','5,587','60,006','175,906','898,552','315,245','771,041','197,040','812,683',
'867,841','800,863','69,694','29,898','1,196,586','27,913','163,737','263,855','64,198','70,445',
'19','16,685','4,266','156,649','360,852','627','67,961','60,296','638','180',
'1,146','375','14,710','9,081','234,937','38,932','24,555','4,571,013','663,670','45,321',
'543,385','1,915,666','1,056,725')
simplyhired_jobs = c()
# adding jobs to an empty vector
simplyhired_jobs = c('27,586','1,233','13,702','44,456','37,409','87,187','33,666','34,298','21,562','11,574',
'26,019','2,237','6,353','14,068','61,044','99,354','193,995','40,280','47,118','34,411',
'69,038','63,254','9,906','10,482','104,201','6,411','11,880','31,166','14,219','9,865',
'34','4,243','1,038','32,330','497,140','33','23,610','7,756','345','96',
'630','219','1,391','3,158','66,652','5,255','22,061','41,747','372,740','60,142',
'157,402','548,264','547,690')
ziprecruiter_jobs = c()
# adding jobs to an empty vector
ziprecruiter_jobs = c('1,685,359','434','623,648','288,970','170,268','2,047,037','261,054','317,549','316,611','116,528',
'111,577','2,220,185','3,460','22,966','20,558','457,959','54,686','291,574','243,188','78,526',
'292,644','202,555','45,434','39,475','269,004','27,009','29,676','130,988','42,417','36,291',
'14','14,633','487','89,157','488,192','13','70,727','17,436','1,508','154',
'110,348','168','213,055','13,278','188,515','17,266','39,603','5,547,167','839,660','145,056',
'355,482','34,357','1,159,558')
new_df = data.frame(skill_title = title_vector,
indeed = indeed_jobs,
linkedin = linkedin_jobs,
simplyhired = simplyhired_jobs,
ziprecruiter = ziprecruiter_jobs
)
new_df
## skill_title indeed linkedin simplyhired ziprecruiter
## 1 Data analysis 46,915 580,822 27,586 1,685,359
## 2 Data wrangling 2,660 3,329 1,233 434
## 3 Data modeling 28,328 135,443 13,702 623,648
## 4 Statistics 2,327 216,875 44,456 288,970
## 5 Data visualization 41,727 119,686 37,409 170,268
## 6 Programming 9,444 510,656 87,187 2,047,037
## 7 Quantitative analysis 55,983 53,452 33,666 261,054
## 8 Machine learning 75,076 186,554 34,298 317,549
## 9 Machine learning models 9,240 21,617 21,562 316,611
## 10 Data mining 7,400 112,298 11,574 116,528
## 11 Debugging 49,922 98,477 26,019 111,577
## 12 Hypothesis testing 3,706 3,452 2,237 2,220,185
## 13 A/B tests 72,630 5,587 6,353 3,460
## 14 Regression 5,368 60,006 14,068 22,966
## 15 R 12,265 175,906 61,044 20,558
## 16 Python 85,808 898,552 99,354 457,959
## 17 C 1,578 315,245 193,995 54,686
## 18 C++ 2,277 771,041 40,280 291,574
## 19 C# 102,663 197,040 47,118 243,188
## 20 HTML 14,354 812,683 34,411 78,526
## 21 Java 3,059 867,841 69,038 292,644
## 22 JavaScript 135,162 800,863 63,254 202,555
## 23 PHP 16,826 69,694 9,906 45,434
## 24 SAS 19,130 29,898 10,482 39,475
## 25 SQL 197,041 1,196,586 104,201 269,004
## 26 Scala 16,762 27,913 6,411 27,009
## 27 MATLAB 17,577 163,737 11,880 29,676
## 28 SQL Server 57,397 263,855 31,166 130,988
## 29 NoSQL 34,179 64,198 14,219 42,417
## 30 Hadoop 22,990 70,445 9,865 36,291
## 31 OpenRefine 37 19 34 14
## 32 TensorFlow 9,539 16,685 4,243 14,633
## 33 Cloudera 2,472 4,266 1,038 487
## 34 Tableau 63,047 156,649 32,330 89,157
## 35 Microsoft Excel 868,715 360,852 497,140 488,192
## 36 Octave 117 627 33 13
## 37 Spark 7,211 67,961 23,610 70,727
## 38 PowerBI 18,860 60,296 7,756 17,436
## 39 Plotly 525 638 345 1,508
## 40 Bokeh 169 180 96 154
## 41 Matplotlib 935 1,146 630 110,348
## 42 Seaborn 305 375 219 168
## 43 Keras 3,025 14,710 1,391 213,055
## 44 Pytorch 5,472 9,081 3,158 13,278
## 45 AWS 120,397 234,937 66,652 188,515
## 46 Hive 45 38,932 5,255 17,266
## 47 Soft Skills 43,238 24,555 22,061 39,603
## 48 Communication 61,936 4,571,013 41,747 5,547,167
## 49 Collaboration 721,924 663,670 372,740 839,660
## 50 Creative Thinking 114,820 45,321 60,142 145,056
## 51 Critical Thinking 316,760 543,385 157,402 355,482
## 52 Problem Solving 1,135,873 1,915,666 548,264 34,357
## 53 Interpersonal Skills 1,278,165 1,056,725 547,690 1,159,558
# Get working directory path
path <- getwd()
path
## [1] "C:/Users/Uzma/CUNY_SPS_PROJECTS/Data_607_Project3"
write.csv(new_df, file.path(path, "skills_cleaned_data.csv"))