suppressPackageStartupMessages(library('XML'))
suppressPackageStartupMessages(library('rvest'))
library(stringr)
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(RMySQL)
## Loading required package: DBI

load data into R

jobdata <- tbl_df(read.csv("https://raw.githubusercontent.com/ntlrs/Data-607-Project-3/master/cleandata", stringsAsFactors = FALSE, check.names = FALSE))
head(jobdata)
## # A tibble: 6 x 3
##    Industry           `Skill Set`        `Skill Type`
##       <chr>                 <chr>               <chr>
## 1 Marketing            Soft Skill         Team Player
## 2 Marketing Programming/Technical Database Management
## 3 Marketing Programming/Technical                Java
## 4 Marketing Programming/Technical                SPSS
## 5 Marketing Programming/Technical                 SAS
## 6 Marketing Programming/Technical               STATA

find the data specifically for marketing:

mrktdata_count <- jobdata %>%
  filter(Industry == "Marketing") %>%
  count(`Skill Type`, sort= TRUE)
mrktdata_count
## # A tibble: 47 x 2
##                        `Skill Type`     n
##                               <chr> <int>
##  1              Predictive Anaytics     8
##  2                              SQL     8
##  3                           Python     7
##  4 Build Statisical Learning Models     6
##  5               Good Communication     6
##  6                                R     6
##  7                          Tableau     5
##  8                           Hadoop     4
##  9                              SAS     4
## 10                      Team Player     4
## # ... with 37 more rows
marketingdata <- mrktdata_count[1:10,]
marketingdata
## # A tibble: 10 x 2
##                        `Skill Type`     n
##                               <chr> <int>
##  1              Predictive Anaytics     8
##  2                              SQL     8
##  3                           Python     7
##  4 Build Statisical Learning Models     6
##  5               Good Communication     6
##  6                                R     6
##  7                          Tableau     5
##  8                           Hadoop     4
##  9                              SAS     4
## 10                      Team Player     4

plot the top 10 skills for marketing

ggplot(marketingdata, aes(x = marketingdata$`Skill Type`, y = marketingdata$n, fill = marketingdata$`Skill Type`)) + 
  geom_bar(stat = "identity") +
  xlab("Skills") + 
  ylab("Frequency") + 
  theme(legend.position = "none",  
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle("Top Skills of Data Scientist in Marketing")

Conclusion:

There is a good mix of soft skills and technical skills required to be a data scientist.