Project 3

library(rvest)

## Loading required package: xml2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(httr)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v readr   1.3.1
## v tibble  2.1.3     v purrr   0.3.2
## v tidyr   0.8.3     v stringr 1.4.0
## v ggplot2 3.2.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:httr':
## 
##     config

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(tidyr)

Data Science Skills

Data Science General Skills

text_string <- read_file("https://github.com/AlainKuiete/DATA607/blob/master/datascienceskills")
search_word <- c('python', 'machine', 'learning','analytic', 'team', 'teams', 'r','sql','','communication', 'analytical','java','scala', 'hdoop','spark','computer', 'science', '','engineering', 'solving', 'Tableau', 'sofware', 'statistical','modeling','quantitative',
'customer', 'statistics', 'management', 'models', 'predictive', 'problems')


Results <- tibble(text = text_string) %>% 
  mutate(text = tolower(text)) %>% 
  mutate(text = str_remove_all(text, '[[:punct:]]')) %>% 
  mutate(tokens = str_split(text, "\\s+")) %>%
  unnest() %>% 
  count(tokens) %>% 
  filter(tokens %in%search_word) %>%
  mutate(freq = n / sum(n)) %>% 
  arrange(desc(n))

The table with words and frequencies.

Results <- mutate(Results, tokens = reorder(tokens,-1*n))
Results

ggplot2::ggplot(Results, aes(x=tokens, y=freq, fill=tokens)) + 
  geom_bar(width = 0.75,  stat = "identity", colour = "black", size = 1) + 
  coord_polar(theta = "x") + xlab("") + ylab("") + 
  ggtitle("Word Frequency") + theme(legend.position = "none") + 
  labs(x = NULL, y = NULL)

plotly::ggplotly(ggplot2::ggplot(Results, 
                  aes(x=tokens, y=freq, fill=tokens)) + 
                   geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) + 
                   xlab("") + ylab("") + ggtitle("Word Frequency") + 
                   theme(legend.position = "none") + 
                   labs(x = NULL, y = NULL) + 
                   theme(plot.subtitle = element_text(vjust = 1), plot.caption = element_text(vjust = 1), axis.text.x = element_text(angle = 90)) + 
                   theme(panel.background = element_rect(fill = "honeydew1"), plot.background = element_rect(fill = "antiquewhite")))%>% 
  config(displaylogo = F) %>% config(showLink = F)

Downloading data

dsg <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/ds_general_skills_revised.csv" )
str(dsg)

## 'data.frame':    30 obs. of  5 variables:
##  $ Keyword    : Factor w/ 27 levels "","\"data scientist\" \"[keyword]\"",..: 14 8 25 11 10 15 27 7 13 20 ...
##  $ LinkedIn   : Factor w/ 23 levels "","1,212","1,310",..: 16 15 12 11 8 6 5 4 3 2 ...
##  $ Indeed     : Factor w/ 23 levels "","1,125","1,413",..: 11 12 8 7 6 4 3 2 23 22 ...
##  $ SimplyHired: Factor w/ 23 levels "","1,153","1,497",..: 10 11 9 8 4 3 2 23 22 21 ...
##  $ Monster    : Factor w/ 23 levels "","1,207","1,815",..: 7 10 8 4 6 3 2 22 18 17 ...

Subsetting the Data Science soft Skills

dskg <-dsg[1:15,]

reshaping my dataframe

colnames(dskg) <- c("D.Skills", "LinkedIn", "Indeed", "SimplyHired", "Monster")
dskg$LinkedIn <- as.numeric(gsub(",","",dskg$LinkedIn))
dskg$Indeed <- as.numeric(gsub(",","",dskg$Indeed))
dskg$SimplyHired <- as.numeric(gsub(",","",dskg$SimplyHired))
dskg$Monster <- as.numeric(gsub(",","",dskg$Monster))

Computation

s.dskg <- summarise(dskg, sL=sum(LinkedIn, na.rm=TRUE), sI= sum(Indeed, na.rm=TRUE), sS=sum(SimplyHired, na.rm=TRUE),
           sM=sum(Monster, na.rm=TRUE))

tsg <- sum(s.dskg)

g.skills <- dskg%>% mutate(D.Skills, pct=(LinkedIn+Indeed+SimplyHired+Monster)/tsg)%>%
  select(D.Skills,pct)
g.skills

Visualisation

Plotting with horizontal column

library(ggplot2)

g.skills %>%
  filter(pct > .001) %>%
  mutate(phrases = reorder(D.Skills, pct)) %>%
  ggplot(aes(phrases, pct)) +
  geom_col(aes(fill=phrases),show.legend=FALSE) +
  ggtitle("General Skills for Data Science")+
  xlab(NULL) +ylab("Overall percentage")+
  coord_flip()

Analysis, Machine Learning, Statistics, Computer Science and Communication are general skill required for Data Scientists.

Data Science Soft Skills

Downloading data

dss <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/ds_job_listing_software.csv")

Subsetting the Data Science soft Skills

dsk <- dss[1:30,1:5]
str(dsk)

## 'data.frame':    30 obs. of  5 variables:
##  $ Keyword    : Factor w/ 42 levels "","\"data scientist\" \"[keyword]\"",..: 31 33 39 37 15 18 34 40 17 35 ...
##  $ LinkedIn   : Factor w/ 40 levels "","1,024","1,040",..: 33 27 17 9 8 7 6 5 4 3 ...
##  $ Indeed     : Factor w/ 37 levels "","1,012","1,134",..: 23 22 11 5 6 4 3 2 37 34 ...
##  $ SimplyHired: Factor w/ 39 levels "","1,059","1,164",..: 16 15 14 4 3 2 39 38 37 35 ...
##  $ Monster    : Factor w/ 40 levels "","1,002","1,062",..: 18 17 5 3 4 2 39 36 35 33 ...

reshaping my dataframe

colnames(dsk) <- c("D.Skills", "LinkedIn", "Indeed", "SimplyHired", "Monster")
dsk$LinkedIn <- as.numeric(gsub(",","",dsk$LinkedIn))
dsk$Indeed <- as.numeric(gsub(",","",dsk$Indeed))
dsk$SimplyHired <- as.numeric(gsub(",","",dsk$SimplyHired))
dsk$Monster <- as.numeric(gsub(",","",dsk$Monster))

Computation

s.dsks <- summarise(dsk, sL=sum(LinkedIn), sI= sum(Indeed), sS=sum(SimplyHired),
           sM=sum(Monster))
ts <- sum(s.dsks)

skills <- dsk%>% mutate(D.Skills, pct=(LinkedIn+Indeed+SimplyHired+Monster)/ts)%>%select(D.Skills,pct)
skill <-skills[1:10,]
skill

Visualisation

skill <- mutate(skill, D.Skills = reorder(D.Skills, -1*pct))
ggplot(skill, 
       aes(x=D.Skills, y = pct))+
  geom_col(aes(fill=D.Skills), position = "dodge")+
  xlab("Computer Skills")+ylab("Overall percentage")+
  ggtitle("Top 10 Sofware Skills for Data Science")

Python and R are the most software computer skills recommanded for Data Scientist.

Reference: The Most in Demand Skills for Data Scientists by Jeff Hale. Toward Data Science

url1 <- "https://www.google.com/search?safe=active&rlz=1C1EJFC_enUS834US834&sxsrf=ACYBGNSOYZtHkxHTh41kYm0q3DH8TnMuHw:1572152661833&ei=VSW1Xa3EMsHk_Ab9_afoAw&q=data+sciences+jobs&oq=data+sciences+jobs&gs_l=psy-ab.3..0i10l10.41861.55760..56318...1.0..1.3382.6568.0j1j7-1j1j1......0....1..gws-wiz.....10..35i362i39.rCNbLJ7wjdA&uact=5&ibp=htl;jobs&sa=X&ved=2ahUKEwjMi9q01bvlAhWCm-AKHSUxD-wQp4wCMAB6BAgFEAE#fpstate=tldetail&htidocid=n1I1nIYsv9Heqo3hAAAAAA%3D%3D&htivrt=jobs"

counting words with R

url2 <- "https://www.codementor.io/jhwatts2010/counting-words-with-r-ds35hzgmj"

Project 3

Alain T Kuiete

10/17/2019

Data Science Skills

Data Science General Skills

The table with words and frequencies.

Downloading data

Subsetting the Data Science soft Skills

reshaping my dataframe

Computation

Visualisation

Plotting with horizontal column

Analysis, Machine Learning, Statistics, Computer Science and Communication are general skill required for Data Scientists.

Data Science Soft Skills

Downloading data

Subsetting the Data Science soft Skills

reshaping my dataframe

Computation

Visualisation

Python and R are the most software computer skills recommanded for Data Scientist.

Reference: The Most in Demand Skills for Data Scientists by Jeff Hale. Toward Data Science

counting words with R