Load required packages

library(plyr)

library(ggplot2)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

library(RColorBrewer)
library(wordcloud)

Step 1

Load indeed dataset

indeed_url<-url('https://raw.githubusercontent.com/mlforsachid/Data607-Project3/master/Data/Indeed_Job_Search_Results.csv')
read_indeed_url<-read.csv(indeed_url,header = TRUE, stringsAsFactors = TRUE, sep = ',')
head(read_indeed_url)

Step2 Tidying the data

#Removing the radius column
read_indeed_url<-read_indeed_url[,-5]
head(read_indeed_url)

#Rename the columns
names(read_indeed_url)<-c('Source', 'Job Title','Skills','City','url','Count')
head(read_indeed_url)

Removing NA value in the dataset

read_indeed_url<-na.omit(read_indeed_url)
head(read_indeed_url)

Aggregate the skills to get the frequency

indeed_skillaggr<-aggregate(read_indeed_url$Count,by=list(Category=read_indeed_url$Skills), FUN=sum)
indeed_skillaggr

Jobs by skills

skills_count<-read_indeed_url %>%
  group_by(Skills) %>%
  summarise(Total=sum(Count)) %>%
  arrange(desc(Total))

skills_count

Jobs opening by City

skills_city<-read_indeed_url %>%
  group_by(Skills,City) %>%
  summarise(Total=sum(Count)) %>%
  arrange(desc(Total))

skills_city

Step3 Data Visualization

plots_top<-tail(skills_count,10)

#ggplot(plots_top, aes(plots_top$Skills, plots_top$Total)) + geom_bar(stat="identity")

darkcols <- brewer.pal(8,"Dark2")
names <- plots_top$Skills
barplot(plots_top$Total,main="Indeed Counts", horiz=TRUE, names.arg=names, las=1, col=darkcols, cex.axis=0.5, cex.names = 0.5)

top10_skills<-skills_city[1:10,]
ggplot(top10_skills, aes(x=Skills, y=Total, colour= City, size = Total)) + geom_point()

library(wordcloud)
wordcloud(skills_count$Skills,skills_count$Total, random.order=FALSE, colors=brewer.pal(8,"Dark2"))

## Warning in wordcloud(skills_count$Skills, skills_count$Total, random.order
## = FALSE, : Machine Learning could not be fit on page. It will not be
## plotted.

Step4 Conclusion

The top 5 skills are oriented towards Big Data, Python, R are the word most used in the dataset. Our finding show a few skills underlie the field of data science.Certainly mathematics and Statistics should play a role in our study but they are not part of our dataset. We cannot conclude definitely about on demand skills since some other skills are missing in the dataset.

Project3_Analysis

Hantz Angrand

October 22, 2018