library(plyr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(RColorBrewer)
library(wordcloud)
Load indeed dataset
indeed_url<-url('https://raw.githubusercontent.com/mlforsachid/Data607-Project3/master/Data/Indeed_Job_Search_Results.csv')
read_indeed_url<-read.csv(indeed_url,header = TRUE, stringsAsFactors = TRUE, sep = ',')
head(read_indeed_url)
#Removing the radius column
read_indeed_url<-read_indeed_url[,-5]
head(read_indeed_url)
#Rename the columns
names(read_indeed_url)<-c('Source', 'Job Title','Skills','City','url','Count')
head(read_indeed_url)
Removing NA value in the dataset
read_indeed_url<-na.omit(read_indeed_url)
head(read_indeed_url)
indeed_skillaggr<-aggregate(read_indeed_url$Count,by=list(Category=read_indeed_url$Skills), FUN=sum)
indeed_skillaggr
Jobs by skills
skills_count<-read_indeed_url %>%
group_by(Skills) %>%
summarise(Total=sum(Count)) %>%
arrange(desc(Total))
skills_count
Jobs opening by City
skills_city<-read_indeed_url %>%
group_by(Skills,City) %>%
summarise(Total=sum(Count)) %>%
arrange(desc(Total))
skills_city
plots_top<-tail(skills_count,10)
#ggplot(plots_top, aes(plots_top$Skills, plots_top$Total)) + geom_bar(stat="identity")
darkcols <- brewer.pal(8,"Dark2")
names <- plots_top$Skills
barplot(plots_top$Total,main="Indeed Counts", horiz=TRUE, names.arg=names, las=1, col=darkcols, cex.axis=0.5, cex.names = 0.5)
top10_skills<-skills_city[1:10,]
ggplot(top10_skills, aes(x=Skills, y=Total, colour= City, size = Total)) + geom_point()
library(wordcloud)
wordcloud(skills_count$Skills,skills_count$Total, random.order=FALSE, colors=brewer.pal(8,"Dark2"))
## Warning in wordcloud(skills_count$Skills, skills_count$Total, random.order
## = FALSE, : Machine Learning could not be fit on page. It will not be
## plotted.
The top 5 skills are oriented towards Big Data, Python, R are the word most used in the dataset. Our finding show a few skills underlie the field of data science.Certainly mathematics and Statistics should play a role in our study but they are not part of our dataset. We cannot conclude definitely about on demand skills since some other skills are missing in the dataset.