Consolidated Analysis

install.packages("rmarkdown") # Make cool dynamic documents
install.packages("knitr") # Run R Code Chunks
install.packages("ggplot2") # For plotting
install.packages("DT") # Interactive HTML tables
install.packages("tidyverse") # Tidy Universe

Read Consolidated Data from our Repository

ConsData <- read.csv("https://raw.githubusercontent.com/DATA607/Project3/master/dataframe.csv", stringsAsFactors = FALSE)
# Some Filtering of Data - C++ and Analytics as the data against these is spurious
ConsData <- ConsData %>% filter(skill != 'C++') %>% filter(skill != '(C++)') %>% filter(skill != 'analytics') %>% filter(skill != 'Analytics')
ConsData$count <- as.integer(ConsData$count)
## Warning: NAs introduced by coercion
datatable(head(ConsData,3))

A sample of where our data comes from

RecordCounts <- ConsData %>% group_by(location) %>% summarise(Number=n()) 
datatable(RecordCounts)

What are Top Skills

We are checking what are top sub-skills which data scientists. Later on we would look at Groupings of Skills.

TopSkillsData <- ConsData %>% group_by(skill)  %>% summarise(Count=sum(count)) %>% arrange(desc(Count))

datatable(TopSkillsData)

A chart to represent the skills

TopSkillsData <- head(TopSkillsData, 10)
TopSkillsData <- TopSkillsData %>% arrange(Count)
barplot(TopSkillsData$Count, main="Top 10 Skills", horiz=TRUE,
  names.arg=TopSkillsData$skill, las=1, cex.axis = 0.5, cex.names=0.5)

What are Top Skills Groups

Download the Skill Groups Meta File

SkillGroups <- read.csv("https://raw.githubusercontent.com/DATA607/Project3/master/SkillGroupings.txt", stringsAsFactors = FALSE, header = FALSE)
colnames(SkillGroups) <- c("Skill", "Group")
datatable(SkillGroups)

Join with Skill Counts data

C++ and analytics are spurious data for us, so these would be removed.
TopSkillsGroupData <- ConsData %>% group_by(skill)  %>% summarise(Count=sum(count)) %>% arrange(desc(Count))

TopSkillsGroupData <- inner_join(TopSkillsGroupData, SkillGroups, by=c("skill" = "Skill"))
TopSkillsGroupData <- TopSkillsGroupData %>% group_by(Group) %>% summarise(GroupedCount=sum(Count)) %>% arrange(desc(GroupedCount))
datatable(TopSkillsGroupData)

A chart to represent Top Skill Groups

TopSkillsGroupData <- TopSkillsGroupData %>% arrange(GroupedCount)
barplot(TopSkillsGroupData$GroupedCount, main="Top Skills by Groups", horiz=FALSE,
  names.arg=TopSkillsGroupData$Group, las=1, cex.axis = 0.5, cex.names=0.5)

Some Analytical questions

Which cities value the Programming most for Data Scientists?

TopSkillsGroupData1 <- inner_join(ConsData, SkillGroups, by=c("skill" = "Skill"))
TopSkillsGroupDataProgByLocation <- TopSkillsGroupData1 %>% filter(Group=="Programming") %>% group_by(location) %>% summarise(Count=sum(count)) %>% arrange(desc(Count))
datatable(TopSkillsGroupDataProgByLocation)

If I am a programmer, what is the next best skill I can learn?

TopSkillsGroupData2 <- inner_join(ConsData, SkillGroups, by=c("skill" = "Skill"))
TopSkillsGroupData2 <- TopSkillsGroupData2 %>% filter(Group!="Programming")
TopNonProgSkills <- TopSkillsGroupData2 %>% group_by(skill) %>% summarise(Count=sum(count)) %>% arrange(desc(Count))
datatable(TopNonProgSkills)

NY vs SF vs London vs Syndey Comparison for Top skill

aggDataLocNY <- ConsData %>% filter(location == "New York, NY")
aggDataLocNY <- aggDataLocNY %>% group_by(skill) %>% summarise(totalCount=sum(count))
aggDataLocNY <- aggDataLocNY  %>% arrange(desc(totalCount))
aggDataLocNY <- head(aggDataLocNY, 1)

aggDataLocSF <- ConsData %>% filter(location == "San Francisco, CA")
aggDataLocSF <- aggDataLocSF %>% group_by(skill) %>% summarise(totalCount=sum(count))
aggDataLocSF <- aggDataLocSF  %>% arrange(desc(totalCount))
aggDataLocSF <- head(aggDataLocSF, 1)

aggDataLocLdn <- ConsData %>% filter(location == "London")
aggDataLocLdn <- aggDataLocLdn %>% group_by(skill) %>% summarise(totalCount=sum(count))
aggDataLocLdn <- aggDataLocLdn  %>% arrange(desc(totalCount))
aggDataLocLdn <- head(aggDataLocLdn, 1)

aggDataLocSyd <- ConsData %>% filter(location == "Sydney")
aggDataLocSyd <- aggDataLocSyd %>% group_by(skill) %>% summarise(totalCount=sum(count))
aggDataLocSyd <- aggDataLocSyd  %>% arrange(desc(totalCount))
aggDataLocSyd <- head(aggDataLocSyd, 1)


ggplot() + 
geom_bar(data=aggDataLocNY, aes(x=skill, y=totalCount, fill='NY'), stat = "identity", position=position_dodge()) + geom_bar(data=aggDataLocSF, aes(x=skill, y=totalCount, fill='SF'), stat = "identity", position=position_dodge()) + geom_bar(data=aggDataLocLdn, aes(x=skill, y=totalCount, fill='Ldn'), stat = "identity", position=position_dodge()) + geom_bar(data=aggDataLocSyd, aes(x=skill, y=totalCount, fill='Syd'), stat = "identity", position=position_dodge()) + scale_fill_manual(values=c("#999999", "#E69F00", "#A69F55", "444444"), labels=c("NY", "SF", "Ldn", "Syd")) 

#+ scale_fill_brewer(palette="Paired")