Kaggle Survey Data

I have tried to compare Indian and US Kagglers and took part in the Kaggle competition,finding their similarities and differences! Also, I have tried to predict their salaries by using different models

The notebook flows as follows
1.Data Cleaing
2.EDA
3.Model Building
4.Model Evaluation (to be updated)

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ---------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)
library(tidyverse)
library(dplyr)
library(grid)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggthemes)
library(RColorBrewer)
library(cowplot)

## 
## ********************************************************

## Note: As of version 1.0.0, cowplot does not change the

##   default ggplot2 theme anymore. To recover the previous

##   behavior, execute:
##   theme_set(theme_cowplot())

## ********************************************************

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggthemes':
## 
##     theme_map

nb.cols <- 25
mycolors <- colorRampPalette(brewer.pal(12, "Set3"))(nb.cols)
kaggle_data  <- read.csv("multiple_choice_responses.csv")
dim(kaggle_data)

## [1] 19717   246

We have 19718 rows (observations) and 246 columns! The number of columns are very high because most of the questions are multiple type questions and each of the option has got one column!

There is lot of missing data in different columns. There are two types of missing data:

1.Columns which represent questions where only one option was to be selected (age,education,designation,etc) : here missing data means that users(Kagglers) didn’t fill it
2.Columns which represent questions where multiple option could be selected (activities at work,sources to learn Data science,etc) : here missing data means that user didnot select that option

And two of the above possibilities are very obvious and hence we wont go with missing data evaluation!

DATA CLEANING

data<- kaggle_data
#Cleaning Education(Q4) column
data$Q4<-gsub("[^0-9A-Za-z///']", "", data$Q4)
data$Q4<-gsub("Mastersdegree", "Masters", data$Q4)
data$Q4<-gsub("Bachelorsdegree", "Bachelors", data$Q4)
data$Q4<-gsub("Doctoraldegree", "Doctoral", data$Q4)
data$Q4<-gsub("Noformaleducationpasthighschool", "High School", data$Q4)
data$Q4<-gsub("Somecollege/universitystudywithoutearningabachelorsdegree", "No Bachelor degree but attended college", data$Q4)
data$Q4<-gsub("Professionaldegree", "Professional Degree", data$Q4)
data$Q4<-gsub("Iprefernottoanswer", "Prefer Not to answer", data$Q4)
#Cleaning ML status of the company (Q8) column
data$Q8<-gsub("[^0-9A-Za-z///']", "", data$Q8)
data$Q8<-gsub("WehavewellestablishedMLmethodsiemodelsinproductionformorethan2years", "Model in production > 2 years", data$Q8)
data$Q8<-gsub("Idonotknow", "No idea", data$Q8)
data$Q8<-gsub("NowedonotuseMLmethods", "No use of ML models", data$Q8)
data$Q8<-gsub("WeareexploringMLmethodsandmayonedayputamodelintoproduction", "Exploring ML models", data$Q8)
data$Q8<-gsub("WerecentlystartedusingMLmethodsiemodelsinproductionforlessthan2years", "Model in production < 2 years", data$Q8)
data$Q8<-gsub("WeuseMLmethodsforgeneratinginsightsbutdonotputworkingmodelsintoproduction", "Only Insights no production", data$Q8)
#Cleaning Salary(Q10) column
data$Q10 <- gsub("\\-", "p", data$Q10)
data$Q10 <- gsub("\\,", "", data$Q10)
data$Q10 <- gsub("\\$", "", data$Q10)
data$Q10 <- gsub(">", "", data$Q10)
data$Q10 <- gsub(" ", "", data$Q10)

#As there are many classes in the salary column, prediction could be difficult with so many classes! Hence reducing them to 4 main classes:
# 1.less than 25k       2.between 25k to 50k     3.between 50k to 100k     4.more than 100k

data$Q10<-gsub("1000p1999", "<25k", data$Q10)
data$Q10<-gsub("2000p2999", "<25k", data$Q10)
data$Q10<-gsub("3000p3999", "<25k", data$Q10)
data$Q10<-gsub("4000p4999", "<25k", data$Q10)
data$Q10<-gsub("5000p7499", "<25k", data$Q10)
data$Q10<-gsub("7500p9999", "<25k", data$Q10)

data$Q10<-gsub("10000p14999", "<25k", data$Q10)
data$Q10<-gsub("15000p19999", "<25k", data$Q10)
data$Q10<-gsub("20000p24999", "<25k", data$Q10)



data$Q10<-gsub("25000p29999", "25k to 50k", data$Q10)
data$Q10<-gsub("30000p39999", "25k to 50k", data$Q10)
data$Q10<-gsub("40000p49999", "25k to 50k", data$Q10)


data$Q10<-gsub("50000p59999", "50k to 100k", data$Q10)
data$Q10<-gsub("60000p69999", "50k to 100k", data$Q10)
data$Q10<-gsub("70000p79999", "50k to 100k", data$Q10)
data$Q10<-gsub("80000p89999", "50k to 100k", data$Q10)
data$Q10<-gsub("90000p99999", "50k to 100k", data$Q10)


data$Q10<-gsub("100000p124999", ">100k", data$Q10)
data$Q10<-gsub("125000p149999", ">100k", data$Q10)
data$Q10<-gsub("150000p199999", ">100k", data$Q10)
data$Q10<-gsub("200000p249999", ">100k", data$Q10)
data$Q10<-gsub("250000p299999", ">100k", data$Q10)
data$Q10<-gsub("300000p500000", ">100k", data$Q10)
data$Q10<-gsub("500000", ">100k", data$Q10)
data$Q10<-gsub("0p999", "<25k", data$Q10)

ORDERING FACTOR VARIABLES : As entire data is categorical (except duration), ordering factor variables!

data$Q1 <- factor(data$Q1, levels = c("18-21", "22-24", "25-29",
                                                  "30-34","35-39","40-44","45-49",
                                             "50-54","55-59","60-69","70+"),ordered=TRUE)

data$Q2 <- factor( data$Q2 , ordered = FALSE )
data$Q3 <- factor( data$Q3 , ordered = FALSE )



data$Q4 <- factor(data$Q4, levels = c("Prefer Not to answer", "High School", "No Bachelor degree but attended college",
                                                  "Bachelors","Professional Degree","Masters","Doctoral"),ordered=TRUE)



data$Q5 <- factor(data$Q5,ordered=TRUE)

data$Q6 <- factor(data$Q6, levels = c("0-49 employees", "50-249 employees", "250-999 employees",
                                                  "1000-9,999 employees","> 10,000 employees"),ordered=TRUE)

data$Q7 <- factor(data$Q7, levels = c("0", "1-2", "3-4",
                                                  "59","10-14","15-19","20+"),ordered=TRUE)

data$Q11 <- factor(data$Q11, levels = c("$0 (USD)", "$1-$99","$100-$999","$1000-$9,999", "$10,000-$99,999",
                                                  "> $100,000 ($USD)"),ordered=TRUE)

data$Q15 <- factor(data$Q15, levels = c("< 1 years", "1-2 years","3-5 years","5-10 years",
                                                    "10-20 years","20+ years")
                                                    ,ordered=TRUE)
data$Q23 <- factor(data$Q23, levels = c("< 1 years", "1-2 years","2-3 years","3-4 years","4-5 years",
                                                    "5-10 years","10-15 years","20+ years") ,ordered=TRUE)

data$Q8<- factor(data$Q8,ordered = FALSE)

salary_levels<- c("<25k",
                  "25k to 50k",
                  "50k to 100k",
                  ">100k")

data$Q10<-ordered(data$Q10, levels = salary_levels)

EXPLORATORY DATA ANALYSIS The most important step in any data science project

country<-data %>% group_by(Q3) %>% summarise(count=(n()/19716)*100) %>%
  arrange(desc(count)) 

country$Q3<-factor(country$Q3,ordered = T,levels = rev(country$Q3))

ggplot(country,aes(Q3,count,fill=Q3)) + geom_bar(stat = "identity",colour = "black", width = 1) + coord_flip()  +
  labs(title = "Distribution of Kagglers by country",
       x = "Country", 
       y = "% responders") + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 5, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0))

Almost 40% of users come from India and US! I was always curious about how Data Science practices and practitioners are different in these two countries, if they really are!! And this entire analysis is focussed on comparing the Kaggleres coming from both of these countries!

data %>%  group_by(Q2) %>% summarise(count=n() *100 /19716) %>% 
ggplot(aes(x=Q2,y=count,fill=Q2))+geom_bar(stat="identity") + 
 labs(title = "Distribution of Kagglers by sex",
       x = "Sex", 
       y = "% responders") + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 10, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust =0))+  scale_fill_manual(values = mycolors)

We see that 80% of responders are Male and around 17%-18% responders are females. People who do not want to publically share their sexuality status and people who prefer to self describe are very less! Lets see % of female responders by country and see where does Indian and have a glimpse where does India and US lie.

country_and_sex <- data %>% group_by(Q3,Q2) %>% summarise(count=n()) %>% 
  spread(Q2,count) %>% 
  mutate(Female_prop=Female * 100 /(Male+Female)) %>%  arrange(Female_prop)


country_and_sex$Q3 <- factor(country_and_sex$Q3, 
                             ordered = T, 
                             levels = country_and_sex$Q3)


ggplot(country_and_sex,aes(Q3,Female_prop,fill=Q3)) + geom_bar(stat = "identity") + coord_flip() +   theme(legend.position = "bottom") +
  labs(title = "% of Female responders by Countries",
       x = "Country", 
       y = "% Female responders") + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 14, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 1))

So out of all the countries USA comes on 14th position and India comes around 28th! Although number of responders from most of the countries are very less, which makes this analysis skewed, still we have a blurry picture about activeness of females on Kaagle from different countries. In US its about 20 females for every 100 males, and for India this number reduces down to somewhere between 16%-17%. Lets compare these numbers to the global average!

#global female representation
global_female_data<-data %>% group_by(Q2) %>% filter(Q2=="Male"|Q2=="Female") %>% 
  summarise(count=n() *100 /sum(data$Q2 =="Male" | data$Q2=="Female")) 
global_female_pct<- global_female_data[1,2]
global_female_pct<- as.numeric(global_female_pct)

country_and_sex %>% filter(Q3 %in% c("India","United States of America")) %>%
  ggplot(aes(Q3,Female_prop,fill=Q3)) + geom_bar(stat = "identity") +   theme(legend.position = "bottom") +
  labs(title = "% of Female responders by in USA and India",
       x = "Country", 
       y = "% Female responders by country") +  
  geom_hline(yintercept = global_female_pct, linetype = 2, color = "grey35", size = 1) + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 14, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0))+  scale_fill_manual(values = mycolors)

USA is much above the global % of female responders where India is slightly below the global average!

Lets see this from a broder view again to compare the countries from where we have good number of responders. Which are mainly : US,India,Brazil,,China,Russia, Japan and Germany.

(top_countries<- filter(country,count>2.5))

top_countries_list <- c("India","United States of America","Brazil","Japan","Russia","China","Germany")

country_and_sex %>% filter(Q3 %in% top_countries_list) %>% 
  ggplot(aes(Q3,Female_prop,fill=Q3)) + geom_bar(stat = "identity") + coord_flip() +   theme(legend.position = "bottom") +
  labs(title = "% of Female responders at top countries",
       x = "Country", 
       y = "% Female responders by country") +
  geom_hline(yintercept = global_female_pct, linetype = 2, color = "grey35", size = 1) +
theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 14, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0))+  scale_fill_manual(values = mycolors)

We can see only USA and Germany have % of female responders above the global average!

Let’s see the Education background of responders in both the countries!

data %>% filter(Q4!="") %>% filter (Q4!="I prefer not to answer") %>% filter (Q3 %in% c("India","United States of America")) %>% 
  group_by(Q4,Q3) %>% summarise(count=n())  %>% 
  ggplot(aes(Q4,count,fill=Q4)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Kagglers by degree",
       x = "degree", 
       y = "Kagglers by degree") + facet_wrap(~Q3) + coord_flip() +
theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0))+  scale_fill_manual(values = mycolors)

High number of Indian Kagglers are have education till Bachelors level. Where as high proportion of US Kagglers have masters degree! High number US Kagglers are more educated than Indian Kagglers. Share of PHDs (Doctoral degree holders) is also high in US. Data Science is in it’s peak in India right now, and many undergrads directly wish to get into these kind of roles and hence they showcase their skills and practice on Datasets on Kaggle! Lets see if the designation,age and experience with code and ML features supports this claim!

us_india_data<-data %>% filter(Q3 %in% c("India","United States of America"))
india_us_data<-data %>% filter(Q3 %in% c("India","United States of America"))
mf_us_india_sum_data<-sum(us_india_data$Q2 =="Male" | us_india_data$Q2=="Female")  

p1<-us_india_data %>%filter(Q5 != "") %>%  group_by(Q2,Q5,Q3) %>% filter(Q2 %in% c("Male","Female")) %>%  
summarise(count=n()) %>% arrange(desc(count))  %>% ggplot(aes(reorder(Q5,count),count,fill=Q5)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Kagglers by job title",
       x = "title", 
       y = "number of responders") + facet_wrap(~Q3) +  coord_flip() +
theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) +  scale_fill_manual(values = mycolors) 

p2<-data %>% group_by(Q1,Q3) %>%  filter (Q3 %in% c("India","United States of America"))%>%  filter(Q1!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q1,count,fill=Q1)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Kagglers by age",
       x = "Age", 
       y = "number of responders") + facet_wrap(~Q3) +  coord_flip() + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) +  scale_fill_manual(values = mycolors)

india_us_data<- data %>% filter (Q3 %in% c("India","United States of America"))

p3<-india_us_data %>% group_by(Q15,Q3) %>%  filter (Q3 %in% c("India","United States of America"))%>%  filter(Q15!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q15,count,fill=Q15)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Experience in coding to analyse data",
       x = "Years of experience", 
       y = "number of responders") + facet_wrap(~Q3) +  coord_flip() + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 5, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0)) +  scale_fill_manual(values = mycolors)

## Warning: Factor `Q15` contains implicit NA, consider using
## `forcats::fct_explicit_na`

p4<-india_us_data %>% group_by(Q23,Q3) %>%  filter (Q3 %in% c("India","United States of America"))%>%  filter(Q23!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q23,count,fill=Q23)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Experience of Kagglers ML methods",
        x = "Years of experience", 
       y = "number of responders") + facet_wrap(~Q3) +  coord_flip() + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0)) +  scale_fill_manual(values = mycolors)

## Warning: Factor `Q23` contains implicit NA, consider using
## `forcats::fct_explicit_na`

grid.arrange(p1, p2,nrow=2)

grid.arrange(p3,p4,nrow=2)

As we can see a very high number of Indian responders were Students, followed by Data Scientists and Software Engineers! Data Scientist use Kaggle to use state of art algorithms and master their art! Where as many Software Engineers in India are looking up to transition into Data Science as its ‘hottest and sexist job of the century :p’. Where as in US number of responders who are Data Scientist are almost double than any of the category following it! Share of students in US is very less as compared to India. Age distribution of Indian Kagglers is skewed and has peak on 18-21. Thus majority of Indian Kagglers are very young inexperienced. Where as US Kagglers are between age group of 25-34.

Coding experience to analyse data is again skewed, in India high proportion of responders have coding experience between ‘0-1 year’ and ‘1-2 years’. Where as high proportion of Kagglers from US are well experienced in coding, the peak is around 3-5 years! A very intersting insight here is : a good number of highly experience (more than 10 years) coders from US are active on Kaggle where as this number is very minimal in India! Again, when ML methods are concerned, high proportion of Indian Kagglers have less experience as compared to US. There are significant number of Kagglers having experience in ML methods between 5-10 years!

We can thus say surely that majority of US Kagglers are experienced and highly educated (with their masters and doctoral degrees) and are looking forward to stay ahead in the game of Data Science by competing and showcasing their skills on Kaggle. Where as most of the Indian Kagglers are young students who want to breakthrough into the field of Data Science!

Also an interesting insight to note here is that - many of the people having jobs in the Data industry like Data Engineer, Statisticians, Business Analysts aren’t much active on Kaggle! Reason might be that whole philosphy of Kaggle competition is based on data cleaning,feature engineernig,EDA,modelling and evaluation and these dont come under main tasks of the jobs written above! Also, the high proportion of Software Engineers tells an interesting story.

Lets quickly compare other attributes like company size,number of people in DS team,incorporation of ML methods,cost spent on ML products by company.

p5<-india_us_data %>% group_by(Q6,Q3) %>%  filter (Q3 %in% c("India","United States of America"))%>%  filter(Q6!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q6,count,fill=Q6)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Company size",
       x = "Number of employees", 
       y = "number of responders") + facet_wrap(~Q3) +  coord_flip() + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) +  scale_fill_manual(values = mycolors)

## Warning: Factor `Q6` contains implicit NA, consider using
## `forcats::fct_explicit_na`

p6<-india_us_data %>% group_by(Q7,Q3) %>%  filter (Q3 %in% c("India","United States of America"))%>%  filter(Q7!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q7,count,fill=Q7)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "DS team size",
       x = "number of employees", 
       y = "number of responders") + facet_wrap(~Q3) +  coord_flip() + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) +  scale_fill_manual(values = mycolors)

## Warning: Factor `Q7` contains implicit NA, consider using
## `forcats::fct_explicit_na`

p7<-india_us_data %>% group_by(Q8,Q3) %>%  filter (Q3 %in% c("India","United States of America"))%>% 
  filter(Q8!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q8,count,fill=Q8)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Use of ML models",
       x = "title", 
       y = "number of responders") + facet_wrap(~Q3) +  coord_flip() + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) +  scale_fill_manual(values = mycolors) 


p8<-india_us_data %>% group_by(Q11,Q3) %>%  filter (Q3 %in% c("India","United States of America"))%>% 
  filter(Q11!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q11,count,fill=Q11)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Money spent on ML products at work",
       x = "Amount in USD", 
       y = "number of responders") + facet_wrap(~Q3) +  coord_flip() + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) +  scale_fill_manual(values = mycolors)

## Warning: Factor `Q11` contains implicit NA, consider using
## `forcats::fct_explicit_na`

grid.arrange(p5, p6,nrow=2)

p7

p8

As we can see in both India and US, responder’s company size is either too small(startups which just started) or too large(MNCs), and high proportion of them have 20+ members handeling DS workload! We see some difference in money spent on ML products, although high number of responders from both countries come from the companies where no money is spent, but when it comes to 10,000to 99,999 and $100,000+ buckets, proportion of employees coming from such companies is higher than India.

Above was EDA of questions where only one option could be selected! Let’s analyse the questions where more than one answers could have been selected.We will only analyse few important ones!

##Select any activities that make up an important part of your role at work
india_us_data$Q3<-factor(india_us_data$Q3,levels = c("India", "United States of America") ,ordered=FALSE)

india_us_data$Q9_Part_1<- factor(india_us_data$Q9_Part_1,levels = "Analyze and understand data to influence product or business decisions")
india_us_data$Q9_Part_2<- factor(india_us_data$Q9_Part_2,levels = "Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data")
india_us_data$Q9_Part_3<- factor(india_us_data$Q9_Part_3,levels = "Build prototypes to explore applying machine learning to new areas")
india_us_data$Q9_Part_4<- factor(india_us_data$Q9_Part_4,levels = "Build and/or run a machine learning service that operationally improves my product or workflows")
india_us_data$Q9_Part_5<- factor(india_us_data$Q9_Part_5,levels = "Experimentation and iteration to improve existing ML models")
india_us_data$Q9_Part_6<- factor(india_us_data$Q9_Part_6,levels = "Do research that advances the state of the art of machine learning")
india_us_data$Q9_Part_7<- factor(india_us_data$Q9_Part_7,levels = "None of these activities are an important part of my role at work")
india_us_data$Q9_Part_8<- factor(india_us_data$Q9_Part_8,levels = "Other")


role <- rbind(as.data.frame(table(india_us_data$Q9_Part_1, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q9_Part_2, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q9_Part_3, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q9_Part_4, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q9_Part_5, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q9_Part_6, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q9_Part_7, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q9_Part_8, india_us_data$Q3)))

names(role) <- c("role", "Country", "Freq")
ggplot(role, aes(role,Freq,fill=role))+
  geom_bar(stat = "identity", colour = "black", width = 1)+
  scale_fill_brewer(palette = "Paired", labels = c("Analyse data",
                                                    "Build data infra",
                                                    "Build prototypes ",
                                                    "Experimentation",
                                                    "Research",
                                                    "None of these","Other"))+
  facet_grid(.~Country)+
  labs(x="", y="Number of Kagglers", fill="Roles", title="Major part of role")+
  theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 0, colour = "black"), 
        legend.position="bottom", legend.direction='horizontal', strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 12), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines")) +  scale_fill_manual(values = mycolors)

## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

##Who/what are your favorite media sources that report on data science topics? 

india_us_data$Q12_Part_1<- factor(india_us_data$Q12_Part_1,levels = "Twitter (data science influencers)")
india_us_data$Q12_Part_2<- factor(india_us_data$Q12_Part_2,levels = "Hacker News (https://news.ycombinator.com/)")
india_us_data$Q12_Part_3<- factor(india_us_data$Q12_Part_3,levels = "Reddit (r/machinelearning, r/datascience, etc)")
india_us_data$Q12_Part_4<- factor(india_us_data$Q12_Part_4,levels = "Kaggle (forums, blog, social media, etc)")
india_us_data$Q12_Part_5<- factor(india_us_data$Q12_Part_5,levels = "Course Forums (forums.fast.ai, etc)")
india_us_data$Q12_Part_6<- factor(india_us_data$Q12_Part_6,levels = "YouTube (Cloud AI Adventures, Siraj Raval, etc)")
india_us_data$Q12_Part_7<- factor(india_us_data$Q12_Part_7,levels = "Podcasts (Chai Time Data Science, Linear Digressions, etc)")
india_us_data$Q12_Part_8<- factor(india_us_data$Q12_Part_8,levels = "Blogs (Towards Data Science, Medium, Analytics Vidhya, KDnuggets etc)")
india_us_data$Q12_Part_9<- factor(india_us_data$Q12_Part_9,levels = "Journal Publications (traditional publications, preprint journals, etc)")
india_us_data$Q12_Part_10<- factor(india_us_data$Q12_Part_10,levels = "Slack Communities (ods.ai, kagglenoobs, etc)")
india_us_data$Q12_Part_11<- factor(india_us_data$Q12_Part_11,levels = "None")
india_us_data$Q12_Part_12<- factor(india_us_data$Q12_Part_12,levels = "Other")

sour <- rbind(as.data.frame(table(india_us_data$Q12_Part_1, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_2, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_3, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_4, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_5, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_6, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_7, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_8, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_9, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_10, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_11, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q12_Part_12, india_us_data$Q3)))




names(sour) <- c("sour", "country", "Freq")
library(ggthemes)

ggplot(sour, aes(sour,Freq,fill=sour))+
  geom_bar(stat = "identity", colour = "black", width = 1)+
  scale_fill_brewer(palette = "Pastel1", labels = c("Twitter",
                                                    "HackerNews",
                                                    "Reddit",
                                                    "Kaggle",
                                                    "Course Forums",
                                                    "Youtube","Podcasts",
                                                    "Blogs","Journal Publications",
                                                    "Slacks Communities","None","Other"))+
  facet_grid(.~country)+
  labs(x="", y="Number of Kagglers", fill="Sources", title="Media Sources")+
  theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 0, colour = "black"), 
        legend.position="bottom", legend.direction='horizontal', strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 12), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"))

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Pastel1 is 9
## Returning the palette you asked for with that many colors

###On which platforms have you begun or completed data science courses?
india_us_data$Q13_Part_1<- factor(india_us_data$Q13_Part_1,levels = "Udacity")
india_us_data$Q13_Part_2<- factor(india_us_data$Q13_Part_2,levels = "Coursera")
india_us_data$Q13_Part_3<- factor(india_us_data$Q13_Part_3,levels = "edX")
india_us_data$Q13_Part_4<- factor(india_us_data$Q13_Part_4,levels = "DataCamp")
india_us_data$Q13_Part_5<- factor(india_us_data$Q13_Part_5,levels = "DataQuest")
india_us_data$Q13_Part_6<- factor(india_us_data$Q13_Part_6,levels = "Kaggle Courses (i.e. Kaggle Learn)")
india_us_data$Q13_Part_7<- factor(india_us_data$Q13_Part_7,levels = "Fast.ai")
india_us_data$Q13_Part_8<- factor(india_us_data$Q13_Part_8,levels = "Udemy")
india_us_data$Q13_Part_9<- factor(india_us_data$Q13_Part_9,levels = "LinkedIn Learning")
india_us_data$Q13_Part_10<- factor(india_us_data$Q13_Part_10,levels = "University Courses (resulting in a university degree)")
india_us_data$Q13_Part_11<- factor(india_us_data$Q13_Part_11,levels = "None")
india_us_data$Q13_Part_12<- factor(india_us_data$Q13_Part_12,levels = "Other")

cour <- rbind(as.data.frame(table(india_us_data$Q13_Part_1, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_2, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_3, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_4, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_5, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_6, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_7, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_8, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_9, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_10, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_11, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q13_Part_12, india_us_data$Q3)))



names(cour) <- c("Cour", "country", "Freq")

ggplot(cour, aes(Cour,Freq,fill=Cour))+
  geom_bar(stat = "identity", colour = "black", width = 1)+
  facet_grid(.~country)+
  labs(x="", y="Number of Kagglers", fill="Platform", title="Doing data science course on platforms")+
  theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 0, colour = "black"), 
        legend.position="bottom", legend.direction='horizontal', strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 12), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines")) +
scale_fill_manual(values = mycolors)+  coord_flip()

###What programming languages do you use on a regular basis? 

india_us_data$Q18_Part_1<- factor(india_us_data$Q18_Part_1,levels = "Python")
india_us_data$Q18_Part_2<- factor(india_us_data$Q18_Part_2,levels = "R")
india_us_data$Q18_Part_3<- factor(india_us_data$Q18_Part_3,levels = "SQL")
india_us_data$Q18_Part_4<- factor(india_us_data$Q18_Part_4,levels = "C")
india_us_data$Q18_Part_5<- factor(india_us_data$Q18_Part_5,levels = "C++")
india_us_data$Q18_Part_6<- factor(india_us_data$Q18_Part_6,levels = "Java")
india_us_data$Q18_Part_7<- factor(india_us_data$Q18_Part_7,levels = "Javascript")
india_us_data$Q18_Part_8<- factor(india_us_data$Q18_Part_8,levels = "Typescript")
india_us_data$Q18_Part_9<- factor(india_us_data$Q18_Part_9,levels = "Bash")
india_us_data$Q18_Part_10<- factor(india_us_data$Q18_Part_10,levels = "MATLAB")
india_us_data$Q18_Part_11<- factor(india_us_data$Q18_Part_11,levels = "None")
india_us_data$Q18_Part_12<- factor(india_us_data$Q18_Part_12,levels = "Other")

lang <- rbind(as.data.frame(table(india_us_data$Q18_Part_1, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_2, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_3, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_4, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_5, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_6, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_7, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_8, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_9, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_10, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_11, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q18_Part_12, india_us_data$Q3)))




names(lang) <- c("lang", "country", "Freq")
library(ggthemes)

ggplot(lang, aes(lang,Freq,fill=lang))+
  geom_bar(stat = "identity", colour = "black", width = 1)+
  facet_grid(.~country)+
  labs(x="", y="Number of Kagglers", fill="Language", title="Programming Language")+
  theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 0, colour = "black"), 
        legend.position="bottom", legend.direction='horizontal', strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 12), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines")) + 
scale_fill_manual(values = mycolors) + scale_fill_manual(values = mycolors)

## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

###Which of the following ML algorithms do you use on a regular basis?


india_us_data$Q24_Part_1<- factor(india_us_data$Q24_Part_1,levels = "Linear or Logistic Regression")
india_us_data$Q24_Part_2<- factor(india_us_data$Q24_Part_2,levels = "Decision Trees or Random Forests")
india_us_data$Q24_Part_3<- factor(india_us_data$Q24_Part_3,levels = "Gradient Boosting Machines (xgboost, lightgbm, etc)")
india_us_data$Q24_Part_4<- factor(india_us_data$Q24_Part_4,levels = "Bayesian Approaches")
india_us_data$Q24_Part_5<- factor(india_us_data$Q24_Part_5,levels = "Evolutionary Approaches")
india_us_data$Q24_Part_6<- factor(india_us_data$Q24_Part_6,levels = "Dense Neural Networks (MLPs, etc)")
india_us_data$Q24_Part_7<- factor(india_us_data$Q24_Part_7,levels = "Convolutional Neural Networks")
india_us_data$Q24_Part_8<- factor(india_us_data$Q24_Part_8,levels = "Generative Adversarial Networks")
india_us_data$Q24_Part_9<- factor(india_us_data$Q24_Part_9,levels = "Recurrent Neural Networks")
india_us_data$Q24_Part_10<- factor(india_us_data$Q24_Part_10,levels = "Transformer Networks (BERT, gpt-2, etc)")
india_us_data$Q24_Part_11<- factor(india_us_data$Q24_Part_11,levels = "None")
india_us_data$Q24_Part_12<- factor(india_us_data$Q24_Part_12,levels = "Other")

algo <- rbind(as.data.frame(table(india_us_data$Q24_Part_1, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_2, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_3, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_4, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_5, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_6, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_7, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_8, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_9, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_10, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_11, india_us_data$Q3)),
              as.data.frame(table(india_us_data$Q24_Part_12, india_us_data$Q3)))


names(algo) <- c("algo", "country", "Freq")
library(ggthemes)

ggplot(algo, aes(algo,Freq,fill=algo))+
  geom_bar(stat = "identity", colour = "black", width = 1)+
  facet_grid(.~country)+
  labs(x="", y="Number of Kagglers", fill="Algorithms", title="Use of ML algo")+
  theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 14, colour = "black"), axis.text.x = element_text(size = 0, colour = "black"), 
        legend.position="bottom", legend.direction='horizontal', strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 12), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"))  +
  scale_fill_brewer(palette = "Paired", labels = c("Linear/Logistics",
                                                    "ST/RF",
                                                    "GBM/XGBoost ",
                                                    "Baysian",
                                                    "DenseNN",
                                                    "CNN","GAN","RNN","Transformer Networks","None","Other"))+  scale_fill_manual(values = mycolors)

## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

1.As we see high number of Kagglers in both the countries work in analysing the data to get insights for products and businesses (most of which is reporting :p) followed by building prototypes to explore applying machine learning to new areas! The trend is similar in both countries. To my surprise I was expecting number of Kagglers doing ’research’to advance state of art of ML to be very less in India, which is not the case. 2.The trend is very similar to media sources for reading about DS.Mainly Kaggle and blogs, all thanks to Kaggle community, Towards Data Science and KDNuggets!! 3.Similarly no difference in trend is observed in algos used,course platform and languages. The insights are very straight forward and communicative.

Before making models to predict the salary, lets visualise salary with some importat aspects like country,age,sex,education,years of experience in ML,company size,etc!

india_us_data$Q3<-factor(india_us_data$Q3,levels = c("India", "United States of America") ,ordered=FALSE)
india_us_data %>% group_by(Q10,Q3)%>% 
  filter(Q10!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q10,count,fill=Q10)) + geom_bar(stat="identity",position="dodge") +   theme(legend.position = "bottom") +
  labs(title = "Salay distribution by country",
       x = "Salary", 
       y = "Frquency") + facet_wrap(~Q3) +  coord_flip() + theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="none", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) + scale_fill_manual(values = mycolors)

## Warning: Factor `Q10` contains implicit NA, consider using
## `forcats::fct_explicit_na`

india_us_data %>% group_by(Q10,Q2,Q3)%>% 
  filter(Q10!="") %>%   filter(Q2!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q2,count,fill=Q10)) + geom_bar(stat="identity",position="fill") +   theme(legend.position = "bottom") +
  labs(title = "Salay by gender",
       x = "Gender", 
       y = "Share") + facet_wrap(~Q3) +   theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="bottom", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) + scale_fill_manual(values = mycolors)

## Warning: Factor `Q10` contains implicit NA, consider using
## `forcats::fct_explicit_na`

india_us_data %>% group_by(Q10,Q1,Q3)%>% 
  filter(Q10!="") %>%   filter(Q1!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q1,count,fill=Q10)) + geom_bar(stat="identity",position="fill") +   theme(legend.position = "bottom") +
  labs(title = "Salay by age",
       x = "Age", 
       y = "Share") + facet_wrap(~Q3) +   theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="bottom", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) + scale_fill_manual(values = mycolors) +  coord_flip()

## Warning: Factor `Q10` contains implicit NA, consider using
## `forcats::fct_explicit_na`

india_us_data %>% group_by(Q10,Q4,Q3)%>% 
  filter(Q10!="") %>%   filter(Q4!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q4,count,fill=Q10)) + geom_bar(stat="identity",position="fill") +   theme(legend.position = "bottom") +
  labs(title = "Salay by education",
       x = "Education", 
       y = "Share") + facet_wrap(~Q3) +   theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="bottom", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) + scale_fill_manual(values = mycolors) +  coord_flip()

## Warning: Factor `Q10` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Q4` contains implicit NA, consider using
## `forcats::fct_explicit_na`

india_us_data %>% group_by(Q10,Q5,Q3)%>% 
  filter(Q10!="") %>%   filter(Q5!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q5,count,fill=Q10)) + geom_bar(stat="identity",position="fill") +   theme(legend.position = "bottom") +
  labs(title = "Salay by job role",
       x = "Role", 
       y = "Share") + facet_wrap(~Q3) +   theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="bottom", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) + scale_fill_manual(values = mycolors) +  coord_flip()

## Warning: Factor `Q10` contains implicit NA, consider using
## `forcats::fct_explicit_na`

india_us_data %>% group_by(Q10,Q6,Q3)%>% 
  filter(Q10!="") %>%   filter(Q6!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q6,count,fill=Q10)) + geom_bar(stat="identity",position="fill") +   theme(legend.position = "bottom") +
  labs(title = "Salary by company size",
       x = "company size", 
       y = "Share") + facet_wrap(~Q3) +   theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="bottom", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) + scale_fill_manual(values = mycolors) +  coord_flip()

## Warning: Factor `Q10` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Q6` contains implicit NA, consider using
## `forcats::fct_explicit_na`

india_us_data %>% group_by(Q10,Q15,Q3)%>% 
  filter(Q10!="") %>%   filter(Q15!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q15,count,fill=Q10)) + geom_bar(stat="identity",position="fill") +   theme(legend.position = "bottom") +
  labs(title = "Salary by years exp in coding",
       x = "years", 
       y = "Share") + facet_wrap(~Q3) +   theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="bottom", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) + scale_fill_manual(values = mycolors) +  coord_flip()

## Warning: Factor `Q10` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Q15` contains implicit NA, consider using
## `forcats::fct_explicit_na`

india_us_data %>%filter (Q3 %in% c("India","United States of America")) %>%  group_by(Q10,Q23,Q3)%>% 
  filter(Q10!="") %>%   filter(Q23!="") %>% 
  summarise(count=n())  %>% ggplot(aes(Q23,count,fill=Q10)) + geom_bar(stat="identity",position="fill") +   theme(legend.position = "bottom") +
  labs(title = "Salary by years exp in ML",
       x = "Years", 
       y = "Share") + facet_wrap(~Q3) +   theme_fivethirtyeight()+
  theme(axis.title = element_text(size = 10, colour = "black"), axis.text.x = element_text(size = 8, colour = "black"), 
        legend.position="bottom", strip.text.x = element_text(size = 9, face = "bold"),
        legend.title = element_text(face = "bold", size = 2), legend.text = element_text(size = 12), panel.spacing = unit(0.8, "lines"),
       plot.title = element_text(hjust = 0.5)) + scale_fill_manual(values = mycolors) +  coord_flip()

## Warning: Factor `Q10` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Q23` contains implicit NA, consider using
## `forcats::fct_explicit_na`

The trend above for salaries gives lot of interesting insights, few important of them are:

1.25k USD in Indian Currency comes to around (INR 18,00,000 pa) which is considered a very good salary in India, so all the people with less salary and very good salary fall under this criteria and we wont be able to separare them because we build our classes in this manner!To no surpise highest number of Kagglers from India come from this salary slab! And I am amazed to see people from India who are having high salaries too are active on the platform (or atleast have an account if not active).

2.Salary Gender gap is seen in both the countries; less proportion of females with high salries are seen, this difference is keen in India.

3.Salary distribution is as expected highly dependent on age,company size,years of exp in coding to analyse data and ML!

4.It seems that education matters more in India when it comes to salary! As proportion of people with high salary slabs increase in India as education level advances! On contrast in US, highest level of proportion of people having high salaries is seen in ‘High School’ (probably drop outs and/or founders) and people with Professional Degree (like Udemy,Simplilearn).

BUILDING CLASSFIERS (1) Decision Tree

#modeling only on features like age,education,country,job discription,years of exp,etc 
#as these have direct impact on salaries!!
model_data<-india_us_data %>% select(Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q10,Q11,Q15,Q23) %>%  
  filter(Q1!="")%>% 
  filter(Q2!="")%>% filter(Q3!="")%>% filter(Q4!="")%>% filter(Q5!="")%>% filter(Q6!="")%>% filter(Q7!="")%>% filter(Q8!="")%>% 
  filter(Q10!="")%>% filter(Q11!="")%>% filter(Q15!="")%>% filter(Q23!="") %>% 
  filter(Q3 %in% c("India","United States of America"))

model_data$Q3<-factor(model_data$Q3,levels = c("India", "United States of America") ,ordered=FALSE)
#sampling 
library(rsample)      
set.seed(123)
ames_split <- initial_split(model_data, prop = .7)
train <- training(ames_split)
test  <- testing(ames_split)


#DT Model
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)

## Loading required package: rpart

library(RColorBrewer)
library(rpart.plot)
library(rpart)

#building tree
dt_tree<-rpart(Q10~.-Q10, data=train, method="class")
#visualising tree
fancyRpartPlot(dt_tree, caption = NULL)

#prediction
predict_unseen <-predict(dt_tree, test, type = 'class')
#confusion matrix
table_mat <- table(test$Q10, predict_unseen)
table_mat

##              predict_unseen
##               <25k 25k to 50k 50k to 100k >100k
##   <25k         179          0           6    15
##   25k to 50k    36          0           5     4
##   50k to 100k   21          0          35    37
##   >100k          6          0          40   135

#accuracy
accuracy_Test12 <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test12

## [1] 0.672447

#recall
recall <- diag(table_mat) / rowSums(table_mat)
mean(recall)

## [1] 0.5043001

#precision
precision <- (diag(table_mat) / colSums(table_mat))
precision[is.na(precision)] <- 0
mean(precision)

## [1] 0.4633631

#multiclass AUC-ROC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

predict_unseen<-ordered(predict_unseen, levels = salary_levels)
multiclass.roc(test$Q10, predict_unseen)

## Setting direction: controls < cases

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

## 
## Call:
## multiclass.roc.default(response = test$Q10, predictor = predict_unseen)
## 
## Data: predict_unseen with 4 levels of test$Q10: <25k, 25k to 50k, 50k to 100k, >100k.
## Multi-class area under the curve: 0.7842

#Cp values
printcp(dt_tree)

## 
## Classification tree:
## rpart(formula = Q10 ~ . - Q10, data = train, method = "class")
## 
## Variables actually used in tree construction:
## [1] Q1  Q23 Q3 
## 
## Root node error: 734/1213 = 0.60511
## 
## n= 1213 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.491826      0   1.00000 1.00000 0.023195
## 2 0.023842      1   0.50817 0.50817 0.021896
## 3 0.010000      3   0.46049 0.47411 0.021462

plotcp(dt_tree)

#variable importance
dt_tree$variable.importance %>% 
  tidy() %>%
  dplyr::arrange(desc(x)) %>%
  ggplot(aes(reorder(names, x), x)) +
  geom_col() +
  coord_flip() +
  ggtitle("Variable importance")

## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")

#FOR DECISION TREE
#precision = 0.46   #recall = 0.504  #accuracy = 0.67  #auc = 0.7842

Writing the variable name discription for readability
1. Q3: Country
2. Q23 :Yeats of experience in ML models
3. Q1 : Age
4.Q15 : Years of experience in writing code to analyse data

We see varibalr importance is in this order: 1.Country
2.Exp in writing code
3.Age
4.Exp in ML models
5.Education
6.Money spent at work on ML products
7.Deployment of ML by employer
8.Job title
9.DS team size

Accuracy of model is 67.22% The tree is self explanatory!

BUILDING CLASSFIERS (2) Random Forests

class(train$Q10)

## [1] "ordered" "factor"

library(randomForest) # basic implementation

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:rattle':
## 
##     importance

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

# for reproduciblity
set.seed(123)
# default RF model
m1 <- randomForest(
  formula = Q10 ~ .-Q10,
  data    = train
)
m1

## 
## Call:
##  randomForest(formula = Q10 ~ . - Q10, data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 28.52%
## Confusion matrix:
##             <25k 25k to 50k 50k to 100k >100k class.error
## <25k         427         23          15    14   0.1085595
## 25k to 50k    74         16          25     9   0.8709677
## 50k to 100k   22         10          83    86   0.5870647
## >100k         13          0          55   341   0.1662592

plot(m1)
legend("top", colnames(m1$err.rate),col=1:4,cex=0.8,fill=1:4)

pred = predict(m1, newdata=test[,-9])
cm = table(test[,9], pred)
cm

##              pred
##               <25k 25k to 50k 50k to 100k >100k
##   <25k         173          6          10    11
##   25k to 50k    30          4           7     4
##   50k to 100k   14          7          37    35
##   >100k          4          2          14   161

#accuracy
accuracy_Test <- sum(diag(cm)) / sum(cm)
accuracy_Test

## [1] 0.7225434

#recall
recall <- diag(cm) / rowSums(cm)
mean(recall)

## [1] 0.5603103

#precision
precision <- (diag(cm) / colSums(cm))
precision[is.na(precision)] <- 0
mean(precision)

## [1] 0.5751206

#multiclass AUC-ROC
library(pROC)
pred<-ordered(pred, levels = salary_levels)
multiclass.roc(test$Q10, pred)

## Setting direction: controls < cases

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

## 
## Call:
## multiclass.roc.default(response = test$Q10, predictor = pred)
## 
## Data: pred with 4 levels of test$Q10: <25k, 25k to 50k, 50k to 100k, >100k.
## Multi-class area under the curve: 0.8164

mtry <- tuneRF(train[-9],train$Q10, ntreeTry=500,
               stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)

## mtry = 3  OOB error = 28.52% 
## Searching left ...
## mtry = 2     OOB error = 28.03% 
## 0.01734104 0.01 
## Searching right ...
## mtry = 4     OOB error = 29.18% 
## -0.04117647 0.01

#accuracy = 72.25%          #precision = 0.57           #recall = 0.56    #auc = 0.81

Lets do hyperparameter tuning to find the best model

TUNING RANDOM FOREST

library(ranger)

## 
## Attaching package: 'ranger'

## The following object is masked from 'package:randomForest':
## 
##     importance

## The following object is masked from 'package:rattle':
## 
##     importance

hyper_grid <- expand.grid(
  mtry       = seq(2, 10, by = 1),
  node_size  = seq(3, 9, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_error   = 0
)
nrow(hyper_grid)

## [1] 144

for(i in 1:nrow(hyper_grid)) {
  
  # train model
  model <- ranger(
    formula         = Q10 ~ .-Q10, 
    data            = train, 
    num.trees       = 200,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$node_size[i],
    sample.fraction = hyper_grid$sampe_size[i],
    seed            = 123
  )
  
  # add OOB error to grid
  hyper_grid$OOB_error[i] <- (model$prediction.error)
}

hyper_grid %>% 
  dplyr::arrange(OOB_error) %>%
  head(10)

#for node size of 7, mtry =2, samplesize= 0.55 OOB error is minimum! Lets build this model by using 200 trees!
tuned_RF <- randomForest(
  formula = Q10 ~ .-Q10,
  data    = train,
      ntree       = 200,
    mtry            = 2,
    nodesize   = 7
)
tuned_RF

## 
## Call:
##  randomForest(formula = Q10 ~ . - Q10, data = train, ntree = 200,      mtry = 2, nodesize = 7) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 28.44%
## Confusion matrix:
##             <25k 25k to 50k 50k to 100k >100k class.error
## <25k         439          9          17    14  0.08350731
## 25k to 50k    81          8          23    12  0.93548387
## 50k to 100k   24          9          80    88  0.60199005
## >100k         11          0          57   341  0.16625917

pred = predict(tuned_RF, newdata=test[,-9])
cm = table(test[,9], pred)
cm

##              pred
##               <25k 25k to 50k 50k to 100k >100k
##   <25k         177          3           6    14
##   25k to 50k    32          4           4     5
##   50k to 100k   16          3          36    38
##   >100k          6          0          12   163

#accuracy
accuracy_Test <- sum(diag(cm)) / sum(cm)
accuracy_Test

## [1] 0.7321773

#recall
recall <- diag(cm) / rowSums(cm)
mean(recall)

## [1] 0.5653845

#precision
precision <- (diag(cm) / colSums(cm))
precision[is.na(precision)] <- 0
mean(precision)

## [1] 0.6319581

#multiclass AUC-ROC
pred<-ordered(pred, levels = salary_levels)
multiclass.roc(test$Q10, pred)

## Setting direction: controls < cases

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

## 
## Call:
## multiclass.roc.default(response = test$Q10, predictor = pred)
## 
## Data: pred with 4 levels of test$Q10: <25k, 25k to 50k, 50k to 100k, >100k.
## Multi-class area under the curve: 0.8071

varImpPlot(tuned_RF)

#accuracy= 73.21%           #precision= 0.63     # recall= 0.56     #auc= 0.8071

#Tuned RF gives us best accuracy and is more precise than intial random forest model

Traning Naive Bayes classifier

library(rsample)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

features <- setdiff(names(train), "Q10")
x <- train[, features]
y <- train$Q10

train_control <- trainControl(
  method = "cv", 
  number = 10
  )

nb.m1 <- train(
  x = x,
  y = y,
  method = "nb",
  trControl = train_control
  )

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 53

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 55

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 59

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 60

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 72

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 86

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 93

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 100

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 109

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 119

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 53

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 55

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 59

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 60

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 72

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 86

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 93

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 100

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 109

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 119

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 12

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 64

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 65

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 73

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 79

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 82

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 91

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 96

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 12

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 64

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 65

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 73

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 79

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 82

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 91

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 96

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 43

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 53

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 73

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 86

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 89

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 91

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 106

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 120

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 122

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 43

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 53

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 73

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 86

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 89

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 91

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 106

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 120

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 122

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 71

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 74

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 78

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 79

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 81

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 90

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 100

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 101

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 105

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 115

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 116

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 117

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 71

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 74

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 78

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 79

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 81

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 90

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 100

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 101

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 105

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 115

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 116

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 117

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 12

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 47

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 69

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 80

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 96

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 99

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 106

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 12

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 47

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 69

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 80

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 96

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 99

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 106

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 31

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 48

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 49

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 63

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 88

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 89

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 91

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 92

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 98

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 107

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 115

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 31

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 48

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 49

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 63

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 88

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 89

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 91

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 92

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 98

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 107

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 115

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 4

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 47

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 95

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 107

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 4

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 47

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 95

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 107

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 43

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 59

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 63

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 66

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 87

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 89

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 93

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 95

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 96

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 104

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 116

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 118

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 120

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 43

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 59

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 63

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 66

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 83

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 87

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 89

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 93

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 95

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 96

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 104

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 116

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 118

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 120

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 21

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 36

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 71

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 88

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 96

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 104

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 21

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 36

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 71

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 88

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 96

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 104

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 88

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 91

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 106

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 107

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 113

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 88

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 91

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 106

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 107

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 113

confusionMatrix(nb.m1)

## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##              Reference
## Prediction    <25k 25k to 50k 50k to 100k >100k
##   <25k        33.1        5.1         2.1   1.7
##   25k to 50k   3.1        1.8         1.2   0.6
##   50k to 100k  1.8        1.8         8.7   5.3
##   >100k        1.6        1.5         4.5  26.1
##                             
##  Accuracy (average) : 0.6974

pred = predict(nb.m1, newdata=test[,-9])

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 63

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 65

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 66

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 68

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 70

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 75

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 76

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 78

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 105

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 112

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 135

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 137

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 163

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 177

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 195

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 196

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 210

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 228

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 254

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 268

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 271

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 274

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 303

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 311

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 313

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 316

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 317

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 329

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 332

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 334

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 346

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 350

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 357

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 370

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 378

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 381

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 395

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 402

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 416

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 420

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 431

## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 461

cm = table(test[,9], pred)
cm

##              pred
##               <25k 25k to 50k 50k to 100k >100k
##   <25k         174          6           8    12
##   25k to 50k    21         11           5     8
##   50k to 100k   17         10          33    33
##   >100k          5          2          27   147

#accuracy
accuracy_Test <- sum(diag(cm)) / sum(cm)
accuracy_Test

## [1] 0.7032755

#recall
recall <- diag(cm) / rowSums(cm)
mean(recall)

## [1] 0.5703595

#precision
precision <- (diag(cm) / colSums(cm))
precision[is.na(precision)] <- 0
mean(precision)

## [1] 0.5920521

#multiclass AUC-ROC

pred<-ordered(pred, levels = salary_levels)
multiclass.roc(test$Q10, pred)

## Setting direction: controls < cases

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

## 
## Call:
## multiclass.roc.default(response = test$Q10, predictor = pred)
## 
## Data: pred with 4 levels of test$Q10: <25k, 25k to 50k, 50k to 100k, >100k.
## Multi-class area under the curve: 0.7996

#accuracy = 70.32%      #precision = 0.59    #recall=0.57         #AUC=0.79

MULTICLASS LOGISTIC REGRESSION MODEL

library(tidyverse)
library(caret)
library(nnet)

multi_logistic <- nnet::multinom(Q10 ~.-Q10, data = train)

## # weights:  268 (198 variable)
## initial  value 1681.575060 
## iter  10 value 875.011943
## iter  20 value 767.285404
## iter  30 value 757.930800
## iter  40 value 755.387797
## iter  50 value 754.748431
## iter  60 value 754.535292
## iter  70 value 754.408570
## iter  80 value 754.315060
## iter  90 value 754.245693
## iter 100 value 754.233584
## final  value 754.233584 
## stopped after 100 iterations

summary(multi_logistic)

## Call:
## nnet::multinom(formula = Q10 ~ . - Q10, data = train)
## 
## Coefficients:
##             (Intercept)       Q1.L      Q1.Q      Q1.C        Q1^4
## 25k to 50k    -3.496980 -3.5716517 -6.682417 -4.448604 -3.07985164
## 50k to 100k   -2.704878 -1.5480844 -2.289620 -1.385046  0.03722388
## >100k         -4.249498  0.5552242 -4.339521 -1.722280  0.01050761
##                   Q1^5       Q1^6        Q1^7       Q1^8       Q1^9
## 25k to 50k  -2.7631865 -1.1231370 -0.98411765 -0.8686361 -0.6767385
## 50k to 100k -0.6308709  0.2206739  0.16177186 -0.9511294 -0.9487959
## >100k       -0.2732254  0.2515004  0.01810242 -1.2859863 -1.2225089
##                   Q1^10    Q2Male Q2Prefer not to say
## 25k to 50k   0.08871047 0.2112635           -1.033526
## 50k to 100k -0.70826700 0.5327267           -1.559881
## >100k       -0.05039823 0.9656991           -1.667083
##             Q2Prefer to self-describe Q3United States of America      Q4.L
## 25k to 50k                 -0.2900705                   1.957733 1.8125976
## 50k to 100k                -2.2121242                   5.392708 1.7808147
## >100k                       5.5889020                   7.985055 0.7508415
##                   Q4.Q      Q4.C     Q4^4      Q4^5      Q4^6       Q5.L
## 25k to 50k  -0.5757822 -2.767235 3.714942 -3.624286 2.0489251 -1.3518703
## 50k to 100k -0.2104337 -2.931805 2.706303 -1.786880 1.7124539  0.7216732
## >100k       -0.4874267 -2.493760 0.933256  0.602945 0.7462192  0.7397085
##                    Q5.Q       Q5.C       Q5^4       Q5^5       Q5^6
## 25k to 50k  -0.17970371 -1.6131470 -1.3378434  4.6479935  4.5696060
## 50k to 100k  0.46154790 -0.3045349  0.1535751 -0.4559838 -0.3763447
## >100k       -0.03272713 -0.2820763  0.4993570 -0.8706312  0.8542564
##                   Q5^7        Q5^8       Q5^9      Q5^10      Q5^11
## 25k to 50k  -0.3558724  0.46313298  3.4828400 -0.6508595 -2.2757145
## 50k to 100k -0.3842226 -0.07914374 -0.5566947 -0.3421191  0.8565371
## >100k        0.1739591 -0.11342774  0.8787477  0.1703615  1.0327967
##                 Q5^12       Q6.L       Q6.Q       Q6.C        Q6^4
## 25k to 50k  3.1084719 0.06780350 -0.2971907 -0.3454651 -0.01278108
## 50k to 100k 0.1182718 0.07004829 -0.4559472 -0.2745858  0.30654356
## >100k       0.9828893 0.35335097 -0.6841146  0.3639670  0.13927709
##                   Q7.L      Q7.Q      Q7.C        Q7^4      Q7^5
## 25k to 50k  -0.4411393 -1.316411 0.4198840  0.02809503 0.4670276
## 50k to 100k -0.1527066 -1.048409 0.4228793 -0.02284568 0.3693530
## >100k       -0.1907847 -1.633064 0.7172474 -0.01515568 0.6028286
##                   Q7^6 Q8Exploring ML models
## 25k to 50k  0.13457522           -0.58007842
## 50k to 100k 0.09135365           -0.06747227
## >100k       0.14944695           -0.48688321
##             Q8Model in production < 2 years
## 25k to 50k                       -0.3543110
## 50k to 100k                      -0.2043923
## >100k                            -0.4412506
##             Q8Model in production > 2 years Q8No idea
## 25k to 50k                       0.05453651 -1.425774
## 50k to 100k                      0.43784361 -1.718298
## >100k                            0.88985175 -2.291591
##             Q8No use of ML models Q8Only Insights no production    Q11.L
## 25k to 50k             -0.5647569                    -0.6265962 0.622275
## 50k to 100k            -0.6865376                    -0.4660211 1.277046
## >100k                  -1.0400076                    -0.8796174 2.049234
##                 Q11.Q      Q11.C       Q11^4       Q11^5      Q15.L
## 25k to 50k  0.4626867 -0.1932037 -0.13890777  0.15172485 0.09416044
## 50k to 100k 0.6749783 -1.1231973 -0.75836589 -0.12244230 0.90963433
## >100k       0.5554210 -1.5139678 -0.06109443 -0.06813856 2.55656837
##                  Q15.Q      Q15.C      Q15^4      Q15^5      Q23.L
## 25k to 50k  -0.3999012 -0.5898282 -0.4204200 -0.1838940 -2.2653540
## 50k to 100k -1.3358075 -0.9803950 -0.7577060 -0.3466561  0.9280248
## >100k       -0.9037341 -0.4576400 -0.5735565  0.1457970  1.3966177
##                  Q23.Q      Q23.C      Q23^4       Q23^5      Q23^6
## 25k to 50k  -3.5733370 -2.8579561 -1.5532838 -0.95743055 -0.6255559
## 50k to 100k  0.8135710 -0.1032683  0.2018767  0.08041714  0.1240356
## >100k        0.3444244 -1.1384288 -0.1482702 -0.07763604 -0.3903605
##                   Q23^7
## 25k to 50k  -0.15744663
## 50k to 100k  0.48620574
## >100k        0.07462448
## 
## Std. Errors:
##             (Intercept)      Q1.L      Q1.Q       Q1.C       Q1^4
## 25k to 50k    6.9582439 33.688353 36.180456 32.3608504 25.0700375
## 50k to 100k   2.2642703  1.136139  1.032380  0.9230631  0.8131339
## >100k         0.9692874  1.243127  1.140727  1.0242682  0.8849294
##                   Q1^5       Q1^6      Q1^7      Q1^8      Q1^9     Q1^10
## 25k to 50k  16.9808136 10.0276212 5.1155784 2.2367274 0.9220086 0.5067702
## 50k to 100k  0.7622642  0.7184388 0.7112044 0.6731619 0.6476157 0.5368212
## >100k        0.8046712  0.7408200 0.7195480 0.6896525 0.6561944 0.5481550
##                Q2Male Q2Prefer not to say Q2Prefer to self-describe
## 25k to 50k  0.3372016            1.334999                 155.06596
## 50k to 100k 0.3806487            1.271080                 103.88676
## >100k       0.4190895            1.318926                  33.07431
##             Q3United States of America      Q4.L      Q4.Q      Q4.C
## 25k to 50k                   0.3741403 12.549336 0.6899901 13.553510
## 50k to 100k                  0.4058271  8.100966 0.8456136  8.744211
## >100k                        0.5322696  2.957515 1.0091032  3.161816
##                  Q4^4      Q4^5     Q4^6       Q5.L       Q5.Q      Q5.C
## 25k to 50k  18.707389 14.487376 6.572105 10.8922270 11.3838042 5.2512090
## 50k to 100k 12.039392  9.336694 4.255198  0.6075345  0.5437756 0.4155772
## >100k        4.206095  3.333177 1.621748  0.6867004  0.5959229 0.4564510
##                   Q5^4       Q5^5       Q5^6       Q5^7      Q5^8
## 25k to 50k  11.7044589 14.4414904 14.1331035 15.3945241 7.4708483
## 50k to 100k  0.5253473  0.7135409  0.7772128  0.7266839 0.4990293
## >100k        0.5417785  0.8074038  0.8718387  0.8267565 0.5552832
##                   Q5^9     Q5^10      Q5^11      Q5^12      Q6.L      Q6.Q
## 25k to 50k  12.8643456 4.3542780 12.0999732 13.5249239 0.3797651 0.3537822
## 50k to 100k  0.6202468 0.5472872  0.6149610  0.5190295 0.4501184 0.4103725
## >100k        0.6967723 0.5841795  0.6832416  0.5851664 0.5152478 0.4721558
##                  Q6.C      Q6^4      Q7.L      Q7.Q      Q7.C      Q7^4
## 25k to 50k  0.3395205 0.3613802 0.9200533 2.5337391 0.9634151 0.2929388
## 50k to 100k 0.4018633 0.4349363 0.4445201 0.8361412 0.3753776 0.2855621
## >100k       0.4479607 0.5016415 0.4106157 0.3900691 0.2769148 0.3131859
##                  Q7^5      Q7^6 Q8Exploring ML models
## 25k to 50k  1.0201780 0.3124497             1.1963873
## 50k to 100k 0.3653256 0.1296539             0.5093192
## >100k       0.2247657 0.1021432             0.4237249
##             Q8Model in production < 2 years
## 25k to 50k                        1.1912776
## 50k to 100k                       0.4918953
## >100k                             0.3832819
##             Q8Model in production > 2 years Q8No idea
## 25k to 50k                        1.1849050 1.2178766
## 50k to 100k                       0.4685599 0.5349939
## >100k                             0.3365390 0.4621126
##             Q8No use of ML models Q8Only Insights no production     Q11.L
## 25k to 50k              1.2028244                     1.2209048 0.3411745
## 50k to 100k             0.5435163                     0.5701980 0.3875245
## >100k                   0.4656957                     0.4960249 0.4263490
##                 Q11.Q     Q11.C     Q11^4     Q11^5     Q15.L     Q15.Q
## 25k to 50k  0.2958631 0.3380290 0.3535345 0.3144687 0.9747531 0.8747725
## 50k to 100k 0.3501314 0.4002257 0.4192797 0.3923296 0.8160165 0.7282546
## >100k       0.3731421 0.4568173 0.4773765 0.4278572 0.8105578 0.7149774
##                 Q15.C     Q15^4     Q15^5     Q23.L     Q23.Q     Q23.C
## 25k to 50k  0.6694242 0.4824156 0.3471683 13.860264 13.850218 11.056762
## 50k to 100k 0.5968843 0.4684309 0.3606329  1.304351  1.179545  1.037038
## >100k       0.6081423 0.4873644 0.3789892  1.277695  1.155386  1.015513
##                 Q23^4     Q23^5     Q23^6     Q23^7
## 25k to 50k  7.2722642 3.9141714 1.6807464 0.6308617
## 50k to 100k 1.0011536 0.8587437 0.6472782 0.5552600
## >100k       0.9869797 0.8479116 0.6446954 0.5568983
## 
## Residual Deviance: 1508.467 
## AIC: 1856.467

pred = predict(multi_logistic, newdata=test[,-9])
cm = table(test[,9], pred)
cm

##              pred
##               <25k 25k to 50k 50k to 100k >100k
##   <25k         182          2           8     8
##   25k to 50k    26          7           4     8
##   50k to 100k   16          5          36    36
##   >100k          5          1          16   159

#ACCURACY
accuracy_Test <- sum(diag(cm)) / sum(cm)
accuracy_Test

## [1] 0.7398844

#recall
recall <- diag(cm) / rowSums(cm)
mean(recall)

## [1] 0.5827763

#precision
precision <- (diag(cm) / colSums(cm))
precision[is.na(precision)] <- 0
mean(precision)

## [1] 0.6443702

#multiclass AUC-ROC

pred<-ordered(pred, levels = salary_levels)
multiclass.roc(test$Q10, pred)

## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls < cases

## 
## Call:
## multiclass.roc.default(response = test$Q10, predictor = pred)
## 
## Data: pred with 4 levels of test$Q10: <25k, 25k to 50k, 50k to 100k, >100k.
## Multi-class area under the curve: 0.8106

#accuracy 73.9%   #precision 0.64       #recall 0.58      #auc 0.81

From model evaluation point of view we can see that the best performing model is multiclass logistic regression!