Since last year, Kaggle has organized a survey asking the Kaggle
community about their machine learning and data science “uses”. As an
African, I am particularly interested in the African part of the
community since, I have to admit, I know close to nothing about my
fellow Africans. Thanks to Kaggle, I can use this opportunity to know
more about most of us.
Datasets
I will be using three datasets for this kernel:
# STRING PROCESSING
# countries
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"Iran, Islamic Republic of...","Iran")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"I do not wish to disclose my location","Won't disclose")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"United Kingdom of Great Britain and Northern Ireland","UK and NI")
multipleChoice18$Q3 <- str_replace(multipleChoice18$Q3,"United States of America","USA")
continents$Country <- str_replace(continents$Country,"Iran, Islamic Republic of...","Iran")
continents$Country <- str_replace(continents$Country,"I do not wish to disclose my location","Won't disclose")
continents$Country <- str_replace(continents$Country,"United Kingdom of Great Britain and Northern Ireland","UK and NI")
continents$Country <- str_replace(continents$Country,"United States of America","USA")
# CONVERT CATEGORICAL DATA TO FACTOR
# age groups
multipleChoice18$Q2 <- factor(multipleChoice18$Q2,
levels = c("18-21","22-24","25-29",
"30-34","35-39","40-44",
"45-49","50-54","55-59",
"60-69","70-79","80+"),
labels = c("18-21","22-24","25-29",
"30-34","35-39","40-44",
"45-49","50-54","55-59",
"60-69","70-79","80+"))
# degree
multipleChoice18$Q4 <- factor(multipleChoice18$Q4,
levels = c("Doctoral degree","Master’s degree","Bachelor’s degree","Professional degree",
"No formal education past high school",
"Some college/university study without earning a bachelor’s degree",
"I prefer not to answer"),
labels = c("PhD","Master","Bachelor","Professional",
"High school","No degree","Won't disclose"))
# undergraduate major
multipleChoice18$Q5 <- factor(multipleChoice18$Q5,
levels = c("Medical or life sciences (biology, chemistry, medicine, etc.)",
"Computer science (software engineering, etc.)",
"Engineering (non-computer focused)",
"Mathematics or statistics",
"A business discipline (accounting, economics, finance, etc.)",
"Environmental science or geology",
"Social sciences (anthropology, psychology, sociology, etc.)",
"Physics or astronomy",
"Information technology, networking, or system administration",
"I never declared a major",
"Other",
"Humanities (history, literature, philosophy, etc.)") ,
labels = c("Medical/life sciences", "Computer science",
"Engineering", "Mathematics/statistics",
"A business discipline", "Physics/astronomy",
"IT/Network/Sys. admin", "No major declared",
"Humanities", "Env. science", "Social sciences", "Other"))
# In what industry is your current employer?
multipleChoice18$Q7 <- factor(multipleChoice18$Q7,
levels = c("Retail/Sales", "I am a student",
"Computers/Technology", "Accounting/Finance",
"Academics/Education",
"Insurance/Risk Assessment","Other",
"Energy/Mining", "Non-profit/Service",
"Marketing/CRM", "Government/Public Service",
"Manufacturing/Fabrication",
"Online Service/Internet-based Services",
"Broadcasting/Communications",
"Medical/Pharmaceutical",
"Online Business/Internet-based Sales",
"Military/Security/Defense",
"Shipping/Transportation",
"Hospitality/Entertainment/Sports"),
labels = c("Retail / Sales", "Student",
"Computers / Technology", "Accounting / Finance",
"Academics / Education",
"Insurance / Risk Assessment","Other",
"Energy / Mining", "Non-profit / Service",
"Marketing / CRM", "Government / Public Service",
"Manufacturing / Fabrication",
"Online Service / Internet-based Services",
"Broadcasting / Communications",
"Medical / Pharmaceutical",
"Online Business / Internet-based Sales",
"Military / Security/Defense",
"Shipping / Transportation",
"Hospitality / Entertainment/Sports"))
# experience in current role
multipleChoice18$Q8 <- factor(multipleChoice18$Q8, levels = c("0-1","1-2","2-3",
"3-4","4-5","5-10",
"10-15","15-20","20-25",
"25-30","30+"))
# yearly compensation
multipleChoice18$Q9 <- factor(multipleChoice18$Q9,
levels = c("I do not wish to disclose my approximate yearly compensation",
"0-10,000","10-20,000","20-30,000","30-40,000",
"40-50,000","50-60,000","60-70,000","70-80,000",
"80-90,000","90-100,000","100-125,000",
"125-150,000","150-200,000","200-250,000",
"250-300,000","300-400,000", "400-500,000","500,000+"),
labels = c("Won't disclose",
"0-10,000","10-20,000","20-30,000","30-40,000",
"40-50,000","50-60,000","60-70,000","70-80,000",
"80-90,000","90-100,000","100-125,000",
"125-150,000","150-200,000","200-250,000",
"250-300,000","300-400,000", "400-500,000","500,000+"))
# time spent coding
multipleChoice18$Q23 <- factor(multipleChoice18$Q23, levels = c("0% of my time",
"1% to 25% of my time",
"25% to 49% of my time",
"50% to 74% of my time",
"75% to 99% of my time",
"100% of my time"),
labels = c("0%","1% to 25%","25% to 49%",
"50% to 74%","75% to 99%","100%"))
# coding experience
multipleChoice18$Q24 <- factor(multipleChoice18$Q24,
levels = c("I have never written code and I do not want to learn",
"I have never written code but I want to learn",
"< 1 year","1-2 years","3-5 years","5-10 years",
"10-20 years","20-30 years","30-40 years", "40+ years") ,
labels = c("I don't write code and don't want to learn",
"I don't write code but want to learn",
"< 1 year", "1-2 years", "3-5 years",
"5-10 years", "10-20 years","20-30 years","30-40 years", "40+ years")
)
# For how many years have you used machine learning methods
multipleChoice18$Q25 <- factor(multipleChoice18$Q25,
levels = c("I have never studied machine learning and I do not plan to",
"I have never studied machine learning but plan to learn in the future",
"< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years",
"5-10 years", "10-15 years", "20+ years"),
labels = c("Never studied, do not plan to",
"Never studied, plan to learn",
"< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years",
"5-10 years", "10-15 years", "20+ years"))
# use of machine learning in industries
multipleChoice18$Q10 <- factor(multipleChoice18$Q10,
levels = c("I do not know",
"No (we do not use ML methods)",
"We are exploring ML methods (and may one day put a model into production)",
"We recently started using ML methods (i.e., models in production for less than 2 years)",
"We have well established ML methods (i.e., models in production for more than 2 years)",
"We use ML methods for generating insights (but do not put working models into production)"),
labels = c("I do not know", "No", "Exploring ML methods",
"Recently started", "Well established ML methods",
"For generating insights"))
# expertise in data science
multipleChoice18$Q40 <- factor(multipleChoice18$Q40,
levels = c("Independent projects are equally important as academic achievements",
"Independent projects are much more important than academic achievements",
"Independent projects are slightly more important than academic achievements",
"Independent projects are slightly less important than academic achievements",
"Independent projects are much less important than academic achievements",
"No opinion; I do not know"),
labels = c("Equally important",
"Much more important",
"Slightly more important",
"Less important",
"Much less important",
"No opinion/Don't know"))
# are you a data scientist?
multipleChoice18$Q26 <- factor(multipleChoice18$Q26,
levels = c("Definitely yes", "Probably yes", "Maybe",
"Probably not", "Definitely not"),
labels = c("Definitely yes", "Probably yes", "Maybe",
"Probably not", "Definitely not"))