For this project, my group (Team 4: Gregg Maloy, Jacob Silver, Miguel Gomez, Umer Farooq) regularly met up on Google Meet to discuss how we were going to divide the work and share what we had coded so far. We found a dataset on Kaggle that surveyed about 24000 people with the objective of finding out “who is working with data, what’s happening with machine learning in different industries, and the best ways for new data scientists to break into the field.” The dataset contains 43 questions presented in a wide format spanning 296 columns.
The first part of this markdown is going to be my contribution to the project, which was analyzing the demographic information, question 12 (programming language), and looking for the “bright spots” in data science. During one of our meetups, Jacob shared his code, which consisted of a function that took the columns we wanted as inputs and graphed them in a bar chart. His code also contained a function that graphed the cross between two questions. In my case, I used his function to determine which programming language various positions (data scientists, software engineers, etc.) used on a regular basis. At our final meetup, we shared what we had completed and sliced down which data would go in the presentation.
df <- read.csv("https://raw.githubusercontent.com/LeJQC/MSDS/main/DATA%20607/Project%203/kaggle_survey_2022_responses.csv",na.strings = c("","N/A"))
#There were 23998 respondents to the survey and 43 questions separated into 296 columns
dim(df)
## [1] 23998 296
#head(df[1:6],5)
#Making a data frame with the Age of all the respondents
df2 <- df[2]
age <- data.frame(df2[-1, ])
colnames(age) <- "Age"
The majority of the respondents were between age 18 to 29
age %>%
ggplot(aes(x=Age))+
geom_bar()+
coord_flip()+
theme_minimal()
### Table of the age and percentage
age_count <- age %>%
count(Age) %>%
mutate(percent = round(n/sum(n)*100)) %>%
arrange(desc(n))
knitr:: kable(age_count)
Age | n | percent |
---|---|---|
18-21 | 4559 | 19 |
25-29 | 4472 | 19 |
22-24 | 4283 | 18 |
30-34 | 2972 | 12 |
35-39 | 2353 | 10 |
40-44 | 1927 | 8 |
45-49 | 1253 | 5 |
50-54 | 914 | 4 |
55-59 | 611 | 3 |
60-69 | 526 | 2 |
70+ | 127 | 1 |
#Making a data frame with the Gender of all the respondents
df3 <- df[3]
gender <- data.frame(df3[-1, ])
colnames(gender) <- "Gender"
Three fourths of the respondents were male
gender_count <- gender %>%
count(Gender) %>%
mutate(percent = round(n/sum(n)*100)) %>%
arrange(desc(n))
knitr:: kable(gender_count)
Gender | n | percent |
---|---|---|
Man | 18266 | 76 |
Woman | 5286 | 22 |
Prefer not to say | 334 | 1 |
Nonbinary | 78 | 0 |
Prefer to self-describe | 33 | 0 |
#Making a data frame with the Country of all the respondents
df4 <- df[4]
country <- data.frame(df4[-1, ])
colnames(country) <- "Country"
About a third of the respondents are from India
country_per <- country %>%
count(Country) %>%
arrange(desc(n)) %>%
mutate(percent = round(n/sum(n)*100)) %>%
head(10)
knitr:: kable(country_per)
Country | n | percent |
---|---|---|
India | 8792 | 37 |
United States of America | 2920 | 12 |
Other | 1430 | 6 |
Brazil | 833 | 3 |
Nigeria | 731 | 3 |
Pakistan | 620 | 3 |
Japan | 556 | 2 |
China | 453 | 2 |
Egypt | 383 | 2 |
Mexico | 380 | 2 |
Looking at the most used common programming language (question 12) among the respondents
#Splicing the columns needed
df12 <- df[,31:45]
dfq12 <- df12 %>%
pivot_longer(
cols = everything(),
names_to = "Question 12",
values_to = "Language",
values_drop_na = TRUE
)
dfq12 <- dfq12[16:nrow(dfq12),]
dfq12 %>%
ggplot(aes(x=Language))+
geom_bar()+
coord_flip()+
theme_minimal()
table(dfq12$Language)
##
## Bash C C# C++ Go Java Javascript
## 1674 3801 1473 4549 322 3862 3489
## Julia MATLAB None Other PHP Python R
## 296 2441 256 1342 1443 18653 4571
## SQL
## 9620
#Splicing the columns I want
df11 <- df[3:nrow(df),30:45]
df11_clean <- df11 %>%
filter(df11$Q11 != "I have never written code") %>%
group_by(Q11)
dfq11 <- df11_clean %>%
pivot_longer(
cols = -1,
names_to = "Question 12",
values_to = "Language",
values_drop_na = TRUE)
Seems python is the most used language regardless of years of experience
table(dfq11$Q11,dfq11$Language)
##
## Bash C C# C++ Go Java Javascript Julia MATLAB None Other
## < 1 years 80 703 141 753 25 555 382 28 477 174 170
## 1-3 years 265 1263 312 1555 56 1244 1010 57 742 36 259
## 10-20 years 336 344 244 379 59 421 439 46 232 3 233
## 20+ years 338 412 269 481 55 401 457 58 195 9 268
## 3-5 years 296 655 258 863 61 724 645 46 447 19 183
## 5-10 years 359 424 249 518 66 517 556 61 348 15 229
##
## PHP Python R SQL
## < 1 years 150 4528 1000 2039
## 1-3 years 402 5846 1255 2801
## 10-20 years 198 1570 424 1009
## 20+ years 191 1332 442 842
## 3-5 years 273 3088 820 1609
## 5-10 years 229 2289 630 1320
It looks like the people just starting their data science career use more programming languages than data scientist who have more than 10 years of experience.
dfq11 %>%
ggplot(aes(x= factor(Q11, levels = c("< 1 years","1-3 years","3-5 years","5-10 years","10-20 years","20+ years")),fill=Language))+
geom_bar(alpha = 20)+
theme_bw()+
labs(title = "Years of Experience vs Language", x= "Years of Experience")
df_comp <- df[159]
comp <- na.omit(data.frame(df_comp[-1,]))
colnames(comp) <- c("Compensation")
comp %>%
count(Compensation) %>%
arrange(n)
## Compensation n
## 1 >$1,000,000 23
## 2 $500,000-999,999 48
## 3 300,000-499,999 76
## 4 250,000-299,999 78
## 5 200,000-249,999 155
## 6 90,000-99,999 197
## 7 80,000-89,999 222
## 8 4,000-4,999 234
## 9 3,000-3,999 244
## 10 125,000-149,999 269
## 11 2,000-2,999 271
## 12 25,000-29,999 277
## 13 70,000-79,999 289
## 14 15,000-19,999 299
## 15 60,000-69,999 318
## 16 20,000-24,999 337
## 17 150,000-199,999 342
## 18 7,500-9,999 362
## 19 50,000-59,999 366
## 20 5,000-7,499 391
## 21 100,000-124,999 404
## 22 40,000-49,999 421
## 23 1,000-1,999 444
## 24 30,000-39,999 464
## 25 10,000-14,999 493
## 26 $0-999 1112
Created a new data frame that filtered by people who made $200,000 or more. There were 380 respondents of the 23998 people survey who matched this criteria, which is about 1.6%.
df_top <- df %>%
filter(df$Q29 %in% c("200,000-249,999","250,000-299,999","300,000-499,999","$500,000-999,999",">$1,000,000"))
df_top <- df_top[,c(6:25,29:44,146)]
Instead of looking at each observation individually, I wanted to get a general sense of what these “bright spots” have in common. I dumped all the answers to questions 6(where they learned to code), 7(most useful platform), 12(programming language), and 23 (job title) into one data frame and made a wordcloud.
df_top_long <- df_top %>%
pivot_longer(
cols = everything(),
names_to = "Question",
values_to = "Value",
values_drop_na = TRUE)
df_top_comp <- df_top_long %>%
count(Value) %>%
arrange(desc(n))
knitr:: kable(head(df_top_comp,10))
Value | n |
---|---|
Python | 328 |
Online courses (Coursera, EdX, etc) | 229 |
Coursera | 215 |
SQL | 210 |
Kaggle (notebooks, competitions, etc) | 183 |
Video platforms (YouTube, Twitch, etc) | 170 |
Master’s degree | 156 |
20+ years | 146 |
Other | 141 |
University courses | 123 |
df_top_long %>%
count(Value) %>%
with(wordcloud(Value, n, max.words = 10,colors = brewer.pal(10,"Spectral"),scale = c(2,.5)))
#clear question text from first row
df <- df[-1,]
graph_multicolumn_q <- function(og_df, q_list,
q_name, val_label,
q_text) {
#Create limited df with only that question's columns
q_df <- df %>%
select(all_of(q_list))
#Create long-pivoted df for graphing
q_df1 <- q_df %>%
pivot_longer(
cols = everything(),
names_to = q_name,
values_to = val_label,
values_drop_na = TRUE)
#Produce df of value counts
q_df1_count <- q_df1 %>%
count(!!sym(val_label))
#Graph result
q_df1 %>%
ggplot(aes(x=fct_rev(fct_infreq(!!sym(val_label)))))+
geom_bar()+
coord_flip()+
theme_minimal()+
ggtitle(q_text)+
xlab(val_label)
}
graph_cross_analysis <- function(og_df, q_demo, q_list, q_name, val_label, q_text) {
q_df <- og_df %>%
select(all_of(c(q_demo, q_list)))
q_df1 <- q_df %>%
pivot_longer(
cols= -1,
names_to=q_name,
values_to=val_label,
values_drop_na=TRUE
)
q_df2 <- q_df1[complete.cases(q_df1),]
q_df2 %>%
ggplot(aes(x=fct_rev(fct_infreq(!!sym(val_label))), fill = as.factor(!!sym(q_demo))))+
facet_wrap(q_demo)+
coord_flip()+
geom_bar()+
theme(axis.text.x = element_text(size = 7))+
theme(axis.text.y = element_text(size = 7))+
theme(legend.position = "none")+
labs(y= "Count", x = val_label)+
ggtitle(q_text)
}
q12_cols <- c("Q12_1","Q12_2","Q12_3","Q12_4","Q12_5", "Q12_6","Q12_7","Q12_8","Q12_9","Q12_10","Q12_11","Q12_12","Q12_13","Q12_14","Q12_15")
graph_multicolumn_q(df,
q12_cols,
'Question 12',
'Language',
'Regularly Used Programming Language')
graph_cross_analysis(og_df = df,
q_demo = 'Q11',
q_list = q12_cols,
q_name = 'Question 12',
val_label = 'Language',
q_text = 'Language and Years of Experience')
graph_cross_analysis(og_df = df,
q_demo = 'Q8',
q_list = q12_cols,
q_name = 'Question 12',
val_label = 'Language',
q_text = 'Language and Degree')
graph_cross_analysis(og_df = df,
q_demo = 'Q23',
q_list = q12_cols,
q_name = 'Question 12',
val_label = 'Language',
q_text = 'Language and Job Title')