For this project, my group (Team 4: Gregg Maloy, Jacob Silver, Miguel Gomez, Umer Farooq) regularly met up on Google Meet to discuss how we were going to divide the work and share what we had coded so far. We found a dataset on Kaggle that surveyed about 24000 people with the objective of finding out “who is working with data, what’s happening with machine learning in different industries, and the best ways for new data scientists to break into the field.” The dataset contains 43 questions presented in a wide format spanning 296 columns.

The first part of this markdown is going to be my contribution to the project, which was analyzing the demographic information, question 12 (programming language), and looking for the “bright spots” in data science. During one of our meetups, Jacob shared his code, which consisted of a function that took the columns we wanted as inputs and graphed them in a bar chart. His code also contained a function that graphed the cross between two questions. In my case, I used his function to determine which programming language various positions (data scientists, software engineers, etc.) used on a regular basis. At our final meetup, we shared what we had completed and sliced down which data would go in the presentation.

Loading the libraries

Loading the data from Github

df <- read.csv("https://raw.githubusercontent.com/LeJQC/MSDS/main/DATA%20607/Project%203/kaggle_survey_2022_responses.csv",na.strings = c("","N/A"))

Checking out the data frame

#There were 23998 respondents to the survey and 43 questions separated into 296 columns
dim(df)
## [1] 23998   296
#head(df[1:6],5)

Age

#Making a data frame with the Age of all the respondents 
df2 <- df[2]
age <- data.frame(df2[-1, ])
colnames(age) <- "Age"

Ploting their age

The majority of the respondents were between age 18 to 29

age %>% 
  ggplot(aes(x=Age))+
  geom_bar()+
  coord_flip()+
  theme_minimal()

### Table of the age and percentage

age_count <- age %>%  
  count(Age) %>% 
  mutate(percent = round(n/sum(n)*100)) %>% 
  arrange(desc(n))

knitr:: kable(age_count)
Age n percent
18-21 4559 19
25-29 4472 19
22-24 4283 18
30-34 2972 12
35-39 2353 10
40-44 1927 8
45-49 1253 5
50-54 914 4
55-59 611 3
60-69 526 2
70+ 127 1

Gender

#Making a data frame with the Gender of all the respondents 
df3 <- df[3]
gender <- data.frame(df3[-1, ])
colnames(gender) <- "Gender"

Count/percentage of the gender

Three fourths of the respondents were male

gender_count <- gender %>% 
  count(Gender) %>% 
  mutate(percent = round(n/sum(n)*100)) %>% 
  arrange(desc(n)) 

knitr:: kable(gender_count)
Gender n percent
Man 18266 76
Woman 5286 22
Prefer not to say 334 1
Nonbinary 78 0
Prefer to self-describe 33 0

Country

#Making a data frame with the Country of all the respondents 
df4 <- df[4]
country <- data.frame(df4[-1, ])
colnames(country) <- "Country"

Count/percentage of the country

About a third of the respondents are from India

country_per <- country %>% 
  count(Country) %>% 
  arrange(desc(n)) %>% 
  mutate(percent = round(n/sum(n)*100)) %>% 
  head(10) 

knitr:: kable(country_per)
Country n percent
India 8792 37
United States of America 2920 12
Other 1430 6
Brazil 833 3
Nigeria 731 3
Pakistan 620 3
Japan 556 2
China 453 2
Egypt 383 2
Mexico 380 2

Question 12

Programming Language

Looking at the most used common programming language (question 12) among the respondents

#Splicing the columns needed
df12 <- df[,31:45]

Converting the 15 columns of question 6 into long format

dfq12 <- df12 %>% 
  pivot_longer(
    cols = everything(),
    names_to = "Question 12",
    values_to = "Language",
    values_drop_na = TRUE
  )

dfq12 <- dfq12[16:nrow(dfq12),]

Plotting the info

dfq12 %>% 
  ggplot(aes(x=Language))+
  geom_bar()+
  coord_flip()+
  theme_minimal()

Tabling the data

table(dfq12$Language)
## 
##       Bash          C         C#        C++         Go       Java Javascript 
##       1674       3801       1473       4549        322       3862       3489 
##      Julia     MATLAB       None      Other        PHP     Python          R 
##        296       2441        256       1342       1443      18653       4571 
##        SQL 
##       9620

Question 12 with Question 11

Creating a data frame to look at the relationship between years of experience writing code/programming(question 11) and the programming language use regularly(question 12)

#Splicing the columns I want
df11 <- df[3:nrow(df),30:45]

Cleaning out those who have no experience writing code

df11_clean <- df11 %>% 
  filter(df11$Q11 != "I have never written code") %>%
  group_by(Q11) 

Converting data to long format

dfq11 <- df11_clean %>% 
  pivot_longer(
    cols = -1,
    names_to = "Question 12",
    values_to = "Language",
    values_drop_na = TRUE)

Contingency table of years of experience and programming language

Seems python is the most used language regardless of years of experience

table(dfq11$Q11,dfq11$Language)
##              
##               Bash    C   C#  C++   Go Java Javascript Julia MATLAB None Other
##   < 1 years     80  703  141  753   25  555        382    28    477  174   170
##   1-3 years    265 1263  312 1555   56 1244       1010    57    742   36   259
##   10-20 years  336  344  244  379   59  421        439    46    232    3   233
##   20+ years    338  412  269  481   55  401        457    58    195    9   268
##   3-5 years    296  655  258  863   61  724        645    46    447   19   183
##   5-10 years   359  424  249  518   66  517        556    61    348   15   229
##              
##                PHP Python    R  SQL
##   < 1 years    150   4528 1000 2039
##   1-3 years    402   5846 1255 2801
##   10-20 years  198   1570  424 1009
##   20+ years    191   1332  442  842
##   3-5 years    273   3088  820 1609
##   5-10 years   229   2289  630 1320

Plotting the data

It looks like the people just starting their data science career use more programming languages than data scientist who have more than 10 years of experience.

dfq11 %>% 
  ggplot(aes(x= factor(Q11, levels = c("< 1 years","1-3 years","3-5 years","5-10 years","10-20 years","20+ years")),fill=Language))+
  geom_bar(alpha = 20)+
  theme_bw()+
  labs(title = "Years of Experience vs Language", x= "Years of Experience")

Salary

Creating a data frame of the compensation

df_comp <- df[159]
comp <- na.omit(data.frame(df_comp[-1,]))
colnames(comp) <- c("Compensation")

Just wanted to see what it looks like

comp %>% 
  count(Compensation) %>% 
  arrange(n)
##        Compensation    n
## 1       >$1,000,000   23
## 2  $500,000-999,999   48
## 3   300,000-499,999   76
## 4   250,000-299,999   78
## 5   200,000-249,999  155
## 6     90,000-99,999  197
## 7     80,000-89,999  222
## 8       4,000-4,999  234
## 9       3,000-3,999  244
## 10  125,000-149,999  269
## 11      2,000-2,999  271
## 12    25,000-29,999  277
## 13    70,000-79,999  289
## 14    15,000-19,999  299
## 15    60,000-69,999  318
## 16    20,000-24,999  337
## 17  150,000-199,999  342
## 18      7,500-9,999  362
## 19    50,000-59,999  366
## 20      5,000-7,499  391
## 21  100,000-124,999  404
## 22    40,000-49,999  421
## 23      1,000-1,999  444
## 24    30,000-39,999  464
## 25    10,000-14,999  493
## 26           $0-999 1112

“Bright Spots”-Top Earners

Created a new data frame that filtered by people who made $200,000 or more. There were 380 respondents of the 23998 people survey who matched this criteria, which is about 1.6%.

df_top <- df %>% 
  filter(df$Q29 %in% c("200,000-249,999","250,000-299,999","300,000-499,999","$500,000-999,999",">$1,000,000"))

df_top <- df_top[,c(6:25,29:44,146)]

Instead of looking at each observation individually, I wanted to get a general sense of what these “bright spots” have in common. I dumped all the answers to questions 6(where they learned to code), 7(most useful platform), 12(programming language), and 23 (job title) into one data frame and made a wordcloud.

df_top_long <- df_top %>% 
  pivot_longer(
    cols = everything(),
    names_to = "Question",
    values_to = "Value",
    values_drop_na = TRUE)
df_top_comp <- df_top_long %>%  
  count(Value) %>% 
  arrange(desc(n))

knitr:: kable(head(df_top_comp,10))
Value n
Python 328
Online courses (Coursera, EdX, etc) 229
Coursera 215
SQL 210
Kaggle (notebooks, competitions, etc) 183
Video platforms (YouTube, Twitch, etc) 170
Master’s degree 156
20+ years 146
Other 141
University courses 123

Word Cloud of the answers

df_top_long %>% 
  count(Value) %>% 
  with(wordcloud(Value, n, max.words = 10,colors = brewer.pal(10,"Spectral"),scale = c(2,.5)))

This next portion is the code of the functions Jacob created. I used it to analyze the programming languaged use by different job titles and programming language in relation to years of experience and degree.

#clear question text from first row
df <- df[-1,]

This function makes a bar chart of the column/question that is input

graph_multicolumn_q <- function(og_df, q_list, 
                                q_name, val_label,
                                q_text) {
  #Create limited df with only that question's columns
  q_df <- df %>%
     select(all_of(q_list))
  
  #Create long-pivoted df for graphing
  q_df1 <- q_df %>%
    pivot_longer(
    cols = everything(),
    names_to = q_name,
    values_to = val_label,
    values_drop_na = TRUE)
  
  #Produce df of value counts
  q_df1_count <- q_df1 %>%
    count(!!sym(val_label))
  
  #Graph result
  
  q_df1 %>%
    ggplot(aes(x=fct_rev(fct_infreq(!!sym(val_label)))))+
    geom_bar()+
    coord_flip()+
    theme_minimal()+
    ggtitle(q_text)+
    xlab(val_label)
  
}

This function graphs the questoin against another question

graph_cross_analysis <- function(og_df, q_demo, q_list, q_name, val_label, q_text) {
  q_df <- og_df %>%
    select(all_of(c(q_demo, q_list)))
  
  q_df1 <- q_df %>%
    pivot_longer(
      cols= -1,
      names_to=q_name,
      values_to=val_label,
      values_drop_na=TRUE
  )
  
  q_df2 <- q_df1[complete.cases(q_df1),]  
    
  q_df2 %>%
    ggplot(aes(x=fct_rev(fct_infreq(!!sym(val_label))), fill = as.factor(!!sym(q_demo))))+
    facet_wrap(q_demo)+
    coord_flip()+
    geom_bar()+
    theme(axis.text.x = element_text(size = 7))+
    theme(axis.text.y = element_text(size = 7))+
    theme(legend.position = "none")+
    labs(y= "Count", x = val_label)+
    ggtitle(q_text)
    
    
}

Creating a vector of the columns I want for question 12

q12_cols <-  c("Q12_1","Q12_2","Q12_3","Q12_4","Q12_5", "Q12_6","Q12_7","Q12_8","Q12_9","Q12_10","Q12_11","Q12_12","Q12_13","Q12_14","Q12_15")

Inputting this into the function to create a bar graph

graph_multicolumn_q(df,
                    q12_cols,
                    'Question 12',
                    'Language',
                    'Regularly Used Programming Language')

Crossing Language with Years of experience

graph_cross_analysis(og_df = df, 
                     q_demo = 'Q11', 
                     q_list = q12_cols, 
                     q_name = 'Question 12', 
                     val_label = 'Language',
                     q_text = 'Language and Years of Experience')

Crossing Language with Degree

graph_cross_analysis(og_df = df, 
                     q_demo = 'Q8', 
                     q_list = q12_cols, 
                     q_name = 'Question 12', 
                     val_label = 'Language',
                     q_text = 'Language and Degree')

Crossing Language with Job title

graph_cross_analysis(og_df = df, 
                     q_demo = 'Q23', 
                     q_list = q12_cols, 
                     q_name = 'Question 12', 
                     val_label = 'Language',
                     q_text = 'Language and Job Title')