Project_3_Database

Method

1. Load the dataset from GitHub

survey_raw <- read_csv("https://raw.githubusercontent.com/catfoodlover/msdsrepo/main/Project_3_607/kaggle-survey-2018/multipleChoiceResponses.csv")

2. Filter out rows not of interest

filter down to just participants who identify as Data Scientists
filter down to just participants within the United States of America

survey_df<-filter(survey_raw,survey_raw$Q6=="Data Scientist")
survey_df <- survey_df %>% filter(Q3 %in% c("United States of America"))

3. Create participant_info table

create id column
select rows corresponding to Gender, Age, Country, and Degree Name
create id column for age and degree

survey_df <- survey_df %>% mutate(participant_id = row_number())
# Split data up into tables
# participant info
participant_info <-
  survey_df %>% select(
    participant_id,
    gender = Q1,
    age_group_name = Q2,
    country = Q3,
    degree_name = Q4
  ) %>% select(participant_id, gender, age_group_name, country, degree_name)

participant_info <- add_group_indices(participant_info, c('age_group_name'), 'age_group_id')

participant_info <- add_group_indices(participant_info, c('degree_name'), 'degree_id')

4. Create age_group table

split of age_group_id and age_group_name from participant_info and keep distinct rows

# Create the age_group table
age_group <- participant_info %>% select(age_group_id, age_group_name) %>% distinct()

5. Create education_degree table

split of degree_id and degree_name from participant_info and keep distinct rows

# Create the education_degree table
education_degree <- participant_info %>% select(degree_id, degree_name) %>% distinct()

6. Create final participant_info table

select just the required rows

# Reduce participant_info down to just the required fields
participant_info <- participant_info %>% select(participant_id, gender, age_group_id, country, highest_education = degree_name)

7. Create participant_job_activity

select participant_id and all columns for Question 11
convert data from wide to long
filter out NA values and rename columns

# create participant_job_activity

participant_job_activity <-
  survey_df %>% select(participant_id, starts_with("Q11")) %>% melt(
    id.var = c("participant_id"),
    measure.vars = c(
      "Q11_Part_1",
      "Q11_Part_2",
      "Q11_Part_3",
      "Q11_Part_4",
      "Q11_Part_5",
      "Q11_Part_6",
      "Q11_Part_7"
    )
  ) %>% filter(!is.na(value)) %>% select(participant_id, activity_description = value)

8. Create participant_job table

Select participant_id and columns related to Undergraduate Major, Job Title, Industry, and Compensation

#Create participant_job table
participant_job <-
  survey_df %>% select(
    pt_job_id = participant_id,
    under_graduate_major = Q5,
    job_title = Q6,
    industry = Q7,
    compensation = Q9
  )

9. Create software table

select participant id and columns related to analysis tools, IDE and programming languages
convert columns from wide to long
create a column for participant_id, software, software_type

# Create software table
software <-
  survey_df %>% select(
    software_id = participant_id,
    analysis_tool = starts_with("Q12"),
    ide = starts_with("Q13"),
    language = starts_with("Q16")
  ) %>% melt(id.vars = c("software_id"), mearsure.vars = c("analysis_tool", "ide", "language")) %>%
  mutate(variable = gsub('[1-9]', '', variable)) %>% filter(!is.na(value)&value != '-1'&value != 'Other')

10. Create a software_type table

group by software type and create an id column called software_type
split the columns of interest off from the software table and take distinct rows

software <- add_group_indices(software, c('variable'), 'software_type')

software_type <- software %>% select(software_type_id = software_type, software_type_name = variable) %>% distinct()

software <- software %>% select(software_id, software_type,  software_name = value) %>% distinct()

10. Save tables out to bit.io

Rather than using CREATE and INSERT statements, we are saving out the tables as csv and using the bit.io GUI to load the tables

# Save tables to load through the bit.io gui
write = FALSE

if (write == TRUE){
write.csv2(software, "/Users/williamaiken/Data607/software.csv", row.names = FALSE)
write.csv2(software_type, "/Users/williamaiken/Data607/software_type.csv", row.names = FALSE)
write.csv2(participant_job, "/Users/williamaiken/Data607/participant_job.csv", row.names = FALSE)
write.csv2(participant_job_activity, "/Users/williamaiken/Data607/participant_job_activity.csv", row.names = FALSE)
write.csv2(participant_info, "/Users/williamaiken/Data607/participant_info.csv", row.names = FALSE)
write.csv2(education_degree, "/Users/williamaiken/Data607/education_degree.csv", row.names = FALSE)
write.csv2(age_group, "/Users/williamaiken/Data607/age_group.csv", row.names = FALSE)
}

participant_id	gender	industry	compensation
1	Female	I am a student	0-10,000
2	Male	Other	125-150,000
3	Male	Medical/Pharmaceutical	125-150,000
4	Female	Non-profit/Service	125-150,000
5	Male	Online Service/Internet-based Services	125-150,000
6	Male	Insurance/Risk Assessment	125-150,000

Project_3_Database

by Team 2 (William Aiken, Bikram Barua, Catherine Cho, Eric Lehmphul, Nnaemeka Newman Okereafor)

10/16/2021

Introduction

This is where we save parse the Kaggle survey dataset into the fields of interests. And save those files out as normalized tables.

Method

1. Load the dataset from GitHub

2. Filter out rows not of interest

3. Create participant_info table

4. Create age_group table

5. Create education_degree table

6. Create final participant_info table

7. Create participant_job_activity

8. Create participant_job table

9. Create software table

10. Create a software_type table

10. Save tables out to bit.io

Results

We can now connect to bit.io and query our normalized tables