Method
1. Load the dataset from GitHub
survey_raw <- read_csv("https://raw.githubusercontent.com/catfoodlover/msdsrepo/main/Project_3_607/kaggle-survey-2018/multipleChoiceResponses.csv")
2. Filter out rows not of interest
- filter down to just participants who identify as Data Scientists
- filter down to just participants within the United States of America
survey_df<-filter(survey_raw,survey_raw$Q6=="Data Scientist")
survey_df <- survey_df %>% filter(Q3 %in% c("United States of America"))
3. Create participant_info table
- create id column
- select rows corresponding to Gender, Age, Country, and Degree Name
- create id column for age and degree
survey_df <- survey_df %>% mutate(participant_id = row_number())
# Split data up into tables
# participant info
participant_info <-
survey_df %>% select(
participant_id,
gender = Q1,
age_group_name = Q2,
country = Q3,
degree_name = Q4
) %>% select(participant_id, gender, age_group_name, country, degree_name)
participant_info <- add_group_indices(participant_info, c('age_group_name'), 'age_group_id')
participant_info <- add_group_indices(participant_info, c('degree_name'), 'degree_id')
4. Create age_group table
- split of age_group_id and age_group_name from participant_info and keep distinct rows
# Create the age_group table
age_group <- participant_info %>% select(age_group_id, age_group_name) %>% distinct()
5. Create education_degree table
- split of degree_id and degree_name from participant_info and keep distinct rows
# Create the education_degree table
education_degree <- participant_info %>% select(degree_id, degree_name) %>% distinct()
6. Create final participant_info table
- select just the required rows
# Reduce participant_info down to just the required fields
participant_info <- participant_info %>% select(participant_id, gender, age_group_id, country, highest_education = degree_name)
7. Create participant_job_activity
- select participant_id and all columns for Question 11
- convert data from wide to long
- filter out NA values and rename columns
# create participant_job_activity
participant_job_activity <-
survey_df %>% select(participant_id, starts_with("Q11")) %>% melt(
id.var = c("participant_id"),
measure.vars = c(
"Q11_Part_1",
"Q11_Part_2",
"Q11_Part_3",
"Q11_Part_4",
"Q11_Part_5",
"Q11_Part_6",
"Q11_Part_7"
)
) %>% filter(!is.na(value)) %>% select(participant_id, activity_description = value)
8. Create participant_job table
- Select participant_id and columns related to Undergraduate Major, Job Title, Industry, and Compensation
#Create participant_job table
participant_job <-
survey_df %>% select(
pt_job_id = participant_id,
under_graduate_major = Q5,
job_title = Q6,
industry = Q7,
compensation = Q9
)
9. Create software table
- select participant id and columns related to analysis tools, IDE and programming languages
- convert columns from wide to long
- create a column for participant_id, software, software_type
# Create software table
software <-
survey_df %>% select(
software_id = participant_id,
analysis_tool = starts_with("Q12"),
ide = starts_with("Q13"),
language = starts_with("Q16")
) %>% melt(id.vars = c("software_id"), mearsure.vars = c("analysis_tool", "ide", "language")) %>%
mutate(variable = gsub('[1-9]', '', variable)) %>% filter(!is.na(value)&value != '-1'&value != 'Other')
10. Create a software_type table
- group by software type and create an id column called software_type
- split the columns of interest off from the software table and take distinct rows
software <- add_group_indices(software, c('variable'), 'software_type')
software_type <- software %>% select(software_type_id = software_type, software_type_name = variable) %>% distinct()
software <- software %>% select(software_id, software_type, software_name = value) %>% distinct()
10. Save tables out to bit.io
- Rather than using CREATE and INSERT statements, we are saving out the tables as csv and using the bit.io GUI to load the tables
# Save tables to load through the bit.io gui
write = FALSE
if (write == TRUE){
write.csv2(software, "/Users/williamaiken/Data607/software.csv", row.names = FALSE)
write.csv2(software_type, "/Users/williamaiken/Data607/software_type.csv", row.names = FALSE)
write.csv2(participant_job, "/Users/williamaiken/Data607/participant_job.csv", row.names = FALSE)
write.csv2(participant_job_activity, "/Users/williamaiken/Data607/participant_job_activity.csv", row.names = FALSE)
write.csv2(participant_info, "/Users/williamaiken/Data607/participant_info.csv", row.names = FALSE)
write.csv2(education_degree, "/Users/williamaiken/Data607/education_degree.csv", row.names = FALSE)
write.csv2(age_group, "/Users/williamaiken/Data607/age_group.csv", row.names = FALSE)
}
Results
We can now connect to bit.io and query our normalized tables
bit.io Project 3 tables
- Here is a sample query where we are join the participant_info table to the participant_job table and selecting the participant_id, gender, industry and compensation
#password <- key_get("bit.io", "catfoodlover_demo_db_connection")
Sys.setenv(PGGSSENCMODE="disable")
con <- dbConnect(RPostgres::Postgres(), dbname = 'bitdotio',
host = 'db.bit.io',
port = 5432,
user = 'bitdotio',
password = 'Dukn_BBrDekxNmUvd4R4XfnvAQuk')
d <- dbGetQuery(con, 'SELECT * FROM "catfoodlover/Project 3"."participant_info" AS info
LEFT JOIN "catfoodlover/Project 3"."participant_job" AS job
ON info.participant_id = job.pt_job_id;')
d %>% select(participant_id, gender, industry, compensation) %>% head() %>% kbl() %>% kable_styling()
|
participant_id
|
gender
|
industry
|
compensation
|
|
1
|
Female
|
I am a student
|
0-10,000
|
|
2
|
Male
|
Other
|
125-150,000
|
|
3
|
Male
|
Medical/Pharmaceutical
|
125-150,000
|
|
4
|
Female
|
Non-profit/Service
|
125-150,000
|
|
5
|
Male
|
Online Service/Internet-based Services
|
125-150,000
|
|
6
|
Male
|
Insurance/Risk Assessment
|
125-150,000
|