# Import Libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime as dt
# Define Function to Parse Data
def parse_data(data_folder, filename):
data = pd.read_csv(data_folder + filename)
columns = ['respondent_id', 'study_id', 'date', 'sex', 'number of children', 'q1', 'q2', 'q3_1', 'q3_2', 'q3_3', 'q3_4', 'q3_5', 'q3_6', 'q4', 'q5']
compiled_study = pd.DataFrame()
# Assuming All Data Files in Data Folder are Named by the Convention 'study_###'
study_id = filename.split('_')[1].replace('.csv', '')
for index, row_df in data.iterrows():
# Preset 'study_id' Column from Data File Name and Preset Rows with -1
new_df = pd.DataFrame(-1, index = ['0'], columns = columns)
new_df['study_id'] = study_id
# Add Data into New Data Frame
for data_column in row_df.keys():
data_column_l = data_column.lower()
if data_column_l not in columns and data_column_l != 'q3':
print('Error: Column Not Supported {}'.format(data_column))
continue
# Edge Case for Q5
if data_column_l == 'q5' and study_id == '101':
if row_df[data_column] == 2:
row_df[data_column] = 1
elif row_df[data_column] == 1:
row_df[data_column] = 0
# Edge Case for Q3
if data_column_l == 'q3':
# Extract Responses to Q3
values = row_df[data_column].split(';')
for value in values:
# Set Matching Responses to 1 in New Columns
new_df['q3_' + value.strip()] = 1
# Format Date to DAY-MONTH-YEAR (DD-MM-YYYY)
if data_column_l == 'date':
date = row_df[data_column]
if '-' in date:
# Convert YEAR-MONTH-DAY to DAY-MONTH-YEAR
date = dt.strptime(date, '%Y-%m-%d').strftime('%d-%m-%Y')
elif '/' in date:
# Convert DAY/MONTH/YEAR to DAY-MONTH-YEAR
date = dt.strptime(date, '%d/%m/%Y').strftime('%d-%m-%Y')
new_df[data_column_l] = date
# Add Remaining Columns into New Data Frame
else:
new_df[data_column_l] = row_df[data_column]
# Replace NULL Values with NaN or 0
cols_nan = ['number of children', 'q4']
new_df[cols_nan] = new_df[cols_nan].replace({-1: np.nan})
new_df[cols_nan] = new_df[cols_nan].astype('Int64')
cols_zero = ['q3_1', 'q3_2', 'q3_3', 'q3_4', 'q3_5', 'q3_6']
new_df[cols_zero] = new_df[cols_zero].replace({-1: 0})
new_df = new_df.drop(['q3'], axis = 1)
# Select Columns for a Multi-Level Index
idx = ['respondent_id', 'study_id', 'date', 'sex', 'number of children']
# Pivot Data based on Multi-Level Index
multi_indexed_df = new_df.set_index(idx)
# Stack Columns containing Responses
stacked_df = multi_indexed_df.stack(dropna = False)
# Reset Multi-Level Index
final_df = stacked_df.reset_index()
# Reformat Stacked Columns
final_df.rename(columns = {'level_5': 'question_id', 0: 'response'}, inplace = True)
final_df['number of children'] = final_df['number of children'].astype('Int64')
# Compile Data for Study
compiled_study = compiled_study.append(final_df, sort = False)
return compiled_study
# Define Main Function
def main():
data_folder = 'C:/Users/chenl/Desktop/data/'
data = pd.DataFrame()
# Read All Data Files in Data Folder
for filename in os.listdir(data_folder):
# Assuming All Data Files in Data Folder are Named by the Convention 'study_###'
study_id = filename.split('_')[1].replace('.csv', '')
data = data.append(parse_data(data_folder, filename))
data.index = range(1, len(data) + 1)
data.index.name = 'id'
print(data)
data.to_csv('C:/Users/chenl/Desktop/CLH_data/CLH_master_table.csv')
if __name__ == '__main__':
main()
# Create Respondents Table
data = pd.read_csv('C:/Users/chenl/Desktop/CLH_data/CLH_master_table.csv')
respondents = data[['respondent_id', 'study_id', 'date', 'sex', 'number of children']]
respondents = respondents.drop_duplicates()
respondents.index = range(1, len(respondents) + 1)
respondents.index.name = 'id'
respondents['number of children'] = respondents['number of children'].astype('Int64')
print(respondents)
respondents.to_csv('C:/Users/chenl/Desktop/CLH_data/CLH_respondents_table.csv')
# Create Responses Table
data = pd.read_csv('C:/Users/chenl/Desktop/CLH_data/CLH_master_table.csv')
responses = data[['respondent_id', 'study_id', 'question_id', 'response']]
responses = responses.drop_duplicates()
responses.index = range(1, len(responses) + 1)
responses.index.name = 'id'
responses['response'] = responses['response'].astype('Int64')
print(responses)
responses.to_csv('C:/Users/chenl/Desktop/CLH_data/CLH_responses_table.csv')