Code
::opts_chunk$set(echo = TRUE)
knitrlibrary(tidyverse)
library(ggplot2)
library(patchwork)
library(ggpubr)
# creates a file with the bibtex for packages used:
::write_bib(c(.packages(),
knitr"knitr", "rmarkdown"), "refs/packages.bib")
::opts_chunk$set(echo = TRUE)
knitrlibrary(tidyverse)
library(ggplot2)
library(patchwork)
library(ggpubr)
# creates a file with the bibtex for packages used:
::write_bib(c(.packages(),
knitr"knitr", "rmarkdown"), "refs/packages.bib")
This assignment is designed and answered based on the survey data we collected in class DATA2x02 students (2024).
Data wrangling and information see Section 2, you could also download the clean data set at Section 3
Note: Gratefully adopted Tarr (2024) ’s Data Cleaning styles
library(tidyverse)
library(gendercoder)
library(janitor)
library(hms)
theme_set(theme_bw())
= readxl::read_excel("DATA2x02_survey_2024_Responses.xlsx")
x
= colnames(x)
old_names
= c(
new_names "timestamp",
"target_grade",
"assignment_preference",
"trimester_or_semester",
"age",
"tendency_yes_or_no",
"pay_rent",
"urinal_choice",
"stall_choice",
"weetbix_count",
"weekly_food_spend",
"living_arrangements",
"weekly_alcohol",
"believe_in_aliens",
"height",
"commute",
"daily_anxiety_frequency",
"weekly_study_hours",
"work_status",
"social_media",
"gender",
"average_daily_sleep",
"usual_bedtime",
"sleep_schedule",
"sibling_count",
"allergy_count",
"diet_style",
"random_number",
"favourite_number",
"favourite_letter",
"drivers_license",
"relationship_status",
"daily_short_video_time",
"computer_os",
"steak_preference",
"dominant_hand",
"enrolled_unit",
"weekly_exercise_hours",
"weekly_paid_work_hours",
"assignments_on_time",
"used_r_before",
"team_role_type",
"university_year",
"favourite_anime",
"fluent_languages",
"readable_languages",
"country_of_birth",
"wam",
"shoe_size")
# Overwrite the old names:
colnames(x) = new_names
# Combine names
= bind_cols(Variable_Names = new_names, Explanations = old_names)
name_combo |> gt::gt() name_combo
Variable_Names | Explanations |
---|---|
timestamp | Timestamp |
target_grade | What final grade are you aiming to achieve in DATA2x02? |
assignment_preference | When it comes to assignments / due tasks do you: |
trimester_or_semester | Would you prefer to have a trimester system (3 main teaching sessions per year) or stick with the existing semester system (2 main teaching sessions per year)? |
age | How old are you? |
tendency_yes_or_no | Do you tend to lean towards saying "yes" or towards saying "no" to things throughout life? |
pay_rent | Do you pay rent? |
urinal_choice | You enter a public bathroom and find you're the only one there. There are three urinals on the wall for you to choose from. Which do you choose? |
stall_choice | You enter a public bathroom and there are three stalls to choose from. All three are unoccupied. Which do you choose? |
weetbix_count | How many Weet-Bix would you typically eat in one sitting? |
weekly_food_spend | What is the average amount of money you spend each week on food/beverages? |
living_arrangements | What are your current living arrangements? |
weekly_alcohol | How much alcohol do you consume each week? |
believe_in_aliens | Do you believe in the existence of aliens (Extraterrestrial Life)? |
height | How tall are you? |
commute | How do you get to university? |
daily_anxiety_frequency | How often would you say you feel anxious on a daily basis? |
weekly_study_hours | How many hours a week do you spend studying? |
work_status | Do you work? |
social_media | What is your favourite social media platform? |
gender | What is your gender? |
average_daily_sleep | How much sleep do you get (on avg, per day)? |
usual_bedtime | What time do you usually go to bed? |
sleep_schedule | How consistent would you rate your sleep schedule? |
sibling_count | How many siblings do you have? |
allergy_count | How many allergies do you know you have? |
diet_style | What is your diet style? |
random_number | Pick a number at random between 1 and 10 (inclusive) |
favourite_number | What is your favourite number (between 1 and 10 inclusive) |
favourite_letter | What is your favourite letter from the English alphabet? |
drivers_license | Do you have a driver's license? |
relationship_status | Do you currently have a partner? |
daily_short_video_time | How much time do you spend on short video apps (like tiktok or reels) every day? |
computer_os | What computer OS (operating system) are you currently using? |
steak_preference | How do you like your steak cooked? |
dominant_hand | What is your dominant hand? |
enrolled_unit | Which unit are you enrolled in? |
weekly_exercise_hours | On average, how many hours each week do you spend exercising? |
weekly_paid_work_hours | How many hours a week (on average) do you work in paid employment? |
assignments_on_time | Do you submit assignments on time? |
used_r_before | Have you ever used R before starting DATA2x02? |
team_role_type | What kind of role (active or passive) do you think you are when working as part of a team? |
university_year | Which year of university are you currently in? |
favourite_anime | What is your favourite anime? |
fluent_languages | How many languages can you speak fluently? |
readable_languages | How many languages can you read? |
country_of_birth | What country were you born in? |
wam | What is your WAM (weighted average mark)? |
shoe_size | What is your shoe size? |
Cleaning steps see below, you could download the clean data for further analysis. Download CSV Here
# Step 2: Select relevant columns and remove rows with missing values in any selected column (including enrolled_unit)
<- x %>%
df_cleaned select(target_grade, assignment_preference, weekly_alcohol, daily_anxiety_frequency,
weekly_study_hours, work_status, average_daily_sleep, sleep_schedule,
diet_style, relationship_status, enrolled_unit, weekly_exercise_hours, %>%
weekly_paid_work_hours, assignments_on_time, team_role_type, wam) drop_na() # Remove rows with missing values in any selected column
# Step 3: Clean `diet_style`
<- df_cleaned %>%
df_cleaned # Delete rows where diet_style is 'no beef'
filter(diet_style != "no beef") %>%
# Reclassify 'diet_style' values
mutate(diet_style = case_when(
%in% c("normal", "Normal", "NA", "None") ~ "Omnivorous",
diet_style TRUE ~ diet_style # Retain all other values as they are
))
# Step 4: Clean `relationship_status`
<- df_cleaned %>%
df_cleaned # Delete rows where relationship_status is 'Tears'
filter(relationship_status != "Tears") %>%
# Reclassify 'relationship_status' values
mutate(relationship_status = case_when(
== "Multiple" ~ "Yes", # Convert 'Multiple' to 'Yes'
relationship_status == "No but I would like to have one, please" ~ "No", # Convert this to 'No'
relationship_status TRUE ~ relationship_status # Retain all other values as they are
))
# Step 5: Clean the 'weekly_study_hours', 'weekly_exercise_hours', and 'wam'
<- df_cleaned %>%
df_cleaned # Convert 'weekly_study_hours' to numeric and remove outliers
mutate(weekly_study_hours = as.numeric(weekly_study_hours),
weekly_study_hours = ifelse(weekly_study_hours < 0 | weekly_study_hours > 100, NA, weekly_study_hours)) %>%
drop_na(weekly_study_hours) # Remove rows with invalid study hours
# Step 6: Clean 'average_daily_sleep' with custom rules
<- df_cleaned %>%
df_cleaned mutate(average_daily_sleep = as.numeric(str_replace_all(average_daily_sleep, "[-~]", "-")),
average_daily_sleep = ifelse(grepl(":", average_daily_sleep),
as.numeric(sub(":", ".", average_daily_sleep)), average_daily_sleep),
average_daily_sleep = ifelse(grepl("-", average_daily_sleep),
rowMeans(sapply(str_split(average_daily_sleep, "-"), as.numeric)),
average_daily_sleep),average_daily_sleep = as.numeric(average_daily_sleep),
average_daily_sleep = ifelse(average_daily_sleep < 0 | average_daily_sleep > 15, NA, average_daily_sleep)) %>%
drop_na(average_daily_sleep)
# Step 7: Clean 'weekly_exercise_hours' and 'weekly_paid_work_hours'
<- df_cleaned %>%
df_cleaned mutate(weekly_exercise_hours = as.numeric(weekly_exercise_hours),
weekly_exercise_hours = ifelse(weekly_exercise_hours < 0 | weekly_exercise_hours > 30 | weekly_exercise_hours == 900,
NA, weekly_exercise_hours),
weekly_paid_work_hours = as.numeric(weekly_paid_work_hours),
weekly_paid_work_hours = ifelse(weekly_paid_work_hours < 0 | weekly_paid_work_hours > 30, NA, weekly_paid_work_hours)) %>%
drop_na(weekly_exercise_hours, weekly_paid_work_hours)
# Step 8: Clean 'wam'
<- df_cleaned %>%
df_cleaned mutate(wam = as.numeric(wam),
wam = ifelse(wam < 0 | wam > 100, NA, wam)) %>%
drop_na(wam)
# Step 9: Create 'weekly_exercise_level' categorical variable
<- df_cleaned %>%
df_cleaned mutate(weekly_exercise_level = case_when(
< 3 ~ "Low Intensity",
weekly_exercise_hours >= 3 & weekly_exercise_hours <= 8 ~ "Moderate Intensity",
weekly_exercise_hours > 8 ~ "High Intensity"
weekly_exercise_hours
))
# Step 10: Create 'wam_level' categorical variable
<- df_cleaned %>%
df_cleaned mutate(wam_level = case_when(
<= 50 ~ "Fail",
wam > 50 & wam <= 64 ~ "Pass",
wam > 64 & wam <= 74 ~ "Credit",
wam > 74 & wam <= 84 ~ "Distinction",
wam > 84 ~ "High Distinction"
wam
))
# Step 11: Write the cleaned data to a new CSV file (ensuring row names are not saved)
write.csv(df_cleaned, "cleaned_data.csv", row.names = FALSE)
# Print a message indicating the completion of the CSV creation
#print("Cleaned CSV file created successfully: cleaned_data.csv")