DATA1001 Project Template

Author

SIDs xxx

Executive Summary (Max 50 words)

This is effectively your “Executive Summary” with a concrete insight or action. Type your recommendation here.

Evidence

EDA (Max 200 words)

Code

library(tidyverse)
# read in your data here using read.csv
# =============================
# Step 0: Load Packages
# =============================
library(tidyverse)   # For wrangling and plotting
library(naniar)      # For missing values visualization
# Step 1: Import Data
# Replace the path if your CSV is in another folder
data <- read_csv("data1001_survey_data_2025_S2-1.csv")

#check the strcture of the data
str(data)

spc_tbl_ [2,955 × 28] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ cohort                 : num [1:2955] 202400 202400 202400 202400 202400 ...
 $ consent                : chr [1:2955] "I DO NOT consent to take part in the study" "I consent to take part in the study" "I consent to take part in the study" "I consent to take part in the study" ...
 $ age                    : num [1:2955] NA 18 19 90 8 NA 21 NA 20 25 ...
 $ gender                 : chr [1:2955] NA "Female" "Female" "Female" ...
 $ country_of_birth       : chr [1:2955] NA "Australia" "Other Please Specify" "Other Please Specify" ...
 $ country_of_birth_5_TEXT: chr [1:2955] NA NA NA "somoa" ...
 $ hours_work             : num [1:2955] NA 40 40 38.5 10 NA 0 NA 1 12 ...
 $ social_media_use       : num [1:2955] NA 4 200 2 10 NA 2 NA 10 2 ...
 $ rent                   : num [1:2955] NA 400 201 1000 500 ...
 $ friends_count          : num [1:2955] NA 2 0 100 1 NA 10 NA 20 8 ...
 $ stress                 : num [1:2955] NA 10 1 5 7 NA 3 NA 10 6 ...
 $ highest_speed          : num [1:2955] NA 150 -800 140 200 NA 80 NA 1000 280 ...
 $ relationship_status    : chr [1:2955] NA "In a relationship" "Its complicated" "Its complicated" ...
 $ dates                  : num [1:2955] NA NA NA NA NA NA NA NA NA NA ...
 $ standard_drinks        : num [1:2955] NA 6 4.3 0 0 NA 0 NA 7 12 ...
 $ countries              : num [1:2955] NA NA NA NA NA NA NA NA NA NA ...
 $ drug_use_q             : chr [1:2955] NA "Have you ever used recreational drugs?" "Have you ever gotten high off recreational drugs?" "Have you ever used recreational drugs?" ...
 $ drug_use_ans           : chr [1:2955] NA "Yes" "Prefer not to say" "No" ...
 $ student_type           : chr [1:2955] NA "International" "International" "Domestic" ...
 $ mainstream_advanced    : chr [1:2955] NA "DATA1001" "DATA1901" "DATA1001" ...
 $ semesters              : num [1:2955] NA 4 8.3 0 5 NA 3 NA 3 15 ...
 $ commute                : num [1:2955] NA 70 5 60 60 NA 10 NA 30 30 ...
 $ data_interest          : num [1:2955] NA 9 4 7 6 NA 7 NA 5 10 ...
 $ mark_goal              : num [1:2955] NA 50 50 0 90 NA 88 NA 10 8 ...
 $ hours_studying         : num [1:2955] NA 5 25 10 3 NA 5 NA 1 9 ...
 $ lecture_mode           : chr [1:2955] NA "Live in the Lecture Theatre" "Other" "Other" ...
 $ study_type             : chr [1:2955] NA "I work steadily all semester" "It changes depending on the subject" "I work steadily all semester" ...
 $ learner_style          : chr [1:2955] NA "Style 1" "Style 3" "Style 1" ...
 - attr(*, "spec")=
  .. cols(
  ..   cohort = col_double(),
  ..   consent = col_character(),
  ..   age = col_double(),
  ..   gender = col_character(),
  ..   country_of_birth = col_character(),
  ..   country_of_birth_5_TEXT = col_character(),
  ..   hours_work = col_double(),
  ..   social_media_use = col_double(),
  ..   rent = col_double(),
  ..   friends_count = col_double(),
  ..   stress = col_double(),
  ..   highest_speed = col_double(),
  ..   relationship_status = col_character(),
  ..   dates = col_double(),
  ..   standard_drinks = col_double(),
  ..   countries = col_double(),
  ..   drug_use_q = col_character(),
  ..   drug_use_ans = col_character(),
  ..   student_type = col_character(),
  ..   mainstream_advanced = col_character(),
  ..   semesters = col_double(),
  ..   commute = col_double(),
  ..   data_interest = col_double(),
  ..   mark_goal = col_double(),
  ..   hours_studying = col_double(),
  ..   lecture_mode = col_character(),
  ..   study_type = col_character(),
  ..   learner_style = col_character()
  .. )
 - attr(*, "problems")=<externalptr>

Code

#missing values
colSums(is.na(data))

                 cohort                 consent                     age 
                      0                       0                     113 
                 gender        country_of_birth country_of_birth_5_TEXT 
                    113                     113                    2203 
             hours_work        social_media_use                    rent 
                    113                     113                     113 
          friends_count                  stress           highest_speed 
                    113                     113                     113 
    relationship_status                   dates         standard_drinks 
                    113                     134                     113 
              countries              drug_use_q            drug_use_ans 
                    134                     114                     114 
           student_type     mainstream_advanced               semesters 
                    113                     113                     113 
                commute           data_interest               mark_goal 
                    113                     113                     113 
         hours_studying            lecture_mode              study_type 
                    113                     113                     113 
          learner_style 
                    113

Code

gg_miss_var(data)

Code

#numerical summaries
summary(select(data, age, hours_work, friends_count, stress, hours_studying, mark_goal))

      age         hours_work      friends_count          stress      
 Min.   : 0.0   Min.   :   0.00   Min.   :       0   Min.   : 0.000  
 1st Qu.:18.0   1st Qu.:   0.00   1st Qu.:       3   1st Qu.: 3.000  
 Median :18.0   Median :   2.00   Median :       6   Median : 5.000  
 Mean   :19.3   Mean   :  12.56   Mean   :   35201   Mean   : 5.141  
 3rd Qu.:19.0   3rd Qu.:  14.00   3rd Qu.:      10   3rd Qu.: 7.000  
 Max.   :90.0   Max.   :3333.00   Max.   :99999999   Max.   :10.000  
 NA's   :113    NA's   :113       NA's   :113        NA's   :113     
 hours_studying    mark_goal     
 Min.   : 0.00   Min.   :  0.00  
 1st Qu.: 3.00   1st Qu.: 75.00  
 Median : 4.00   Median : 85.00  
 Mean   : 5.21   Mean   : 81.11  
 3rd Qu.: 6.00   3rd Qu.: 90.00  
 Max.   :60.00   Max.   :100.00  
 NA's   :113     NA's   :113

Code

# Categorical summaries
table(data$gender, useNA = "ifany")


                   Female                      Male Non-binary / third gender 
                     1560                      1216                        39 
        Prefer not to say                      <NA> 
                       27                       113

Code

table(data$student_type, useNA = "ifany")


     Domestic International          <NA> 
         1670          1172           113

Code

# =============================
# Step 5: Visualisations
# =============================

# Histogram of Age
ggplot(data, aes(x = age)) + 
  geom_histogram(binwidth = 2, fill = "steelblue", colour = "black") +
  labs(title = "Age Distribution of Students")

Code

# Boxplot of Study Hours by Student Type
ggplot(data, aes(x = student_type, y = hours_studying)) +
  geom_boxplot(fill = "orange") +
  labs(title = "Study Hours by Student Type")

Code

# Scatterplot: Study Hours vs Mark Goal
ggplot(data, aes(x = hours_studying, y = mark_goal)) +
  geom_point(alpha = 0.5, colour = "darkblue") +
  geom_smooth(method = "lm", colour = "red") +
  labs(title = "Relationship between Study Hours and Mark Goal",
    x = "Hours Studying per Week",
    y = "Mark Goal (%)")

Discuss the data and consider

Source
Structure
Limitations
Assumptions
Data cleaning

Research Question 1 (Max 350 words for both RQ1 + RQ2 combined)

Type your research question evidence here.

Code

# you can enter your code to make your graphs in code chunks like this one
# the output will be displayed in your html

2 + 2

[1] 4

Research Question 2 (Linear Model)

Type your research question evidence here. Include testing assumptions of the linear model (residual plot).

Articles (Max 50 words)

Include any relevant literature to your analysis.

Acknowledgements

The Acknowledgment section includes a list of group meetings (date and time and attendance), the contribution of each group member, and all resources used (e.g. url of stack overflow, url of Ed post, date and details of drop-in session with tutor).

An AI usage statement including: the name and version of the AI tools, the publisher, the URL, a brief description of the context in which the tool was used. If no AI was used, this must be clearly stated.

A short statement explaining how 1 of the Shared Values and 1 of the Ethical Principles has been adhered to.