#Package Installation
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#Data Cleaning Code Chunk
# Renaming each variable for easier coding
data = read.csv("data1.csv")
data= rename(data, year=What.year.of.University.are.you.in.)
data= rename(data, faculty=What.university.faculty.are.you.studying.in.)
data= rename(data, dedication=On.a.scale.of.one.to.10..where.one.is.low.dedication.and.10.is.extremely.dedicated..how.dedicated.are.you.to.succeeding.in.your.degree..)
data= rename(data, study=How.many.hours.do.you.spend.studying.per.week.outside.of.class.)
data= rename(data, screentime=Check.your.screen.time.on.the.devices.you.use.most.when.studying..How.many.hours.do.you.spend.not.studying.in.a.week..E.g..on.your.phone.scrolling.Instagram.etc.)
data= rename(data, rstatus=What.is.your.relationship.status.)
data= rename(data, type=Regarding.your.time.at.USYD..are.you.a.full.time..part.time.or.exchange.student..)
data= rename(data,physint=How.would.you.describe.the.intensity.of.any.physical.activity.you.perform.in.a.week.)
data= rename(data, physext=How.would.you.describe.the.extent.of.physical.activity.in.your.life.)
data= rename(data,transport=How.long.does.transport.to.university.take.for.you.per.day..If.you.travel.for.less.than.1.hour..express.as.a.decimal..)
data= rename(data, WAM=What.is.your.WAM.to.the.nearest.whole.number.)
#Removes the 10th column of exercise intensity
data <- data[ -c(10)]
# Removes all 0 WAM values
data = filter(data, WAM != 0)
data = data[-c(2,3,4), ]
# Change screen time, study and transport to numeric
data$screentime <- as.numeric(data$screentime)
data$study <- as.numeric(data$study)
data$transport <- as.numeric(data$transport)
data$dedication <- as.factor(data$dedication)
#Checks the structure of data after changes have been made
str(data)## 'data.frame': 42 obs. of 11 variables:
## $ Timestamp : chr "8/30/2022" "9/5/2022" "9/6/2022" "9/6/2022" ...
## $ year : chr "1st year" "1st year" "1st year" "1st year" ...
## $ faculty : chr "Medicine and Health" "Science" "Science" "Science" ...
## $ dedication: Factor w/ 9 levels "1","3","4","5",..: 9 7 1 8 6 8 9 8 6 6 ...
## $ study : num 35 22 0 15 12 12 12 8 30 65 ...
## $ screentime: num 14 8 0 20 5 35 2 8 5 5 ...
## $ rstatus : chr "Single" "Single" "Single" "Single" ...
## $ type : chr "Full Time" "Full Time" "Full Time" "Full Time" ...
## $ physint : chr "Moderate" "Moderate" "Moderate" "Moderate" ...
## $ transport : num 0.6 0.75 0.8 2 85 0.25 3 1.5 2 0.5 ...
## $ WAM : int 68 62 73 82 70 80 90 75 80 85 ...
This report investigates external factors impacting university students’ WAM.
The main discoveries included a positive relationship between study time and WAM. Moreover, an inverse relationship between screen-time and WAM was observed. These results are applicable in improving students’ outlook on their academics and study optimisation techniques usable by academic institutions.
head(data, 3)## Timestamp year faculty dedication study screentime rstatus
## 1 8/30/2022 1st year Medicine and Health 10 35 14 Single
## 5 9/5/2022 1st year Science 8 22 8 Single
## 6 9/6/2022 1st year Science 1 0 0 Single
## type physint transport WAM
## 1 Full Time Moderate 0.60 68
## 5 Full Time Moderate 0.75 62
## 6 Full Time Moderate 0.80 73
tail(data, 3)## Timestamp year faculty dedication study screentime rstatus type
## 43 9/6/2022 3rd year Science 8 10 20 Single Full Time
## 44 9/6/2022 3rd year Engineering 9 6 5 Partner Full Time
## 45 9/6/2022 3rd year Science 4 6 30 Single Full Time
## physint transport WAM
## 43 Heavy 0.2 85
## 44 Light 10.0 78
## 45 Limited 0.2 70
str(data)## 'data.frame': 42 obs. of 11 variables:
## $ Timestamp : chr "8/30/2022" "9/5/2022" "9/6/2022" "9/6/2022" ...
## $ year : chr "1st year" "1st year" "1st year" "1st year" ...
## $ faculty : chr "Medicine and Health" "Science" "Science" "Science" ...
## $ dedication: Factor w/ 9 levels "1","3","4","5",..: 9 7 1 8 6 8 9 8 6 6 ...
## $ study : num 35 22 0 15 12 12 12 8 30 65 ...
## $ screentime: num 14 8 0 20 5 35 2 8 5 5 ...
## $ rstatus : chr "Single" "Single" "Single" "Single" ...
## $ type : chr "Full Time" "Full Time" "Full Time" "Full Time" ...
## $ physint : chr "Moderate" "Moderate" "Moderate" "Moderate" ...
## $ transport : num 0.6 0.75 0.8 2 85 0.25 3 1.5 2 0.5 ...
## $ WAM : int 68 62 73 82 70 80 90 75 80 85 ...
summary(data)## Timestamp year faculty dedication
## Length:42 Length:42 Length:42 7 :15
## Class :character Class :character Class :character 8 :11
## Mode :character Mode :character Mode :character 9 : 6
## 10 : 3
## 5 : 2
## 6 : 2
## (Other): 3
## study screentime rstatus type
## Min. : 0.00 Min. : 0.00 Length:42 Length:42
## 1st Qu.: 8.00 1st Qu.: 5.00 Class :character Class :character
## Median :12.00 Median :10.00 Mode :character Mode :character
## Mean :14.95 Mean :15.40
## 3rd Qu.:20.00 3rd Qu.:20.75
## Max. :65.00 Max. :47.00
##
## physint transport WAM
## Length:42 Min. : 0.000 Min. :50.00
## Class :character 1st Qu.: 0.325 1st Qu.:70.00
## Mode :character Median : 0.815 Median :75.00
## Mean : 4.075 Mean :73.83
## 3rd Qu.: 1.475 3rd Qu.:80.00
## Max. :85.000 Max. :91.00
##
Our data consists of 42 observations of 11 variables. Variable types include characters, numbers, and integers. The screen time, study and transport variables were reclassified as numeric to aid in graphical summaries, while dedication was reclassified to factor.
The data collection occurred through a Google Forms page
The survey was conducted between the dates 30/8/2022 and 9/9/2022.
A limitation of the data collection was selection bias due to most surveyed individuals being a part of the DATA1001 course.
75% of the surveyed demographic was first year undergraduate students and hence furthered this selection bias.
Another was the skew produced from the disproportionate amount of Science students surveyed. Some students are also more likely to exaggerate responses due to the survey being online, introducing extremities and skews in the collected data. This would not be representative of the general undergraduate student population.
Assumptions stem from the limitations where we assumed there were few data entry errors. We also assumed that the questions were not misleading and hence universally understood in the same capacity. An assumption regarding the data being truly representative of the entire undergraduate population was also made.
Each of these assumptions were made in efforts to simplify data cleaning, analysis and classification.
All 0 WAM values were removed as it was unlikely that a student achieved a WAM of 0, and to retain the anonymity the students were likely attempting to maintain. We also removed the “Physical Extent” data column as this data was not used in our analysis. Three sample entries were made prior large scale distribution of our survey, hence were removed prior our true analysis. Finally, all attributes were renamed to simplify coding and analysis.
Consent Bias: surveyed individuals were not given a choice and hence may be inclined to provide exaggerated or inaccurate data.
Observational Studies: Analysis does not produce causative relationships but provides links to existing evidence.
Survivor Bias: surveyed individuals were required to enter all data, minimising the null data entries we need to clean or analyse.
Selection Bias: the survey was distributed to a very specific group of students in DATA1001 and accounted only for participants with internet availability.
ggplot(data, aes(x= study,y= WAM))+geom_point(aes(col=faculty)) + ggtitle("Was your grandma right when she said study longer for more marks?") + labs(y="Weighted Average Mark", x="Study Time (hours)")+ geom_smooth(method = "lm", se=F)## `geom_smooth()` using formula 'y ~ x'
ggplot(lm(study~WAM, data=data)) +
geom_point(aes(x=.fitted, y=.resid)) + labs(title="Residual Plot of WAM vs study time")
The above residual plot shows no apparent trend, hence the scatterplot
used in 4.2.1 is accurate and well suited to the data comparison being
made.
ggplot(data, aes(x=dedication)) + geom_bar() + ggtitle("Do you really want to study, or do you have to study?") + labs(x="Dedication (relative scale)", y="Number of Students")ggplot(data, aes(x=faculty, y= WAM))+ geom_boxplot() + coord_flip() + ggtitle("Are Business students really that lazy?") + labs(y="WAM", x="Faculty")+ geom_smooth(method = "lm", se=F, col=data$study)## `geom_smooth()` using formula 'y ~ x'
ggplot(data, aes(y=dedication)) + geom_bar(aes(fill=faculty))+ coord_flip() + ggtitle("How smart is each faculty?") + labs(x="Number of Students", y="Dedication") ggplot(data, aes(y=WAM)) + geom_histogram(position=position_dodge2(preserve="single"), aes(fill=faculty)) + coord_flip() + labs(x="Number of Students") + ggtitle("WAM? Who is that?")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data, aes(x=screentime, y=WAM)) + geom_point(aes(col=faculty)) + geom_smooth(method = "lm", se=F) + labs(title="More Carrots or Less Screentime?") + labs(x="Screen time (hours)")## `geom_smooth()` using formula 'y ~ x'
Article 1 suggests no correlation between student’s screen time and GPA/WAM. Our research proved the contrary according to graph 4.2.7 where we observed a weak negative correlation.
Article 2 suggests a strong interaction between study time and motivation, aligning with our results.
Both media and scientific journals should be used to compare results however according to the above comparison, media studies are less reliable for appropriate study comparison.
| Date | Contributions | |
|---|---|---|
| 15/8/22 | Reading the assessment task and allocating tasks specific to each individual’s strengths. Devising ideas for research topic | |
| 22/8/22 | Begin forming the survey and understanding the types of information we need to collect before proceeding with analysis. | |
| 29/8/22 | Finalising survey details and making questions compulsory | |
| 6/9/22 | Distributed survey | |
| 7/9/22 | Began coding for graphs | |
| 12/9/22 | Started writing IDA, Executive Summary and Research Articles | |
| 19/9/22 | Continued writing IDA, Research Article Summary | |
| 21/9/22 | Finalising details | |
| 23/9/22 | Finalising submission details and submitting the assignment |
C. (2022). Change or modify x axis tick labels in R using ggplot2. Retrieved 22 September 2022, from https://stackoverflow.com/questions/20529252/change-or-modify-x-axis-tick-labels-in-r-using-ggplot2
4 Types of Data Biases (And How to Avoid Them). (2022). Retrieved 22 September 2022, from https://harbour.space/fintech/articles/data-bias
College Students’ Relationship Between Entitlement, Screentime, Smartphone Addiction and Academic Success - ProQuest. (2022). Retrieved 22 September 2022, from https://www.proquest.com/docview/2423812205?fromopenview=true&pq-origsite=gscholar
Contreras, V. (2022). Scrolling for answers on screen time. Retrieved 22 September 2022, from https://ohsmagnet.com/30616/news/scrolling-for-answers-on-screen-time/
curve, g., & Vazquez, C. (2022). ggplot2: histogram with normal curve. Retrieved 22 September 2022, from https://stackoverflow.com/questions/6967664/ggplot2-histogram-with-normal-curve
Dodge overlapping objects side-to-side — position_dodge. (2022). Retrieved 22 September 2022, from https://ggplot2.tidyverse.org/reference/position_dodge.html
ggplot, H. (2022). How to change legend title in ggplot. Retrieved 22 September 2022, from https://stackoverflow.com/questions/14622421/how-to-change-legend-title-in-ggplot
Histogram with density curves in R. (2022). Retrieved 22 September 2022, from https://r-charts.com/distribution/histogram-curves/#:~:text=A%20basic%20histogram%20can%20be,setting%20prob%20%3D%20TRUE%20as%20argument.
Interpreting Residual Plots to Improve Your Regression. (2022). Retrieved 22 September 2022, from https://www.qualtrics.com/support/stats-iq/analyses/regression-guides/interpreting-residual-plots-improve-regression/
Markdown, H., Himel, A., & Mwangi, K. (2022). How to center the title in R Markdown. Retrieved 22 September 2022, from https://stackoverflow.com/questions/19697402/how-to-center-the-title-in-r-markdown
Nonis, S., & Hudson, G. (2006). Academic Performance of College Students: Influence of Time Spent Studying and Working. Journal Of Education For Business, 81(3), 151-159. doi: 10.3200/joeb.81.3.151-159
O’Flaherty, M., Baxter, J., & Campbell, A. (2022). Do extracurricular activities contribute to better adolescent outcomes? A fixed‐effects panel data approach. Journal Of Adolescence, 94(6), 855-866. doi: 10.1002/jad.12069
Overlaying facetted histograms with normal curve using ggplot2 - sesa blog. (2022). Retrieved 22 September 2022, from https://data-se.netlify.app/2021/06/23/overlaying-facetted-histograms-with-normal-curve-using-ggplot2/
R Markdown Theme Gallery. (2022). Retrieved 22 September 2022, from https://www.datadreaming.org/post/r-markdown-theme-gallery/
R, H. (2022). How to build a trendline on a graph in R. Retrieved 22 September 2022, from https://stackoverflow.com/questions/35742754/how-to-build-a-trendline-on-a-graph-in-r
Change Colors in ggplot2 Line Plot in R (Example) | Modify Color of Lines. (2022). Retrieved 22 September 2022, from https://statisticsglobe.com/change-colors-in-ggplot2-line-plot-in-r
colours, g., Diggs, B., & Hohenstein, S. (2022). ggplot: colour points by groups based on user defined colours. Retrieved 22 September 2022, from https://stackoverflow.com/questions/21536835/ggplot-colour-points-by-groups-based-on-user-defined-colours
fa function - RDocumentation. (2022). Retrieved 22 September 2022, from https://www.rdocumentation.org/packages/psych/versions/2.2.5/topics/fa
Holtz, Y. (2022). Grouped boxplot with ggplot2. Retrieved 22 September 2022, from https://r-graph-gallery.com/265-grouped-boxplot-with-ggplot2.html
Plot Grouped Data: Box plot, B. (2022). Plot Grouped Data: Box plot, Bar Plot and More - Articles - STHDA. Retrieved 22 September 2022, from http://www.sthda.com/english/articles/32-r-graphics-essentials/132-plot-grouped-data-box-plot-bar-plot-and-more/
Zach, V. (2022). How to Overlay Normal Curve on Histogram in R (2 Examples) - Statology. Retrieved 22 September 2022, from https://www.statology.org/overlay-normal-curve-histogram-in-r/