library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
stroopdata <- read_csv("https://uoepsy.github.io/data/strooptask.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## age = col_double(),
## practice = col_character(),
## matching = col_double(),
## mismatching = col_double(),
## height = col_double()
## )
stroopdata <- stroopdata %>%
mutate(stroop_effect = mismatching - matching)
Recall that above we visualised the relationship between practice (categorical) and Stroop-effect (numeric) by plotting two density curves overlayed on one another, or two histograms side-by-side.
What other way might we visualise this relationship?
stroopdata %>%
ggplot(., aes(x = practice, y = stroop_effect)) +
geom_boxplot()
Check what other variables there are in the data - use the function names(), and give it your dataframe:
names(stroopdata)
## [1] "id" "age" "practice" "matching"
## [5] "mismatching" "height" "stroop_effect"
We also have information recorded on the participants’ heights (measured in cm).
Produce a visualisation of the relationship between height and the Stroop-effect. Before you do so, think about what you expect to see?
Calculate the covariance.
stroopdata %>%
ggplot(., aes(x = height, y = stroop_effect)) +
geom_point(size = 2, colour = "brown", fill = "white") + #size 2 makes the dots a tad bit easier to read
xlab("Height of a participant") +
ylab("Difference in matching and mismatching sets")
cov(stroopdata$height, stroopdata$stroop_effect)
## [1] -3.024087
edicycle <- read_csv("https://uoepsy.github.io/data/cycling_mmwalk.csv")
## Parsed with column specification:
## cols(
## month = col_character(),
## day = col_double(),
## hour = col_double(),
## cyclists = col_double()
## )
The data contains information on the total number of cyclists travelling in either direction on Middle Meadow Walk for each hour of each day in 2012.
Plot the relationship between hour of day and the number of cyclists.
class(edicycle$hour)
## [1] "numeric"
edicycle$hour <- as.factor(edicycle$hour) #editing hour as a factor
edicycle %>%
ggplot(., aes(x = hour, y = cyclists/365)) +
geom_col(colour = "cadetblue4", fill = "cadetblue4", size = 0.7) +
ggtitle("Number of cyclists per hour") +
xlab("Hour of the day") +
ylab("Number of cyclists")
Using group_by() and summarise(), you can aggregate the data in to grouped averages (using mean()), or grouped totals (using sum()).
Which month had the highest total number of cyclists?
Which hour of the day had the highest average number of cyclists?
edicycle %>%
group_by(month) %>%
summarise(Cyclists = sum(cyclists)) %>%
arrange(desc(Cyclists))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## month Cyclists
## <chr> <dbl>
## 1 Aug 55165
## 2 Mar 50997
## 3 Oct 49160
## 4 May 46364
## 5 Sep 44351
## 6 Nov 43962
## 7 Jun 39379
## 8 Feb 38781
## 9 Apr 37707
## 10 Jul 36707
## 11 Jan 33517
## 12 Dec 23507
#the output confirms that August had the highest total number of cyclists
edicycle %>%
group_by(hour) %>%
summarise(Cyclists = mean(cyclists)) %>%
arrange(desc(Cyclists))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 24 x 2
## hour Cyclists
## <fct> <dbl>
## 1 17 144.
## 2 8 139.
## 3 18 113.
## 4 16 108.
## 5 9 104.
## 6 15 84.2
## 7 13 76.6
## 8 14 76.0
## 9 19 71.0
## 10 12 69.2
## # ... with 14 more rows
#highest amount of cyclists was found at 5pm
Another dataset, available at https://uoepsy.github.io/data/cycling_invrow.csv, contains information on the total number of cyclists each month from 2014 to 2016 on Inverleith Row, Edinburgh. It also contains data on the monthly rainfall (in millimetres) measured at the nearby Royal Botanic Gardens.
Read the data into R, and produce a visualisation of the relationship between monthly rainfall and monthly cyclists.
invrow <- read_csv("https://uoepsy.github.io/data/cycling_invrow.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## month = col_character(),
## cyclists = col_double(),
## rainfall_mm = col_double()
## )
invrow %>%
ggplot(., aes(x = rainfall_mm, y = cyclists)) +
geom_point(size = 2, colour = "cadetblue4")
pulse <- read_csv("https://uoepsy.github.io/data/pulse.csv")
## Parsed with column specification:
## cols(
## active = col_double(),
## rest = col_double(),
## smoke = col_character(),
## sex = col_character(),
## exercise = col_double(),
## hgt = col_double(),
## wgt = col_double()
## )
#I'll edit inches to centimeres and pounds (lb) to kilograms
pulse <- pulse %>%
mutate(hgt = round(hgt*2.54, 2), wgt = round(wgt*0.4535924, 2))
Explore visually the relationship between active and resting pulse rate and the variables which influence resting pulse rate.
pulse %>%
ggplot(., aes(x = sex, y = rest)) +
geom_boxplot(aes(fill = sex)) +
facet_wrap(~smoke, labeller = labeller(smoke = c("N" = "Non-smoker", "Y" = "Smoker"))) +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_discrete(name = "", labels = c("Female", "Male")) +
ylab("Resting heart rate") +
ggtitle("The relationship of sex and smoking to resting heart rate")
#sex seems to make a difference in resting heart rate (probably due to size difference), and smoking increases person's resting heart rate
pulse %>%
ggplot(., aes(x = rest, y = active, color = sex)) +
geom_point(size = 2) +
xlim(40, NA) +
ylim(40, NA) +
xlab("Resting heart rate") +
ylab("Active heart rate") +
ggtitle("Relationship between the resting and active heart rate in males and females") +
labs(color = "Sex")
#visualising the correlation in resting heart rate and active heart rate in males and females
ggplot(data = pulse, aes(x = factor(exercise), y = rest, fill = factor(exercise))) +
geom_boxplot() +
theme(legend.position="none") +
xlab("Amount of exercise | Scale from 1 to 3") +
ylab("Resting heart rate | bpm") +
ggtitle("Relationship between the amount of exercise and resting heart rate")
Create a two-way contingency table to assess the relationship between Smoker/Non-Smoker and Male/Female./
What percentage of Males are Smokers?
pulse %>%
select(., sex, smoke) %>%
table() %>%
prop.table(margin = 1)
## smoke
## sex N Y
## F 0.91818182 0.08181818
## M 0.86065574 0.13934426
#14 percent of males are smokers
The average resting pulse was 68, with a standard deviation of 9.95. Heights of participants in the study ranged from 152.4 centimeters to 198.12 centimeters, with a median of 172.72 and an IQR of 15.24.