library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.6 ✔ dplyr 1.0.8
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.3 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(skimr)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(stats)
Shortcuts:
Commenting out multiple lines of code: cmd + shift + c
The shorthand for inserting chunks into an R Markdown file is cmd + options + i
We will be using a data set built into the reshape2 package called french fries (see here: https://rdrr.io/cran/reshape2/man/french_fries.html). The variables are:
time in weeks from start of study treatment (type of oil) subject replicate potato-y flavour buttery flavour grassy flavour rancid flavour painty flavour
df1 <- french_fries #load in data - if we were importing data, we would use the csv file
head(df1) #shows the first 6 lines
## time treatment subject rep potato buttery grassy rancid painty
## 61 1 1 3 1 2.9 0.0 0.0 0.0 5.5
## 25 1 1 3 2 14.0 0.0 0.0 1.1 0.0
## 62 1 1 10 1 11.0 6.4 0.0 0.0 0.0
## 26 1 1 10 2 9.9 5.9 2.9 2.2 0.0
## 63 1 1 15 1 1.2 0.1 0.0 1.1 5.1
## 27 1 1 15 2 8.8 3.0 3.6 1.5 2.3
View(df1) #shows the entire data set
There are many ways to inspect our data - see below!
str(df1) #lets you know what each variable type is
## 'data.frame': 696 obs. of 9 variables:
## $ time : Factor w/ 10 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ treatment: Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ subject : Factor w/ 12 levels "3","10","15",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ rep : num 1 2 1 2 1 2 1 2 1 2 ...
## $ potato : num 2.9 14 11 9.9 1.2 8.8 9 8.2 7 13 ...
## $ buttery : num 0 0 6.4 5.9 0.1 3 2.6 4.4 3.2 0 ...
## $ grassy : num 0 0 0 2.9 0 3.6 0.4 0.3 0 3.1 ...
## $ rancid : num 0 1.1 0 2.2 1.1 1.5 0.1 1.4 4.9 4.3 ...
## $ painty : num 5.5 0 0 0 5.1 2.3 0.2 4 3.2 10.3 ...
summary(df1) #get descriptive info for each variable
## time treatment subject rep potato
## 1 : 72 1:232 10 : 60 Min. :1.0 Min. : 0.000
## 2 : 72 2:232 15 : 60 1st Qu.:1.0 1st Qu.: 4.000
## 3 : 72 3:232 16 : 60 Median :1.5 Median : 7.200
## 4 : 72 19 : 60 Mean :1.5 Mean : 6.953
## 5 : 72 51 : 60 3rd Qu.:2.0 3rd Qu.: 9.900
## 6 : 72 52 : 60 Max. :2.0 Max. :14.900
## (Other):264 (Other):336 NA's :1
## buttery grassy rancid painty
## Min. : 0.000 Min. : 0.0000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.400 1st Qu.: 0.000
## Median : 0.700 Median : 0.0000 Median : 2.700 Median : 0.600
## Mean : 1.824 Mean : 0.6642 Mean : 3.852 Mean : 2.522
## 3rd Qu.: 2.925 3rd Qu.: 0.9000 3rd Qu.: 6.500 3rd Qu.: 4.100
## Max. :11.200 Max. :11.1000 Max. :14.900 Max. :13.100
## NA's :4 NA's :1 NA's :1 NA's :2
summary(df1$treatment) #get descriptive info for only the variable "treatment" - using a $ specifies the column that you could like to use
## 1 2 3
## 232 232 232
You could also use the describe functions in the psych package, which are almost identical to the summary() function
describe(df1) #describe() is a function from the psych package
## vars n mean sd median trimmed mad min max range skew kurtosis
## time* 1 696 5.36 2.82 5.0 5.34 2.97 1 10.0 9.0 0.04 -1.19
## treatment* 2 696 2.00 0.82 2.0 2.00 1.48 1 3.0 2.0 0.00 -1.50
## subject* 3 696 6.47 3.41 6.5 6.46 3.71 1 12.0 11.0 0.01 -1.21
## rep 4 696 1.50 0.50 1.5 1.50 0.74 1 2.0 1.0 0.00 -2.00
## potato 5 695 6.95 3.58 7.2 7.00 4.45 0 14.9 14.9 -0.11 -0.95
## buttery 6 692 1.82 2.41 0.7 1.36 1.04 0 11.2 11.2 1.45 1.32
## grassy 7 695 0.66 1.32 0.0 0.36 0.00 0 11.1 11.1 3.48 17.87
## rancid 8 695 3.85 3.78 2.7 3.39 4.00 0 14.9 14.9 0.80 -0.47
## painty 9 694 2.52 3.39 0.6 1.88 0.89 0 13.1 13.1 1.29 0.57
## se
## time* 0.11
## treatment* 0.03
## subject* 0.13
## rep 0.02
## potato 0.14
## buttery 0.09
## grassy 0.05
## rancid 0.14
## painty 0.13
describeBy(df1, group = df1$treatment)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis
## time* 1 232 5.36 2.83 5.0 5.34 2.97 1 10.0 9.0 0.04 -1.20
## treatment* 2 232 1.00 0.00 1.0 1.00 0.00 1 1.0 0.0 NaN NaN
## subject* 3 232 6.47 3.42 6.5 6.46 3.71 1 12.0 11.0 0.01 -1.22
## rep 4 232 1.50 0.50 1.5 1.50 0.74 1 2.0 1.0 0.00 -2.01
## potato 5 232 6.89 3.74 7.2 6.91 4.45 0 14.9 14.9 -0.08 -1.05
## buttery 6 231 1.78 2.47 0.6 1.27 0.89 0 11.2 11.2 1.55 1.57
## grassy 7 232 0.65 1.38 0.0 0.34 0.00 0 11.1 11.1 4.26 26.29
## rancid 8 232 4.07 3.87 2.8 3.64 4.15 0 14.3 14.3 0.70 -0.72
## painty 9 232 2.58 3.33 1.2 1.93 1.78 0 13.1 13.1 1.40 1.14
## se
## time* 0.19
## treatment* 0.00
## subject* 0.22
## rep 0.03
## potato 0.25
## buttery 0.16
## grassy 0.09
## rancid 0.25
## painty 0.22
## ------------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew kurtosis
## time* 1 232 5.36 2.83 5.00 5.34 2.97 1 10.0 9.0 0.04 -1.20
## treatment* 2 232 2.00 0.00 2.00 2.00 0.00 2 2.0 0.0 NaN NaN
## subject* 3 232 6.47 3.42 6.50 6.46 3.71 1 12.0 11.0 0.01 -1.22
## rep 4 232 1.50 0.50 1.50 1.50 0.74 1 2.0 1.0 0.00 -2.01
## potato 5 232 7.00 3.58 7.05 7.03 4.60 0 14.1 14.1 -0.08 -1.04
## buttery 6 230 1.97 2.49 0.80 1.52 1.19 0 10.1 10.1 1.29 0.74
## grassy 7 232 0.66 1.25 0.00 0.36 0.00 0 7.1 7.1 2.59 7.55
## rancid 8 232 3.62 3.80 2.20 3.12 3.26 0 14.9 14.9 0.89 -0.34
## painty 9 231 2.46 3.51 0.10 1.78 0.15 0 12.7 12.7 1.31 0.47
## se
## time* 0.19
## treatment* 0.00
## subject* 0.22
## rep 0.03
## potato 0.24
## buttery 0.16
## grassy 0.08
## rancid 0.25
## painty 0.23
## ------------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew kurtosis
## time* 1 232 5.36 2.83 5.0 5.34 2.97 1 10.0 9.0 0.04 -1.20
## treatment* 2 232 3.00 0.00 3.0 3.00 0.00 3 3.0 0.0 NaN NaN
## subject* 3 232 6.47 3.42 6.5 6.46 3.71 1 12.0 11.0 0.01 -1.22
## rep 4 232 1.50 0.50 1.5 1.50 0.74 1 2.0 1.0 0.00 -2.01
## potato 5 231 6.97 3.44 7.3 7.04 4.00 0 14.5 14.5 -0.19 -0.80
## buttery 6 231 1.72 2.27 0.7 1.29 1.04 0 10.2 10.2 1.49 1.60
## grassy 7 231 0.68 1.33 0.0 0.38 0.00 0 10.5 10.5 3.29 15.47
## rancid 8 231 3.87 3.69 2.8 3.40 4.15 0 13.3 13.3 0.79 -0.35
## painty 9 231 2.53 3.36 0.5 1.92 0.74 0 11.8 11.8 1.15 0.05
## se
## time* 0.19
## treatment* 0.00
## subject* 0.22
## rep 0.03
## potato 0.23
## buttery 0.15
## grassy 0.09
## rancid 0.24
## painty 0.22
For character strings, it is useful to get a count of how many cases fall into these categories.
table(df1$treatment) #this is a raw count of each category specified (treatment)
##
## 1 2 3
## 232 232 232
ggplot(df1, aes(x = potato)) +
geom_histogram() +
stat_bin(bins = 20) #graph a histogram for one column of data
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Removed 1 rows containing non-finite values (stat_bin).
less than is > less than or equal to <= greater than is > greater than or equal to is >= equals is == not equal to is !=
or is ‘OR’ and is ‘&’
skim(df1) #for each variables, it gives you the n missing, min, max, empty n, number of unique values, mean, stdev, quartiles, mini histrogram
| Name | df1 |
| Number of rows | 696 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| time | 0 | 1 | FALSE | 10 | 1: 72, 2: 72, 3: 72, 4: 72 |
| treatment | 0 | 1 | FALSE | 3 | 1: 232, 2: 232, 3: 232 |
| subject | 0 | 1 | FALSE | 12 | 10: 60, 15: 60, 16: 60, 19: 60 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| rep | 0 | 1.00 | 1.50 | 0.50 | 1 | 1.0 | 1.5 | 2.00 | 2.0 | ▇▁▁▁▇ |
| potato | 1 | 1.00 | 6.95 | 3.58 | 0 | 4.0 | 7.2 | 9.90 | 14.9 | ▅▇▇▇▂ |
| buttery | 4 | 0.99 | 1.82 | 2.41 | 0 | 0.0 | 0.7 | 2.92 | 11.2 | ▇▂▁▁▁ |
| grassy | 1 | 1.00 | 0.66 | 1.32 | 0 | 0.0 | 0.0 | 0.90 | 11.1 | ▇▁▁▁▁ |
| rancid | 1 | 1.00 | 3.85 | 3.78 | 0 | 0.4 | 2.7 | 6.50 | 14.9 | ▇▃▂▂▁ |
| painty | 2 | 1.00 | 2.52 | 3.39 | 0 | 0.0 | 0.6 | 4.10 | 13.1 | ▇▂▁▁▁ |
#Can use skim_without_charts() function for no mini histograms
df1_treat1 <- subset(df1, treatment == 1)
filter() is the Tidyverse way of subsetting the data
df1_treat2 <- dplyr::filter(df1, treatment == 2)
Subsetting by treatment == 1 AND rep ==1
df1_treat1_rep1 <- dplyr::filter(df1, treatment == 1 & rep == 1)
Selecting variables (columns) using the select() function
df1_special <- dplyr::select(df1, subject, treatment, rep, potato, buttery) #Selecting only a few variables
head(df1_special)
## subject treatment rep potato buttery
## 61 3 1 1 2.9 0.0
## 25 3 1 2 14.0 0.0
## 62 10 1 1 11.0 6.4
## 26 10 1 2 9.9 5.9
## 63 15 1 1 1.2 0.1
## 27 15 1 2 8.8 3.0
Deleting variables using the select() function
df1_delete <- dplyr::select(df1, -subject, -treatment, -rep, -potato, -buttery) #A minus sign deletes a column from a data set
head(df1_delete)
## time grassy rancid painty
## 61 1 0.0 0.0 5.5
## 25 1 0.0 1.1 0.0
## 62 1 0.0 0.0 0.0
## 26 1 2.9 2.2 0.0
## 63 1 0.0 1.1 5.1
## 27 1 3.6 1.5 2.3
Pipes are a way of telling R, “then do this” without having to run separate lines of code.
Shortcut for piping is shift + command + m
df1_final <- df1 %>%
dplyr::filter(treatment == 1) %>%
dplyr::select(subject, treatment, rep, potato, buttery)
head(df1_final)
## subject treatment rep potato buttery
## 61 3 1 1 2.9 0.0
## 25 3 1 2 14.0 0.0
## 62 10 1 1 11.0 6.4
## 26 10 1 2 9.9 5.9
## 63 15 1 1 1.2 0.1
## 27 15 1 2 8.8 3.0
pl_hist_buttery <- df1_final %>%
ggplot(aes(x = buttery)) +
geom_histogram() +
theme_minimal()
pl_hist_buttery #show the plot of buttery
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
The arguments are your data frame and the path + name you want the file to save
write.csv(df1_final, "~/Documents/R Scripts General/output/final-data.csv")
I’m saving my histogram plot from above as an image file.
ggsave(plot = pl_hist_buttery, file = "~/Documents/R Scripts General/output/pl_hist_buttery.png",
width = 4, height = 3.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
Let’s re-code treatment into a new variable, where the character values are instead numeric values (e.g., we want the treatment 1 to be peanut oil and so on ).
table(df1$treatment) #Check how many levels of treatment in the data
##
## 1 2 3
## 232 232 232
Base-R method for recoding values from one variable into another
df1$treatment_name <- NA #Makes a new column name
df1$treatment_name[df1$treatment == 1] <- "Peanut Oil" #If treatment = 1, then make the treatment name "Peanut Oil"
df1$treatment_name[df1$treatment == 2] <- "Canola Oil" #If treatment = 2, then make the treatment name "Canola Oil"
df1$treatment_name[df1$treatment == 3] <- "Vegetable Oil" #If treatment = 3, then make the treatment name "Vegetable Oil"
View(df1) #check your work - specifically the new treatment_name column!
df2 <- df1_special %>%
na.omit() #omit NA values and save it as a new data set
Rename columns - rename the individual variables using new_name = old_name In the example below, the original variable “potato” is now named “potato_flavor”
df2 <- df2 %>% rename("potato_flavor" = potato, "buttery_flavor" = buttery)
head(df2)
## subject treatment rep potato_flavor buttery_flavor
## 61 3 1 1 2.9 0.0
## 25 3 1 2 14.0 0.0
## 62 10 1 1 11.0 6.4
## 26 10 1 2 9.9 5.9
## 63 15 1 1 1.2 0.1
## 27 15 1 2 8.8 3.0
Summarize your data by a grouping variable
df2 %>%
group_by(treatment) %>%
summarize(mean_potato = mean(potato_flavor)) #get potato flavor means for the 3 treatments
## # A tibble: 3 × 2
## treatment mean_potato
## <fct> <dbl>
## 1 1 6.87
## 2 2 7.01
## 3 3 6.97
A more complicated summarizing scenario…
df2 %>%
group_by(treatment, rep) %>% #group by treatment and repetition
summarize(mean_potato = mean(potato_flavor), mean_buttery = mean(buttery_flavor)) #take the mean of potato and buttery flavors
## `summarise()` has grouped output by 'treatment'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups: treatment [3]
## treatment rep mean_potato mean_buttery
## <fct> <dbl> <dbl> <dbl>
## 1 1 1 6.74 1.80
## 2 1 2 7.00 1.76
## 3 2 1 7.18 1.99
## 4 2 2 6.84 1.96
## 5 3 1 6.94 1.81
## 6 3 2 7.00 1.63
Generate a scatterplot using potato, and buttery variables at repetition == 1
df1_final %>% #take your data and then...
filter(rep == 1) %>% #take only first repetitions and then...
ggplot(aes(x = potato, y = buttery)) +
geom_point() #plot potato flavor on the x axis and buttery flavor on the y axis
## Warning: Removed 1 rows containing missing values (geom_point).
I can make the same graph as above but make it look a little nicer with some aesethics…
df1_final %>% #take your data and then...
filter(rep == 1) %>% #take only first repetitions and then...
ggplot(aes(x = potato, y = buttery)) + #plot potato flavor on the x axis and buttery flavor on the y axis
geom_point(size = 2, color = "dodgerblue") + # this is a scatterplot (geom_point) and the color is blue
theme_minimal() + #the theme is minimal (no grey background)
xlab("Potato Flavor") + #add x axis label
ylab("Buttery Flavor") + #add y axis label
ggtitle("Scatterplot of Potato Flavor and Buttery Flavor at the First Taste") #add a title
## Warning: Removed 1 rows containing missing values (geom_point).