This file contains commonly used R functions for data analysis, data management, and visualization. All code examples will use iris, a pre-loaded R data set. Type ?iris in the console to learn more information about the dataset.
Package: base R
#Explanation: Specifies hierarchy among a set of values or a column. The code below sets setosa to the 1st level, versicolor to the 2nd, and virinica to the 3rd level. Factor is helpful when dealing with categorical since it assigns numerical levels.
# Convert the Species column to a factor.
iris$Species <- factor(iris$Species)
# Get the levels of the Species factor
levels(iris$Species)
## [1] "setosa" "versicolor" "virginica"
# Check the class of the Species column
class(iris$Species)
## [1] "factor"
Package: base R
#Explanation: For loops performs a set of operations on each value in a sequence.
#Loop over the columns of the iris dataset
for (col in names(iris)) {
# Calculate the mean of the column
col_mean <- mean(iris[[col]])
# Print the column name and mean
cat("Mean of", col, ":", col_mean, "\n")
}
## Mean of Sepal.Length : 5.843333
## Mean of Sepal.Width : 3.057333
## Mean of Petal.Length : 3.758
## Mean of Petal.Width : 1.199333
## Mean of Species : NA
Package: base R
#Explanation: Makes a new function that can execute a specific task through a combination of specified functions
# Define a function that calculates the mean of a numeric column in a data frame
mean_column <- function(df, column) {
if (!is.numeric(df[[column]])) {
stop("Column must be numeric")
}
return(mean(df[[column]]))
}
# Calculate the mean of the Sepal.Length column using the mean_column() function
sepal_length_mean <- mean_column(iris, "Sepal.Length")
sepal_length_mean
## [1] 5.843333
Package: base R
#Explanation: If a condition is met, then a action or function is executed. If the condition is not met, a different action or function is executed.
# Create a new column called "SpeciesCode" based on the "Species" column
iris$SpeciesCode <- ifelse(iris$Species == "setosa", 1,
ifelse(iris$Species == "versicolor", 2, 3))
# Print the first 10 rows of the iris dataset
head(iris, 10)
Package: base R
#Explanation: Identifies if value is missing (NA) or not
#Count all missing values in iris
sum(is.na(iris))
## [1] 0
#Count all non-missing values in iris
sum(!is.na(iris)) # ! = not
## [1] 900
Package: base R
#Explanation: Applies a function on each column or value of a dataset and returns a list
# Apply the `mean()` function to each numeric column of the iris dataset
means <- lapply(iris[,1:4], mean)
# Print the resulting list of means
print(means)
## $Sepal.Length
## [1] 5.843333
##
## $Sepal.Width
## [1] 3.057333
##
## $Petal.Length
## [1] 3.758
##
## $Petal.Width
## [1] 1.199333
Package: base R
#Explanation: Combines two dataframes which have one column with identical values
#Normally you merge two different datasets, but we will merge iris with itself
merged_iris <- merge(iris, iris, #names of two datasets
by = c("Species"), #shared column between two datasets
suffixes = c("_1", "_2")) #adding suffixes to distinguish orginal values
head(merged_iris) #see the first 6 rows
Package: base R
#Explanation: Loads .csv files into R
#iris2 <- read.csv("name_of_file.csv")
#make sure your working directory is set to your file location.
Package: dplyr
#Explanation: Changes column names
library(dplyr) #part of tidyverse
iris_renamed <- iris %>% rename(SL = Sepal.Length, #Changed Sepal.Length to SL
SW = Sepal.Width, #Changed Sepal.Width to SW
PL = Petal.Length, #Changed Petal.Length to PL
PW = Petal.Width, #Changed Petal.Width to PW
Class = Species) #Changed Species to Class
iris_renamed
Package: base R
#Explanation: Applies a function on each column or value of a dataset and returns a vector or matrix
# Apply the mean function to each column of the dataset
column_means <- sapply(X = iris, FUN = mean, na.rm = TRUE)
# View the column means
column_means #Species is categorical, so NA is returned
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species SpeciesCode
## 5.843333 3.057333 3.758000 1.199333 NA 2.000000
Package: base R
#Explanation: Writes one or more character strings to text. Useful in working with strings.
# Write the first 5 rows of the dataset to a text file
writeLines(colnames(iris))
## Sepal.Length
## Sepal.Width
## Petal.Length
## Petal.Width
## Species
## SpeciesCode
Package: readxl
#Explanation: Loads .xlsx files into R
library(readxl)
# Read the Excel file
# example <- read.xlsx("name_of_your_file.xlsx",
# sheetIndex = 1, #will read only the first sheet
# startRow = 2, #will start reading the data at the 2nd row
# endRow = 100, #will stop reading the data the 100th row
# colIndex = 1:5) #will only read columns 1-5
Package: readxl
#Explanation: Saves dataframe as an .xlsx file type
# write.xlsx(df, #name of object
# file = "example.xlsx", #name that of file being created
# sheetName = "Sheet1", #name that of sheet being created
# row.names = FALSE) #not including row names
Package: stringr
#Explanation: Combines multiple values (characters, special characters, and/or numbers) together
library(stringr)
# Concatenate the species and petal width columns of the iris data frame
iris$Species_PetalWidth <- str_c(iris$Species, " - ", iris$Petal.Width)
# View the first few rows of the modified iris data frame
head(iris)
Package: stringr
#Explanation: Locates and stores all occurrences of specificed characters, special characters, or numbers within a cell or cells.
library(stringr)
# Extract the petal length from the iris data frame using str_match
iris$Petal.Length.Extracted <- str_match(iris$Petal.Length, #variable
"\\d+\\.\\d+") #searching for a digit (d), followed by a period (.), and followed by another digit (d)
# View the first few rows of the modified iris data frame
head(iris)
Package: stringr
#Explanation: Replaces characters, special characters, or numbers in a column (or cell) with specified characters, special characters, or numbers
library(stringr)
# Replace the species names in the iris data frame with shorter names
iris$Species <- str_replace(iris$Species, #variable of interest
"versicolor", #character to search for
"vers") #replacement text
# View the change name
table(iris$Species)
##
## setosa vers virginica
## 50 50 50
Package: stringr
#Explanation: Shows all occurances of specified characters, special characters, or numbers within a cell or cells
library(stringr)
# Use str_view_all to visualize the matches of the "setosa" pattern in the Species column of the iris data frame
str_view_all(iris$Species, "setosa")
## [1] │ <setosa>
## [2] │ <setosa>
## [3] │ <setosa>
## [4] │ <setosa>
## [5] │ <setosa>
## [6] │ <setosa>
## [7] │ <setosa>
## [8] │ <setosa>
## [9] │ <setosa>
## [10] │ <setosa>
## [11] │ <setosa>
## [12] │ <setosa>
## [13] │ <setosa>
## [14] │ <setosa>
## [15] │ <setosa>
## [16] │ <setosa>
## [17] │ <setosa>
## [18] │ <setosa>
## [19] │ <setosa>
## [20] │ <setosa>
## ... and 130 more
Package: dplyr
#Explanation: Sorts rows in a dataframe based on one or more columns
library(tidyverse) #dplyr is part of tidyverse
# Sort the iris data frame by Petal.Length in ascending order
iris_sorted <- arrange(iris, Petal.Length)
# View the first few rows of the sorted data frame
head(iris_sorted)
Package: dplyr
#Explanation: Adds columns of a dataframe to another dataframe
library(tidyverse) #dplyr is part of tidyverse
# Create a new data frame with two columns
new_data <- data.frame(Sepal.Width = runif(150, 2.5, 4),
Sepal.Length = runif(150, 4, 7))
# Combine the iris and new_data data frames by column
iris_combined <- bind_cols(iris, new_data)
# View the first few rows of the combined data frame
head(iris_combined)
Package: dplyr
#Explanation: Adds rows of a dataframe to another dataframe
library(tidyverse) #dplyr is part of tidyverse
# Create a new data frame with two rows
new_data <- data.frame(Sepal.Width = c(3.5, 2.8),
Sepal.Length = c(6.2, 5.1),
Petal.Length = c(4.3, 1.5),
Petal.Width = c(1.3, 0.3),
Species = c("versicolor", "setosa"))
# Combine the iris and new_data data frames by row
iris_combined <- bind_rows(iris, new_data)
# View the last few rows of the combined data frame
tail(iris_combined)
Package: dplyr
#Explanation: Searches for values that fit one or more user-defined conditions
library(tidyverse) #dplyr is part of tidyverse
# Extract all rows where Species is "setosa"
setosa <- filter(iris, #df
Species == #variable
"setosa") #condition to search for
# Extract all rows where Sepal.Length is greater than 6.0
long_sepal <- filter(iris, Sepal.Length > 6.0)
# Extract all rows where Petal.Length is between 4.5 and 5.0
long_petal <- filter(iris, Petal.Length >= 4.5, Petal.Length <= 5.0)
Package: dplyr
#Explanation: Creates a new column
library(tidyverse) #dplyr is part of tidyverse
# Add a new column that is the product of Sepal.Length and Sepal.Width
iris <- mutate(iris, #dataframe
sepal_area = #name of new column
Sepal.Length * Sepal.Width) #formula to create values for new column
# Add a new column that is the ratio of Petal.Length to Petal.Width
iris <- mutate(iris, petal_ratio = Petal.Length / Petal.Width)
# Add a new column that is the average of Sepal.Length and Petal.Length
iris <- mutate(iris,avg_length = (Sepal.Length + Petal.Length) / 2)
Package: dplyr
#Explanation: Selects column(s) within a dataframe. Helpful when making a new dataframe with less columns.
library(tidyverse) #dplyr is part of tidyverse
# Select the columns Sepal.Length, Sepal.Width, and Species
iris_select <- select(iris, #dataframe
Sepal.Length, Sepal.Width, Species) #variables
# Select all columns except Sepal.Length and Sepal.Width
iris_select <- iris %>% select(-Sepal.Length, -Sepal.Width) #with pipes
# Select columns Petal.Length and Petal.Width, and all columns starting with Sepal
iris_select <- select(iris, Petal.Length, Petal.Width, starts_with("Sepal"))
Package: base R
#Explanation: Regresses one or more columns onto another column.
# Fit a linear regression model of Petal.Length on Sepal.Length and Sepal.Width
model <- lm(Petal.Length ~ #dependent variables
Sepal.Length + Sepal.Width, #independent variables
data = iris) #data
# Print the summary of the model
summary(model)
##
## Call:
## lm(formula = Petal.Length ~ Sepal.Length + Sepal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.25582 -0.46922 -0.05741 0.45530 1.75599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.52476 0.56344 -4.481 1.48e-05 ***
## Sepal.Length 1.77559 0.06441 27.569 < 2e-16 ***
## Sepal.Width -1.33862 0.12236 -10.940 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6465 on 147 degrees of freedom
## Multiple R-squared: 0.8677, Adjusted R-squared: 0.8659
## F-statistic: 482 on 2 and 147 DF, p-value: < 2.2e-16
Package: base R
#Explanation: Converts a frequency or contignecy table into proportions
# Compute the proportions of each species in the iris data set
prop.table(table(iris$Species))
##
## setosa vers virginica
## 0.3333333 0.3333333 0.3333333
# Compute the proportions of each species by combining the Species and Petal.Width variables
prop.table(table(iris$Species, iris$Petal.Width), margin = 1)
##
## 0.1 0.2 0.3 0.4 0.5 0.6 1 1.1 1.2 1.3 1.4 1.5 1.6
## setosa 0.10 0.58 0.14 0.14 0.02 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## vers 0.00 0.00 0.00 0.00 0.00 0.00 0.14 0.06 0.10 0.26 0.14 0.20 0.06
## virginica 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.04 0.02
##
## 1.7 1.8 1.9 2 2.1 2.2 2.3 2.4 2.5
## setosa 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## vers 0.02 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## virginica 0.02 0.22 0.10 0.12 0.12 0.06 0.16 0.06 0.06
Package: base R
#Explanation: Lists the quarantines, median, mean, and range of one or more columns
# Obtain a summary of the dataset
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species SpeciesCode Species_PetalWidth Petal.Length.Extracted.V1
## Length:150 Min. :1 Length:150 Length:150
## Class :character 1st Qu.:1 Class :character Class :character
## Mode :character Median :2 Mode :character Mode :character
## Mean :2
## 3rd Qu.:3
## Max. :3
## sepal_area petal_ratio avg_length
## Min. :10.00 Min. : 2.125 Min. :2.700
## 1st Qu.:15.66 1st Qu.: 2.802 1st Qu.:3.362
## Median :17.66 Median : 3.300 Median :5.050
## Mean :17.82 Mean : 4.311 Mean :4.801
## 3rd Qu.:20.32 3rd Qu.: 4.667 3rd Qu.:5.800
## Max. :30.02 Max. :15.000 Max. :7.300
Package: base R
#Explanation: Creates a table that shows the amount of times a value is repeated in one or more columns
#Create table showing the count of 3 Species
table(iris$Species)
##
## setosa vers virginica
## 50 50 50
Package: base R
#Explanation: Creates a contingency table which shows the count for two or more variables
# Create a contingency table of Species and Sepal.Width
xtabs(~ Species + Sepal.Width, #two variables
data = iris) #dataset
## Sepal.Width
## Species 2 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7
## setosa 0 0 1 0 0 0 0 0 1 6 4 5 2 9 6 3 3
## vers 1 2 3 3 4 3 5 6 7 8 3 3 1 1 0 0 0
## virginica 0 1 0 0 4 2 4 8 2 12 4 5 3 2 0 1 0
## Sepal.Width
## Species 3.8 3.9 4 4.1 4.2 4.4
## setosa 4 2 1 1 1 1
## vers 0 0 0 0 0 0
## virginica 2 0 0 0 0 0
Package: base R
#Explanation: Groups data by one or more variables and then applies a function (i.e. mean)
# Summarize Sepal.Length by Species
aggregate(Sepal.Length ~ Species, #variables
data = iris, #dataset
FUN = mean) #function
Package: epiDisplay
#Explanation: Creates a table that shows the frequnecy, percent, and cumulative percent for categorical data
library(epiDisplay)
tab1(iris$Species)
## iris$Species :
## Frequency Percent Cum. percent
## setosa 50 33.3 33.3
## vers 50 33.3 66.7
## virginica 50 33.3 100.0
## Total 150 100.0 100.0
Package: gtsummary
#Explanation: Creates presentation style table that shows the subgroup count, percentages, and total count.
library(gtsummary)
#Preloaded dataset 'mtcars' is used since function is better illustrated with this data.
tbl_summary(mtcars, include = c("mpg", "cyl", "disp"))
| Characteristic | N = 321 |
|---|---|
| mpg | 19.2 (15.4, 22.8) |
| cyl | |
| Â Â Â Â 4 | 11 (34%) |
| Â Â Â Â 6 | 7 (22%) |
| Â Â Â Â 8 | 14 (44%) |
| disp | 196 (121, 326) |
| 1 Median (IQR); n (%) | |
Package: dplyr
#Explanation: Counts the number of rows in a dataframe or that number of rows that meet specified criteria
library(dplyr) #dplyr is part of tidyverse
#Count total observations in dataset
iris %>%
count()
#Count observations by subgroup Species
iris %>%
group_by(Species) %>%
count()
Package: dplyr
#Explanation: Group_by subgroups a variable. Summarize can apply a variety of statistical functions for the specified groups (mean, count, sum, etc.).
library(dplyr) #dplyr is part of tidyverse
iris %>%
group_by(Species) %>%
summarize(mean_sepal_length = mean(Sepal.Length),
mean_sepal_width = mean(Sepal.Width),
mean_petal_length = mean(Petal.Length),
mean_petal_width = mean(Petal.Width))
Package: base R
#Explanation: Saves plots file type .png
#create simple bar plot of Petal.Length
barplot(iris$Petal.Length)
png("iris_boxplot.png")
dev.off #Close PNG
## function (which = dev.cur())
## {
## if (which == 1)
## stop("cannot shut down device 1 (the null device)")
## .External(C_devoff, as.integer(which))
## dev.cur()
## }
## <bytecode: 0x00000203ef189170>
## <environment: namespace:grDevices>
Package:ggplot2
#Explanation: Creates a bar chart for two columns
library(ggplot2) #ggplot2 is part of tidyverse
ggplot(data = iris, #data
aes( #aes stands for aesthetics
x = Species, #variable for x axis
fill = Species)) + #will color bins with subgroups of Species
geom_bar() #creates a bar chart
Package: ggplot2
#Explanation: Creates a scatter plot for two columns
library(ggplot2) #ggplot2 is part of tidyverse
ggplot(data = iris, #data
aes(x = Sepal.Length, y = Sepal.Width, #variables for x and y axes
color = Species)) + #will color bins with subgroups of Species
geom_point() #creates scatter plot
Package: ggplot2
#Explanation: Adjusts scale of axes for scatterplots, bar charts, histrograms and other ggplot visuals
library(ggplot2) #ggplot2 is part of tidyverse
# Create plot
ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point() +
# Customize the x-axis scale
scale_x_continuous(name = "Length (cm)", breaks = seq(4, 8, 0.5), limits = c(4, 8)) +
# Customize the y-axis scale
scale_y_continuous(name = "Width (cm)", breaks = seq(2, 5, 0.5), limits = c(2, 5))