Purpose

This file contains links and notes to Youtube videos describing R functions and concepts. Datasets used in this document are either preloaded in R, or are custom and posted below each header. It is recommended to load/install tidyverse as pipes (%>%) are frequently used, and many packages covered in this document are part of tidyverse.

#load tidyverse
library(tidyverse)

Visualizations

Simple Tables

Package: Base R, Dataset: iris

Frequency Table: A frequency table shows the count for one variable

Video Link: 3:13

table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50

A proportion table shows the proportion for one variable

Video Link: 7:15

prop.table(table(iris$Species)) 
## 
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333

Table with percentages

Video Link: 5:45

#multiply prop.table formula by 100
prop.table(table(iris$Species))*100 
## 
##     setosa versicolor  virginica 
##   33.33333   33.33333   33.33333

A contingency table shows the count for two or more variables

Video Link: 8:32

#The petal width count for 3 different flower species are shown. 
xtabs(~Petal.Width +Species, iris) 
##            Species
## Petal.Width setosa versicolor virginica
##         0.1      5          0         0
##         0.2     29          0         0
##         0.3      7          0         0
##         0.4      7          0         0
##         0.5      1          0         0
##         0.6      1          0         0
##         1        0          7         0
##         1.1      0          3         0
##         1.2      0          5         0
##         1.3      0         13         0
##         1.4      0          7         1
##         1.5      0         10         2
##         1.6      0          3         1
##         1.7      0          1         1
##         1.8      0          1        11
##         1.9      0          0         5
##         2        0          0         6
##         2.1      0          0         6
##         2.2      0          0         3
##         2.3      0          0         8
##         2.4      0          0         3
##         2.5      0          0         3

Presentation Summary Tables

Package: gtsummary, Dataset: CO2

Install/load gtsummary

Video Link: 0:10

library(gtsummary)

Basic Summary Table

Video Link: 0:44

CO2 %>% select(!c(Plant,conc)) %>% #Removing ‘Plant’ and ‘conc’ variables
  tbl_summary()   #creates summary table 
Characteristic N = 841
Type
    Quebec 42 (50%)
    Mississippi 42 (50%)
Treatment
    nonchilled 42 (50%)
    chilled 42 (50%)
uptake 28 (18, 37)
1 n (%); Median (IQR)

Summary split by categorical variable

Video Link: 1:42

CO2 %>% select(!c(Plant,conc)) %>% 
  tbl_summary(by = Type) #organizes the data by the categorical variable 'Type'
Characteristic Quebec, N = 421 Mississippi, N = 421
Treatment
    nonchilled 21 (50%) 21 (50%)
    chilled 21 (50%) 21 (50%)
uptake 37 (30, 40) 19 (14, 28)
1 n (%); Median (IQR)

Summary split by categorical variable with p-values

Video Link: 2:13

CO2 %>% select(!c(Plant,conc)) %>% 
  tbl_summary(by = Type) %>% 
  add_p() # adds p-values to the table 
Characteristic Quebec, N = 421 Mississippi, N = 421 p-value2
Treatment >0.9
    nonchilled 21 (50%) 21 (50%)
    chilled 21 (50%) 21 (50%)
uptake 37 (30, 40) 19 (14, 28) <0.001
1 n (%); Median (IQR)
2 Pearson’s Chi-squared test; Wilcoxon rank sum test

Summary including overall, extra heading, and other statistics

Video Link: 3:02

CO2 %>% select(!c(Plant,conc)) %>% 
  tbl_summary(by = Type, 
              statistic = list(all_continuous() ~ "{mean} ({sd})", #adds mean and standard deviation
              all_categorical() ~ "{n} / {N} ({p}%)" ), #add percentage 
              digits = all_continuous() ~ 2) %>% #rounds decimal to 2 digits
  add_overall() %>% #adds total column 
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Location**") #adds title 'Location'
Characteristic Overall, N = 841 Location
Quebec, N = 421 Mississippi, N = 421
Treatment
    nonchilled 42 / 84 (50%) 21 / 42 (50%) 21 / 42 (50%)
    chilled 42 / 84 (50%) 21 / 42 (50%) 21 / 42 (50%)
uptake 27.21 (10.81) 33.54 (9.67) 20.88 (7.82)
1 n / N (%); Mean (SD)

Create crosstab with p-values

Video Link: 4:27

#Crosstab shows the relationship between two categorical variables 

CO2 %>% #dataset
  tbl_cross(row = Type, col = Treatment, percent = "cell") %>%  #creates cross tables w/ Type and Treatment 
    add_p() #adds p-values
Treatment Total p-value1
nonchilled chilled
Type >0.9
    Quebec 21 (25%) 21 (25%) 42 (50%)
    Mississippi 21 (25%) 21 (25%) 42 (50%)
Total 42 (50%) 42 (50%) 84 (100%)
1 Pearson’s Chi-squared test

Barplot (1)

Package: Base R, Dataset: custom

Load Custom data

Video Link: 0:10

values <- c(.4, .75, 0.2, 0.6, 0.5) 

Make simple barplot

Video Link: 0:30

barplot (values) 

Add color to barplot

Video Link: 1:23

#Add color by using the text characters  
barplot(values, col = "#1b98e0") 

#Add color by using color name
barplot(values, col = "darkgreen")

### Change bar orientation to horizontal Video Link: 2:00

barplot(values, horiz = TRUE)

Add labels to barplot

Video Link: 2:27

#Create a vector called group that contains that labels for the barplot 
group <- LETTERS[1:5]

#Assign the new vector to ‘names.arg’
barplot(values, names.arg = group)

Creating a stacked bar plot

Video Link: 3:33

# Create matrix called data for the data that includes two different values for each column/label
 data <- as.matrix(data.frame(  #as.matrix converts the dataframe to a matrix
   A = c(0.2, 0.4), 
   B = c(0.3,0.1), 
   C = c(0.7, 0.1), 
   D = c(0.1, 0.2),  
   E = c(0.3, 0.3)))
# Create row names for the matrix called Group 1 and Group 2 
rownames(data) <- c ("Group 1", "Group 2")

#Plot stacked bar plot with matrix data with 2 different colors 
barplot(data, col = c("#1b98e0", "#353436"))

Add legend to bar plot

Video Link: 5:04

#legend must be executed with barplot in the same code chunk
barplot(data, col = c("#1b98e0", "#353436"))

legend("topright",  #location of legend
       legend = c("Group 1", "Group 2"), #names of legend
       fill = c("#1b98e0", "#353436"))  #fill colors for legend 

Grouped barchart (columns side-by-side)

Video Link: 6:25

#The groups should be some type of categorical data 
barplot(data, 
        col = c ("#1b98e0", "#353436"),
        beside = TRUE) #places columns beside each other 

Manually grouped barchart: Color subgroups by mean then create grouped barplot

Video Link: 24:28

#Using diamonds pre-loaded dataset

#Create object containing mean price by subgroup clarity 
diamonds_m_cl_co <- aggregate(diamonds, price ~ clarity + color, mean) 

#Plot grouped barplot with object containing subgroups 
ggplot(diamonds_m_cl_co, aes(x=clarity, y = price, fill = color)) + 
  geom_bar(stat = "identity",  position = "dodge") #dodge makes barplot grouped. 

Barplot (2)

Custom Data and load ggplot2

Video Link: 7:48

#Custom data 
values <- c(.4, .75, 0.2, 0.6, 0.5) 

#load ggplot2
library(ggplot2) #automatically loaded with tidyverse 

Ggplot2 only takes dataframes as input, not matrices like baseR

Video Link: 8:09

#Dataframe is a slightly different way to store data than a matrix. 
data_ggp <- data.frame(group, values)

#If unsure of your data type, run ‘typeof(name_of_data)’
typeof(data_ggp) #should now say dataframe
## [1] "list"

Create a bar plot in ggplot2

Video Link: 8:47

ggplot(data_ggp, aes(x = group, y = values)) + 
  geom_bar(stat = "identity") #specifies a bar chart based on our data

Ordering Bars of a Barplot

Custom data

Video Link: 0:15

data <- data.frame(
  x = c("A", "B", "C", "D", "E"),  
  y = c(0.5, 2, 1.2, -0.2, 0.7))

Manually ordering bars

Video Link: 1:55

# Create duplicate of data
data1 <- data 

# Change factor levels of data

#changes the factor ordering to ‘B’, ‘D’, ‘E’, ‘C’, ‘A’
data1$x <- factor(data1$x, levels = c("B", "D", "E", "C", "A")) 

#Default factor ordering is ‘A’, ‘B’, ‘C’, ‘D’, ‘E’

Order bars in increasing order

Video Link: 3:30

# Step 1: Duplicate  data 
data2 <- data 

#Step 2: Change factor levels to increasing order ()

#Replace x and y with variable names used in your plot
data2$x <- factor(data2$x, levels = data2$x[order(data2$y)])

#Step 3: Create plot with new data 
 ggplot(data2, aes(x, y)) + 
   geom_bar(stat = "identity")

Order bars in decreasing order

Video Link: 4:30

#Step 1: Duplicate data 
data3 <- data         

#Step 2: Change factor levels to decreasing order 
 data3$x <- factor(data3$x, levels = data3$x[order(data3$y, decreasing = TRUE)])
#Replace ‘TRUE’ with ‘FALSE’ to organize in increasing order

#Step 3: Create plot with new data 
ggplot(data3, aes(x, y)) + 
  geom_bar(stat = "identity")

Scatterplots, Changing X-Axis Range, and Facet Layers

Package: ggplot2, Dataset: custom

Custom data

Video Link:2:22

data <- data.frame(
  x = 1:9, 
  y = c(3, 1, 4, 3, 5, 2, 1, 2, 3), 
  group = rep(LETTERS[1:3], each = 3))  

Create base layer for a variety of plots

Video Link: 3:22

#shows just x and y axes, and tick marks
ggplot(data, aes(x = x, y = y)) 

Create scatter plot

Video Link: 3:55

ggplot(data, aes(x = x, y = y)) + 
  geom_point() #specifies  a scatter plot

Scatter plot: Change point size

Video Link: 4:45

ggplot(data, aes(x = x, y = y)) + 
  geom_point(size =3) #changes point size 

Scatter plot: Specify the colors by group

Video Link: 5:10

ggplot(data, aes(x = x, y = y, col = group)) + #col = group specifies the color grouping
  geom_point(size =3) + 
  theme(legend.position = "none") #removes default legend  

To save a plot, assign an object to ggplot2 code

Video Link: 6:08

ggp_simple <- ggplot(data, aes(x = x, y = y, col = group)) +
  geom_point(size =3 ) #this save the plot to ggp_simple

ggp_simple #the plot won't appear unless the object 'ggp_simple' is called 

Change the x-axis range

Video Link: 7:01

ggp_simple + 
  scale_x_continuous(limits = c(-3, 15)) #changes the numeric range of x-axis 

Manually change colors of points by groups

Video Link: 7:52

ggp_simple + 
  scale_color_manual(
    breaks = c("A", "B", "C"),  
    values = c("#1b98e0", #selects three different colors. One for each group A, B, C. 
               "#353436", 
               "#e32f08"))

Add multiple scale layers by adding a ‘+’ between layers

Video Link: 8:58

ggp_simple +  
  scale_x_continuous(limits = c(-3, 15)) + # the '+' allows multiple layers to be added
  scale_color_manual(
    breaks = c("A", "B", "C"),  
    values = c("#1b98e0", 
               "#353436", 
               "#e32f08"))

Facet layers: Creates side-by-side subplots of a variable or dataset

Video Link: 9:45

#Facet layers are helpful when your original plot has too much data 
ggp_simple +  
  scale_x_continuous(limits = c(-3, 15)) + 
  scale_color_manual(
    breaks = c("A", "B", "C"),  
    values = c("#1b98e0", 
               "#353436", 
               "#e32f08")) + 
    facet_wrap(group ~ .) + #adds facet layer
    theme_bw() # changes  graph background from gray to white 

Density Plots

Package: ggplot2, Dataset: diamonds

Add regression line to facet wrapped subplots

Video Link: 18:04

ggplot(diamonds, aes(x = price, y = carat)) +
  geom_point () + 
facet_wrap(clarity ~ . ) + 
  geom_smooth(method = "lm", formula = y ~ x) #adds regression line

Density plot: Helpful plot for showing distribution of a number

Video Link: 19:44

#For density plots only input one column/variable since density will be graphed on the y-axis
ggplot(diamonds, aes(x = depth)) + #variable is depth
  geom_density()

Density plot: Draw density plots by group

Video Link: 20:36

#‘fill’ colors the area between the density line and the x-axis, as opposed to coloring just the line 
ggplot(diamonds, aes(x = depth, fill =cut)) + # adds multiple density graphs to plot 
  geom_density(alpha = .3) #makes color more translucent. Can be set to any number lower than 1

Pie Charts

Package: ggplot2, Dataset: diamonds

Step 1: Assign colors to object

Video Link: 0:33

#create vector called 'colors' that has 7 different colors
colors <- c("#FFFFFF","#F5FCC2","#E0ED87","#CCDE57", 
            "#B3C732","#94A813","#718200")

Step 2: Shape dataset to have 3 columns: categorical variable, count, and percentage

Video Link: 0:37

data <- diamonds %>%   
  group_by(color) %>%   #categorical column 
  summarize(
    counts = n(),      #count column
    percentage = n()/nrow(diamonds))  #percentage column

Step 3: Create Pie Chart

Video Link: 0:56

pie <- ggplot(data = data, aes(x="", y = percentage, fill = color)) + 
  geom_col(color = "black") +  #outline color of pie chart 
  coord_polar("y", start = 0) +  #creates pie chart. Following code is formatting.
  geom_text(aes(label = paste0(round(percentage*100), "%")), #adds % lables to pie slices
            position = position_stack(vjust = 0.5)) + #positions labels
  theme(panel.background = element_blank(),  #removes default background
        axis.line = element_blank(),        #removes axis line   
        axis.text= element_blank(),         #removes axis text
        axis.ticks = element_blank(),       #removes axis ticks
        axis.title=element_blank(),         #removes title
        plot.title = element_text(hjust = 0.5, size = 18)) + #format for title
  ggtitle("Pie chart of Diamond Color") + #title
  scale_fill_manual(values = colors) 

pie #call pie chart 

Bubble Plots

Package: ggplot2, Datasets: mtcars

Note: A bubble plot is a scatter plot except it has a 3rd numeric variable mapped to a size aesthetic Video Link: 0:05

Subset mtcars dataset

Video Link: 0:25

data <- mtcars %>% mutate(
  cyl = factor(cyl), #creates factor levels 4,6, and 8 for 'cyl' var. Needed for color grouping for code chunks below
  Model = rownames(mtcars)) #creates 'Model' var that contains all the names of cars

Create basic bubble plot

Video Link: 0:30

plot1 <- data %>% 
  ggplot(aes(x = wt, y = mpg, size = hp)) + #‘size = hp’ is the 3rd variable that makes this a bubble plot 
  geom_point(alpha = 0.5) #'alpha .5' makes bubbles more transparent
  
plot1

Add color and custom bubble size to bubble plot

Video Link: 1:09

#‘color = cyl’ adds color by grouping
plot2 <- data %>% 
  ggplot(aes(x = wt, y = mpg, size = hp, color = cyl, label = Model)) + 
  geom_point(alpha = 0.5) + 
  scale_size(range = c(.1, 15))

plot2

Convert ggplot bubble plot into plotly plot

Video Link: 1:40

#Plotly maps are interactive. You can filter data by clicking on the legend and obtain point-specific data by hovering the cursor over a bubble point .

#Step 1: Load/install plotly
library(plotly)

#Step 2: Convert ggplot to plotly plot
p <- ggplotly(plot2, width=500, height=500) %>%  #converts ggplot to plotly
  layout(xaxis = list(range = c(1, 6)),  #formatting
         yaxis = list(range = c(8, 35)), 
         legend = list(x = 0.825, y = .975))

p

Data Management

Using the Which and Order Command

Package: Base R, Dataset: iris

Find a row’s values at a specified variable’s maximum value

Video Link: 0:23

#Returns the entire row where the variable ‘Sepal’ is the highest value
iris[which.max(iris$Sepal.Length), ] #Be sure to add a comma before the last bracket

Find a row’s values at a specified variable’s minimum value

Video Link: 1:22

#Returns the entire row where the variable ‘Sepal’ is the lowest value
iris[which.min(iris$Sepal.Length), ]

‘order’ selects any location of a variable (i.e. 10th or 23rd highest value)

Video Link: 1:40

#Returns the entire row where the variable ‘Sepal’ is the 11th highest
iris[order(iris$Sepal.Length) [11], ]

Using File Paths

(Package: Base R)

Best way use to specify file/directory path is with function ‘file.path’

Video Link: 0:38

#change the words in quotation marks to a directory path on your computer 
my_directory <- file.path("C:", "Users", "Joach", "Desktop")

#my_directory can  be called or used in code to refer to the directory path

Function file.path can also be used to specify a specific file

Video Link: 1:44

#If ‘my_file.csv’ wasn’t added, this would specify a directory path
My_file <- file.path("C:", "Users", "Joach", "Desktop", "my_file.csv")

Handeling NAs in R

Package: Base R, Dataset: airquality

Find missing values (NA)

Video Link: 1:05

#Returns matrix where TRUE is a missing value, FALSE is a numeric value
is.na(airquality)
##        Ozone Solar.R  Wind  Temp Month   Day
##   [1,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [2,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [3,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [4,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [5,]  TRUE    TRUE FALSE FALSE FALSE FALSE
##   [6,] FALSE    TRUE FALSE FALSE FALSE FALSE
##   [7,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [8,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [9,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [10,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [11,] FALSE    TRUE FALSE FALSE FALSE FALSE
##  [12,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [13,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [14,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [15,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [16,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [17,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [18,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [19,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [20,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [21,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [22,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [23,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [24,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [25,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [26,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [27,]  TRUE    TRUE FALSE FALSE FALSE FALSE
##  [28,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [29,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [30,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [31,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [32,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [33,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [34,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [35,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [36,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [37,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [38,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [39,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [40,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [41,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [42,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [43,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [44,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [45,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [46,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [47,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [48,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [49,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [50,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [51,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [52,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [53,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [54,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [55,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [56,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [57,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [58,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [59,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [60,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [61,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [62,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [63,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [64,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [65,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [66,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [67,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [68,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [69,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [70,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [71,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [72,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [73,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [74,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [75,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [76,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [77,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [78,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [79,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [80,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [81,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [82,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [83,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [84,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [85,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [86,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [87,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [88,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [89,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [90,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [91,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [92,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [93,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [94,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [95,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [96,] FALSE    TRUE FALSE FALSE FALSE FALSE
##  [97,] FALSE    TRUE FALSE FALSE FALSE FALSE
##  [98,] FALSE    TRUE FALSE FALSE FALSE FALSE
##  [99,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [100,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [101,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [102,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [103,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [104,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [105,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [106,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [107,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [108,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [109,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [110,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [111,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [112,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [113,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [114,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [115,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [116,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [117,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [118,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [119,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [120,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [121,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [122,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [123,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [124,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [125,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [126,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [127,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [128,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [129,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [130,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [131,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [132,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [133,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [134,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [135,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [136,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [137,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [138,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [139,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [140,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [141,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [142,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [143,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [144,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [145,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [146,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [147,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [148,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [149,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [150,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [151,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [152,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [153,] FALSE   FALSE FALSE FALSE FALSE FALSE

Count the amount of missing values in a data set

Video Link: 2:00

#Returns the total amount of missing values 
sum(is.na(airquality))
## [1] 44

Remove all missing values from a dataset

Video Link: 3:10

#Deletes entire row where the missing value is present.
na.omit(airquality)

Calculate the mean of a variable where there is missing values

Video Link: 4:15

mean(airquality$Ozone, na.rm = TRUE) 
## [1] 42.12931

Data Analysis

Using lapply and sapply

Package: Base R, Dataset: mtcars

Note 1: If the data input is a column, sapply and lapply will apply the function to each value in the specified column.

Note 2: If the data input is a dataset, sapply and lapply will apply the function to each column.

Note 3: Using these functions is often quicker than using a for loop.

lapply runs a function on each element of a dataset and returns a list

Video Link: 0:38

#Step 1: Load data (0:45)
data <- mtcars 

#Step 2: Create demo function (0:50)
mpg_category <- function(mpg){ #creates function called mpg_categrory
  if(mpg > 30){   #if mpg is greater than 30, "High" will be assigned
    return("High")
  } 
  else if (mpg > 20){ #if mpg is less than 30 but greater than 20, "Medium"
      return("Medium")
  } else if(mpg <21){
    return("Low")   #If not assigned "Medium" or "High", assign "Low" 
  }
  } 

#Step 3: Use lapply (1:10)

#applies function created above to every value of data$mpg and returns a list. 
lapply(X = data$mpg, FUN = mpg_category)
## [[1]]
## [1] "Medium"
## 
## [[2]]
## [1] "Medium"
## 
## [[3]]
## [1] "Medium"
## 
## [[4]]
## [1] "Medium"
## 
## [[5]]
## [1] "Low"
## 
## [[6]]
## [1] "Low"
## 
## [[7]]
## [1] "Low"
## 
## [[8]]
## [1] "Medium"
## 
## [[9]]
## [1] "Medium"
## 
## [[10]]
## [1] "Low"
## 
## [[11]]
## [1] "Low"
## 
## [[12]]
## [1] "Low"
## 
## [[13]]
## [1] "Low"
## 
## [[14]]
## [1] "Low"
## 
## [[15]]
## [1] "Low"
## 
## [[16]]
## [1] "Low"
## 
## [[17]]
## [1] "Low"
## 
## [[18]]
## [1] "High"
## 
## [[19]]
## [1] "High"
## 
## [[20]]
## [1] "High"
## 
## [[21]]
## [1] "Medium"
## 
## [[22]]
## [1] "Low"
## 
## [[23]]
## [1] "Low"
## 
## [[24]]
## [1] "Low"
## 
## [[25]]
## [1] "Low"
## 
## [[26]]
## [1] "Medium"
## 
## [[27]]
## [1] "Medium"
## 
## [[28]]
## [1] "High"
## 
## [[29]]
## [1] "Low"
## 
## [[30]]
## [1] "Low"
## 
## [[31]]
## [1] "Low"
## 
## [[32]]
## [1] "Medium"

sapply runs a function on each element of a dataset and returns a vector or matrix

Video Link: 1:57

#applies function to each value of mpg
sapply(X = data$mpg, FUN = mpg_category) #returns a vector
##  [1] "Medium" "Medium" "Medium" "Medium" "Low"    "Low"    "Low"    "Medium"
##  [9] "Medium" "Low"    "Low"    "Low"    "Low"    "Low"    "Low"    "Low"   
## [17] "Low"    "High"   "High"   "High"   "Medium" "Low"    "Low"    "Low"   
## [25] "Low"    "Medium" "Medium" "High"   "Low"    "Low"    "Low"    "Medium"