Open a new script file: File > New File > R Markdown
New R Markdown wizard will pop-up, you can add a title and author
Write by editing the template
Knit document to generate report
New File (Top left corner) >> R Markdown OR
File >> New File >> R Markdown
Windows: Type the following; Ctrl + Alt + l
Mac: Type the following; Ctrl + Alt + l
# In order to run your code, please type your code inside this code chunk
It is good practice to keep a set of related data, analyses, and text self-contained in a single folder (This is called the working directory).
Before we start any analysis, ensure to set the path to the directory where we are working.
Please refer to the attached PowerPoint Presentation
R comes with a built in reference manual.
Type: help(“anova”) OR
Type: ? anova
packages: A bundle of functions, documentation, and datasets.
Note: You cannot use the contents of a package until you load a package in your current R session. You should update your packages from time to time to receive the latest improvements from package authors.
# install.packages("tidyverse")
# install.packages("readxl")
# install.packages("SmartEDA")
# install.packages("patchwork")
library(tidyverse) # For data analysis and plotting
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl) # For importing excel files
library(xlsx) # For importing excel files
library(SmartEDA) # For Exploratory data analysis
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(patchwork) # For combining plots/charts/graphs
# mydata <- read_excel("mydataset.xlsx") # To import an Excel File (.csv) file
#
# mydata <- read_csv("mydataset.csv") # To import Comma Separated Values File (.csv) file
install.packages(“haven”)
library(haven)
# mydata <- read_dta("mtcars.dta") # Import a STATA dataset (.dta)
diabetes <- read_excel("diabetes_dataset.xlsx") |> # Press Enter to go to the next line
mutate_if(is.character, as.factor) # Changing character variables to factor variables. # Press Ctrl + Enter
# For help
? factor
? character
summary(diabetes) # The summary() function gives a short summary of your dataset. # Press Ctrl + Enter
## Outcome Age BMI Pregnancies
## Diabetic :567 Min. :16.00 Min. :15.10 Min. : 0.000
## Not Diabetic:340 1st Qu.:29.00 1st Qu.:25.00 1st Qu.: 6.000
## Median :34.00 Median :31.50 Median : 8.000
## Mean :34.79 Mean :31.95 Mean : 8.499
## 3rd Qu.:40.00 3rd Qu.:37.75 3rd Qu.:11.000
## Max. :56.00 Max. :62.60 Max. :21.000
## PedigreeFunction Insulin Glucose SkinThickness
## Min. :0.004918 Min. : 50.0 Min. :140.0 Min. :12.97
## 1st Qu.:0.356315 1st Qu.:155.0 1st Qu.:150.0 1st Qu.:23.97
## Median :0.500633 Median :200.0 Median :152.0 Median :27.16
## Mean :0.494279 Mean :202.7 Mean :152.6 Mean :27.63
## 3rd Qu.:0.627481 3rd Qu.:249.0 3rd Qu.:155.0 3rd Qu.:31.41
## Max. :0.994079 Max. :426.0 Max. :168.0 Max. :45.32
## BloodPressure BMIcategories
## Min. : 56.00 Normal :177
## 1st Qu.: 80.50 Obese :511
## Median : 89.00 Overweight :165
## Mean : 89.31 Underweight: 54
## 3rd Qu.: 98.00
## Max. :119.00
# SmartEDA::ExpReport(diabetes, op_file = "diabetes.html") # The SmartEDA package gives a comprehensive summary of your dataset (great package to use exploratory data analysis). # Press Ctrl + Enter
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Pregnancies)) +
geom_bar() # Press Ctrl + Enter
# For help
? ggplot # Press Ctrl + Enter
plot1 <- diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Pregnancies,
fill = Outcome)) + # Use fill to add colour, in this case, we added the variable Outcome to create a stacked bar plot
geom_bar() # Press Ctrl + Enter
plot1 # Press Ctrl + Enter
plot2 <- diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Pregnancies,
fill = Outcome)) +
geom_bar(position = position_dodge()) # Positioning the factors of the Outcome variable to be next to each other. # Press Ctrl + Enter
plot2 # Press Ctrl + Enter# Press Ctrl + Enter
plot3 <- diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Pregnancies,
fill = Outcome)) +
geom_bar(show.legend = FALSE) + # To remove the figure legend (as it is not adding any value on this plot)
facet_wrap( ~ Outcome) # the facet_wrap() function is used to create sub plots, in this case, it's two plots. # Press Ctrl + Enter
plot3 # Press Ctrl + Enter
plot1 / plot2 / plot3 # Press Ctrl + Enter
# OR
plot3 | (plot1 / plot2) # Press Ctrl + Enter
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Outcome,
y = Age)) +
geom_boxplot() # Press Ctrl + Enter
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Outcome,
y = Age,
fill = BMIcategories)) + # Use fill to add colour, in this case, we added the variable BMIcategories
geom_boxplot() # the geom_boxplot() is used to create a boxplot. # Press Ctrl + Enter
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Outcome,
y = Age,
fill = BMIcategories)) + # Use fill to add colour, in this case, we added the variable BMIcategories
geom_boxplot(show.legend = F) + # To remove the figure legend (as it is not adding any value on this plot)
facet_wrap(~ BMIcategories) # the facet_wrap() function is used to create sub plots, in this case, it's four plots. # Press Ctrl + Enter
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Outcome,
y = Age,
fill = Outcome)) + # Use fill to add colour, in this case, we added the variable Outcome
geom_boxplot(show.legend = FALSE) # To remove the figure legend (as it is not adding any value on this plot)
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Outcome,
y = Age,
fill = Outcome)) + # Use fill to add colour, in this case, we added the variable Outcome
geom_boxplot(show.legend = FALSE) + # To remove the figure legend (as it is not adding any value on this plot)
geom_jitter() # To add participant data points on the plot
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = BMIcategories,
y = Glucose,
fill = Outcome)) + # Use fill to add colour, in this case, we added the variable Outcome
ggtitle("Distribution of Glucose stratified by BMI categories and Outcome") + # To add a title on the plot
geom_boxplot() # Press Ctrl + Enter
# BMIcategories + Insulin + Outcome
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = BMIcategories,
y = Insulin,
fill = Outcome)) + # Use fill to add colour, in this case, we added the variable Outcome
geom_boxplot() +
ggtitle("Distribution of Insulin stratified by BMI categories and Outcome") + # To add a title on the plot
theme_classic() # To remove the grey background. # Press Ctrl + Enter
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Age,
y = Insulin,
colour = Outcome)) + # Use colour to add colour, in this case, we added the variable Outcome
geom_point() # Press Ctrl + Enter
diabetes |> # Press Enter to go to the next line
ggplot(aes(x = Age,
y = Insulin,
colour = Outcome)) + # Use colour to add colour, in this case, we added the variable Outcome
geom_point() +
facet_wrap(~ Outcome) # Press Ctrl + Enter