0. Important Things about R

# I am a comment, since there is a "#" sign before me.
# Always be coding.
# Make sure your solution is correct, clean, and performant (CCP, often in that order).
# R is case-sensitive!!!!!
# A quicker start with R (done in 2 hours): https://www.w3schools.com/r/

1. Assignments

x <- 100  # Create an R object named "x" and assign it the value 100. 
          # If the object x already exists, replace its original value with 100
          # You can also write the code as x = 100.

print(x)  # Call the command or function "print" to print the value of x.
## [1] 100
x         # Equivalent to printing x
## [1] 100
y = 200   # Create another R object named "y" and assign it the value 200. 
          # What is the difference between "=" and "<-"?

y = y - 5 # Update the value of y by subtracting 5 from it.
          # Keep in mind: this is not an equation!!!
y         # Print the value of y
## [1] 195
300 -> z  # Create a third R object named "z" and assign it 300

a = c(3, 10, 21, 45, 70, 80, 92, 105) # Create a fourth R object named "a" and assign it an array of values.
                                      # The object "a" is called a (numeric) vector. Vectors are building blocks in R.
                                      # Here, "c" is called an R function which concatenates values. 
                                      # We will introduce R functions later.

b = 1:10   # Create a fifth R object and assign it 10 consecutive integers from 1 to 10

j = "Hello World" # Create a sixth R object and assign it the string "Hello World".
                  # Note that a string must be enclosed in a pair of quotation marks

m = j # Create a seventh R object m and assign it the value of j. That is, m is a copy of j

2. R infix operation symbols: +, -, *, /, ==, !=, <, >, <=, >=

2a. Arithmetic operators: +, -, *, /

p = 2 + 3  # Create an R object "p" and assign it the sum of 2 and 3, so p takes the value of 5
           # Leaving some spaces around "+" sign increases readability.

q = 2 - 3  

r = 2 * 3  # Create an R object "r" and assign it the product of 2 and 3, so r takes the value of 6

s = 2 / 3  # Create an R object "r" and assign it the ratio of 2 and 3, so s takes the value of 2/3

w = 1 - 2 * 3 / 4  # Create an R object "w" and assign it the value of the mathematical expression "1 - 2 * 3 / 4", 
                   # so w takes the value of -0.5

2b. Relational operators: ==, !=, <, >, <=, >=

2==3
## [1] FALSE
2.0==2
## [1] TRUE
2!=3
## [1] TRUE
2<3
## [1] TRUE
2>3
## [1] FALSE
2<=3
## [1] TRUE
2>=3
## [1] FALSE

3. Logical Values (TRUE and FALSE) and Logical Operators (&, |)

u = TRUE   # Assign the reserved word TRUE to an object.
           # TRUE is numerically equal to 1

v = FALSE  # Assign the reserved word FALSE to an object.
           # FALSE is numerically equal to 0

k = (3 > 2)  # Since 3 is greater than 2, the right hand side is TRUE. 
             # k is an R object taking the value of TRUE.
             # The parentheses are optional, since R knows to evaluate the value of "3>2" first and then assign it to k

e = (4 < 6)  # Since 4 is less than 6, the right hand side is TRUE.

f = (4 <= 6) # Since 4 is less than or equal to 6, the right hand side is TRUE.

g = (12 >= 17) # Since 12 is not greater than or equal to 17, the right hand side is FALSE.

h = (100 == 100.00) # Since 100 is equal to 100.00, the right hand side is TRUE.

f & g # Since f is TRUE and g is FALSE, the result is FALSE
## [1] FALSE
f | g # Since f is TRUE and g is FALSE, the result is TRUE
## [1] TRUE

4. Vectorization

x = c(3, 6, 10, 2, 3)  # x is a vector

y = x + 5  # Add 5 to each of the value in x. The result is a new vector and is assigned to the R object y.

z = x * 10 # Multiply each of the value in x by 5. The result is a new vector and is assigned to the R object z.

a = (x <= 4) # Compare each value in the vector x with 4. 
             # If a value is less than or equal to 4, the result is TRUE. Otherwise the results is FALSE.
             # Since there are 5 values in the vector x, the object a is a vector of 5 TURE's or FALSE's.
             # In fact, the result is the array TRUE, FALSE, FALSE, TRUE, TRUE

5. The if-else conditional

x = 82

if (x < 60){
  grade = "F"
} else {
  grade = "Pass"
}

y # Print the value of y
## [1]  8 11 15  7  8
x = 78

if (x < 60){
  grade = "F"
} else if (x < 70){
  grade = "D"
} else  if (x < 80){
  grade = "C"
} else  if (x < 90){
  grade = "B"
} else {
  grade = "A"
}

grade # Print the value of grade
## [1] "C"

6. The “for” Loops and “while” Loops

s = 0 # The object s will hold the sum of values in a vector. It's initialized set to 0.

for (x in c(2, 5, 1, 8, 12)){
  s = s + x   # Updating s by adding each of the values sequentially
}

s # The value of s is the sum of 2, 5, 1, 8, 12
## [1] 28
x = 10
while (x > 4){  # When x is greater than 4, perform the task within the pair of braces.
  print(x)  # Print x
  x = x - 1 # Update x
}
## [1] 10
## [1] 9
## [1] 8
## [1] 7
## [1] 6
## [1] 5

7. R Functions

c() # The function "c" creates a vector without element
## NULL
x = c(1, 3, 0, 1, 4, 2, 2, 5) # The function "c" creates a vector using the given values.
                              # The vector is assigned to the R object x.

length(x) # Length of x (the number of values in x)
## [1] 8
is.numeric(x) # Check whether x is numeric 
## [1] TRUE
mean(x) # The "mean" is an R function calculating the mean of values in x.
## [1] 2.25
sd(x)   # The "sd" is an R function calculating the standard deviation of values in x.
## [1] 1.669046
sqrt(x) # The "sqrt" is an R function calculating the square root of each value in x.
## [1] 1.000000 1.732051 0.000000 1.000000 2.000000 1.414214 1.414214 2.236068
summary(x)  # Calculate the 5-number summary of the data in x, along with the mean.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    2.00    2.25    3.25    5.00
grade = c("A", "A", "B", "C", "B", "B")
is.character(grade) # Check whether "grade" is character
## [1] TRUE
s1 = "Hello"
s2 = "World"
paste(s1, s2, "2023!") # Paste the three strings together
## [1] "Hello World 2023!"
s1 = c("23", "45", "8", "120")
as.numeric(s)  # Convert the character vector s1 to a numeric vector
## [1] 28
d = c(2, 3, 2, 2, 3, 4, 0, 1, 4, 5, 0, 2, 4, 5, 5, 1, 2, 2, 1, 4, 3, 4, 5, 0, 0, 5, 4, 1, 5, 0, 2)

table(d)    # Create a frequency table for values in d
## d
## 0 1 2 3 4 5 
## 5 4 7 3 6 6
barplot(table(d))  # Create a bar plot based on the frequency table of values in d

hist(d)     # Create a histogram

boxplot(d)  # Create a box-plot

8. Define Your Own R Functions

# A user-defined function must choose a name without any space in it.
# The word "function" must be used, followed by a pair of parentheses within which 
# is a list of parameters separated by commas.
# Within a pair of braces is the so-called function body consisting of one of more lines of code.

# Example 1.
# Input: a numeric vector
# Output: the sum
findSumOf = function(x){  
  s = 0   # Initialize s by setting it to 0
  
  for (a in x){ # "a" will go through the data vector x.
    s = s + a   # Updating s by adding each of the values sequentially
  }
  
  s
}

# Use the defined function
x = c(1, 3, 0, 1, 4, 2, 2, 5) 
findSumOf(x)
## [1] 18
# Example 2.
# Input: a numeric or character vector
# Output: A plot
plotOf = function(x){
  if (is.numeric(x)){
    hist(x)
  } else {
    barplot(table(x))
  }
}

# Use the defined function 
x = c(23, 12, 10, 8, 9, 20, 16, 13, 22, 3, 9, 21, 20, 40, 25, 2, 8, 28)
y = c("A", "C", "C", "B", "A", "B", "B", "A", "B", "D")
plotOf(x)

plotOf(y)

# Example 3.
# Input: a numeric vector
# Output: normalization or standardization of the vector
# Normalization transforms the data to have range between 0 and 1
# Standardization transforms the data to have mean 0 and standard deviation 1.
# It's good to use normalization when a machine learning method involves linear combination.


transform = function(x, method = c("normalization", "standardization")){
  if (method[1] == "normalization"){
    return((x-min(x))/(max(x)-min(x)))
  } else{
    return((x-mean(x))/sd(x))
  }
}

# Use the defined function 
x = c(23, 12, 10, 8, 9, 20, 16, 13, 22, 3, 9, 21, 20, 40, 25, 2, 8, 28)

transform(x) # Default is normalization
##  [1] 0.55263158 0.26315789 0.21052632 0.15789474 0.18421053 0.47368421
##  [7] 0.36842105 0.28947368 0.52631579 0.02631579 0.18421053 0.50000000
## [13] 0.47368421 1.00000000 0.60526316 0.00000000 0.15789474 0.68421053
transform(x, method = "normalization")
##  [1] 0.55263158 0.26315789 0.21052632 0.15789474 0.18421053 0.47368421
##  [7] 0.36842105 0.28947368 0.52631579 0.02631579 0.18421053 0.50000000
## [13] 0.47368421 1.00000000 0.60526316 0.00000000 0.15789474 0.68421053
transform(x, method = "standardization")
##  [1]  0.716950527 -0.418699108 -0.625180860 -0.831662612 -0.728421736
##  [6]  0.407227900 -0.005735604 -0.315458232  0.613709651 -1.347866991
## [11] -0.728421736  0.510468775  0.407227900  2.472045418  0.923432279
## [16] -1.451107867 -0.831662612  1.233154907

9. R Data Types and Data Structures:

# Data types: numeric, integer, complex, character, and logical.
# Date structures: vectors, data Frames, matrices, lists, and factors.
# Vector: of the same type
# List: of elements of same or different types
# Factor: stored as a vector of integer values corresponding to character values when displayed.

Name = c("Tom", "Jerry", "David", "Jessica", "Allison", "Amy", "Emma") # A character vector

Score = c(92, 85, 79, 98, 88, 90, 85) # A numeric vector

Section = c("One", "Two", "One", "Two", "Two", "One", "Two")

# Create a data frame with 3 columns
D = data.frame(Name, Score, Section) 
# Print D
D  # A question: How can you suppress the rownames?
##      Name Score Section
## 1     Tom    92     One
## 2   Jerry    85     Two
## 3   David    79     One
## 4 Jessica    98     Two
## 5 Allison    88     Two
## 6     Amy    90     One
## 7    Emma    85     Two
# The "str" function allows you to see the structure of data frame D
str(D)
## 'data.frame':    7 obs. of  3 variables:
##  $ Name   : chr  "Tom" "Jerry" "David" "Jessica" ...
##  $ Score  : num  92 85 79 98 88 90 85
##  $ Section: chr  "One" "Two" "One" "Two" ...
# The "summary" function allows you to generate a summary of each column of data frame D
summary(D)
##      Name               Score         Section         
##  Length:7           Min.   :79.00   Length:7          
##  Class :character   1st Qu.:85.00   Class :character  
##  Mode  :character   Median :88.00   Mode  :character  
##                     Mean   :88.14                     
##                     3rd Qu.:91.00                     
##                     Max.   :98.00
# Extract the scores
D$Score 
## [1] 92 85 79 98 88 90 85
# Or
D[,"Score"]   # This way of extracting a column from a data frame can be very useful
## [1] 92 85 79 98 88 90 85
# Extract the second column. The result is a vector.
D[, 2]   
## [1] 92 85 79 98 88 90 85
# Extract the second row. The result is a vector.
D[2, ]   
##    Name Score Section
## 2 Jerry    85     Two
# Display the first a few rows of data frame D
head(D)
##      Name Score Section
## 1     Tom    92     One
## 2   Jerry    85     Two
## 3   David    79     One
## 4 Jessica    98     Two
## 5 Allison    88     Two
## 6     Amy    90     One
# Display the last a few rows of data frame D
tail(D)
##      Name Score Section
## 2   Jerry    85     Two
## 3   David    79     One
## 4 Jessica    98     Two
## 5 Allison    88     Two
## 6     Amy    90     One
## 7    Emma    85     Two
# Subset D to make a data frame containing data such that Score >90
subset(D, Score>90)
##      Name Score Section
## 1     Tom    92     One
## 4 Jessica    98     Two
# Subset D to make a data frame containing data such that Score = 98
subset(D, Score == 98)
##      Name Score Section
## 4 Jessica    98     Two
# Add a new column called "Major" to D
D$Major = c("DS", "DA", "Math", "DS", "CS", "EE", "IS")

# Number of rows in data frame D
nrow(D)
## [1] 7
# Number of columns in data frame D
ncol(D)
## [1] 4
# Dimension (Number of rows and columns) of data frame D
dim(D)
## [1] 7 4
# Extract column names of data frame D
colnames(D)  # or just: names(D)
## [1] "Name"    "Score"   "Section" "Major"
# Extract row names of data frame D
rownames(D) 
## [1] "1" "2" "3" "4" "5" "6" "7"
# Change the name of the 3rd column of data frame D
colnames(D)[3] = "section"


# Create a matrix of 2 rows using values in a vector, with elements placed column by column
M = matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), nrow = 3) 

# Print M
M  
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12
# Extract the second column. The result is a vector.
D[, 2]   
## [1] 92 85 79 98 88 90 85
# Extract the second row. The result is a vector.
D[2, ]   
##    Name Score section Major
## 2 Jerry    85     Two    DA
# Create a list
L = list(StudentRecords = D, M = M, greeting = "Hello!")
# Print L
L
## $StudentRecords
##      Name Score section Major
## 1     Tom    92     One    DS
## 2   Jerry    85     Two    DA
## 3   David    79     One  Math
## 4 Jessica    98     Two    DS
## 5 Allison    88     Two    CS
## 6     Amy    90     One    EE
## 7    Emma    85     Two    IS
## 
## $M
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12
## 
## $greeting
## [1] "Hello!"
# Extract the matrix in list L
L$M
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12
L$hobby = "soccer"

length(L) # Number of elements in list L
## [1] 4
# Create a factor with 3 levels
x = c(1, 3, 3, 2, 1, 1, 2, 1, 3, 2, 1, 3, 3, 2)
f = factor(x)
levels(f) # Automatically use different values as levels
## [1] "1" "2" "3"
levels(f) = c("Medium", "Low", "High") # Levels can be changed to whatever you like

# We can do above using one-line code
# Levels are the different values in data and corresponding labels can be chosen arbitrarily.
# Factors can be very useful when creating bar graphs with bars in order you determine.
factor(x, levels = 1:3, labels = c("Medium", "Low", "High"))
##  [1] Medium High   High   Low    Medium Medium Low    Medium High   Low   
## [11] Medium High   High   Low   
## Levels: Medium Low High

An excercise: refer to https://www.thinkbiosolution.com/wp-content/uploads/2019/11/TBS_whitepaper_20190701_R_vs_spo2.pdf. Can you create a data frame based on data from the three trials?

10. The Pipe %>%

# You need to install the "tidyverse" package before using the pipe. 
# To install an R package, find "Package" tab in the right side of this RStudio screen.
# Click it and then click "Install" tab. Type the name of the package.

# An example of using the pipe %>%
library(tidyverse) # Load the package first
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
625 %>% sqrt %>% sqrt  # Send the number 625 to the square root function. 
## [1] 5
                       # The result 25 is sent again.

# Another example:
# Create a frequency table from the given data and then make a bar plot based on the table.
c(2, 4, 1, 5, 2, 5, 1, 4, 2) %>% table %>% barplot  

11. Some Useful Functions

# The "fix" function: 
# the function invokes edit on x and then assigns 
# the new (edited) version of x in the user's workspace.

# The "na.omit" function:
# The function simply removes missing values from a vector or rows of 
# a data frame where there are missing values, 
# though there are other ways to deal with the missing values.
D=data.frame(x=c(6,8,NA, 7,9, NA),
             y=c(3, 9, 1, NA, 0, 5)
            )
D
##    x  y
## 1  6  3
## 2  8  9
## 3 NA  1
## 4  7 NA
## 5  9  0
## 6 NA  5
na.omit(D)
##   x y
## 1 6 3
## 2 8 9
## 5 9 0
# The "ifelse" function
x = c(95, 84, 90, 73, 65)
ifelse (x >= 90, "Excellent!", "Study hard!")
## [1] "Excellent!"  "Study hard!" "Excellent!"  "Study hard!" "Study hard!"
# The "apply" function
D = data.frame(x1 = 1:5, x2 = 8:12, x3 = c(20, 25, 34, 48, 90))
apply(D, 2, mean) # Find the mean of each column of data frame D. Her "2" indicates action on columns.
##   x1   x2   x3 
##  3.0 10.0 43.4
# The "aggregate" function
aggregate(Sepal.Length ~ Species, iris, mean) # Find the mean of Sepal.Length for each Species 
##      Species Sepal.Length
## 1     setosa        5.006
## 2 versicolor        5.936
## 3  virginica        6.588
                                              # in data frame "iris"

# The "as.Date" function to handle dates: 
x=c("12/23/2022", "12/24/2022", "12/25/2022")
as.Date(x, format = "%m/%d/%Y")  
## [1] "2022-12-23" "2022-12-24" "2022-12-25"
y=c("23Dec2022", "24deC2022", "25December2022")
as.Date(y, format = "%d%b%Y") 
## [1] "2022-12-23" "2022-12-24" "2022-12-25"
z = c("Jan2023", "Feb2023", "Mar2023", "Apr2023")
z= paste("01", z, sep = "") # Make z look like y
z = as.Date(z, "%d%b%y")
format(z, "%m/%Y")
## [1] "01/2020" "02/2020" "03/2020" "04/2020"
format(z, "%m") # Extract only month
## [1] "01" "02" "03" "04"
w = c("Oct 23, 2022", "Oct 24, 2022", "Oct 25, 2022") # How would you handle these?
 
# The "gsub" function for global substitution within a string
s = c("230,956", "361,087", "73,925")  # a vector of 3 strings
gsub(",", "", s)  # The R function (globally) substitute commas in each of the strings in vector s.
## [1] "230956" "361087" "73925"
# The "substring" function for extracting a piece of a string
x="Have a great semester!"
substring(x, 3)  # a substring from the 3rd character to the end
## [1] "ve a great semester!"
substring(x, 3, 9) # a substring from the 3rd character to the 9th
## [1] "ve a gr"
# The "gregexpr" for locating all occurrences of a character in a string
## For example, the character "t" occurs at the third and place of the the string "I am perhaps better than this instructor".
x = "I am perhaps better than This instructor"
gregexpr("t",x)[[1]] %>% unlist() # Locate positions of all t's
## [1] 16 17 21 34 38
## attr(,"match.length")
## [1] 1 1 1 1 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
y = "The house has 3 bedrooms and 2 bathrooms."
gregexpr("[0-9]",y) %>% unlist() # Locate all digits
## [1] 15 30
## Can you write a function called "indexc", to locate the first occurrence 
## of a character in every string in a character vector?

12. Read data remotely and fix anormalies

13. Make a data pipiline by watching the video:

https://www.youtube.com/watch?v=9w4WFe48rEg

14. Write a journal article using R Markdown or Quarto with the “rticles” package

  • To produce a pdf output, you might need to install the “tinytex” package.

15. Web Scrape Dynamic Tables in R

The Rvest & RSelenium Tutorials:

16. How to Create Your Own Package in RStudio in just 15 minutes?

Tutorials:

17. Image Analysis with R

The complete reference is here.

You must install packages:

  • BiocManager: this package manages all bioconductor packages.

  • EBImage: this package allows you to do image analysis.

library(EBImage)

img1 = readImage("MonaLisa1.jpeg")
display(img1)
img2 = readImage("MonaLisa2.jpeg")
display(img2)
img3 = readImage("MonaLisa3.jpeg")
display(img3)
img4 = readImage("MonaLisa4.jpeg")
display(img4)

par(mfrow=c(2,2))
hist(img1)
hist(img2)
hist(img3)
hist(img4)
par(mfrow=c(1,1))

left4 = img4[1:300 , ,  ]
display(left4)
right4 = img4[450:725 , ,  ]
display(right4)

par(mfrow=c(2,3))
hist(img1, main = "img1")
hist(img2, main = "img1")
hist(img3, main = "img1")
hist(left4, main = "left4")
hist(right4, main = "right4")
par(mfrow=c(1,1))

# Create your own image
x = Image(rnorm(300*300*3),dim=c(300,300,3), colormode='Color')
display(x)
hist(x)

18. The JSON

  • JSON is a syntax for storing and exchanging data.

  • JSON is a string, written with JavaScript object notation. String is easy to read and store.

  • In R, a JSON string can be converted to an R list.

  • In R, an R list can be converted to a JSON string.

# The following is a string in JSON.
x =  '{"p1":{ "name":"John", "age":30, "city":"New York"}, "p2":{ "name":"Amy", "age":44, "city":"St. Cloud"}}'

# Use the function fromJSON() to convert x to an R list
L=jsonlite::fromJSON(x)

L
## $p1
## $p1$name
## [1] "John"
## 
## $p1$age
## [1] 30
## 
## $p1$city
## [1] "New York"
## 
## 
## $p2
## $p2$name
## [1] "Amy"
## 
## $p2$age
## [1] 44
## 
## $p2$city
## [1] "St. Cloud"
# You can convert it back with the toJSON() function
jsonlite::toJSON(L)
## {"p1":{"name":["John"],"age":[30],"city":["New York"]},"p2":{"name":["Amy"],"age":[44],"city":["St. Cloud"]}}

Appendix: Answers to questions

print(pressure, row.names = FALSE) # Suppress row names
##  temperature pressure
##            0   0.0002
##           20   0.0012
##           40   0.0060
##           60   0.0300
##           80   0.0900
##          100   0.2700
##          120   0.7500
##          140   1.8500
##          160   4.2000
##          180   8.8000
##          200  17.3000
##          220  32.1000
##          240  57.0000
##          260  96.0000
##          280 157.0000
##          300 247.0000
##          320 376.0000
##          340 558.0000
##          360 806.0000