# I am a comment, since there is a "#" sign before me.
# Always be coding.
# Make sure your solution is correct, clean, and performant (CCP, often in that order).
# R is case-sensitive!!!!!
# A quicker start with R (done in 2 hours): https://www.w3schools.com/r/
x <- 100 # Create an R object named "x" and assign it the value 100.
# If the object x already exists, replace its original value with 100
# You can also write the code as x = 100.
print(x) # Call the command or function "print" to print the value of x.
## [1] 100
x # Equivalent to printing x
## [1] 100
y = 200 # Create another R object named "y" and assign it the value 200.
# What is the difference between "=" and "<-"?
y = y - 5 # Update the value of y by subtracting 5 from it.
# Keep in mind: this is not an equation!!!
y # Print the value of y
## [1] 195
300 -> z # Create a third R object named "z" and assign it 300
a = c(3, 10, 21, 45, 70, 80, 92, 105) # Create a fourth R object named "a" and assign it an array of values.
# The object "a" is called a (numeric) vector. Vectors are building blocks in R.
# Here, "c" is called an R function which concatenates values.
# We will introduce R functions later.
b = 1:10 # Create a fifth R object and assign it 10 consecutive integers from 1 to 10
j = "Hello World" # Create a sixth R object and assign it the string "Hello World".
# Note that a string must be enclosed in a pair of quotation marks
m = j # Create a seventh R object m and assign it the value of j. That is, m is a copy of j
p = 2 + 3 # Create an R object "p" and assign it the sum of 2 and 3, so p takes the value of 5
# Leaving some spaces around "+" sign increases readability.
q = 2 - 3
r = 2 * 3 # Create an R object "r" and assign it the product of 2 and 3, so r takes the value of 6
s = 2 / 3 # Create an R object "r" and assign it the ratio of 2 and 3, so s takes the value of 2/3
w = 1 - 2 * 3 / 4 # Create an R object "w" and assign it the value of the mathematical expression "1 - 2 * 3 / 4",
# so w takes the value of -0.5
2==3
## [1] FALSE
2.0==2
## [1] TRUE
2!=3
## [1] TRUE
2<3
## [1] TRUE
2>3
## [1] FALSE
2<=3
## [1] TRUE
2>=3
## [1] FALSE
u = TRUE # Assign the reserved word TRUE to an object.
# TRUE is numerically equal to 1
v = FALSE # Assign the reserved word FALSE to an object.
# FALSE is numerically equal to 0
k = (3 > 2) # Since 3 is greater than 2, the right hand side is TRUE.
# k is an R object taking the value of TRUE.
# The parentheses are optional, since R knows to evaluate the value of "3>2" first and then assign it to k
e = (4 < 6) # Since 4 is less than 6, the right hand side is TRUE.
f = (4 <= 6) # Since 4 is less than or equal to 6, the right hand side is TRUE.
g = (12 >= 17) # Since 12 is not greater than or equal to 17, the right hand side is FALSE.
h = (100 == 100.00) # Since 100 is equal to 100.00, the right hand side is TRUE.
f & g # Since f is TRUE and g is FALSE, the result is FALSE
## [1] FALSE
f | g # Since f is TRUE and g is FALSE, the result is TRUE
## [1] TRUE
x = c(3, 6, 10, 2, 3) # x is a vector
y = x + 5 # Add 5 to each of the value in x. The result is a new vector and is assigned to the R object y.
z = x * 10 # Multiply each of the value in x by 5. The result is a new vector and is assigned to the R object z.
a = (x <= 4) # Compare each value in the vector x with 4.
# If a value is less than or equal to 4, the result is TRUE. Otherwise the results is FALSE.
# Since there are 5 values in the vector x, the object a is a vector of 5 TURE's or FALSE's.
# In fact, the result is the array TRUE, FALSE, FALSE, TRUE, TRUE
x = 82
if (x < 60){
grade = "F"
} else {
grade = "Pass"
}
y # Print the value of y
## [1] 8 11 15 7 8
x = 78
if (x < 60){
grade = "F"
} else if (x < 70){
grade = "D"
} else if (x < 80){
grade = "C"
} else if (x < 90){
grade = "B"
} else {
grade = "A"
}
grade # Print the value of grade
## [1] "C"
s = 0 # The object s will hold the sum of values in a vector. It's initialized set to 0.
for (x in c(2, 5, 1, 8, 12)){
s = s + x # Updating s by adding each of the values sequentially
}
s # The value of s is the sum of 2, 5, 1, 8, 12
## [1] 28
x = 10
while (x > 4){ # When x is greater than 4, perform the task within the pair of braces.
print(x) # Print x
x = x - 1 # Update x
}
## [1] 10
## [1] 9
## [1] 8
## [1] 7
## [1] 6
## [1] 5
c() # The function "c" creates a vector without element
## NULL
x = c(1, 3, 0, 1, 4, 2, 2, 5) # The function "c" creates a vector using the given values.
# The vector is assigned to the R object x.
length(x) # Length of x (the number of values in x)
## [1] 8
is.numeric(x) # Check whether x is numeric
## [1] TRUE
mean(x) # The "mean" is an R function calculating the mean of values in x.
## [1] 2.25
sd(x) # The "sd" is an R function calculating the standard deviation of values in x.
## [1] 1.669046
sqrt(x) # The "sqrt" is an R function calculating the square root of each value in x.
## [1] 1.000000 1.732051 0.000000 1.000000 2.000000 1.414214 1.414214 2.236068
summary(x) # Calculate the 5-number summary of the data in x, along with the mean.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 2.00 2.25 3.25 5.00
grade = c("A", "A", "B", "C", "B", "B")
is.character(grade) # Check whether "grade" is character
## [1] TRUE
s1 = "Hello"
s2 = "World"
paste(s1, s2, "2023!") # Paste the three strings together
## [1] "Hello World 2023!"
s1 = c("23", "45", "8", "120")
as.numeric(s) # Convert the character vector s1 to a numeric vector
## [1] 28
d = c(2, 3, 2, 2, 3, 4, 0, 1, 4, 5, 0, 2, 4, 5, 5, 1, 2, 2, 1, 4, 3, 4, 5, 0, 0, 5, 4, 1, 5, 0, 2)
table(d) # Create a frequency table for values in d
## d
## 0 1 2 3 4 5
## 5 4 7 3 6 6
barplot(table(d)) # Create a bar plot based on the frequency table of values in d
hist(d) # Create a histogram
boxplot(d) # Create a box-plot
# A user-defined function must choose a name without any space in it.
# The word "function" must be used, followed by a pair of parentheses within which
# is a list of parameters separated by commas.
# Within a pair of braces is the so-called function body consisting of one of more lines of code.
# Example 1.
# Input: a numeric vector
# Output: the sum
findSumOf = function(x){
s = 0 # Initialize s by setting it to 0
for (a in x){ # "a" will go through the data vector x.
s = s + a # Updating s by adding each of the values sequentially
}
s
}
# Use the defined function
x = c(1, 3, 0, 1, 4, 2, 2, 5)
findSumOf(x)
## [1] 18
# Example 2.
# Input: a numeric or character vector
# Output: A plot
plotOf = function(x){
if (is.numeric(x)){
hist(x)
} else {
barplot(table(x))
}
}
# Use the defined function
x = c(23, 12, 10, 8, 9, 20, 16, 13, 22, 3, 9, 21, 20, 40, 25, 2, 8, 28)
y = c("A", "C", "C", "B", "A", "B", "B", "A", "B", "D")
plotOf(x)
plotOf(y)
# Example 3.
# Input: a numeric vector
# Output: normalization or standardization of the vector
# Normalization transforms the data to have range between 0 and 1
# Standardization transforms the data to have mean 0 and standard deviation 1.
# It's good to use normalization when a machine learning method involves linear combination.
transform = function(x, method = c("normalization", "standardization")){
if (method[1] == "normalization"){
return((x-min(x))/(max(x)-min(x)))
} else{
return((x-mean(x))/sd(x))
}
}
# Use the defined function
x = c(23, 12, 10, 8, 9, 20, 16, 13, 22, 3, 9, 21, 20, 40, 25, 2, 8, 28)
transform(x) # Default is normalization
## [1] 0.55263158 0.26315789 0.21052632 0.15789474 0.18421053 0.47368421
## [7] 0.36842105 0.28947368 0.52631579 0.02631579 0.18421053 0.50000000
## [13] 0.47368421 1.00000000 0.60526316 0.00000000 0.15789474 0.68421053
transform(x, method = "normalization")
## [1] 0.55263158 0.26315789 0.21052632 0.15789474 0.18421053 0.47368421
## [7] 0.36842105 0.28947368 0.52631579 0.02631579 0.18421053 0.50000000
## [13] 0.47368421 1.00000000 0.60526316 0.00000000 0.15789474 0.68421053
transform(x, method = "standardization")
## [1] 0.716950527 -0.418699108 -0.625180860 -0.831662612 -0.728421736
## [6] 0.407227900 -0.005735604 -0.315458232 0.613709651 -1.347866991
## [11] -0.728421736 0.510468775 0.407227900 2.472045418 0.923432279
## [16] -1.451107867 -0.831662612 1.233154907
# Data types: numeric, integer, complex, character, and logical.
# Date structures: vectors, data Frames, matrices, lists, and factors.
# Vector: of the same type
# List: of elements of same or different types
# Factor: stored as a vector of integer values corresponding to character values when displayed.
Name = c("Tom", "Jerry", "David", "Jessica", "Allison", "Amy", "Emma") # A character vector
Score = c(92, 85, 79, 98, 88, 90, 85) # A numeric vector
Section = c("One", "Two", "One", "Two", "Two", "One", "Two")
# Create a data frame with 3 columns
D = data.frame(Name, Score, Section)
# Print D
D # A question: How can you suppress the rownames?
## Name Score Section
## 1 Tom 92 One
## 2 Jerry 85 Two
## 3 David 79 One
## 4 Jessica 98 Two
## 5 Allison 88 Two
## 6 Amy 90 One
## 7 Emma 85 Two
# The "str" function allows you to see the structure of data frame D
str(D)
## 'data.frame': 7 obs. of 3 variables:
## $ Name : chr "Tom" "Jerry" "David" "Jessica" ...
## $ Score : num 92 85 79 98 88 90 85
## $ Section: chr "One" "Two" "One" "Two" ...
# The "summary" function allows you to generate a summary of each column of data frame D
summary(D)
## Name Score Section
## Length:7 Min. :79.00 Length:7
## Class :character 1st Qu.:85.00 Class :character
## Mode :character Median :88.00 Mode :character
## Mean :88.14
## 3rd Qu.:91.00
## Max. :98.00
# Extract the scores
D$Score
## [1] 92 85 79 98 88 90 85
# Or
D[,"Score"] # This way of extracting a column from a data frame can be very useful
## [1] 92 85 79 98 88 90 85
# Extract the second column. The result is a vector.
D[, 2]
## [1] 92 85 79 98 88 90 85
# Extract the second row. The result is a vector.
D[2, ]
## Name Score Section
## 2 Jerry 85 Two
# Display the first a few rows of data frame D
head(D)
## Name Score Section
## 1 Tom 92 One
## 2 Jerry 85 Two
## 3 David 79 One
## 4 Jessica 98 Two
## 5 Allison 88 Two
## 6 Amy 90 One
# Display the last a few rows of data frame D
tail(D)
## Name Score Section
## 2 Jerry 85 Two
## 3 David 79 One
## 4 Jessica 98 Two
## 5 Allison 88 Two
## 6 Amy 90 One
## 7 Emma 85 Two
# Subset D to make a data frame containing data such that Score >90
subset(D, Score>90)
## Name Score Section
## 1 Tom 92 One
## 4 Jessica 98 Two
# Subset D to make a data frame containing data such that Score = 98
subset(D, Score == 98)
## Name Score Section
## 4 Jessica 98 Two
# Add a new column called "Major" to D
D$Major = c("DS", "DA", "Math", "DS", "CS", "EE", "IS")
# Number of rows in data frame D
nrow(D)
## [1] 7
# Number of columns in data frame D
ncol(D)
## [1] 4
# Dimension (Number of rows and columns) of data frame D
dim(D)
## [1] 7 4
# Extract column names of data frame D
colnames(D) # or just: names(D)
## [1] "Name" "Score" "Section" "Major"
# Extract row names of data frame D
rownames(D)
## [1] "1" "2" "3" "4" "5" "6" "7"
# Change the name of the 3rd column of data frame D
colnames(D)[3] = "section"
# Create a matrix of 2 rows using values in a vector, with elements placed column by column
M = matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), nrow = 3)
# Print M
M
## [,1] [,2] [,3] [,4]
## [1,] 1 4 7 10
## [2,] 2 5 8 11
## [3,] 3 6 9 12
# Extract the second column. The result is a vector.
D[, 2]
## [1] 92 85 79 98 88 90 85
# Extract the second row. The result is a vector.
D[2, ]
## Name Score section Major
## 2 Jerry 85 Two DA
# Create a list
L = list(StudentRecords = D, M = M, greeting = "Hello!")
# Print L
L
## $StudentRecords
## Name Score section Major
## 1 Tom 92 One DS
## 2 Jerry 85 Two DA
## 3 David 79 One Math
## 4 Jessica 98 Two DS
## 5 Allison 88 Two CS
## 6 Amy 90 One EE
## 7 Emma 85 Two IS
##
## $M
## [,1] [,2] [,3] [,4]
## [1,] 1 4 7 10
## [2,] 2 5 8 11
## [3,] 3 6 9 12
##
## $greeting
## [1] "Hello!"
# Extract the matrix in list L
L$M
## [,1] [,2] [,3] [,4]
## [1,] 1 4 7 10
## [2,] 2 5 8 11
## [3,] 3 6 9 12
L$hobby = "soccer"
length(L) # Number of elements in list L
## [1] 4
# Create a factor with 3 levels
x = c(1, 3, 3, 2, 1, 1, 2, 1, 3, 2, 1, 3, 3, 2)
f = factor(x)
levels(f) # Automatically use different values as levels
## [1] "1" "2" "3"
levels(f) = c("Medium", "Low", "High") # Levels can be changed to whatever you like
# We can do above using one-line code
# Levels are the different values in data and corresponding labels can be chosen arbitrarily.
# Factors can be very useful when creating bar graphs with bars in order you determine.
factor(x, levels = 1:3, labels = c("Medium", "Low", "High"))
## [1] Medium High High Low Medium Medium Low Medium High Low
## [11] Medium High High Low
## Levels: Medium Low High
# You need to install the "tidyverse" package before using the pipe.
# To install an R package, find "Package" tab in the right side of this RStudio screen.
# Click it and then click "Install" tab. Type the name of the package.
# An example of using the pipe %>%
library(tidyverse) # Load the package first
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.0
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
625 %>% sqrt %>% sqrt # Send the number 625 to the square root function.
## [1] 5
# The result 25 is sent again.
# Another example:
# Create a frequency table from the given data and then make a bar plot based on the table.
c(2, 4, 1, 5, 2, 5, 1, 4, 2) %>% table %>% barplot
# The "fix" function:
# the function invokes edit on x and then assigns
# the new (edited) version of x in the user's workspace.
# The "na.omit" function:
# The function simply removes missing values from a vector or rows of
# a data frame where there are missing values,
# though there are other ways to deal with the missing values.
D=data.frame(x=c(6,8,NA, 7,9, NA),
y=c(3, 9, 1, NA, 0, 5)
)
D
## x y
## 1 6 3
## 2 8 9
## 3 NA 1
## 4 7 NA
## 5 9 0
## 6 NA 5
na.omit(D)
## x y
## 1 6 3
## 2 8 9
## 5 9 0
# The "ifelse" function
x = c(95, 84, 90, 73, 65)
ifelse (x >= 90, "Excellent!", "Study hard!")
## [1] "Excellent!" "Study hard!" "Excellent!" "Study hard!" "Study hard!"
# The "apply" function
D = data.frame(x1 = 1:5, x2 = 8:12, x3 = c(20, 25, 34, 48, 90))
apply(D, 2, mean) # Find the mean of each column of data frame D. Her "2" indicates action on columns.
## x1 x2 x3
## 3.0 10.0 43.4
# The "aggregate" function
aggregate(Sepal.Length ~ Species, iris, mean) # Find the mean of Sepal.Length for each Species
## Species Sepal.Length
## 1 setosa 5.006
## 2 versicolor 5.936
## 3 virginica 6.588
# in data frame "iris"
# The "as.Date" function to handle dates:
x=c("12/23/2022", "12/24/2022", "12/25/2022")
as.Date(x, format = "%m/%d/%Y")
## [1] "2022-12-23" "2022-12-24" "2022-12-25"
y=c("23Dec2022", "24deC2022", "25December2022")
as.Date(y, format = "%d%b%Y")
## [1] "2022-12-23" "2022-12-24" "2022-12-25"
z = c("Jan2023", "Feb2023", "Mar2023", "Apr2023")
z= paste("01", z, sep = "") # Make z look like y
z = as.Date(z, "%d%b%y")
format(z, "%m/%Y")
## [1] "01/2020" "02/2020" "03/2020" "04/2020"
format(z, "%m") # Extract only month
## [1] "01" "02" "03" "04"
w = c("Oct 23, 2022", "Oct 24, 2022", "Oct 25, 2022") # How would you handle these?
# The "gsub" function for global substitution within a string
s = c("230,956", "361,087", "73,925") # a vector of 3 strings
gsub(",", "", s) # The R function (globally) substitute commas in each of the strings in vector s.
## [1] "230956" "361087" "73925"
# The "substring" function for extracting a piece of a string
x="Have a great semester!"
substring(x, 3) # a substring from the 3rd character to the end
## [1] "ve a great semester!"
substring(x, 3, 9) # a substring from the 3rd character to the 9th
## [1] "ve a gr"
# The "gregexpr" for locating all occurrences of a character in a string
## For example, the character "t" occurs at the third and place of the the string "I am perhaps better than this instructor".
x = "I am perhaps better than This instructor"
gregexpr("t",x)[[1]] %>% unlist() # Locate positions of all t's
## [1] 16 17 21 34 38
## attr(,"match.length")
## [1] 1 1 1 1 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
y = "The house has 3 bedrooms and 2 bathrooms."
gregexpr("[0-9]",y) %>% unlist() # Locate all digits
## [1] 15 30
## Can you write a function called "indexc", to locate the first occurrence
## of a character in every string in a character vector?
American Statistical Association survey data: https://ww2.amstat.org/censusatschool/. How would you deal with issues (in consistency, extra space, …) in variables such as “height”? This is called data cleaning.
New York police stop-frisk data: https://www.nyc.gov/site/nypd/stats/reports-analysis/stopfrisk.page. Can you propose questions that can be answered by the data?
The Rvest & RSelenium Tutorials:
Tutorials:
https://www.youtube.com/watch?v=rsQoEgWeJMk (Change package license to GPL-3)
https://www.youtube.com/watch?v=EpTkT6Rkgbs (formal: watch from 30:46)
The complete reference is here.
You must install packages:
BiocManager: this package manages all bioconductor packages.
EBImage: this package allows you to do image analysis.
library(EBImage)
img1 = readImage("MonaLisa1.jpeg")
display(img1)
img2 = readImage("MonaLisa2.jpeg")
display(img2)
img3 = readImage("MonaLisa3.jpeg")
display(img3)
img4 = readImage("MonaLisa4.jpeg")
display(img4)
par(mfrow=c(2,2))
hist(img1)
hist(img2)
hist(img3)
hist(img4)
par(mfrow=c(1,1))
left4 = img4[1:300 , , ]
display(left4)
right4 = img4[450:725 , , ]
display(right4)
par(mfrow=c(2,3))
hist(img1, main = "img1")
hist(img2, main = "img1")
hist(img3, main = "img1")
hist(left4, main = "left4")
hist(right4, main = "right4")
par(mfrow=c(1,1))
# Create your own image
x = Image(rnorm(300*300*3),dim=c(300,300,3), colormode='Color')
display(x)
hist(x)
JSON is a syntax for storing and exchanging data.
JSON is a string, written with JavaScript object notation. String is easy to read and store.
In R, a JSON string can be converted to an R list.
In R, an R list can be converted to a JSON string.
# The following is a string in JSON.
x = '{"p1":{ "name":"John", "age":30, "city":"New York"}, "p2":{ "name":"Amy", "age":44, "city":"St. Cloud"}}'
# Use the function fromJSON() to convert x to an R list
L=jsonlite::fromJSON(x)
L
## $p1
## $p1$name
## [1] "John"
##
## $p1$age
## [1] 30
##
## $p1$city
## [1] "New York"
##
##
## $p2
## $p2$name
## [1] "Amy"
##
## $p2$age
## [1] 44
##
## $p2$city
## [1] "St. Cloud"
# You can convert it back with the toJSON() function
jsonlite::toJSON(L)
## {"p1":{"name":["John"],"age":[30],"city":["New York"]},"p2":{"name":["Amy"],"age":[44],"city":["St. Cloud"]}}
print(pressure, row.names = FALSE) # Suppress row names
## temperature pressure
## 0 0.0002
## 20 0.0012
## 40 0.0060
## 60 0.0300
## 80 0.0900
## 100 0.2700
## 120 0.7500
## 140 1.8500
## 160 4.2000
## 180 8.8000
## 200 17.3000
## 220 32.1000
## 240 57.0000
## 260 96.0000
## 280 157.0000
## 300 247.0000
## 320 376.0000
## 340 558.0000
## 360 806.0000