order
sort A-Z
sort(c (3, 1, 2, 3, 5, 4, 2), decreasing = FALSE)
## [1] 1 2 2 3 3 4 5
sort Z-A
sort(c (3, 1, 2, 3, 5, 4, 2), decreasing = TRUE)
## [1] 5 4 3 3 2 2 1
rank
rank(c (3, 1, 2, 3, 5, 4, 2))
## [1] 4.5 1.0 2.5 4.5 7.0 6.0 2.5
reorder
c(3, 1, 2, 3, 5, 4, 2)[order(c (3, 1, 2, 3, 5, 4, 2))]
## [1] 1 2 2 3 3 4 5
index
order(c (3, 1, 2, 3, 5, 4, 2))
## [1] 2 3 7 1 4 6 5
operation
addition
c(0, 2, 2.4) + c(1, -2, 0.6)
## [1] 1 0 3
subtraction
c(0, 2, 2.4) - c(1, -2, 0.6)
## [1] -1.0 4.0 1.8
multiplication
c(0, 2, 2.4) * c(1, -2, 0.6)
## [1] 0.00 -4.00 1.44
division
c(0, 2, 2.4) / c(1, -2, 0.6)
## [1] 0 -1 4
vectorized operations
# Vectorized AND
c(TRUE, TRUE, FALSE, FALSE) & c(T, F, T, F)
## [1] TRUE FALSE FALSE FALSE
# Vectorized OR d
c(TRUE, TRUE, FALSE, FALSE) | c(T, F, T, F)
## [1] TRUE TRUE TRUE FALSE
# exclusive logical OR
xor(c(TRUE, TRUE, FALSE, FALSE), c(T, F, T, F))
## [1] FALSE TRUE TRUE FALSE
# Using %in% operator
x <- c(1, 2, 3, 4, 5)
y <- c(2, 4, 6)
x %in% y
## [1] FALSE TRUE FALSE TRUE FALSE
range
# range in R (mathematical)
x=c(1,2,3,4,5)
range(x)
## [1] 1 5
#[1] 1 5
# range in R (NA)
x=c(1,2,NA,4,5)
range(x,na.rm=FALSE)
## [1] NA NA
#[1] NA NA
# range in R (na.rm)
range(x,na.rm=TRUE)
## [1] 1 5
#[1] 1 5
# range in R (alphabetical)
x=c("a","b","c","d","e","f","g")
range(x)
## [1] "a" "g"
#[1] "a" "g"
sample
set.seed(seed = 10)
#random sampling without replacement
sample(x = 1:10, size = 10, replace = FALSE)
## [1] 9 7 8 6 3 2 10 5 4 1
set.seed(seed = 10)
#random sampling with replacement
sample(x = 1:10, size = 10, replace = TRUE)
## [1] 9 10 7 8 6 7 3 8 10 7
sequence
seq(from=1,to=10)
## [1] 1 2 3 4 5 6 7 8 9 10
seq(1.0,10.0)
## [1] 1 2 3 4 5 6 7 8 9 10
seq(-1,-10)
## [1] -1 -2 -3 -4 -5 -6 -7 -8 -9 -10
seq(from=1,to=10,by=2)
## [1] 1 3 5 7 9
seq(from=5,to=50,by=5)
## [1] 5 10 15 20 25 30 35 40 45 50
seq(5,50,5)
## [1] 5 10 15 20 25 30 35 40 45 50
seq.int(from=5,to=50,length.out=10)
## [1] 5 10 15 20 25 30 35 40 45 50
seq(from=-5,to=-50,length.out=10)
## [1] -5 -10 -15 -20 -25 -30 -35 -40 -45 -50
y<-c(5,10,15,20)
seq(1,25,along.with = y)
## [1] 1 9 17 25
df<-c(-1,-5,-5,-1)
seq(-5,5,along.with = df)
## [1] -5.000000 -1.666667 1.666667 5.000000
seq_len(10)
## [1] 1 2 3 4 5 6 7 8 9 10
seq.int(-5,5)
## [1] -5 -4 -3 -2 -1 0 1 2 3 4 5
string and substring
collapse
library(stringr)
str_flatten(string = c("pear", "orange", "berry", "nut"))
## [1] "pearorangeberrynut"
combination
library(stringr)
str_pad(string = c("pear", "orange", "berry", "nut"), width = 6, side = "both", pad = "Z")
## [1] "ZpearZ" "orange" "berryZ" "ZnutZZ"
concatenate
library(stringr)
cat("This line has a \t tab in it. \n Now, we have skipped a line and included ê.")
## This line has a tab in it.
## Now, we have skipped a line and included ê.
duplicate
library(stringr)
str_dup(string = c("pear", "orange", "berry", "nut"), times = c(1, 2, 3, 4))
## [1] "pear" "orangeorange" "berryberryberry" "nutnutnutnut"
print
library(stringr)
print("This is a string.")
## [1] "This is a string."
noquote("This is a string without printed quotes.")
## [1] This is a string without printed quotes.
join
library(stringr)
stringr::str_c("Number: ", c(1, 2, 3, 4, 5, 6))
## [1] "Number: 1" "Number: 2" "Number: 3" "Number: 4" "Number: 5" "Number: 6"
length
library(stringr)
str_length(string = c("pear", "orange", "berry", "nut"))
## [1] 4 6 5 3
lower and upper
library(stringr)
tolower("ChanGE cAsE 1234")
## [1] "change case 1234"
toupper("ChanGE cAsE 1234")
## [1] "CHANGE CASE 1234"
chartr(old = "EhC", new = "eHX", "ChanGE cAsE 1234")
## [1] "XHanGe cAse 1234"
chartr(old = "abcX", new = "DEFx", "abcdefghijklmnopqrstuvwXyz")
## [1] "DEFdefghijklmnopqrstuvwxyz"
split
library(stringr)
strsplit("Age , Height , Weight , Other", split = ",")
## [[1]]
## [1] "Age " " Height " " Weight " " Other"
substring
library(stringr)
substring("abcdefghijklmnopqrstuvwxyz", first = 2, last = 5)
## [1] "bcde"
trim
library(stringr)
x <- " <--Extra Spaces Here--> <--Here--> "
str_trim(string = x, side = "both")
## [1] "<--Extra Spaces Here--> <--Here-->"
str_squish(string = x)
## [1] "<--Extra Spaces Here--> <--Here-->"
truncation
library(stringr)
x <- "This string has a lot of characters in it."
str_trunc(string = x, width = 11, side = "right", ellipsis = ".....")
## [1] "This s....."
for loop & while loop
#certain
for (i in 1:10) {
print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
#uncertain
i <- 1
while(i<=10) {
print(i)
i <- i + 1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
x <- 0
i <- 0
while (x < 2) {
x <- rnorm(1,0,1)
i <- i + 1
}
print(x)
## [1] 2.137767
print(i)
## [1] 46
if-else
x <- 0
if (x < 0) {
print("Negative number")
} else if (x > 0) {
print("Positive number")
} else
print("Zero")
## [1] "Zero"
f_to_c_message <- function(F) {
C <- (F - 32) * 5 / 9
if (C < -20) {
print("so cold")
}
else if (C > 30) {
print("so hot")
}
else {
print(C)
}
}
f_to_c_message(-10)
## [1] "so cold"
f_to_c_message(50)
## [1] 10
f_to_c_message(110)
## [1] "so hot"
system time
x <- 1:10
y <- 11:20
z <- NULL
system.time(for (i in seq_along(x)) {
z <- c(z, x[i] + y[i])
})
## user system elapsed
## 0.005 0.000 0.006
z
## [1] 12 14 16 18 20 22 24 26 28 30
x <- 1:10
y <- 11:20
z <- NULL
system.time(for (i in range(x)) {
z <- c(z, x[i] + y[i])
})
## user system elapsed
## 0.004 0.000 0.005
z
## [1] 12 30
x <- 1:10
y <- 11:20
z <- vector(length=10)
system.time(for (i in 1:length(x)){
z[i] <- x[i] + y[i]
})
## user system elapsed
## 0.008 0.000 0.009
z
## [1] 12 14 16 18 20 22 24 26 28 30
x <- 1:10
y <- 11:20
z <- vector(mode = "double", length=10)
system.time(for (i in 1:length(x)){
z[i] <- x[i] + y[i]
})
## user system elapsed
## 0.005 0.000 0.005
z
## [1] 12 14 16 18 20 22 24 26 28 30
distribution
x=rnorm(50)
y=x+rnorm(50,mean=50,sd=.1)
cor(x,y)
## [1] 0.9945578
set.seed(100)
x = rnorm(100)
x
## [1] -0.50219235 0.13153117 -0.07891709 0.88678481 0.11697127 0.31863009
## [7] -0.58179068 0.71453271 -0.82525943 -0.35986213 0.08988614 0.09627446
## [13] -0.20163395 0.73984050 0.12337950 -0.02931671 -0.38885425 0.51085626
## [19] -0.91381419 2.31029682 -0.43808998 0.76406062 0.26196129 0.77340460
## [25] -0.81437912 -0.43845057 -0.72022155 0.23094453 -1.15772946 0.24707599
## [31] -0.09111356 1.75737562 -0.13792961 -0.11119350 -0.69001432 -0.22179423
## [37] 0.18290768 0.41732329 1.06540233 0.97020202 -0.10162924 1.40320349
## [43] -1.77677563 0.62286739 -0.52228335 1.32223096 -0.36344033 1.31906574
## [49] 0.04377907 -1.87865588 -0.44706218 -1.73859795 0.17886485 1.89746570
## [55] -2.27192549 0.98046414 -1.39882562 1.82487242 1.38129873 -0.83885188
## [61] -0.26199577 -0.06884403 -0.37888356 2.58195893 0.12983414 -0.71302498
## [67] 0.63799424 0.20169159 -0.06991695 -0.09248988 0.44890327 -1.06435567
## [73] -1.16241932 1.64852175 -2.06209602 0.01274972 -1.08752835 0.27053949
## [79] 1.00845187 -2.07440475 0.89682227 -0.04999577 -1.34534931 -1.93121153
## [85] 0.70958158 -0.15790503 0.21636787 0.81736208 1.72717575 -0.10377029
## [91] -0.55712229 1.42830143 -0.89295740 -1.15757124 -0.53029645 2.44568276
## [97] -0.83249580 0.41351985 -1.17868314 -1.17403476
mean
mean(x)
## [1] 0.002912563
variance
var(x)
## [1] 1.04185
standard deviation
sqrt(var(x))
## [1] 1.02071
sd(x)
## [1] 1.02071
bias
#install.packages("SimDesign")
library(SimDesign)
actual_temp <- c(68.3, 70, 72.4, 71, 67, 70)
predicted_temp <- c(67.9, 69, 71.5, 70, 67, 69)
bias(actual_temp,predicted_temp)
## [1] 0.7166667
actual_temp <- c(150, 203, 137, 247, 116, 287)
predicted_temp <- c(200, 300, 150, 250, 150, 300)
bias(actual_temp,predicted_temp)
## [1] -35
graphics
x=rnorm(100)
y=rnorm(100)
plot(x,y)

plot(x,y,xlab="this is the x-axis",ylab="this is the y-axis",
main="Plot of X vs Y")

dual axis
library(latticeExtra)
## Loading required package: lattice
# create data
set.seed(1)
x <- 1:100
var1 <- cumsum(rnorm(100))
var2 <- var1^2
data <- data.frame(x,var1,var2)
# usual line chart
xyplot(var1 + var2 ~ x, data, type = "l", col=c("steelblue", "#69b3a2") , lwd=2)

#library
library(latticeExtra)
# create data
set.seed(1)
x <- 1:100
var1 <- cumsum(rnorm(100))
var2 <- var1^2
data <- data.frame(x,var1,var2)
# --> construct separate plots for each series
obj1 <- xyplot(var1 ~ x, data, type = "l" , lwd=2, col="steelblue")
obj2 <- xyplot(var2 ~ x, data, type = "l", lwd=2, col="#69b3a2")
# --> Make the plot with second y axis:
doubleYScale(obj1, obj2, add.ylab2 = TRUE, use.style=FALSE )

#library
library(latticeExtra)
# create data
set.seed(1)
x <- 1:100
var1 <- cumsum(rnorm(100))
var2 <- var1^2
data <- data.frame(x,var1,var2)
# --> construct separate plots for each series
obj1 <- xyplot(var1 ~ x, data, type = "l" , lwd=2)
obj2 <- xyplot(var2 ~ x, data, type = "l", lwd=2)
# --> Make the plot with second y axis AND legend:
doubleYScale(obj1, obj2, text = c("obj1", "obj2") , add.ylab2 = TRUE)

#library
library(latticeExtra)
# create data
set.seed(1)
x <- 1:100
var1 <- cumsum(rnorm(100))
var2 <- var1^2
data <- data.frame(x,var1,var2)
# --> construct separate plots for each series
obj1 <- xyplot(var1 ~ x, data, type = "l" , lwd=2)
obj2 <- xyplot(var2 ~ x, data, type = "l", lwd=2)
# --> Make the plot with second y axis AND legend:
doubleYScale(obj1, obj2, text = c("obj1", "obj2") , add.ylab2 = TRUE)

filter
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Create a sample dataset using the `tibble` function from the `tibble` package.
my_data <- tibble(
Name = c("Alice", "Bob", "Carol", "Dave"),
Age = c(25, 30, 35, 40),
Gender = c("Female", "Male", "Female", "Male")
)
my_data
## # A tibble: 4 × 3
## Name Age Gender
## <chr> <dbl> <chr>
## 1 Alice 25 Female
## 2 Bob 30 Male
## 3 Carol 35 Female
## 4 Dave 40 Male
#Create a filter to the dataset using the `filter` function from `dplyr`.
my_filtered_data <- my_data %>%
filter(Age > 30)
my_filtered_data
## # A tibble: 2 × 3
## Name Age Gender
## <chr> <dbl> <chr>
## 1 Carol 35 Female
## 2 Dave 40 Male
#Create a nested filter function by applying two `filter` functions sequentially using `%>%`.
my_nested_filtered_data <- my_data %>%
filter(Gender == "Female") %>%
filter(Age > 30)
my_nested_filtered_data
## # A tibble: 1 × 3
## Name Age Gender
## <chr> <dbl> <chr>
## 1 Carol 35 Female
time
#define two datetimes
first <- "2022-08-20 08:15:22"
second <- "2022-01-01 20:04:48"
#calculate time difference in days
difftime(first, second)
## Time difference of 230.4657 days
#Time difference of 230.5073 days
#calculate time difference in seconds
difftime(first, second, units="secs")
## Time difference of 19912234 secs
#Time difference of 19915834 secs
#calculate time difference in minutes
difftime(first, second, units="mins")
## Time difference of 331870.6 mins
#Time difference of 331930.6 mins
#calculate time difference in hours
difftime(first, second, units="hours")
## Time difference of 5531.176 hours
#Time difference of 5532.176 hours
#calculate time difference in weeks
difftime(first, second, units="weeks")
## Time difference of 32.92367 weeks
#Time difference of 32.92962 weeks
library(hms)
#define two datetimes
first <- "2022-01-01 20:15:22"
second <- "2022-01-01 08:04:48"
#calculate difference between datetimes in hours, minutes, seconds
as_hms(difftime(first, second))
## 12:10:34
Graph
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(igraphdata)
data(UKfaculty)
plot(UKfaculty)

Clustering
clustering <- cluster_leiden(as.undirected(UKfaculty))
plot(clustering, UKfaculty, edge.arrow.size = 0.5, cex = 0.5)

clustering <- cluster_leiden(as.undirected(UKfaculty), objective_function = "modularity")
plot(clustering, UKfaculty, edge.arrow.size = 0.5, cex = 0.5)

clustering <- cluster_leiden(as.undirected(UKfaculty), objective_function = "modularity", resolution_parameter = 0.8)
plot(clustering, UKfaculty, edge.arrow.size = 0.5, cex = 0.5)

Data Manipulation and Data Processing
create table
# Example data
gender <- c("Male", "Female", "Male", "Female", "Male", "Male")
occupation <- c("Engineer", "Teacher", "Engineer", "Doctor", "Teacher", "Doctor")
# Create a contingency table
my_table <- table(gender, occupation)
# Display the table
print(my_table)
## occupation
## gender Doctor Engineer Teacher
## Female 1 0 1
## Male 1 2 1
create randomly-generated table
# Set the seed for reproducibility
set.seed(123)
# Generate random gender and occupation data
n <- 6 # Number of data points
genders <- sample(c("Male", "Female"), n, replace = TRUE)
occupations <- sample(c("Engineer", "Teacher", "Doctor"), n, replace = TRUE)
# Create a contingency table
my_table <- table(genders, occupations)
# Display the table
print(my_table)
## occupations
## genders Doctor Engineer Teacher
## Female 0 1 1
## Male 1 0 3
create data frame
# Example data
name <- c("John", "Jane", "Michael", "Emma")
age <- c(25, 30, 22, 28)
city <- c("New York", "Los Angeles", "Chicago", "San Francisco")
# Create a data frame
my_data_frame <- data.frame(Name = name, Age = age, City = city)
# Display the table
print(my_data_frame)
## Name Age City
## 1 John 25 New York
## 2 Jane 30 Los Angeles
## 3 Michael 22 Chicago
## 4 Emma 28 San Francisco
create randomly-generated data frame
# Set the seed for reproducibility
set.seed(123)
# Create a 5x5 data frame
data <- data.frame(
Letters = c(sample(letters, 3, replace = TRUE), rep(sample(letters, 1), 2)),
Numbers1 = round(runif(5, min = 0, max = 10), 1),
Numbers2 = c(1.1, 2.2, 3.3, 4.4, 5.5), # Non-random values
Numbers3 = c(0.5, 1.0, 1.5, 2.0, 2.5), # Non-random values
Numbers4 = c(0.1, 0.2, 0.3, 0.4, 0.5) # Non-random values
)
# Print the data frame
print(data)
## Letters Numbers1 Numbers2 Numbers3 Numbers4
## 1 o 0.5 1.1 0.5 0.1
## 2 s 5.3 2.2 1.0 0.2
## 3 n 8.9 3.3 1.5 0.3
## 4 c 5.5 4.4 2.0 0.4
## 5 c 4.6 5.5 2.5 0.5
countif
# Count the occurrences of each letter
letter_counts <- table(data$Letters)
# Print the letter counts
print(letter_counts)
##
## c n o s
## 2 1 1 1
# Count the occurrences of each letter
letter_counts <- table(data$Letters)
# Create a data frame to display the counts
letter_counts_df <- data.frame(Letter = names(letter_counts), Count = as.vector(letter_counts))
# Print the data frame with counts
print(letter_counts_df)
## Letter Count
## 1 c 2
## 2 n 1
## 3 o 1
## 4 s 1
sumif
# Load the dplyr package
library(dplyr)
# Assuming you have already created the 'data' data frame as shown in your code
# Calculate the sum for each letter
result <- data %>%
group_by(Letters) %>%
summarize(
Sum_Numbers1 = sum(Numbers1),
Sum_Numbers2 = sum(Numbers2),
Sum_Numbers3 = sum(Numbers3),
Sum_Numbers4 = sum(Numbers4)
)
# Print the result
print(result)
## # A tibble: 4 × 5
## Letters Sum_Numbers1 Sum_Numbers2 Sum_Numbers3 Sum_Numbers4
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 c 10.1 9.9 4.5 0.9
## 2 n 8.9 3.3 1.5 0.3
## 3 o 0.5 1.1 0.5 0.1
## 4 s 5.3 2.2 1 0.2
add row
# Example data frame
name <- c("John", "Jane", "Michael", "Emma")
age <- c(25, 30, 22, 28)
city <- c("New York", "Los Angeles", "Chicago", "San Francisco")
my_table <- data.frame(Name = name, Age = age, City = city)
# New row data
new_row <- data.frame(Name = "Jake", Age = 35, City = "Boston")
# Add the new row to the data frame
row_added_table <- rbind(my_table, new_row)
# Display the updated table
print(row_added_table)
## Name Age City
## 1 John 25 New York
## 2 Jane 30 Los Angeles
## 3 Michael 22 Chicago
## 4 Emma 28 San Francisco
## 5 Jake 35 Boston
delete row
# Remove the second row
row_deleted_table <- my_table[-2, ]
# Display the updated table
print(row_deleted_table)
## Name Age City
## 1 John 25 New York
## 3 Michael 22 Chicago
## 4 Emma 28 San Francisco
add column
# Example data frame
name <- c("John", "Jane", "Michael", "Emma")
age <- c(25, 30, 22, 28)
city <- c("New York", "Los Angeles", "Chicago", "San Francisco")
column_added_table <- data.frame(Name = name, Age = age, City = city)
# New column data
new_column <- c("USA", "USA", "USA", "USA")
# Add the new column to the data frame
column_added_table$Country <- new_column
# Display the updated table
print(column_added_table)
## Name Age City Country
## 1 John 25 New York USA
## 2 Jane 30 Los Angeles USA
## 3 Michael 22 Chicago USA
## 4 Emma 28 San Francisco USA
delete column
# Remove the 'City' column
column_deleted_table <- my_table[, -which(names(my_table) == "City")]
# Display the updated table
print(column_deleted_table)
## Name Age
## 1 John 25
## 2 Jane 30
## 3 Michael 22
## 4 Emma 28
left join
# left join
# Sample data frames
df1 <- data.frame(ID = c(1, 2, 3, 4, 5),
Name = c("Alice", "Bob", "Charlie", "David", "Eve"))
df2 <- data.frame(ID = c(1, 3, 5, 6),
Age = c(25, 30, 27, 22))
# Perform a left join
result <- merge(df1, df2, by = "ID", all.x = TRUE)
print(df1)
## ID Name
## 1 1 Alice
## 2 2 Bob
## 3 3 Charlie
## 4 4 David
## 5 5 Eve
print(df2)
## ID Age
## 1 1 25
## 2 3 30
## 3 5 27
## 4 6 22
print(result)
## ID Name Age
## 1 1 Alice 25
## 2 2 Bob NA
## 3 3 Charlie 30
## 4 4 David NA
## 5 5 Eve 27
right join
# right join
# Sample data frames
df1 <- data.frame(ID = c(1, 2, 3, 4, 5),
Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
df2 <- data.frame(ID = c(3, 4, 5, 6, 7),
Age = c(25, 30, 28, 22, 27))
# Perform right join on 'ID' column
result <- merge(x = df1, y = df2, by = "ID", all.x = FALSE, all.y = TRUE)
print(result)
## ID Name Age
## 1 3 Charlie 25
## 2 4 David 30
## 3 5 Eva 28
## 4 6 <NA> 22
## 5 7 <NA> 27
inner join
# inner join
# Sample data frame 1
df1 <- data.frame(ID = c(1, 2, 3, 4, 5),
Name = c("Alice", "Bob", "Charlie", "David", "Eva"))
# Sample data frame 2
df2 <- data.frame(ID = c(2, 4, 5, 6, 7),
Age = c(25, 30, 28, 22, 27))
# Inner join based on 'ID'
inner_joined <- merge(df1, df2, by = "ID")
print(inner_joined)
## ID Name Age
## 1 2 Bob 25
## 2 4 David 30
## 3 5 Eva 28
outer join
# outer join
# Sample data frames
df1 <- data.frame(ID = c(1, 2, 3, 4),
Name = c("Alice", "Bob", "Charlie", "David"))
df2 <- data.frame(ID = c(2, 3, 5),
Age = c(25, 30, 22))
# Perform an outer join on 'ID' column
merged_df <- merge(df1, df2, by = "ID", all = TRUE)
print(merged_df)
## ID Name Age
## 1 1 Alice NA
## 2 2 Bob 25
## 3 3 Charlie 30
## 4 4 David NA
## 5 5 <NA> 22
semi join
# semi join
# Load the dplyr package
library(dplyr)
# Example data frame 1
df1 <- data.frame(
ID = c(1, 2, 3, 4, 5),
Name = c("Alice", "Bob", "Charlie", "David", "Eve")
)
# Example data frame 2
df2 <- data.frame(
ID = c(2, 4, 5),
Age = c(25, 30, 27)
)
# Perform the semi-join based on the "ID" column
left_semi_join_result <- semi_join(df1, df2, by = "ID")
right_semi_join_result <- semi_join(df2, df1, by = "ID")
# Print the result
print(left_semi_join_result)
## ID Name
## 1 2 Bob
## 2 4 David
## 3 5 Eve
print(right_semi_join_result)
## ID Age
## 1 2 25
## 2 4 30
## 3 5 27
anti join
# anti join
# Load the dplyr package
library(dplyr)
# Sample data: two data frames
left_df <- data.frame(ID = c(1, 2, 3, 4, 5),
Value = c("A", "B", "C", "D", "E"))
right_df <- data.frame(ID = c(3, 4),
Value = c("C", "D"))
# Performing the anti-join
left_anti_join_result <- anti_join(left_df, right_df, by = "ID")
right_anti_join_result <- anti_join(left_df, right_df, by = "ID")
# Output the result
print(left_anti_join_result)
## ID Value
## 1 1 A
## 2 2 B
## 3 5 E
print(right_anti_join_result)
## ID Value
## 1 1 A
## 2 2 B
## 3 5 E
Machine Learning
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(ISLR)
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
Simple Linear Regression
lm.fit=lm(medv~lstat,data=Boston)
lm.fit
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Coefficients:
## (Intercept) lstat
## 34.55 -0.95
names(lm.fit)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit)
## (Intercept) lstat
## 34.5538409 -0.9500494
confint(lm.fit)
## 2.5 % 97.5 %
## (Intercept) 33.448457 35.6592247
## lstat -1.026148 -0.8739505
predict(lm.fit,data.frame(lstat=(c(5,10 ,15))),interval ="confidence")
## fit lwr upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
predict(lm.fit,data.frame(lstat=(c(5,10 ,15))),interval ="prediction")
## fit lwr upr
## 1 29.80359 17.565675 42.04151
## 2 25.05335 12.827626 37.27907
## 3 20.30310 8.077742 32.52846
Multiple Linear Regression
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
lm.fit1=lm(medv~lstat+age,data=Boston)
summary(lm.fit1)
##
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.981 -3.978 -1.283 1.968 23.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
## lstat -1.03207 0.04819 -21.416 < 2e-16 ***
## age 0.03454 0.01223 2.826 0.00491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
## F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
names(lm.fit1)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit1)
## (Intercept) lstat age
## 33.22276053 -1.03206856 0.03454434
confint(lm.fit1)
## 2.5 % 97.5 %
## (Intercept) 31.78687150 34.65864956
## lstat -1.12674848 -0.93738865
## age 0.01052507 0.05856361
vif(lm.fit1)
## lstat age
## 1.569395 1.569395
lm.fit2=lm(medv~.,data=Boston)
summary(lm.fit2)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
names(lm.fit2)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit2)
## (Intercept) crim zn indus chas
## 3.645949e+01 -1.080114e-01 4.642046e-02 2.055863e-02 2.686734e+00
## nox rm age dis rad
## -1.776661e+01 3.809865e+00 6.922246e-04 -1.475567e+00 3.060495e-01
## tax ptratio black lstat
## -1.233459e-02 -9.527472e-01 9.311683e-03 -5.247584e-01
confint(lm.fit2)
## 2.5 % 97.5 %
## (Intercept) 26.432226009 46.486750761
## crim -0.172584412 -0.043438304
## zn 0.019448778 0.073392139
## indus -0.100267941 0.141385193
## chas 0.993904193 4.379563446
## nox -25.271633564 -10.261588893
## rm 2.988726773 4.631003640
## age -0.025262320 0.026646769
## dis -1.867454981 -1.083678710
## rad 0.175692169 0.436406789
## tax -0.019723286 -0.004945902
## ptratio -1.209795296 -0.695699168
## black 0.004034306 0.014589060
## lstat -0.624403622 -0.425113133
vif(lm.fit2)
## crim zn indus chas nox rm age dis
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945
## rad tax ptratio black lstat
## 7.484496 9.008554 1.799084 1.348521 2.941491
lm.fit3=lm(medv~.-age,data=Boston)
summary(lm.fit3)
##
## Call:
## lm(formula = medv ~ . - age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6054 -2.7313 -0.5188 1.7601 26.2243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.436927 5.080119 7.172 2.72e-12 ***
## crim -0.108006 0.032832 -3.290 0.001075 **
## zn 0.046334 0.013613 3.404 0.000719 ***
## indus 0.020562 0.061433 0.335 0.737989
## chas 2.689026 0.859598 3.128 0.001863 **
## nox -17.713540 3.679308 -4.814 1.97e-06 ***
## rm 3.814394 0.408480 9.338 < 2e-16 ***
## dis -1.478612 0.190611 -7.757 5.03e-14 ***
## rad 0.305786 0.066089 4.627 4.75e-06 ***
## tax -0.012329 0.003755 -3.283 0.001099 **
## ptratio -0.952211 0.130294 -7.308 1.10e-12 ***
## black 0.009321 0.002678 3.481 0.000544 ***
## lstat -0.523852 0.047625 -10.999 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.74 on 493 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7343
## F-statistic: 117.3 on 12 and 493 DF, p-value: < 2.2e-16
names(lm.fit3)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit3)
## (Intercept) crim zn indus chas
## 36.436926648 -0.108005604 0.046333661 0.020562177 2.689026199
## nox rm dis rad tax
## -17.713539860 3.814393564 -1.478611555 0.305785940 -0.012328692
## ptratio black lstat
## -0.952211173 0.009320653 -0.523851840
confint(lm.fit3)
## 2.5 % 97.5 %
## (Intercept) 26.45557192 46.418281372
## crim -0.17251263 -0.043498576
## zn 0.01958627 0.073081052
## indus -0.10014145 0.141265800
## chas 1.00009928 4.377953122
## nox -24.94259876 -10.484480962
## rm 3.01181752 4.616969603
## dis -1.85312219 -1.104100922
## rad 0.17593586 0.435636023
## tax -0.01970656 -0.004950825
## ptratio -1.20821163 -0.696210713
## black 0.00405936 0.014581947
## lstat -0.61742541 -0.430278274
vif(lm.fit3)
## crim zn indus chas nox rm dis rad
## 1.792172 2.265290 3.991592 1.071227 4.084846 1.851068 3.620246 7.441492
## tax ptratio black lstat
## 9.000474 1.788084 1.343044 2.599229
Interaction Terms
summary(lm(medv~lstat*age,data=Boston))
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
Non-Linear Terms
lm.fit4=lm(medv~lstat+I(lstat^2),data=Boston)
summary(lm.fit4)
##
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2834 -3.8313 -0.5295 2.3095 25.4148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.862007 0.872084 49.15 <2e-16 ***
## lstat -2.332821 0.123803 -18.84 <2e-16 ***
## I(lstat^2) 0.043547 0.003745 11.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
## F-statistic: 448.5 on 2 and 503 DF, p-value: < 2.2e-16
lm.fit5=lm(Boston$medv~poly(Boston$lstat,5))
summary(lm.fit5)
##
## Call:
## lm(formula = Boston$medv ~ poly(Boston$lstat, 5))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5433 -3.1039 -0.7052 2.0844 27.1153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2318 97.197 < 2e-16 ***
## poly(Boston$lstat, 5)1 -152.4595 5.2148 -29.236 < 2e-16 ***
## poly(Boston$lstat, 5)2 64.2272 5.2148 12.316 < 2e-16 ***
## poly(Boston$lstat, 5)3 -27.0511 5.2148 -5.187 3.10e-07 ***
## poly(Boston$lstat, 5)4 25.4517 5.2148 4.881 1.42e-06 ***
## poly(Boston$lstat, 5)5 -19.2524 5.2148 -3.692 0.000247 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared: 0.6817, Adjusted R-squared: 0.6785
## F-statistic: 214.2 on 5 and 500 DF, p-value: < 2.2e-16
summary(lm(medv~log(rm),data=Boston))
##
## Call:
## lm(formula = medv ~ log(rm), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.487 -2.875 -0.104 2.837 39.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -76.488 5.028 -15.21 <2e-16 ***
## log(rm) 54.055 2.739 19.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.915 on 504 degrees of freedom
## Multiple R-squared: 0.4358, Adjusted R-squared: 0.4347
## F-statistic: 389.3 on 1 and 504 DF, p-value: < 2.2e-16
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
coef(glm.fit)
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## -0.126000257 -0.073073746 -0.042301344 0.011085108 0.009358938 0.010313068
## Volume
## 0.135440659
summary(glm.fit)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
## Lag1 -0.073073746 0.05016739 -1.4565986 0.1452272
## Lag2 -0.042301344 0.05008605 -0.8445733 0.3983491
## Lag3 0.011085108 0.04993854 0.2219750 0.8243333
## Lag4 0.009358938 0.04997413 0.1872757 0.8514445
## Lag5 0.010313068 0.04951146 0.2082966 0.8349974
## Volume 0.135440659 0.15835970 0.8552723 0.3924004
Geospatial Analysis
library(leaflet)
leaflet() %>%
addTiles() %>%
setView(lng = c(-74.0060), lat = c(40.7128), zoom = 5)
leaflet(options = leafletOptions(zoomControl = FALSE)) %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
setView(lng = -74.0060, lat = 40.7128, zoom = 13)
leaflet(options = leafletOptions(zoomControl = FALSE)) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
setView(lng = -74.0060, lat = 40.7128, zoom = 13)
library(ggmap)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:latticeExtra':
##
## layer
## The legacy packages maptools, rgdal, and rgeos, underpinning this package
## will retire shortly. Please refer to R-spatial evolution reports on
## https://r-spatial.org/r/2023/05/15/evolution4.html for details.
## This package is now running under evolution status 0
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
usa <- map_data('usa')
ggplot(data=usa, aes(x=long, y=lat, group=group)) +
geom_polygon(fill='lightblue') +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),
axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
ggtitle('U.S. Map') +
coord_fixed(1.3)

#install.packages("maps")
library(maps)
maps::map("state", interior=FALSE)
maps::map("state", boundary=FALSE, col="gray", add=TRUE)

state <- map_data("state")
ggplot(data=state, aes(x=long, y=lat, fill=region, group=group)) +
geom_polygon(color = "white") +
guides(fill=FALSE) +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),
axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
ggtitle('U.S. Map with States') +
coord_fixed(1.3)

new_york <- subset(state, region=="new york")
counties <- map_data("county")
new_york_county <- subset(counties, region=="new york")
ny_map <- ggplot(data=new_york, mapping=aes(x=long, y=lat, group=group)) +
coord_fixed(1.3) +
geom_polygon(color="black", fill="gray") +
geom_polygon(data=new_york_county, fill=NA, color="white") +
geom_polygon(color="black", fill=NA) +
ggtitle('New York State Counties') +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),
axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank())
ny_map

points = sf::st_read('~/Downloads/Collisions/Collisions.shp')
## Reading layer `Collisions' from data source
## `/Users/sunlei/Downloads/Collisions/Collisions.shp' using driver `ESRI Shapefile'
## replacing null geometries with empty geometries
## Simple feature collection with 236811 features and 38 fields (with 7705 geometries empty)
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -122.4253 ymin: 47.49557 xmax: -122.2389 ymax: 47.73414
## Geodetic CRS: WGS 84
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
setView(lng = -122.330412, lat = 47.609056, zoom = 14) %>%
addCircleMarkers(data = points, fillColor = 'red', fillOpacity = 0.6, stroke = FALSE,
radius = 4, clusterOptions = markerClusterOptions())
x = sf::st_read('~/Downloads/2016_Traffic_Flow_Counts/2016_Traffic_Flow_Counts.shp')
## Reading layer `2016_Traffic_Flow_Counts' from data source
## `/Users/sunlei/Downloads/2016_Traffic_Flow_Counts/2016_Traffic_Flow_Counts.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1865 features and 14 fields
## Geometry type: MULTILINESTRING
## Dimension: XY
## Bounding box: xmin: -122.4134 ymin: 47.49599 xmax: -122.2372 ymax: 47.73414
## Geodetic CRS: WGS 84
my_breaks = function(x) { cut(x, breaks = c(0, 1e4, 2.5e4, 5e4, 7.5e4, 1e5, 1.25e5),
labels = c(1, 2, 4, 8, 12, 16)) }
leaflet() %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
setView(lng = -122.330412, lat = 47.609056, zoom = 11) %>%
addPolylines(data = x, weight = ~my_breaks(AWDT_ROUND), color = 'orange', opacity = 1)