Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
summary(cars)
speed dist
Min. : 4.0 Min. : 2.00
1st Qu.:12.0 1st Qu.: 26.00
Median :15.0 Median : 36.00
Mean :15.4 Mean : 42.98
3rd Qu.:19.0 3rd Qu.: 56.00
Max. :25.0 Max. :120.00
x = cars
##Variables
# Numeric data type (floating-point number)
num_variable <- 42.5
# Integer data type
int_variable <- 10L
# Character data type (text)
char_variable <- "Hello, R!"
# Logical data type (TRUE or FALSE)
logical_variable <- TRUE
# Vector data type (1-dimensional array)
numeric_vector <- c(1, 2, 3, 4, 5)
char_vector <- c("apple", "banana", "orange", 'red', 'blue')
logical_vector <- c(TRUE, FALSE, TRUE)
# Matrix data type (2-dimensional array)
matrix_data <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2, ncol = 3)
# List data type (collection of different data types)
list_data <- list(name = "John", age = 30, married = TRUE)
print(list_data)
$name
[1] "John"
$age
[1] 30
$married
[1] TRUE
list_data$age <- 35
print(list_data)
$name
[1] "John"
$age
[1] 35
$married
[1] TRUE
print(list_data)
$name
[1] "John"
$age
[1] 35
$married
[1] TRUE
# Addition
result_add <- 10 + 5
# Subtraction
result_sub <- 20 - 8
# Multiplication
result_mul <- 6 * 7
# Division
result_div <- 15 / 3
# Modulo (remainder after division)
result_mod <- 17 %% 5
# Exponentiation (raising to a power)
result_exp <- 2^3
# Combined Arithmetic Operations
result_combined <- (5 + 3) * 2
# Using paste()
sequence1 <- "ATCG"
sequence2 <- "GCTA"
combined_sequence <- paste(sequence1, sequence2, sep = "")
print(combined_sequence) # "ATCGGCTA"
[1] "ATCGGCTA"
# Using substr()
gene_sequence <- "ATCGATCGATCG"
gene_segment <- substr(gene_sequence, start = 1, stop = 4)
print(gene_segment) # "ATCG"
[1] "ATCG"
# Using nchar()
dna_sequence <- "ATCGATCGATCG"
sequence_length <- nchar(dna_sequence)
print(sequence_length) # 12
[1] 12
# To Uppercase
uppercase_dna <- toupper("atcg")
print(uppercase_dna) # "ATCG"
[1] "ATCG"
# Using grepl()
gene_sequence <- "ATCGATCGATCG"
motif <- "ATC"
matches <- grepl(motif, gene_sequence)
print(matches) # TRUE
[1] TRUE
# Using strsplit()
genes <- "GeneX,GeneY,GeneZ"
split_genes <- strsplit(genes, ",")[[1]]
print(split_genes)
[1] "GeneX" "GeneY" "GeneZ"
# ["GeneX", "GeneY", "GeneZ"]
library(stringi)
dna_sequence <- "ATCGATCG"
reversed_sequence <- stri_reverse(dna_sequence)
print(reversed_sequence) # "GCTAGCTA"
[1] "GCTAGCTA"
# DNA sequence
dna_sequence <- "ATCGATCGATCGATCGATCG"
# Substring to find frequency
substring_to_find <- "ATC"
# Using gregexpr to find positions of the substring
matches <- gregexpr(substring_to_find, dna_sequence)
match_positions = unlist(matches)
print(match_positions)
[1] 1 5 9 13 17
# Counting the number of matches
frequency <- sum(unlist(matches) != -1)
# Print the frequency
print(frequency)
[1] 5
# Create a numeric vector named ages
ages <- c(25, 30, 22, 28, 35)
print(ages)
[1] 25 30 22 28 35
print(ages[1])
[1] 25
print(ages[2])
[1] 30
# Append the age 40 to the ages vector
ages <- c(ages, 40)
print(ages)
[1] 25 30 22 28 35 40
print(ages[1:3])
[1] 25 30 22
print(ages[2:4])
[1] 30 22 28
print(ages[2:8])
[1] 30 22 28 35 40 NA NA
# Insert the age 45 at the third position in the ages vector
ages <- c(ages[1:2], 45, ages[3:length(ages)])
print(ages)
[1] 25 30 45 22 28 35 40
# Remove the second element from the ages vector
ages <- ages[-2]
print(ages)
[1] 25 45 22 28 35 40
# Create a new vector named young_ages by slicing the ages vector
young_ages <- ages[ages < 30]
print(young_ages)
[1] 25 22 28
# Create a new vector with age group 25-35
age_25_35 <- ages[ages <= 35]
age_25_35 <- age_25_35[age_25_35 >= 25]
print(age_25_35)
[1] 25 28 35
print(sort(ages))
[1] 22 25 28 35 40 45
# Create a numeric vector named scores
scores <- c(85, 92, 78, 89, 80, 95)
print(scores)
[1] 85 92 78 89 80 95
# Create a new vector adjusted_scores by adding 5 to each score
adjusted_scores <- scores + 5
print(adjusted_scores)
[1] 90 97 83 94 85 100
# Create a logical vector pass
pass <- scores >= 80
print(pass)
[1] TRUE TRUE FALSE TRUE TRUE TRUE
# Calculate and print the length of the ages vector
length_ages <- length(ages)
print(length_ages)
[1] 6
# Calculate and print the sum of the elements in the ages vector
sum_ages <- sum(scores)
mean_scores <- mean(scores)
median_scores <- median(scores)
#sum
print(sum_ages)
[1] 519
# Mean
print(mean_scores)
[1] 86.5
# Median
print(median_scores)
[1] 87
# Min
print(min(scores))
[1] 78
# Max
print(max(scores))
[1] 95
#correlation
print(cor(scores, ages))
[1] 0.679584
# standard deviation
print(sd(scores))
[1] 6.715653
# variance
print(var(scores))
[1] 45.1
# Quantiles
print(quantile(scores))
0% 25% 50% 75% 100%
78.00 81.25 87.00 91.25 95.00
# Vector Addition Example
vec1 <- c(1, 2, 3, 4, 5)
vec2 <- c(5, 4, 3, 2, 1)
result_addition <- vec1 + vec2
print("Vector Addition:")
[1] "Vector Addition:"
print(result_addition)
[1] 6 6 6 6 6
result_subtraction <- vec1 - vec2
print("Vector Subtraction:")
[1] "Vector Subtraction:"
print(result_subtraction)
[1] -4 -2 0 2 4
# 1. Create a list
my_list <- list(
name = "John",
age = 25,
city = "New York",
is_student = TRUE
)
# Display the original list
print("Original List:")
[1] "Original List:"
print(my_list)
$name
[1] "John"
$age
[1] 25
$city
[1] "New York"
$is_student
[1] TRUE
# 2. Delete an item
my_list$age <- NULL
# Display the modified list
print(my_list)
$name
[1] "John"
$city
[1] "New York"
$is_student
[1] TRUE
# 3. Add an item
my_list$occupation <- "Engineer"
# Display the list after adding 'occupation'
print("List after adding 'occupation':")
[1] "List after adding 'occupation':"
print(my_list)
$name
[1] "John"
$city
[1] "New York"
$is_student
[1] TRUE
$occupation
[1] "Engineer"
# 4. Change the value of an item
my_list$name <- "Jane"
# Display the list after changing 'name'
print("List after changing 'name':")
[1] "List after changing 'name':"
print(my_list)
$name
[1] "Jane"
$city
[1] "New York"
$is_student
[1] TRUE
$occupation
[1] "Engineer"
# 5. Length of list
list_length <- length(my_list)
print(paste("Length of the list:", list_length))
[1] "Length of the list: 4"
# 6. Merge 2 lists
another_list <- list(country = "USA", language = "English")
merged_list <- c(my_list, another_list)
# Display the merged list
print("Merged List:")
[1] "Merged List:"
print(merged_list)
$name
[1] "Jane"
$city
[1] "New York"
$is_student
[1] TRUE
$occupation
[1] "Engineer"
$country
[1] "USA"
$language
[1] "English"
# Given data frame
data <- data.frame(
ID = c(1, 2, 3, 4, 5),
Name = c("Alice", "Bob", "Charlie", "David", "Eva"),
Age = c(25, 30, 22, 28, 35),
Score = c(85, 92, 78, 89, 81)
)
print(data)
NA
# Create a scatter plot of Age vs. Score
plot(data$Age, data$Score, xlab = "Age", ylab = "Score", main = "Age vs. Score")
# Create a line plot of ID vs. Score
plot(data$ID, data$Score, type = "l", xlab = "ID", ylab = "Score", main = "ID vs. Score")
# Create a histogram of Age distribution
hist(data$Age, xlab = "Age", ylab = "Frequency", main = "Age Distribution")
# Create a pie chart of Age distribution
age_freq <- table(cut(data$Age, breaks = c(20, 25, 30, 40)))
age_freq
(20,25] (25,30] (30,40]
2 2 1
pie(age_freq, labels = c("20-25", "26-30", "31-40"), main = "Age Distribution")
# Create a boxplot of Scores
boxplot(data$Score, xlab = "Scores", main = "Score Distribution")
# Create a bar plot of Scores by Name
barplot(data$Score, names.arg = data$Name, xlab = "Name", ylab = "Score", main = "Individual Scores")
# Create a bar plot of Mean Scores by Name
mean_scores <- tapply(data$Score, data$Name, mean)
barplot(mean_scores, xlab = "Name", ylab = "Mean Score", main = "Mean Scores by Name")
selected_rows <- data[2:4, ]
print(selected_rows)
selected_rows <- data[2, ]
print(selected_rows)
selected_rows <- data[,2:3 ]
print(selected_rows)
selected_rows <- data[2:4,2:3 ]
print(selected_rows)
# Select rows where age is greater than 25 and score is less than 90
selected_rows <- data[data$Age > 25 & data$Score < 90, ]
# Print the selected rows
print(selected_rows)
# Sort data by Age in descending order
sorted_data <- data[order(data$Age, decreasing = TRUE), ]
print("Sorted Data:")
[1] "Sorted Data:"
print(sorted_data)
# Check if any value in Age column is above 40
any_above_40 <- any(data$Age > 40)
print("Any value above 40 in Age column?")
[1] "Any value above 40 in Age column?"
print(any_above_40)
[1] FALSE
# Calculate the average age
average_age <- mean(data$Age)
print("Average Age:")
[1] "Average Age:"
print(average_age)
[1] 28
# Calculate the maximum score
max_score <- max(data$Score)
print("Maximum Score:")
[1] "Maximum Score:"
print(max_score)
[1] 92
# Subset rows with specific IDs
specific_ids <- data[data$ID_number %in% c(2, 4), ]
print("Rows with Specific IDs:")
[1] "Rows with Specific IDs:"
print(specific_ids)
print(data)
major = c('Math', 'Biology', 'CS', 'Physics', 'Chemistry')
data2 = cbind(data, major)
print(data2)
colnames(data2)[5] = 'Major'
print(data2)
v = c()
for(i in 1:5) {
v[i] = i
}
v
[1] 1 2 3 4 5
# Using for loop to print numbers
for (i in 1:5) {
print(i)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
# Using while loop to print numbers
i <- 1
while (i <= 5) {
print(i)
i <- i + 1
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
# Vector
my_vector <- c("apple", "banana", "orange")
# Using for loop to iterate vector
for (fruit in my_vector) {
print(fruit)
}
[1] "apple"
[1] "banana"
[1] "orange"
# Using while loop to iterate vector
i <- 1
while (i <= length(my_vector)) {
print(my_vector[i])
i <- i + 1
}
[1] "apple"
[1] "banana"
[1] "orange"
# List
my_list <- list(name = "John", age = 25, city = "New York")
# Using for loop to iterate list
for (item in my_list) {
print(item)
}
[1] "John"
[1] 25
[1] "New York"
# Using while loop to iterate list
i <- 1
while (i <= length(my_list)) {
print(my_list[[i]])
i <- i + 1
}
[1] "John"
[1] 25
[1] "New York"
data <- data.frame(
ID = c(1, 2, 3, 4, 5),
Name = c("Alice", "Bob", "Charlie", "David", "Eva"),
Age = c(25, 30, 22, 28, 35),
Score = c(85, 92, 78, 89, 81)
)
# Using a for loop to print column names
for (col_name in names(data)) {
print(col_name)
}
[1] "ID"
[1] "Name"
[1] "Age"
[1] "Score"
for(i in data$Name) {
print(i)
}
[1] "Alice"
[1] "Bob"
[1] "Charlie"
[1] "David"
[1] "Eva"
for(i in 1:ncol(data)) {
print(data$ID[i])
print(i)
}
[1] 1
[1] 1
[1] 2
[1] 2
[1] 3
[1] 3
[1] 4
[1] 4
# Nested for loop example
for (i in 1:4) {
for (j in 1:3) {
cat("i:", i, ", j:", j, "\n")
}
}
i: 1 , j: 1
i: 1 , j: 2
i: 1 , j: 3
i: 2 , j: 1
i: 2 , j: 2
i: 2 , j: 3
i: 3 , j: 1
i: 3 , j: 2
i: 3 , j: 3
i: 4 , j: 1
i: 4 , j: 2
i: 4 , j: 3
# Define the size of the multiplication table
table_size <- 3
# Nested for loop to generate the multiplication table
for (i in 1:table_size) {
cat("Multiplication table for", i, ":\n")
for (j in 1:10) {
result <- i * j
cat(i, "x", j, "=", result, "\n")
}
cat("\n")
}
Multiplication table for 1 :
1 x 1 = 1
1 x 2 = 2
1 x 3 = 3
1 x 4 = 4
1 x 5 = 5
1 x 6 = 6
1 x 7 = 7
1 x 8 = 8
1 x 9 = 9
1 x 10 = 10
Multiplication table for 2 :
2 x 1 = 2
2 x 2 = 4
2 x 3 = 6
2 x 4 = 8
2 x 5 = 10
2 x 6 = 12
2 x 7 = 14
2 x 8 = 16
2 x 9 = 18
2 x 10 = 20
Multiplication table for 3 :
3 x 1 = 3
3 x 2 = 6
3 x 3 = 9
3 x 4 = 12
3 x 5 = 15
3 x 6 = 18
3 x 7 = 21
3 x 8 = 24
3 x 9 = 27
3 x 10 = 30
print(30>50)
[1] FALSE
print(30<50)
[1] TRUE
# Define a variable
temperature <- 25
# Simple if-else statement
if (temperature > 30) {
cat("It's a hot day!\n")
} else {
cat("It's not too hot today.\n")
}
It's not too hot today.
scores = c(100, 80, 75, 60, 55, 20, 98)
for(i in scores) {
if (i >= 80) {
cat("Score", i , "is A+ \n")
}
else if (i >= 70) {
cat("Score", i , "is A \n")
}
else if (i >= 60) {
cat("Score", i , "is B \n")
}
else {
cat("Score", i , "is fail\n")
}
}
Score 100 is A+
Score 80 is A+
Score 75 is A
Score 60 is B
Score 55 is fail
Score 20 is fail
Score 98 is A+
DNA_vector = c('AGTC', 'AGTCCA', 'AGTGAC', 'TGAC', 'GATC')
pattern <- "TC"
for(DNA in DNA_vector) {
matches <- grepl(pattern, DNA)
if(matches) {
cat(DNA, '\n')
}
}
AGTC
AGTCCA
GATC
# Load necessary libraries
library(ggplot2)
# Create a sample dataset
data <- data.frame(
work_year = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
salary = c(50000, 60000, 55000, 45000, 55000, 58000, 48000, 59000, 60000)
)
# Create a violin plot
ggplot(data, aes(x = factor(work_year), y = salary)) +
geom_violin(fill = "lightblue", color = "black") +
labs(title = "Violin Plot of Salary by Work Year",
x = "Work Year",
y = "Salary")
library(datasets)
data(iris)
head(iris)
iris = read.csv('datasets/iris.csv')
iris
iris_dataset = iris
# Convert species labels to numeric values
species_numeric <- as.numeric(factor(iris$variety))
species_numeric
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[47] 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[93] 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[139] 3 3 3 3 3 3 3 3 3 3 3 3
# Create a scatter plot using the plot function
plot(iris$sepal.length, iris$sepal.width,
col = species_numeric, # Set scatter icon colors based on species
pch = 1, # Set scatter plot symbol
cex = 1, # Set scatter plot size
xlim = c(4, 8), # Set x-axis limits
ylim = c(1, 5), # Set y-axis limits
xlab = "Sepal Length", # Set x-axis label
ylab = "Sepal Width", # Set y-axis label
main = "Scatter Plot of Sepal Length vs Sepal Width", # Set main title
col.main = "blue", # Set main title color
col.axis = "green", # Set axis text color
cex.main = 1.2, # Set main title size
cex.lab = 1.2,
cex.axis = 2) # Set axis label size)
library(ggplot2)
# install install.packages("colorspace") if error occurs -> could not find function "ggplot"
ggplot(iris, aes(x = sepal.length)) +
geom_histogram(binwidth = 0.1, fill = "lightblue", color = "black") +
labs(title = "Histogram of Sepal Length",
x = "Sepal Length",
y = "Frequency")
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point() +
labs(title = "Scatter Plot of Sepal Length vs. Sepal Width",
x = "Sepal Length",
y = "Sepal Width")
Error in `geom_point()`:
! Problem while computing aesthetics.
ℹ Error occurred in the 1st layer.
Caused by error:
! object 'Sepal.Length' not found
Backtrace:
1. base (local) `<fn>`(x)
2. ggplot2:::print.ggplot(x)
4. ggplot2:::ggplot_build.ggplot(x)
5. ggplot2:::by_layer(...)
12. ggplot2 (local) f(l = layers[[i]], d = data[[i]])
13. l$compute_aesthetics(d, plot)
14. ggplot2 (local) compute_aesthetics(..., self = self)
15. ggplot2:::scales_add_defaults(...)
16. base::lapply(aesthetics[new_aesthetics], eval_tidy, data = data)
17. rlang (local) FUN(X[[i]], ...)
# Load the ggplot2 library
library(ggplot2)
# Plotting the scatter plot
ggplot(iris, aes(x = sepal.length, y = sepal.width, color = variety)) +
geom_point() +
scale_color_manual(values = c("Setosa" = "red", "Versicolor" = "purple", "Virginica" = "blue")) +
labs(title = "Scatter Plot of Sepal Length vs Sepal Width",
x = "Sepal Length",
y = "Sepal Width")
# Load the ggplot2 library
library(ggplot2)
# Plotting the scatter plot with various customizations
scatter_plot = ggplot(iris, aes(x = sepal.length, y = sepal.width, color = variety)) +
geom_point(size = 4) + # Set scatter point size
scale_color_manual(values = c("Setosa" = "red", "Versicolor" = "purple", "Virginica" = "blue")) + # Set class color
labs(title = "Scatter Plot of Sepal Length vs Sepal Width",
x = "Sepal Length",
y = "Sepal Width",
caption = "Source: Iris Dataset") + # Add a caption
#theme_minimal() + # Use a minimal theme -> background white
theme(legend.position = "right", # Move legend to the bottom
text = element_text(size = 12, color = "black"), # Set label text size and color
axis.text = element_text(size = 10, color = "black"), # Set axis text size and color
axis.text.x = element_text(size = 10, color = "green"), # Set x-axis text size and color
axis.text.y = element_text(size = 10, color = "blue"), # Set y-axis text size and color
axis.title = element_text(size = 14, color = "black"), # Set axis title size and color
plot.title = element_text(size = 18, color = "black"),
panel.background = element_rect(fill = "lightgray"), # Set background color
panel.grid = element_blank() # Remove grid lines
) + # Set plot title size and color
xlim(4, 8) + # Set x-axis limits
ylim(1, 5) + # Set y-axis limits
coord_fixed() # Square graph
# Save the plot with 300 dpi
ggsave("scatter_plot.png", plot = scatter_plot, dpi = 700)
Saving 7 x 7 in image
scatter_plot
pairs(iris[, 1:4], col = iris$Species)
ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
geom_boxplot() +
labs(title = "Box Plot of Sepal Length by Species",
x = "Species",
y = "Sepal Length")
cor_matrix <- cor(iris[, 1:4])
print(cor_matrix)
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
cor_matrix <- cor(iris[, 1:4])
print(cor_matrix)
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
library(reshape2)
melted_cormat <- melt(cor_matrix)
print(melted_cormat)
library(ggplot2)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() #heat map
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()+ #heat map
geom_text(aes(Var2, Var1, label = round(value, digits = 2)), color = "black", size = 4) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1),
name="Correlation")
cormat = cor_matrix
# Get upper triangle of the correlation matrix
cormat[lower.tri(cormat)]<- NA
upper_tri = cormat
# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
melted_cormat
# Heatmap
library(ggplot2)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile()+ #heat map
geom_text(aes(Var2, Var1, label = round(value, digits = 2)), color = "black", size = 4) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1),
name="Correlation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed() # square figure shape
library(ggcorrplot)
cor_matrix <- cor(iris[, 1:4])
print(cor_matrix)
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
ggcorrplot(cor_matrix, type = "lower")
ggcorrplot(cor_matrix, type = "upper")
ggcorrplot(cor_matrix,
type = "upper",
colors = c("#6D9EC1", "white", "#E46726"))
ggcorrplot(cor_matrix,
type = "upper",
colors = c("#6D9EC1", "white", "#E46726"),
lab = TRUE)
library(ggplot2)
library(GGally)
ggpairs(iris, aes(colour = Species))
lm_model <- lm(Sepal.Length ~ Petal.Length, data = iris)
summary(lm_model)
Call:
lm(formula = Sepal.Length ~ Petal.Length, data = iris)
Residuals:
Min 1Q Median 3Q Max
-1.24675 -0.29657 -0.01515 0.27676 1.00269
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.30660 0.07839 54.94 <2e-16 ***
Petal.Length 0.40892 0.01889 21.65 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4071 on 148 degrees of freedom
Multiple R-squared: 0.76, Adjusted R-squared: 0.7583
F-statistic: 468.6 on 1 and 148 DF, p-value: < 2.2e-16
#define data
x <- iris$Petal.Length
y <- iris$Sepal.Length
#plot x vs. y
plot(x, y, pch=16, cex=0.5, col=iris$Species)
#fit polynomial regression model
fit <- lm(y ~ x)
#use model to get predicted values
pred <- predict(fit)
ix <- sort(x, index.return=T)$ix # sort and return the index
ix
[1] 23 14 15 36 3 17 37 39 41 42 43 1 2 5 7 9 13 18 29 34 38 46 48 50 4 8 10 11
[29] 16 20 22 28 32 33 35 40 49 12 26 27 30 31 44 47 6 19 21 24 25 45 99 58 94 61 80 65
[57] 82 81 60 70 83 54 63 72 90 93 68 89 100 62 95 96 97 75 98 66 76 88 91 52 56 67 69 79
[85] 85 86 107 55 59 92 51 57 64 74 87 71 77 127 139 53 73 122 124 128 78 114 120 147 84 102 111 115
[113] 134 142 143 150 146 148 112 116 140 149 113 117 138 104 129 133 135 137 141 121 125 145 105 109 130 103 144 101
[141] 126 110 131 136 108 132 106 118 123 119
conf_interval <- predict(fit, newdata=data.frame(x=x), interval="confidence",level = 0.95)
#add polynomial curve to plot
lines(x[ix], pred[ix], col='red', lwd=2)
lines(x[ix], conf_interval[ix,2], col="blue", lty=2)
lines(x[ix], conf_interval[ix,3], col="blue", lty=2)
ggplot(iris, aes(x = Petal.Length, y = Sepal.Length)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(title = "Linear Regression: Sepal Length vs. Petal Length",
x = "Petal Length",
y = "Sepal Length")
x=c(1,2,3,4,5,6,7,8,9,0)
y=c(13,28,43,35,96,84,101,110,108,13)
lm.out <- lm(y ~ x)
newx = seq(min(x),max(x),by = 0.05)
conf_interval <- predict(lm.out, newdata=data.frame(x=newx), interval="confidence",
level = 0.95)
conf_interval
fit lwr upr
1 6.290909 -12.0738277 24.65565
2 6.922121 -11.2978658 25.14211
3 7.553333 -10.5223815 25.62905
4 8.184545 -9.7473863 26.11648
5 8.815758 -8.9728920 26.60441
6 9.446970 -8.1989108 27.09285
7 10.078182 -7.4254554 27.58182
8 10.709394 -6.6525386 28.07133
9 11.340606 -5.8801736 28.56139
10 11.971818 -5.1083743 29.05201
11 12.603030 -4.3371547 29.54322
12 13.234242 -3.5665293 30.03501
13 13.865455 -2.7965129 30.52742
14 14.496667 -2.0271210 31.02045
15 15.127879 -1.2583693 31.51413
16 15.759091 -0.4902741 32.00846
17 16.390303 0.2771479 32.50346
18 17.021515 1.0438794 32.99915
19 17.652727 1.8099027 33.49555
20 18.283939 2.5751997 33.99268
21 18.915152 3.3397515 34.49055
22 19.546364 4.1035389 34.98919
23 20.177576 4.8665419 35.48861
24 20.808788 5.6287403 35.98884
25 21.440000 6.3901128 36.48989
26 22.071212 7.1506380 36.99179
27 22.702424 7.9102936 37.49455
28 23.333636 8.6690568 37.99822
29 23.964848 9.4269040 38.50279
30 24.596061 10.1838112 39.00831
31 25.227273 10.9397535 39.51479
32 25.858485 11.6947054 40.02226
33 26.489697 12.4486408 40.53075
34 27.120909 13.2015327 41.04029
35 27.752121 13.9533536 41.55089
36 28.383333 14.7040751 42.06259
37 29.014545 15.4536682 42.57542
38 29.645758 16.2021030 43.08941
39 30.276970 16.9493490 43.60459
40 30.908182 17.6953749 44.12099
41 31.539394 18.4401485 44.63864
42 32.170606 19.1836371 45.15758
43 32.801818 19.9258070 45.67783
44 33.433030 20.6666238 46.19944
45 34.064242 21.4060524 46.72243
46 34.695455 22.1440569 47.24685
47 35.326667 22.8806006 47.77273
48 35.957879 23.6156462 48.30011
49 36.589091 24.3491554 48.82903
50 37.220303 25.0810894 49.35952
51 37.851515 25.8114088 49.89162
52 38.482727 26.5400733 50.42538
53 39.113939 27.2670421 50.96084
54 39.745152 27.9922738 51.49803
55 40.376364 28.7157262 52.03700
56 41.007576 29.4373569 52.57779
57 41.638788 30.1571228 53.12045
58 42.270000 30.8749805 53.66502
59 42.901212 31.5908861 54.21154
60 43.532424 32.3047954 54.76005
61 44.163636 33.0166639 55.31061
62 44.794848 33.7264472 55.86325
63 45.426061 34.4341004 56.41802
64 46.057273 35.1395788 56.97497
65 46.688485 35.8428377 57.53413
66 47.319697 36.5438327 58.09556
67 47.950909 37.2425194 58.65930
68 48.582121 37.9388540 59.22539
69 49.213333 38.6327930 59.79387
70 49.844545 39.3242936 60.36480
71 50.475758 40.0133136 60.93820
72 51.106970 40.6998117 61.51413
73 51.738182 41.3837475 62.09262
74 52.369394 42.0650817 62.67371
75 53.000606 42.7437760 63.25744
76 53.631818 43.4197938 63.84384
77 54.263030 44.0930996 64.43296
78 54.894242 44.7636596 65.02483
79 55.525455 45.4314417 65.61947
80 56.156667 46.0964156 66.21692
81 56.787879 46.7585530 66.81720
82 57.419091 47.4178275 67.42035
83 58.050303 48.0742149 68.02639
84 58.681515 48.7276935 68.63534
85 59.312727 49.3782435 69.24721
86 59.943939 50.0258480 69.86203
87 60.575152 50.6704922 70.47981
88 61.206364 51.3121641 71.10056
89 61.837576 51.9508542 71.72430
90 62.468788 52.5865559 72.35102
91 63.100000 53.2192650 72.98074
92 63.731212 53.8489801 73.61344
93 64.362424 54.4757027 74.24915
94 64.993636 55.0994368 74.88784
95 65.624848 55.7201891 75.52951
96 66.256061 56.3379692 76.17415
97 66.887273 56.9527890 76.82176
98 67.518485 57.5646632 77.47231
99 68.149697 58.1736089 78.12579
100 68.780909 58.7796456 78.78217
101 69.412121 59.3827954 79.44145
102 70.043333 59.9830823 80.10358
103 70.674545 60.5805326 80.76856
104 71.305758 61.1751747 81.43634
105 71.936970 61.7670390 82.10690
106 72.568182 62.3561574 82.78021
107 73.199394 62.9425639 83.45622
108 73.830606 63.5262938 84.13492
109 74.461818 64.1073839 84.81625
110 75.093030 64.6858723 85.50019
111 75.724242 65.2617985 86.18669
112 76.355455 65.8352027 86.87571
113 76.986667 66.4061264 87.56721
114 77.617879 66.9746116 88.26115
115 78.249091 67.5407012 88.95748
116 78.880303 68.1044388 89.65617
117 79.511515 68.6658680 90.35716
118 80.142727 69.2250333 91.06042
119 80.773939 69.7819792 91.76590
120 81.405152 70.3367502 92.47355
121 82.036364 70.8893912 93.18334
122 82.667576 71.4399469 93.89520
123 83.298788 71.9884618 94.60911
124 83.930000 72.5349805 95.32502
125 84.561212 73.0795471 96.04288
126 85.192424 73.6222054 96.76264
127 85.823636 74.1629989 97.48427
128 86.454848 74.7019707 98.20773
129 87.086061 75.2391633 98.93296
130 87.717273 75.7746188 99.65993
131 88.348485 76.3083785 100.38859
132 88.979697 76.8404834 101.11891
133 89.610909 77.3709736 101.85084
134 90.242121 77.8998886 102.58435
135 90.873333 78.4272673 103.31940
136 91.504545 78.9531478 104.05594
137 92.135758 79.4775676 104.79395
138 92.766970 80.0005632 105.53338
139 93.398182 80.5221706 106.27419
140 94.029394 81.0424250 107.01636
141 94.660606 81.5613606 107.75985
142 95.291818 82.0790112 108.50463
143 95.923030 82.5954096 109.25065
144 96.554242 83.1105879 109.99790
145 97.185455 83.6245773 110.74633
146 97.816667 84.1374085 111.49592
147 98.447879 84.6491112 112.24665
148 99.079091 85.1597146 112.99847
149 99.710303 85.6692469 113.75136
150 100.341515 86.1777357 114.50529
151 100.972727 86.6852081 115.26025
152 101.603939 87.1916900 116.01619
153 102.235152 87.6972071 116.77310
154 102.866364 88.2017841 117.53094
155 103.497576 88.7054451 118.28971
156 104.128788 89.2082138 119.04936
157 104.760000 89.7101128 119.80989
158 105.391212 90.2111645 120.57126
159 106.022424 90.7113904 121.33346
160 106.653636 91.2108116 122.09646
161 107.284848 91.7094485 122.86025
162 107.916061 92.2073209 123.62480
163 108.547273 92.7044482 124.39010
164 109.178485 93.2008491 125.15612
165 109.809697 93.6965418 125.92285
166 110.440909 94.1915441 126.69027
167 111.072121 94.6858731 127.45837
168 111.703333 95.1795457 128.22712
169 112.334545 95.6725780 128.99651
170 112.965758 96.1649859 129.76653
171 113.596970 96.6567847 130.53715
172 114.228182 97.1479893 131.30837
173 114.859394 97.6386142 132.08017
174 115.490606 98.1286736 132.85254
175 116.121818 98.6181810 133.62546
176 116.753030 99.1071498 134.39891
177 117.384242 99.5955929 135.17289
178 118.015455 100.0835228 135.94739
179 118.646667 100.5709518 136.72238
180 119.277879 101.0578918 137.49787
181 119.909091 101.5443541 138.27383
plot(x, y, xlab="x", ylab="y", main="Regression")
abline(lm.out, col="lightblue")
lines(newx, conf_interval[,2], col="blue", lty=2)
lines(newx, conf_interval[,3], col="blue", lty=2)
ggplot(iris, aes(x = Petal.Length, y = Sepal.Length, color=Species)) +
geom_point() +
geom_smooth(method = "lm", formula=y~poly(x, 2), level=0.95, se = TRUE, color = "blue", fill='lightblue') +
labs(title = "Polynomial Regression: Sepal Length vs. Petal Length",
x = "Petal Length",
y = "Sepal Length")
#define data
x <- iris$Petal.Length
y <- iris$Sepal.Length
#plot x vs. y
plot(x, y, pch=16, cex=1.5)
#fit polynomial regression model
fit <- lm(y ~ x + I(x^2) + I(x^3))
summary(fit)
Call:
lm(formula = y ~ x + I(x^2) + I(x^3))
Residuals:
Min 1Q Median 3Q Max
-1.06434 -0.24523 0.00707 0.19869 0.92755
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.64817 0.45873 10.133 <2e-16 ***
x 0.27811 0.48046 0.579 0.564
I(x^2) -0.04428 0.13454 -0.329 0.743
I(x^3) 0.01055 0.01123 0.939 0.349
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.364 on 146 degrees of freedom
Multiple R-squared: 0.8106, Adjusted R-squared: 0.8067
F-statistic: 208.3 on 3 and 146 DF, p-value: < 2.2e-16
#use model to get predicted values
ix <- sort(x, index.return=T)$ix
conf_interval <- predict(fit, newdata=data.frame(x=x), interval="confidence",level = 0.95)
#conf_interval
#add polynomial curve to plot
#lines(x[ix], pred[ix], col='red', lwd=2)
lines(x[ix], conf_interval[ix,1], col="blue")
lines(x[ix], conf_interval[ix,2], col="blue", lty=2)
lines(x[ix], conf_interval[ix,3], col="blue", lty=2)
# Load necessary libraries
library(stats)
# Load the iris dataset (it's built-in)
data(iris)
# Perform polynomial regression (quadratic model)
poly_model <- lm(Sepal.Length ~ poly(Sepal.Width, 2) + poly(Petal.Length, 2) + poly(Petal.Width, 2), data = iris)
# Print summary of polynomial regression
summary(poly_model)
Call:
lm(formula = Sepal.Length ~ poly(Sepal.Width, 2) + poly(Petal.Length,
2) + poly(Petal.Width, 2), data = iris)
Residuals:
Min 1Q Median 3Q Max
-0.85830 -0.21065 0.00061 0.19278 0.77325
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.84333 0.02509 232.877 < 2e-16 ***
poly(Sepal.Width, 2)1 2.99803 0.40359 7.428 9.12e-12 ***
poly(Sepal.Width, 2)2 0.34547 0.31951 1.081 0.28141
poly(Petal.Length, 2)1 12.74168 1.78665 7.132 4.54e-11 ***
poly(Petal.Length, 2)2 1.59442 0.58991 2.703 0.00771 **
poly(Petal.Width, 2)1 -2.82015 1.72498 -1.635 0.10427
poly(Petal.Width, 2)2 -0.95176 0.67450 -1.411 0.16040
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3073 on 143 degrees of freedom
Multiple R-squared: 0.8678, Adjusted R-squared: 0.8623
F-statistic: 156.5 on 6 and 143 DF, p-value: < 2.2e-16
set.seed(123)
k <- 3
kmeans_result <- kmeans(iris[, 1:4], centers = k)
print(kmeans_result)
K-means clustering with 3 clusters of sizes 50, 62, 38
Cluster means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.006000 3.428000 1.462000 0.246000
2 5.901613 2.748387 4.393548 1.433871
3 6.850000 3.073684 5.742105 2.071053
Clustering vector:
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2
[57] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3 3
[113] 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 2 3 3 3 2 3 3 2
Within cluster sum of squares by cluster:
[1] 15.15100 39.82097 23.87947
(between_SS / total_SS = 88.4 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
ggplot(iris, aes(x = Petal.Length, y = Sepal.Length, color = factor(kmeans_result$cluster))) +
geom_point() +
labs(title = "K-Means Clustering: Sepal Length vs. Petal Length",
x = "Petal Length",
y = "Sepal Length")
# Load necessary libraries
library(stats)
library(ggplot2) # For data visualization
# Load the iris dataset (it's built-in)
data(iris)
# Select only the numeric columns for clustering
data_for_clustering <- iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")]
# Perform K-Means clustering with 3 clusters
k <- 3 # Number of clusters
kmeans_result <- kmeans(data_for_clustering, centers = k)
library(cluster)
clusplot(iris, kmeans_result$cluster, color=T, shade=T, labels=0, lines=0)
# Load necessary libraries
library(glmnet)
Loading required package: Matrix
Loaded glmnet 4.1-7
# Load the iris dataset (it's built-in)
data(iris)
# Prepare data matrix and response vector
X <- model.matrix(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = iris)
y <- iris$Sepal.Length
# Perform lasso regression with cross-validation using glmnet
lasso_model <- cv.glmnet(X, y, alpha = 1)
# Print summary of lasso regression
print(lasso_model)
Call: cv.glmnet(x = X, y = y, alpha = 1)
Measure: Mean-Squared Error
Lambda Index Measure SE Nonzero
min 0.000611 77 0.1048 0.01041 3
1se 0.013171 44 0.1143 0.01135 3
# Load necessary libraries
library(e1071) # For SVM
library(caret) # For model evaluation
Loading required package: lattice
library(ggplot2)
library(lattice)
# Load the iris dataset (it's built-in)
data(iris)
# Split the data into training and testing sets
set.seed(123)
train_indices <- createDataPartition(iris$Species, p = 0.8, list = FALSE)
train_data <- iris[train_indices, ]
test_data <- iris[-train_indices, ]
# Train an SVM classifier with a linear kernel
svm_model <- svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = train_data, kernel = "linear")
# Predict using the trained model
predictions <- predict(svm_model, newdata = test_data)
# Confusion matrix
conf_matrix <- confusionMatrix(predictions, test_data$Species)
# Print confusion matrix
print("Confusion Matrix:")
[1] "Confusion Matrix:"
print(conf_matrix)
Confusion Matrix and Statistics
Reference
Prediction setosa versicolor virginica
setosa 10 0 0
versicolor 0 10 1
virginica 0 0 9
Overall Statistics
Accuracy : 0.9667
95% CI : (0.8278, 0.9992)
No Information Rate : 0.3333
P-Value [Acc > NIR] : 2.963e-13
Kappa : 0.95
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: setosa Class: versicolor Class: virginica
Sensitivity 1.0000 1.0000 0.9000
Specificity 1.0000 0.9500 1.0000
Pos Pred Value 1.0000 0.9091 1.0000
Neg Pred Value 1.0000 1.0000 0.9524
Prevalence 0.3333 0.3333 0.3333
Detection Rate 0.3333 0.3333 0.3000
Detection Prevalence 0.3333 0.3667 0.3000
Balanced Accuracy 1.0000 0.9750 0.9500
# Accuracy
accuracy <- conf_matrix$overall["Accuracy"]
print(paste("Accuracy:", accuracy))
[1] "Accuracy: 0.966666666666667"
# F1 Score
f1_score <- conf_matrix$byClass["F1"]
print(paste("F1 Score:", f1_score))
[1] "F1 Score: NA"
# Load necessary libraries
library(e1071) # For SVM
library(caret) # For model evaluation
library(ggplot2)
library(lattice)
# Load the iris dataset (it's built-in)
data(iris)
# Split the data into training and testing sets
set.seed(123)
train_indices <- createDataPartition(iris$Species, p = 0.8, list = FALSE)
train_data <- iris[train_indices, ]
test_data <- iris[-train_indices, ]
# Train an SVM classifier with a linear kernel
svm_model <- svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = train_data, kernel = "linear")
# Predict using the trained model
predictions <- predict(svm_model, newdata = test_data)
# Confusion matrix
conf_matrix <- confusionMatrix(predictions, test_data$Species)
# Print confusion matrix
print("Confusion Matrix:")
[1] "Confusion Matrix:"
print(conf_matrix)
Confusion Matrix and Statistics
Reference
Prediction setosa versicolor virginica
setosa 10 0 0
versicolor 0 10 1
virginica 0 0 9
Overall Statistics
Accuracy : 0.9667
95% CI : (0.8278, 0.9992)
No Information Rate : 0.3333
P-Value [Acc > NIR] : 2.963e-13
Kappa : 0.95
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: setosa Class: versicolor Class: virginica
Sensitivity 1.0000 1.0000 0.9000
Specificity 1.0000 0.9500 1.0000
Pos Pred Value 1.0000 0.9091 1.0000
Neg Pred Value 1.0000 1.0000 0.9524
Prevalence 0.3333 0.3333 0.3333
Detection Rate 0.3333 0.3333 0.3000
Detection Prevalence 0.3333 0.3667 0.3000
Balanced Accuracy 1.0000 0.9750 0.9500
# Accuracy
accuracy <- conf_matrix$overall["Accuracy"]
print(paste("Accuracy:", accuracy))
[1] "Accuracy: 0.966666666666667"
# F1 Score
f1_score <- conf_matrix$byClass["F1"]
print(paste("F1 Score:", f1_score))
[1] "F1 Score: NA"
# Plot the confusion matrix
conf_plot <- plot(conf_matrix$table, col = c("lightblue", "lightcoral"),
main = "Confusion Matrix for SVM on Iris Dataset",
xlab = "Predicted", ylab = "Actual")
# Display the plot
conf_plot
NULL
cm = conf_matrix
plt <- as.data.frame(cm$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))
ggplot(plt, aes(Prediction,Reference, fill= Freq)) +
geom_tile() +
geom_text(aes(label=Freq)) +
scale_fill_gradient(low="white", high="skyblue") +
labs(x = "Reference",
y = "Prediction")
rev(levels(plt$Prediction))
[1] "setosa" "versicolor" "virginica"
# Import caret library
library(caret)
# Create Data
actual <- factor(rep(c(1, 2),
times=c(16, 24)))
predicted <- factor(rep(c(1, 2, 1, 2),
times=c(12, 4, 7, 17)))
# create confusion matrix
confusionMatrix(predicted, actual,
mode = "everything",
positive="1")
Confusion Matrix and Statistics
Reference
Prediction 1 2
1 12 7
2 4 17
Accuracy : 0.725
95% CI : (0.5611, 0.854)
No Information Rate : 0.6
P-Value [Acc > NIR] : 0.07095
Kappa : 0.4444
Mcnemar's Test P-Value : 0.54649
Sensitivity : 0.7500
Specificity : 0.7083
Pos Pred Value : 0.6316
Neg Pred Value : 0.8095
Precision : 0.6316
Recall : 0.7500
F1 : 0.6857
Prevalence : 0.4000
Detection Rate : 0.3000
Detection Prevalence : 0.4750
Balanced Accuracy : 0.7292
'Positive' Class : 1
# Calculate probability
prob_setosa <- sum(iris$Species == "setosa") / nrow(iris)
print("Probability of Setosa:")
[1] "Probability of Setosa:"
print(prob_setosa)
[1] 0.3333333
# Normal distribution
hist(iris$Sepal.Length, probability = TRUE, main = "Histogram of Sepal Length")
lines(density(iris$Sepal.Length), col = "blue")
# Principal Component Analysis (PCA)
pca <- prcomp(iris[, -5])
summary(pca)
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 2.0563 0.49262 0.2797 0.15439
Proportion of Variance 0.9246 0.05307 0.0171 0.00521
Cumulative Proportion 0.9246 0.97769 0.9948 1.00000
setosa <- iris$Sepal.Length[iris$Species == "setosa"]
virginica <- iris$Sepal.Length[iris$Species == "virginica"]
t_test_result <- t.test(setosa, virginica)
print(t_test_result)
Welch Two Sample t-test
data: setosa and virginica
t = -15.386, df = 76.516, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.78676 -1.37724
sample estimates:
mean of x mean of y
5.006 6.588
setosa <- iris$Sepal.Length[iris$Species == "setosa"]
virginica <- iris$Sepal.Length[iris$Species == "virginica"]
t_test_result <- t.test(setosa, virginica, alternative = "two.sided")
print(t_test_result)
Welch Two Sample t-test
data: setosa and virginica
t = -15.386, df = 76.516, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.78676 -1.37724
sample estimates:
mean of x mean of y
5.006 6.588
# https://www.statology.org/interpret-t-test-results-in-r/
anova_result <- aov(Sepal.Length ~ Species, data = iris)
summary(anova_result)
Df Sum Sq Mean Sq F value Pr(>F)
Species 2 63.21 31.606 119.3 <2e-16 ***
Residuals 147 38.96 0.265
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
posthoc <- TukeyHSD(anova_result)
print(posthoc)
# Load necessary libraries
library(stats)
# Load the iris dataset (it's built-in)
data(iris)
# Chi-square test (testing independence between species and petal length)
chisq.test(table(iris$Species, cut(iris$Petal.Length, breaks = c(1, 2, 3, 4, 5))))
# Load necessary libraries
library(ggplot2)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(reshape)
Attaching package: ‘reshape’
The following object is masked from ‘package:dplyr’:
rename
The following object is masked from ‘package:plotly’:
rename
# Load the iris dataset (it's built-in)
data(iris)
# Pairwise scatter plot with color by Species
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point() +
facet_grid(~Species) +
labs(title = "Pairwise Scatter Plot",
x = "Sepal Length", y = "Sepal Width")
# Load required library
library(ggplot2)
library(stats)
iris = read.csv('Class 4/iris.csv')
# Apply PCA
iris_pca <- prcomp(iris[, -5], center = TRUE, scale = TRUE)
iris_pca
Standard deviations (1, .., p=4):
[1] 1.7083611 0.9560494 0.3830886 0.1439265
Rotation (n x k) = (4 x 4):
PC1 PC2 PC3 PC4
sepal.length 0.5210659 -0.37741762 0.7195664 0.2612863
sepal.width -0.2693474 -0.92329566 -0.2443818 -0.1235096
petal.length 0.5804131 -0.02449161 -0.1421264 -0.8014492
petal.width 0.5648565 -0.06694199 -0.6342727 0.5235971
summary(iris_pca)
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 1.7084 0.9560 0.38309 0.14393
Proportion of Variance 0.7296 0.2285 0.03669 0.00518
Cumulative Proportion 0.7296 0.9581 0.99482 1.00000
# Extract PC scores
pc_scores <- as.data.frame(iris_pca$x[, 1:2])
pc_scores
# Combine PC scores with Species
pc_data <- cbind(pc_scores, Species = iris$variety)
pc_data
# Plot PCA (2D)
ggplot(pc_data, aes(PC1, PC2, color = Species)) +
geom_point() +
labs(title = "PCA (2D) of Iris Dataset",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal()
# http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/
# https://www.datacamp.com/tutorial/pca-analysis-r
# http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/
fviz_cos2(iris_pca, choice = "var", axes = 1:2)
# Contributions of variables to PC1
fviz_contrib(iris_pca, choice = "var", axes = 1)
# Contributions of variables to PC2
fviz_contrib(iris_pca, choice = "var", axes = 2)
var <- get_pca_var(iris_pca)
var
Principal Component Analysis Results for variables
===================================================
Name Description
1 "$coord" "Coordinates for the variables"
2 "$cor" "Correlations between variables and dimensions"
3 "$cos2" "Cos2 for the variables"
4 "$contrib" "contributions of the variables"
library("corrplot")
corrplot(var$cos2, is.corr=FALSE)
var <- get_pca_var(iris_pca)
var$contrib
Dim.1 Dim.2 Dim.3 Dim.4
sepal.length 27.150969 14.24440565 51.777574 6.827052
sepal.width 7.254804 85.24748749 5.972245 1.525463
petal.length 33.687936 0.05998389 2.019990 64.232089
petal.width 31.906291 0.44812296 40.230191 27.415396
library(factoextra)
fviz_eig(iris_pca, addlabels = TRUE)
# Graph of the variables
fviz_pca_var(iris_pca, col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE ) # Avoid text overlapping)
fviz_pca_ind(iris_pca,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_ind(iris_pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
fviz_pca_ind(iris_pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = iris$variety, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
# Load required library
library(plotly)
Loading required package: ggplot2
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
iris = read.csv('datasets/iris.csv')
# Plot 3D Scatter Plot
plot_ly(data = iris, x = ~sepal.length, y = ~sepal.width, z = ~petal.length, color = ~variety,
type = "scatter3d", mode = "markers") %>%
layout(scene = list(xaxis = list(title = 'Sepal Length'),
yaxis = list(title = 'Sepal Width'),
zaxis = list(title = 'Petal Length')),
margin = list(l = 0, r = 0, b = 0, t = 0))
NA
library(plotly)
fig <- iris %>%
plot_ly(y = ~sepal.width,type = 'violin')
fig
NA
fig <- iris %>%
plot_ly(
y = ~sepal.width,
type = 'box'
)
fig
fig <- iris %>%
plot_ly(
y = ~sepal.width,
type = 'bar'
)
fig
fig <- iris %>%
plot_ly(
x = ~sepal.width,
type = 'histogram'
)
fig
# Load required library
library(plotly)
# Create Boxplot
boxplot = plot_ly(iris, x = ~variety, y = ~sepal.length, type = "box") %>%
layout(
xaxis = list(title = "Category"), # Set X-axis title
yaxis = list(title = "Value") # Set Y-axis title
)
# Print the plot
boxplot
# Load required library
library(plotly)
# Load Iris dataset
data(iris)
# 1. Scatter Plot
scatter_plot <- plot_ly(iris, x = ~Sepal.Length, y = ~Sepal.Width, color = ~Species, type = "scatter", mode = "markers") %>%
layout(xaxis = list(title = "Sepal Length"), yaxis = list(title = "Sepal Width"), title = "Scatter Plot")
scatter_plot
# 2. Box Plot
box_plot <- plot_ly(iris, x = ~Species, y = ~Petal.Length, type = "box") %>%
layout(xaxis = list(title = "Species"), yaxis = list(title = "Petal Length"), title = "Box Plot")
box_plot
# 3. Violin Plot
violin_plot <- plot_ly(iris, x = ~Species, y = ~Petal.Width, type = "violin") %>%
layout(xaxis = list(title = "Species"), yaxis = list(title = "Petal Width"), title = "Violin Plot")
violin_plot
# 4. Histogram
histogram_plot <- plot_ly(iris, x = ~Sepal.Length, type = "histogram") %>%
layout(xaxis = list(title = "Sepal Length"), yaxis = list(title = "Frequency"), title = "Histogram")
histogram_plot
# 5. Heatmap of Correlation Matrix
correlation_matrix <- cor(iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")])
heatmap_plot <- plot_ly(z = correlation_matrix, type = "heatmap", colorscale = "Viridis") %>%
layout(xaxis = list(title = colnames(correlation_matrix)), yaxis = list(title = colnames(correlation_matrix)), title = "Correlation Heatmap")
heatmap_plot
# 6. Bar Plot
bar_plot <- plot_ly(iris, x = ~Species, type = "bar", marker = list(color = "blue")) %>%
layout(xaxis = list(title = "Species"), yaxis = list(title = "Count"), title = "Bar Plot")
bar_plot
# 7. Line Plot after Sorting a Column
sorted_data <- iris[order(iris$Sepal.Length), ]
line_plot <- plot_ly(sorted_data, x = ~Sepal.Length, y = ~Sepal.Width, type = "scatter", mode = "lines") %>%
layout(xaxis = list(title = "Sepal Length (Sorted)"), yaxis = list(title = "Sepal Width"), title = "Line Plot (Sorted)")
line_plot
# Display plots
#subplot(scatter_plot, box_plot, violin_plot, histogram_plot, heatmap_plot, bar_plot, line_plot, nrows = 4)
# Load required libraries
library(ggplot2)
library(plotly)
# Use plot_ly to create a 3D scatter plot
plot_ly(data = iris, x = ~sepal.length, y = ~sepal.width, z = ~petal.length, color = ~variety, type = "scatter3d", mode = "markers") %>%
layout(scene = list(xaxis = list(title = "Sepal Length"),
yaxis = list(title = "Sepal Width"),
zaxis = list(title = "Petal Length")))
NA
# Load required libraries
library(plotly)
# Create sample data for the surface plot
x <- seq(-10, 10, length.out = 100)
y <- seq(-10, 10, length.out = 100)
z <- outer(x, y, function(x, y) sin(sqrt(x^2 + y^2)) / sqrt(x^2 + y^2))
# Define a custom color scale
color_scale <- c("#440154", "#482878", "#3E4989", "#31688E", "#26838E", "#1F9E89", "#35B779", "#6DCD59", "#B4DE2C", "#FDE725")
color_scale <- c("purple", "white", "red")
# Create a 3D surface plot
plot_ly(z = ~z, x = ~x, y = ~y, type = "surface", colors = color_scale) %>%
layout(scene = list(
xaxis = list(title = 'X-axis'),
yaxis = list(title = 'Y-axis'),
zaxis = list(title = 'Z-axis')
))
NA
library(reshape2)
library(tidyverse)
library(tidymodels)
library(plotly)
library(kernlab)
library(pracma) #For meshgrid()
data(iris)
mesh_size <- .02
margin <- 0
model <- svm_rbf(cost = 1.0) %>%
set_engine("kernlab") %>%
set_mode("regression") %>%
fit(Petal.Width ~ Sepal.Width + Sepal.Length, data = iris)
x_min <- min(X$Sepal.Width) - margin
x_max <- max(X$Sepal.Width) - margin
y_min <- min(X$Sepal.Length) - margin
y_max <- max(X$Sepal.Length) - margin
xrange <- seq(x_min, x_max, mesh_size)
yrange <- seq(y_min, y_max, mesh_size)
xy <- meshgrid(x = xrange, y = yrange)
xx <- xy$X
yy <- xy$Y
dim_val <- dim(xx)
xx1 <- matrix(xx, length(xx), 1)
yy1 <- matrix(yy, length(yy), 1)
final <- cbind(xx1, yy1)
pred <- model %>%
predict(final)
pred <- pred$.pred
pred <- matrix(pred, dim_val[1], dim_val[2])
fig <- plot_ly(iris, x = ~Sepal.Width, y = ~Sepal.Length, z = ~Petal.Width ) %>%
add_markers(size = 5) %>%
add_surface(x=xrange, y=yrange, z=pred, alpha = 0.65, type = 'mesh3d', name = 'pred_surface')
fig
NA
NA