A scalar in R is a single value, such as a number or a string.
# Numeric scalar
num_scalar <- 42
# Character scalar
char_scalar <- "Hello, world!"
num_scalar
[1] 42
char_scalar
[1] "Hello, world!"
Vectors store a collection of elements of the same type.
# Numeric vector
num_vec <- c(1, 2, 3, 4)
# Character vector
char_vec <- c("A", "B", "C")
num_vec
[1] 1 2 3 4
char_vec
[1] "A" "B" "C"
Lists can store elements of different types.
my_list <- list(name = "Alice", age = 30, scores = c(85, 90, 88))
my_list
$name
[1] "Alice"
$age
[1] 30
$scores
[1] 85 90 88
A matrix is a two-dimensional structure where all elements are of the same type.
mat <- matrix(1:6, nrow = 2, ncol = 3)
mat
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
An array can have more than two dimensions.
arr <- array(1:12, dim = c(3, 2, 2))
arr
, , 1
[,1] [,2]
[1,] 1 4
[2,] 2 5
[3,] 3 6
, , 2
[,1] [,2]
[1,] 7 10
[2,] 8 11
[3,] 9 12
A data frame is a table-like structure where columns can have different types.
df <- data.frame(Name = c("Alice", "Bob"), Age = c(25, 30))
df
Factors handle categorical data.
gender <- factor(c("Male", "Female", "Female"))
gender
[1] Male Female Female
Levels: Female Male
Functions are objects that perform specific tasks.
add <- function(a, b) {
return(a + b)
}
add(5, 7)
[1] 12
You can inspect the type or structure of an object using built-in functions.
class(num_vec) # Class of the vector
[1] "numeric"
str(my_list) # Structure of the list
List of 3
$ name : chr "Alice"
$ age : num 30
$ scores: num [1:3] 85 90 88
typeof(mat) # Type of the matrix
[1] "integer"
email_data %>% group_by(EmailID) %>% summarise(email_count = n()) %>%
summarise(email_count > 1)
G1;H1;Errorh in email_data %>% group_by(EmailID) %>% summarise(email_count = n()) %>% :
could not find function "%>%"
Error during wrapup: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
if (!require("tidyverse")) install.packages("tidyverse")
G3;Loading required package: tidyverse
g── Attaching core tidyverse packages ────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tidyverse)
load("HW1_Retail_Transactions_Data.RData")
head(retail_transaction_records)
head(retail_transaction_details)
# Method 1: Using dplyr's left_join
retail_data <- left_join(retail_transaction_records, retail_transaction_details, by = "TransactionID")
# Method 2: Using base R's merge function
retail_data2 <- merge(retail_transaction_records, retail_transaction_details, by = "TransactionID")
mean(retail_data$Price)
[1] 16.75
retail_data <- retail_data %>%
mutate(Revenue = Price * Quantity)
retail_data <- retail_data %>%
mutate(ProductCategory = ifelse(ProductID %in% c("P0", "P1", "P2"), "CategoryA", "CategoryB"))
units_per_category <- retail_data %>%
group_by(ProductCategory) %>%
summarise(TotalUnits = sum(Quantity))
units_per_category
top_product <- retail_data %>%
group_by(ProductID) %>%
summarise(TotalRevenue = sum(Revenue)) %>%
arrange(desc(TotalRevenue)) %>%
slice(1)
top_product
transactions_per_payment <- retail_data %>%
group_by(PaymentMethod) %>%
summarise(NumTransactions = n())
transactions_per_payment
discount_proportion <- retail_data %>%
group_by(ProductID, Channel) %>%
summarise(ProportionDiscounted = mean(DiscountApplied))
`summarise()` has grouped output by 'ProductID'. You can override using the `.groups`
argument.
discount_proportion
most_recent_date <- max(retail_data$TransactionDate)
most_recent_date
[1] "2024-04-09 UTC"
discounted_cash_purchases <- retail_data %>%
filter(DiscountApplied == 1, PaymentMethod == "Cash") %>%
nrow()
discounted_cash_purchases
[1] 4