# Create a vector packages with the names of all required packages. uncomment as needed. This step has to be done only once.
# install.packages(c("libr", "readxl" , "dplyr", "sqldf","stringr","haven","DT"))
packages <- c("libr", "readxl" , "dplyr", "sqldf","stringr","haven","DT")
# Use lapply() to loop over each package and load it with library().
# character.only = TRUE tells library() to treat the name as a string (character) rather than a variable name.
lapply(packages, library, character.only = TRUE)
## Loading required package: common
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## [[1]]
## [1] "libr" "common" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
##
## [[2]]
## [1] "readxl" "libr" "common" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "dplyr" "readxl" "libr" "common" "stats" "graphics"
## [7] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "sqldf" "RSQLite" "gsubfn" "proto" "dplyr" "readxl"
## [7] "libr" "common" "stats" "graphics" "grDevices" "utils"
## [13] "datasets" "methods" "base"
##
## [[5]]
## [1] "stringr" "sqldf" "RSQLite" "gsubfn" "proto" "dplyr"
## [7] "readxl" "libr" "common" "stats" "graphics" "grDevices"
## [13] "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "haven" "stringr" "sqldf" "RSQLite" "gsubfn" "proto"
## [7] "dplyr" "readxl" "libr" "common" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "DT" "haven" "stringr" "sqldf" "RSQLite" "gsubfn"
## [7] "proto" "dplyr" "readxl" "libr" "common" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
Arithmetic Operators
Arithmetic operators in R are used for performing basic mathematical operations.
+ Addition, used to sum two numeric values.
- Subtraction, used to subtract one numeric value from another.
* Multiplication, used to multiply two numeric values.
/ Division, used to divide one numeric value by another. ^ or **
Exponentiation, used to raise a number to the power of another.
%% Modulo, used to find the remainder after division of one number by another.
%/% Integer division, used to return the quotient of division without the remainder.
Logical Operators
Logical operators are used to combine or modify logical values (TRUE or FALSE).
Logical operators are used to combine or modify logical values (TRUE or FALSE).
& Element-wise AND, used to return TRUE if both elements are TRUE.
| Element-wise OR, used to return TRUE if at least one of the elements is TRUE.
&& Logical AND, evaluates only the first element and returns TRUE if both are TRUE.
|| Logical OR, evaluates only the first element and returns TRUE if at least one is TRUE.
! Logical NOT, used to invert the logical value (turn TRUE to FALSE and vice versa).
Comparison (Boolean) Operators
Comparison operators are used to compare values, returning TRUE or FALSE.
== Equal to, checks if two values are the same.
!= Not equal to, checks if two values are different.
> Greater than, checks if the left value is larger than the right value.
< Less than, checks if the left value is smaller than the right value.
>= Greater than or equal to, checks if the left value is larger than or equal to the right value.
<= Less than or equal to, checks if the left value is smaller than or equal to the right value.
Colon Operator
The colon operator is used to generate a sequence of numbers in a specified range.
: Generates a sequence of numbers from a starting value to an ending value.
Pipe Operator
The pipe operator is used to pass the output of one function directly into the next function in a more readable format.
%>% Pipe operator, allows chaining functions together to streamline data manipulation and analysis. These operators are fundamental to performing various operations in R, ranging from mathematical computations to logical evaluations and data manipulation.
# =====================================================================
# 📄 SCRIPT: Filtering Data in R using Different Methods (with Explanations)
# =====================================================================
# ------------------------------------------------------------
# STEP 1: Load Required Libraries
# ------------------------------------------------------------
# These libraries provide different approaches to manipulate and query data:
library(readxl) # To read Excel files (.xlsx)
library(dplyr) # For tidyverse-style data manipulation (e.g., filter, select)
library(data.table) # For fast and memory-efficient data handling
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(sqldf) # Allows SQL-style queries on data frames
# -----------------------------------------------------------
# STEP 2: Load Data from an Excel File
# ------------------------------------------------------------
# Define the full file path to the Excel file (Needs change once you upload the data to local disk)
pathname_xl <- "D://SAS COURSES//Essentials 1//LWPG1V2//LWPG1V2//data//pg1v2//storm.xlsx"
# Read the Excel file into an R data frame
Data_ex <- read_excel(pathname_xl)
# Select only the relevant columns from the dataset
# (like doing KEEP statement in SAS or SELECT in SQL)
Data_excel <- Data_ex %>%
select(Season, Name, Basin, MinPressure, StartDate)
# ------------------------------------------------------------
# STEP 3: Filter CHARACTER Data (e.g., Basin == "SP")
# ------------------------------------------------------------
# 1. Using Base R's `subset()` function
# Filters rows where Basin equals "SP"
Data_storm_base <- subset(Data_excel, Basin == "SP")
# Show the first 5 rows of filtered data
print(head(Data_storm_base, 5))
## # A tibble: 5 × 5
## Season Name Basin MinPressure StartDate
## <dbl> <chr> <chr> <dbl> <dttm>
## 1 1980 <NA> SP 998 1980-03-27 00:00:00
## 2 1980 OFA SP 980 1979-12-09 00:00:00
## 3 1980 PAUL SP 985 1980-01-02 00:00:00
## 4 1980 PENI SP 970 1980-01-02 00:00:00
## 5 1980 RAE SP 990 1980-02-02 00:00:00
# 2. Using `dplyr` and the `filter()` function
# `%>%` is the pipe operator (reads as "then")
Data_storm_dplyr <- Data_excel %>%
filter(Basin == "SP") # Keeps only rows where Basin equals "SP"
# Print first 5 rows of the result
print(head(Data_storm_dplyr, 5))
## # A tibble: 5 × 5
## Season Name Basin MinPressure StartDate
## <dbl> <chr> <chr> <dbl> <dttm>
## 1 1980 <NA> SP 998 1980-03-27 00:00:00
## 2 1980 OFA SP 980 1979-12-09 00:00:00
## 3 1980 PAUL SP 985 1980-01-02 00:00:00
## 4 1980 PENI SP 970 1980-01-02 00:00:00
## 5 1980 RAE SP 990 1980-02-02 00:00:00
# 3. Using `data.table` for efficient filtering
# Convert the data frame to a data.table object
Data_excel_dt <- as.data.table(Data_excel)
# Filter rows where Basin == "SP"
Data_storm_dt <- Data_excel_dt[Basin == "SP"]
# Display the result
print(head(Data_storm_dt, 5))
## Season Name Basin MinPressure StartDate
## <num> <char> <char> <num> <POSc>
## 1: 1980 <NA> SP 998 1980-03-27
## 2: 1980 OFA SP 980 1979-12-09
## 3: 1980 PAUL SP 985 1980-01-02
## 4: 1980 PENI SP 970 1980-01-02
## 5: 1980 RAE SP 990 1980-02-02
# 4. Using SQL-style syntax with `sqldf`
# This runs a SQL query on the R data frame
Data_storm_sqldf <- sqldf("SELECT * FROM Data_excel WHERE Basin = 'SP'")
# Print the first 5 rows
print(head(Data_storm_sqldf, 5))
## Season Name Basin MinPressure StartDate
## 1 1980 <NA> SP 998 1980-03-27 05:30:00
## 2 1980 OFA SP 980 1979-12-09 05:30:00
## 3 1980 PAUL SP 985 1980-01-02 05:30:00
## 4 1980 PENI SP 970 1980-01-02 05:30:00
## 5 1980 RAE SP 990 1980-02-02 05:30:00
# ------------------------------------------------------------
# STEP 4: Filter NUMERIC Data (e.g., MinPressure > 90)
# ------------------------------------------------------------
# 1. Base R using `subset()`
# Filters rows where MinPressure is greater than 90
Data_storm_base <- subset(Data_excel, MinPressure > 90)
print(head(Data_storm_base, 5))
## # A tibble: 5 × 5
## Season Name Basin MinPressure StartDate
## <dbl> <chr> <chr> <dbl> <dttm>
## 1 1980 ALLEN NA 899 1980-07-31 00:00:00
## 2 1980 BONNIE NA 975 1980-08-14 00:00:00
## 3 1980 CHARLEY NA 989 1980-08-20 00:00:00
## 4 1980 DANIELLE NA 1004 1980-09-04 00:00:00
## 5 1980 EARL NA 985 1980-09-04 00:00:00
# 2. `dplyr` using `filter()`
# Applies the same numeric condition using dplyr
Data_storm_dplyr <- Data_excel %>%
filter(MinPressure > 90)
print(head(Data_storm_dplyr, 5))
## # A tibble: 5 × 5
## Season Name Basin MinPressure StartDate
## <dbl> <chr> <chr> <dbl> <dttm>
## 1 1980 ALLEN NA 899 1980-07-31 00:00:00
## 2 1980 BONNIE NA 975 1980-08-14 00:00:00
## 3 1980 CHARLEY NA 989 1980-08-20 00:00:00
## 4 1980 DANIELLE NA 1004 1980-09-04 00:00:00
## 5 1980 EARL NA 985 1980-09-04 00:00:00
# 3. `data.table` version
# Filters using efficient syntax from data.table
Data_storm_dt <- Data_excel_dt[MinPressure > 90]
print(head(Data_storm_dt, 5))
## Season Name Basin MinPressure StartDate
## <num> <char> <char> <num> <POSc>
## 1: 1980 ALLEN NA 899 1980-07-31
## 2: 1980 BONNIE NA 975 1980-08-14
## 3: 1980 CHARLEY NA 989 1980-08-20
## 4: 1980 DANIELLE NA 1004 1980-09-04
## 5: 1980 EARL NA 985 1980-09-04
# 4. `sqldf` SQL-style syntax
# Filters the data using an SQL WHERE clause
Data_storm_sqldf <- sqldf("SELECT * FROM Data_excel WHERE MinPressure > 90")
print(head(Data_storm_sqldf, 5))
## Season Name Basin MinPressure StartDate
## 1 1980 ALLEN NA 899 1980-07-31 05:30:00
## 2 1980 BONNIE NA 975 1980-08-14 05:30:00
## 3 1980 CHARLEY NA 989 1980-08-20 05:30:00
## 4 1980 DANIELLE NA 1004 1980-09-04 05:30:00
## 5 1980 EARL NA 985 1980-09-04 05:30:00
# ------------------------------------------------------------
# CONCLUSION
# ------------------------------------------------------------
# This script shows how to filter data in R using 4 approaches:
# 1. Base R (subset)
# 2. dplyr (filter)
# 3. data.table (bracket syntax)
# 4. sqldf (SQL queries)
#
# Both CHARACTER (Basin == "SP") and NUMERIC (MinPressure > 90)
# filtering are demonstrated — similar to WHERE clause in SAS.
# ------------------------------------------------------------
operator_comparison <- data.frame(
Type = c("Assignment", "Arithmetic", "Relational", "Logical (Boolean)",
"Membership / IN", "Sequence", "String", "Indexing", "Formula", "Pipe"),
R_Example = c("<-, =, ->", "+, -, *, /, ^", "==, !=, >, <, >=, <=", "&, |, !, &&, ||",
"%in%", ":", "paste(), str_c()", "[ ], [[ ]], $", "y ~ x", "%>%"),
SAS_Equivalent = c("=", "+, -, *, /, **", "=, ^=, >, <, >=, <=", "AND, OR, NOT",
"IN (...)", "DO loops (`DO i = 1 to 5;`)", "||", "dataset.variable",
"MODEL y = x", "N/A"),
Explanation = c("R uses `<-` (preferred) to assign values. SAS uses `=`.",
"Basic math operations. `^` in R = `**` in SAS.",
"R uses `==` for equality (like most languages), SAS uses `=`.",
"R uses `&` and `|` for element-wise logic, SAS uses words.",
"Used to test if a value belongs to a set.",
"R uses `1:5` to create sequences; SAS uses `DO`.",
"R uses functions; SAS uses `||` to concatenate strings.",
"R uses these to access or subset data. SAS uses `.` notation.",
"Used in modeling functions (like `lm()`) in R.",
"No equivalent in SAS; `%>%` passes results to the next function.")
)
datatable(operator_comparison)