Installing Required Packages

# Create a vector packages with the names of all required packages. uncomment as needed. This step has to be done only once.

# install.packages(c("libr", "readxl" , "dplyr", "sqldf","stringr","haven","DT"))

packages <- c("libr", "readxl" , "dplyr", "sqldf","stringr","haven","DT")

# Use lapply() to loop over each package and load it with library().
# character.only = TRUE tells library() to treat the name as a string (character) rather than a variable name.

lapply(packages, library, character.only = TRUE)
## Loading required package: common
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## [[1]]
## [1] "libr"      "common"    "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "readxl"    "libr"      "common"    "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "dplyr"     "readxl"    "libr"      "common"    "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "sqldf"     "RSQLite"   "gsubfn"    "proto"     "dplyr"     "readxl"   
##  [7] "libr"      "common"    "stats"     "graphics"  "grDevices" "utils"    
## [13] "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "stringr"   "sqldf"     "RSQLite"   "gsubfn"    "proto"     "dplyr"    
##  [7] "readxl"    "libr"      "common"    "stats"     "graphics"  "grDevices"
## [13] "utils"     "datasets"  "methods"   "base"     
## 
## [[6]]
##  [1] "haven"     "stringr"   "sqldf"     "RSQLite"   "gsubfn"    "proto"    
##  [7] "dplyr"     "readxl"    "libr"      "common"    "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[7]]
##  [1] "DT"        "haven"     "stringr"   "sqldf"     "RSQLite"   "gsubfn"   
##  [7] "proto"     "dplyr"     "readxl"    "libr"      "common"    "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"

Operators in R

Arithmetic Operators

Logical Operators

Comparison (Boolean) Operators

Colon Operator

Pipe Operator

# =====================================================================
# 📄 SCRIPT: Filtering Data in R using Different Methods (with Explanations)
# =====================================================================

# ------------------------------------------------------------
# STEP 1: Load Required Libraries
# ------------------------------------------------------------

# These libraries provide different approaches to manipulate and query data:
library(readxl)     # To read Excel files (.xlsx)
library(dplyr)      # For tidyverse-style data manipulation (e.g., filter, select)
library(data.table) # For fast and memory-efficient data handling
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(sqldf)      # Allows SQL-style queries on data frames

# -----------------------------------------------------------
# STEP 2: Load Data from an Excel File
# ------------------------------------------------------------

# Define the full file path to the Excel file (Needs change once you upload the data to local disk)
pathname_xl <- "D://SAS COURSES//Essentials 1//LWPG1V2//LWPG1V2//data//pg1v2//storm.xlsx"

# Read the Excel file into an R data frame
Data_ex <- read_excel(pathname_xl)

# Select only the relevant columns from the dataset
# (like doing KEEP statement in SAS or SELECT in SQL)
Data_excel <- Data_ex %>%
  select(Season, Name, Basin, MinPressure, StartDate)

# ------------------------------------------------------------
# STEP 3: Filter CHARACTER Data (e.g., Basin == "SP")
# ------------------------------------------------------------

# 1. Using Base R's `subset()` function
# Filters rows where Basin equals "SP"
Data_storm_base <- subset(Data_excel, Basin == "SP")

# Show the first 5 rows of filtered data
print(head(Data_storm_base, 5))
## # A tibble: 5 × 5
##   Season Name  Basin MinPressure StartDate          
##    <dbl> <chr> <chr>       <dbl> <dttm>             
## 1   1980 <NA>  SP            998 1980-03-27 00:00:00
## 2   1980 OFA   SP            980 1979-12-09 00:00:00
## 3   1980 PAUL  SP            985 1980-01-02 00:00:00
## 4   1980 PENI  SP            970 1980-01-02 00:00:00
## 5   1980 RAE   SP            990 1980-02-02 00:00:00
# 2. Using `dplyr` and the `filter()` function
# `%>%` is the pipe operator (reads as "then")
Data_storm_dplyr <- Data_excel %>%
  filter(Basin == "SP")  # Keeps only rows where Basin equals "SP"

# Print first 5 rows of the result
print(head(Data_storm_dplyr, 5))
## # A tibble: 5 × 5
##   Season Name  Basin MinPressure StartDate          
##    <dbl> <chr> <chr>       <dbl> <dttm>             
## 1   1980 <NA>  SP            998 1980-03-27 00:00:00
## 2   1980 OFA   SP            980 1979-12-09 00:00:00
## 3   1980 PAUL  SP            985 1980-01-02 00:00:00
## 4   1980 PENI  SP            970 1980-01-02 00:00:00
## 5   1980 RAE   SP            990 1980-02-02 00:00:00
#  3. Using `data.table` for efficient filtering
# Convert the data frame to a data.table object
Data_excel_dt <- as.data.table(Data_excel)

# Filter rows where Basin == "SP"
Data_storm_dt <- Data_excel_dt[Basin == "SP"]

# Display the result
print(head(Data_storm_dt, 5))
##    Season   Name  Basin MinPressure  StartDate
##     <num> <char> <char>       <num>     <POSc>
## 1:   1980   <NA>     SP         998 1980-03-27
## 2:   1980    OFA     SP         980 1979-12-09
## 3:   1980   PAUL     SP         985 1980-01-02
## 4:   1980   PENI     SP         970 1980-01-02
## 5:   1980    RAE     SP         990 1980-02-02
#  4. Using SQL-style syntax with `sqldf`
# This runs a SQL query on the R data frame
Data_storm_sqldf <- sqldf("SELECT * FROM Data_excel WHERE Basin = 'SP'")

# Print the first 5 rows
print(head(Data_storm_sqldf, 5))
##   Season Name Basin MinPressure           StartDate
## 1   1980 <NA>    SP         998 1980-03-27 05:30:00
## 2   1980  OFA    SP         980 1979-12-09 05:30:00
## 3   1980 PAUL    SP         985 1980-01-02 05:30:00
## 4   1980 PENI    SP         970 1980-01-02 05:30:00
## 5   1980  RAE    SP         990 1980-02-02 05:30:00
# ------------------------------------------------------------
# STEP 4: Filter NUMERIC Data (e.g., MinPressure > 90)
# ------------------------------------------------------------

# 1. Base R using `subset()`
# Filters rows where MinPressure is greater than 90
Data_storm_base <- subset(Data_excel, MinPressure > 90)
print(head(Data_storm_base, 5))
## # A tibble: 5 × 5
##   Season Name     Basin MinPressure StartDate          
##    <dbl> <chr>    <chr>       <dbl> <dttm>             
## 1   1980 ALLEN    NA            899 1980-07-31 00:00:00
## 2   1980 BONNIE   NA            975 1980-08-14 00:00:00
## 3   1980 CHARLEY  NA            989 1980-08-20 00:00:00
## 4   1980 DANIELLE NA           1004 1980-09-04 00:00:00
## 5   1980 EARL     NA            985 1980-09-04 00:00:00
# 2. `dplyr` using `filter()`
# Applies the same numeric condition using dplyr
Data_storm_dplyr <- Data_excel %>%
  filter(MinPressure > 90)
print(head(Data_storm_dplyr, 5))
## # A tibble: 5 × 5
##   Season Name     Basin MinPressure StartDate          
##    <dbl> <chr>    <chr>       <dbl> <dttm>             
## 1   1980 ALLEN    NA            899 1980-07-31 00:00:00
## 2   1980 BONNIE   NA            975 1980-08-14 00:00:00
## 3   1980 CHARLEY  NA            989 1980-08-20 00:00:00
## 4   1980 DANIELLE NA           1004 1980-09-04 00:00:00
## 5   1980 EARL     NA            985 1980-09-04 00:00:00
# 3. `data.table` version
# Filters using efficient syntax from data.table
Data_storm_dt <- Data_excel_dt[MinPressure > 90]
print(head(Data_storm_dt, 5))
##    Season     Name  Basin MinPressure  StartDate
##     <num>   <char> <char>       <num>     <POSc>
## 1:   1980    ALLEN     NA         899 1980-07-31
## 2:   1980   BONNIE     NA         975 1980-08-14
## 3:   1980  CHARLEY     NA         989 1980-08-20
## 4:   1980 DANIELLE     NA        1004 1980-09-04
## 5:   1980     EARL     NA         985 1980-09-04
# 4. `sqldf` SQL-style syntax
# Filters the data using an SQL WHERE clause
Data_storm_sqldf <- sqldf("SELECT * FROM Data_excel WHERE MinPressure > 90")
print(head(Data_storm_sqldf, 5))
##   Season     Name Basin MinPressure           StartDate
## 1   1980    ALLEN    NA         899 1980-07-31 05:30:00
## 2   1980   BONNIE    NA         975 1980-08-14 05:30:00
## 3   1980  CHARLEY    NA         989 1980-08-20 05:30:00
## 4   1980 DANIELLE    NA        1004 1980-09-04 05:30:00
## 5   1980     EARL    NA         985 1980-09-04 05:30:00
# ------------------------------------------------------------
# CONCLUSION
# ------------------------------------------------------------
# This script shows how to filter data in R using 4 approaches:
#   1. Base R (subset)
#   2. dplyr (filter)
#   3. data.table (bracket syntax)
#   4. sqldf (SQL queries)
#
# Both CHARACTER (Basin == "SP") and NUMERIC (MinPressure > 90)
# filtering are demonstrated — similar to WHERE clause in SAS.
# ------------------------------------------------------------

R vs SAS Comparison

operator_comparison <- data.frame(
  Type = c("Assignment", "Arithmetic", "Relational", "Logical (Boolean)", 
           "Membership / IN", "Sequence", "String", "Indexing", "Formula", "Pipe"),
  R_Example = c("<-, =, ->", "+, -, *, /, ^", "==, !=, >, <, >=, <=", "&, |, !, &&, ||", 
                "%in%", ":", "paste(), str_c()", "[ ], [[ ]], $", "y ~ x", "%>%"),
  SAS_Equivalent = c("=", "+, -, *, /, **", "=, ^=, >, <, >=, <=", "AND, OR, NOT", 
                     "IN (...)", "DO loops (`DO i = 1 to 5;`)", "||", "dataset.variable", 
                     "MODEL y = x", "N/A"),
  Explanation = c("R uses `<-` (preferred) to assign values. SAS uses `=`.",
                  "Basic math operations. `^` in R = `**` in SAS.",
                  "R uses `==` for equality (like most languages), SAS uses `=`.",
                  "R uses `&` and `|` for element-wise logic, SAS uses words.",
                  "Used to test if a value belongs to a set.",
                  "R uses `1:5` to create sequences; SAS uses `DO`.",
                  "R uses functions; SAS uses `||` to concatenate strings.",
                  "R uses these to access or subset data. SAS uses `.` notation.",
                  "Used in modeling functions (like `lm()`) in R.",
                  "No equivalent in SAS; `%>%` passes results to the next function.")
)


datatable(operator_comparison)