Unnormalized Data: Contains non-atomic values.
# Unnormalized data
df_unnormalized <- data.frame(
OrderID = c(1, 2, 3),
CustomerName = c("Alice", "Bob", "Charlie"),
Items = c("Laptop, Mouse", "Tablet", "Phone, Headphones"),
TotalPrice = c(1500, 600, 1000),
Address = c("123 Elm St", "456 Oak St", "789 Pine St")
)
print("Unnormalized Data:")
## [1] "Unnormalized Data:"
print(df_unnormalized)
## OrderID CustomerName Items TotalPrice Address
## 1 1 Alice Laptop, Mouse 1500 123 Elm St
## 2 2 Bob Tablet 600 456 Oak St
## 3 3 Charlie Phone, Headphones 1000 789 Pine St
First Normal Form (1NF): Ensures atomic values, no repeating groups
# First Normal Form (1NF)
df_1NF <- data.frame(
OrderID = c(1, 1, 2, 3, 3),
CustomerName = c("Alice", "Alice", "Bob", "Charlie", "Charlie"),
Item = c("Laptop", "Mouse", "Tablet", "Phone", "Headphones"),
TotalPrice = c(1000, 500, 600, 800, 200),
Address = c("123 Elm St", "123 Elm St", "456 Oak St", "789 Pine St", "789 Pine St")
)
print("First Normal Form (1NF):")
## [1] "First Normal Form (1NF):"
print(df_1NF)
## OrderID CustomerName Item TotalPrice Address
## 1 1 Alice Laptop 1000 123 Elm St
## 2 1 Alice Mouse 500 123 Elm St
## 3 2 Bob Tablet 600 456 Oak St
## 4 3 Charlie Phone 800 789 Pine St
## 5 3 Charlie Headphones 200 789 Pine St
Second Normal Form (2NF): Eliminates partial dependencies by separating data into customer and order tables.
# Second Normal Form (2NF)
df_customers <- data.frame(
CustomerID = c(1, 2, 3),
CustomerName = c("Alice", "Bob", "Charlie"),
Address = c("123 Elm St", "456 Oak St", "789 Pine St")
)
df_orders <- data.frame(
OrderID = c(1, 1, 2, 3, 3),
CustomerID = c(1, 1, 2, 3, 3), # Foreign key reference to customers
Item = c("Laptop", "Mouse", "Tablet", "Phone", "Headphones"),
TotalPrice = c(1000, 500, 600, 800, 200)
)
print("Customers Table:")
## [1] "Customers Table:"
print(df_customers)
## CustomerID CustomerName Address
## 1 1 Alice 123 Elm St
## 2 2 Bob 456 Oak St
## 3 3 Charlie 789 Pine St
print("Orders Table:")
## [1] "Orders Table:"
print(df_orders)
## OrderID CustomerID Item TotalPrice
## 1 1 1 Laptop 1000
## 2 1 1 Mouse 500
## 3 2 2 Tablet 600
## 4 3 3 Phone 800
## 5 3 3 Headphones 200
Third Normal Form (3NF): Ensures no transitive dependencies (already satisfied in this case).
print("Final Tables in Third Normal Form (3NF):")
## [1] "Final Tables in Third Normal Form (3NF):"
print("Customers Table:")
## [1] "Customers Table:"
print(df_customers)
## CustomerID CustomerName Address
## 1 1 Alice 123 Elm St
## 2 2 Bob 456 Oak St
## 3 3 Charlie 789 Pine St
print("Orders Table:")
## [1] "Orders Table:"
print(df_orders)
## OrderID CustomerID Item TotalPrice
## 1 1 1 Laptop 1000
## 2 1 1 Mouse 500
## 3 2 2 Tablet 600
## 4 3 3 Phone 800
## 5 3 3 Headphones 200
Character Manipulation
# Load required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
# Load the dataset (replace with the actual path)
df_majors <- read.csv("/Users/aribarazzaq/Downloads/majors-list.csv", stringsAsFactors = FALSE)
# Display the first few rows and check the column names
print(head(df_majors))
## FOD1P Major Major_Category
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Resources
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Resources
## 5 1104 FOOD SCIENCE Agriculture & Natural Resources
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
print(colnames(df_majors))
## [1] "FOD1P" "Major" "Major_Category"
# Assuming the column name for majors is 'Major' (or adjust as needed)
majors_filtered <- df_majors %>%
filter(str_detect(Major, "DATA|STATISTICS"))
# Display the filtered majors
print(majors_filtered)
## FOD1P Major Major_Category
## 1 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 2 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
(.)\1\1 = triplets of the same character “(.)(.)\2\1” = four-character palindrome structure (..)\1 = two-character sequences that repeat “(.).\1.\1” = character repeated with one in between “(.)(.)(.).*\3\2\1” = same three characters in reverse order, possibly with other characters in between.
Start and end with the same character: ^(.)\1$ example will be radar
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) (?i)(..).*\1 example will be banana
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.) (?i)(.)(?=.\1.\1) example will be committee