Here I have three tables which demonstrate UNF, INF, 2NF and 3NF.
# Unnormalized Data (UNF)
unnormalized_data <- tibble(
EmployeeID = c(1, 2, 3, 4),
Name = c("Alice", "Bob", "Yvonne", "Emily"),
Department = c("HR", "Finance", "IT", "Management"),
Projects = c("Recruitment, Payroll", "Budgeting, Forecasting", "Software, database", "Commissioning, Opening"),
Skills = c("Communication, Management", "Analysis, Excel", "Development, testing", "Resource, Planning")
)
print(unnormalized_data)## # A tibble: 4 × 5
## EmployeeID Name Department Projects Skills
## <dbl> <chr> <chr> <chr> <chr>
## 1 1 Alice HR Recruitment, Payroll Communication, Management
## 2 2 Bob Finance Budgeting, Forecasting Analysis, Excel
## 3 3 Yvonne IT Software, database Development, testing
## 4 4 Emily Management Commissioning, Opening Resource, Planning
1NF
# Data in First Normal form (1NF)
first_normal_form <- tibble(
EmployeeID = c(1, 1, 2, 2, 3, 3, 4, 4),
Name = c("Alice", "Alice", "Bob", "Bob", "Yvonne", "Yvonne", "Emily", "Emily"),
Department = c("HR", "HR", "Finance", "Finance", "IT", "IT", "Management", "Management"),
Project = c("Recruitment", "Payroll", "Budgeting", "Forecasting", "Software", "Database", "Commissioning", "Opening"),
Skill = c("Communication", "Management", "Analysis", "Excel", "Development", "Testing", "Resource", "Planning")
)
print(first_normal_form)## # A tibble: 8 × 5
## EmployeeID Name Department Project Skill
## <dbl> <chr> <chr> <chr> <chr>
## 1 1 Alice HR Recruitment Communication
## 2 1 Alice HR Payroll Management
## 3 2 Bob Finance Budgeting Analysis
## 4 2 Bob Finance Forecasting Excel
## 5 3 Yvonne IT Software Development
## 6 3 Yvonne IT Database Testing
## 7 4 Emily Management Commissioning Resource
## 8 4 Emily Management Opening Planning
2NF
# Create Employee-Department table
employee_department <- tibble(
EmployeeID = c(1, 2, 3, 4),
Name = c("Alice", "Bob", "Yvonne", "Emily"),
Department = c("HR", "Finance", "IT", "Management")
)
# Create Employee-Project table
employee_project <- tibble(
EmployeeID = c(1, 1, 2, 2, 3, 3, 4, 4),
Project = c("Recruitment", "Payroll", "Budgeting", "Forecasting", "Software", "Database", "Comissioning", "Opening")
)
# Create Employee-Skill table
employee_skill <- tibble(
EmployeeID = c(1, 1, 2, 2, 3, 3, 4, 4),
Skill = c("Communication", "Management", "Analysis", "Excel", "Development", "Testing", "Resource", "Planning")
)
print(employee_department)## # A tibble: 4 × 3
## EmployeeID Name Department
## <dbl> <chr> <chr>
## 1 1 Alice HR
## 2 2 Bob Finance
## 3 3 Yvonne IT
## 4 4 Emily Management
## # A tibble: 8 × 2
## EmployeeID Project
## <dbl> <chr>
## 1 1 Recruitment
## 2 1 Payroll
## 3 2 Budgeting
## 4 2 Forecasting
## 5 3 Software
## 6 3 Database
## 7 4 Comissioning
## 8 4 Opening
## # A tibble: 8 × 2
## EmployeeID Skill
## <dbl> <chr>
## 1 1 Communication
## 2 1 Management
## 3 2 Analysis
## 4 2 Excel
## 5 3 Development
## 6 3 Testing
## 7 4 Resource
## 8 4 Planning
3NF
# Create Employee table
employee <- tibble(
EmployeeID = c(1, 2, 3, 4),
Name = c("Alice", "Bob", "Yvonne", "Emily")
)
# Create Department table
department <- tibble(
EmployeeID = c(1, 2, 3, 4),
Department = c("HR", "Finance", "IT", "Management")
)
# Create Project table
project <- tibble(
EmployeeID = c(1, 1, 2, 2, 3, 3, 4, 4),
Project = c("Recruitment", "Payroll", "Budgeting", "Forecasting", "Software", "Database", "Commissioning", "Opening")
)
# Create Skill table
skill <- tibble(
EmployeeID = c(1, 1, 2, 2, 3, 3, 4, 4),
Skill = c("Communication", "Management", "Analysis", "Excel", "Development", "Testing", "Resource", "Planning")
)
print(employee)## # A tibble: 4 × 2
## EmployeeID Name
## <dbl> <chr>
## 1 1 Alice
## 2 2 Bob
## 3 3 Yvonne
## 4 4 Emily
## # A tibble: 4 × 2
## EmployeeID Department
## <dbl> <chr>
## 1 1 HR
## 2 2 Finance
## 3 3 IT
## 4 4 Management
## # A tibble: 8 × 2
## EmployeeID Project
## <dbl> <chr>
## 1 1 Recruitment
## 2 1 Payroll
## 3 2 Budgeting
## 4 2 Forecasting
## 5 3 Software
## 6 3 Database
## 7 4 Commissioning
## 8 4 Opening
## # A tibble: 8 × 2
## EmployeeID Skill
## <dbl> <chr>
## 1 1 Communication
## 2 1 Management
## 3 2 Analysis
## 4 2 Excel
## 5 3 Development
## 6 3 Testing
## 7 4 Resource
## 8 4 Planning
Identifying majors that contain either “DATA” or “STATISTICS”
# Install and load necessary packages
library(dplyr)
library(stringr)
# Load the dataset from the url given
url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
majors_data <- read.csv(url)
# Display the first few rows of the dataset to understand its structure
head(majors_data)## FOD1P Major Major_Category
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Resources
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Resources
## 5 1104 FOOD SCIENCE Agriculture & Natural Resources
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
# Filter majors that contain either "DATA" or "STATISTICS" as requested
filtered_majors <- majors_data %>%
filter(str_detect(Major, "DATA|STATISTICS"))
# Display the filtered majors based on the above criteria
print(filtered_majors)## FOD1P Major Major_Category
## 1 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 2 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
(.)\1\1: This expression matches any character that appears three times in a row
“(.)(.)\2\1”: This expression matches a four-character string where the first two characters are mirrored by the second two characters in reverse order.
(..)\1: This expression matches any four-character string where the first two characters are repeated.
“(.).\1.\1”: This expression matches any five-character string where the first, third, and fifth characters are the same, and the second and fourth characters can be any character.
“(.)(.)(.).*\3\2\1” : This expression matches any string where three consecutive characters are followed somewhere later in the string (not necessarily immediately) by those three characters in reverse order.
Start and end with the same character: ^(.)\1.*\1$
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.): ().*\1
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.): (.(.).\2.\2.)