Normalization
Provide an example of at least three dataframes in R that demonstrate normalization. The dataframes can contain any data, either real or synthetic. Although normalization is typically done in SQL and relational databases, you are expected to show this example in R, as it is our main work environment in this course.
student_denormalized_data <- data.frame(
Student_ID = c(1, 1, 1, 2, 2, 3, 3, 4, 5),
Student_Name = c("Amy", "Amy", "Amy", "Jim","Jim", "Fiona","Fiona", "Jack", "Bob"),
Major = c("Computer Science", "Math", "Music", "Chemistry", "Biology", "Music", "Math", "Math", "Physics" ),
Course_ID = c("CS_101", "MATH_101", "MUSIC_101", "CH_101", "BIO_101","MUSIC_101", "MATH_101", "MATH_101", "PH_101"),
Grade = c("A", "A", "B", "B", "A", "A", "A","B","C"),
Fee = c("5000", "4000", "4000", "5000", "4000", "4000", "4000", "4000", "3000"),
Teacher_ID = c ("001","002","003","004","005","003","002", "002","006"),
Teacher_Name = c ("Miss Darlene", "Ms Rose", "Mr Jack", "Mr Sam", "Ms Lori", "Mr Jack", "Ms Rose", "Ms Rose", "Mr Thomas")
)
print(student_denormalized_data)
## Student_ID Student_Name Major Course_ID Grade Fee Teacher_ID
## 1 1 Amy Computer Science CS_101 A 5000 001
## 2 1 Amy Math MATH_101 A 4000 002
## 3 1 Amy Music MUSIC_101 B 4000 003
## 4 2 Jim Chemistry CH_101 B 5000 004
## 5 2 Jim Biology BIO_101 A 4000 005
## 6 3 Fiona Music MUSIC_101 A 4000 003
## 7 3 Fiona Math MATH_101 A 4000 002
## 8 4 Jack Math MATH_101 B 4000 002
## 9 5 Bob Physics PH_101 C 3000 006
## Teacher_Name
## 1 Miss Darlene
## 2 Ms Rose
## 3 Mr Jack
## 4 Mr Sam
## 5 Ms Lori
## 6 Mr Jack
## 7 Ms Rose
## 8 Ms Rose
## 9 Mr Thomas
Students <- data.frame(
Student_ID = unique(student_denormalized_data$Student_ID),
Student_Name = unique(student_denormalized_data$Student_Name)
)
Students
Student_Course <- data.frame(
Student_ID = student_denormalized_data$Student_ID,
Major = student_denormalized_data$Major,
Course_ID = student_denormalized_data$Course_ID,
Grade = student_denormalized_data$Grade
)
print(Student_Course)
## Student_ID Major Course_ID Grade
## 1 1 Computer Science CS_101 A
## 2 1 Math MATH_101 A
## 3 1 Music MUSIC_101 B
## 4 2 Chemistry CH_101 B
## 5 2 Biology BIO_101 A
## 6 3 Music MUSIC_101 A
## 7 3 Math MATH_101 A
## 8 4 Math MATH_101 B
## 9 5 Physics PH_101 C
courses_fee <- data.frame(
Course_ID = unique(student_denormalized_data$Course_ID),
Major = unique(student_denormalized_data$Major),
Fee = unique(student_denormalized_data$Fee),
Teacher_Name = unique(student_denormalized_data$Teacher_ID)
)
print(courses_fee)
## Course_ID Major Fee Teacher_Name
## 1 CS_101 Computer Science 5000 001
## 2 MATH_101 Math 4000 002
## 3 MUSIC_101 Music 3000 003
## 4 CH_101 Chemistry 5000 004
## 5 BIO_101 Biology 4000 005
## 6 PH_101 Physics 3000 006
teachers <- data.frame(
Teacher_ID = unique(student_denormalized_data$Teacher_ID),
Teacher_Name = unique(student_denormalized_data$Teacher_Name)
)
print(teachers)
## Teacher_ID Teacher_Name
## 1 001 Miss Darlene
## 2 002 Ms Rose
## 3 003 Mr Jack
## 4 004 Mr Sam
## 5 005 Ms Lori
## 6 006 Mr Thomas
Character Manipulation
#read cvs
majors_data <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/majors-list.csv")
head(majors_data)
#filter the data
majors_filtered <- majors_data[grep("DATA|STATISTICS", majors_data$Major, ignore.case = TRUE), ]
majors_filtered
(.)\1\1 #(.) means any single character, \1\1 means match the same character twice more. Example “aaa”,“bbb”
“(.)(.)\2\1” #This is for example “abba”, “wccw”, first(.) captures the first character, second (.) captures the second character. \2 match the second captured character, and \1 match the first captured character again.
(..)\1 #This is for example “abab”,“wcwc”. (..) captures two characters, and \1 try to repeat the captured again.
“(.).\1.\1” #(.) means any single character, and . means any character, \1 match the first captured character and so on. Example in this case “acaca”
“(.)(.)(.).\3\2\1” #(.) captures first, second, and third characters. \3\2\1 matches the third, second, and first captured characters. ”.” means matches any sequence characters.
Start and end with the same character. (.)\1
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) (..)\1
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
(.)\1\1\1