#Daniel Hanasab Assignment #3
# Creating three dataframes demonstrating normalization
#Loading the dplyr package, which provides tools for data manipulation.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#here we create a denormalized table where all the information is stored in one place
#we have names, courses and the professors that teach the courses, along with the student IDs
denormalized <- data.frame(
StudentID = c(1, 2, 3, 4),
Name = c("Alice", "Bob", "Charlie", "David"),
Course = c("Math", "Math", "Science", "Math"),
Professor = c("Dr. Smith", "Dr. Smith", "Dr. Brown", "Dr. Smith")
)
#here we are creating normalized tables, we want to structure the data to reduce reduncancy and improve data integrity - here we are only storing the student IDs and the names associated with those IDs
students <- data.frame(
StudentID = c(1, 2, 3, 4),
Name = c("Alice", "Bob", "Charlie", "David")
)
#here we assign a unique course ID to the classes and also the professor teaching those classes, again we are reducing redunancy here as well
courses <- data.frame(
CourseID = c(101, 102),
CourseName = c("Math", "Science"),
Professor = c("Dr. Smith", "Dr. Brown")
)
#here we have a junction table connecting students and courses through CourseID. we can now easily update courses or professors without modifying multiple rows.
enrollments <- data.frame(
StudentID = c(1, 2, 3, 4),
CourseID = c(101, 101, 102, 101)
)
# Displaying dataframes separately, we are just printing the dataframes we put together before
print(students)
## StudentID Name
## 1 1 Alice
## 2 2 Bob
## 3 3 Charlie
## 4 4 David
print(courses)
## CourseID CourseName Professor
## 1 101 Math Dr. Smith
## 2 102 Science Dr. Brown
print(enrollments)
## StudentID CourseID
## 1 1 101
## 2 2 101
## 3 3 102
## 4 4 101
# Character Manipulation
## Finding Majors with "DATA" or "STATISTICS"
#stringr is a package for handling text and string operations.
library(stringr)
#Downloads and reads the list of college majors from 538
majors <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
#here we turn all the text into uppercase, so capitalization doesn't mess up our search and then we do a search for DATA or STATISTICS in order to find the majors we are looking for
#toupper(majors$Major): Converts major names to uppercase (so the search is case-insensitive).
#grep("DATA|STATISTICS", ...): Finds majors that contain "DATA" or "STATISTICS".
data_stats_majors <- majors[grep("DATA|STATISTICS", toupper(majors$Major)), ]
data_stats_majors
## FOD1P Major Major_Category
## 44 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 52 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 59 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
# Regular Expressions
## Explanation of Expressions
#(.) → Captures any single character.
#\1\1 → Matches that same character two more times.
#(.)(.) → Captures two characters separately.
#\\2\\1 → Ensures the second character appears again first, then the first character appears again.
# (.)\1\1 matches any character repeated three times (e.g., "aaa", "111").
# "(.)(.)\\2\\1" matches a four-character palindrome (e.g., "abba").
# (..)\1 matches two-character sequences that repeat (e.g., "tata", "bobo").
# "(.).\\1.\\1" matches patterns like "a_b_a" or "c-d-c" where the same character appears with a gap.
# "(.)(.)(.).*\\3\\2\\1" matches sequences where three characters appear in reverse order later in the string (e.g., "xyz...zyx").
## Constructing Regular Expressions
# Words that start and end with the same character
#^ → Start of the string.
#(.) → Captures the first character.
#.* → Allows any characters in between.
#\\1$ → Ensures the last character is the same as the first one.
pattern1 <- "^(.) .* \1$"
#(..) → Captures two consecutive characters.
#.* → Allows any characters in between.
#\\1 → Ensures the same two-character sequence appears again.
# Words that contain a repeated pair of letters
pattern2 <- "(..).*\\1"
#(.) → Captures a single character.
#.*\1.*\1 → Ensures that this character appears at least three times.
# Words with one letter repeated at least three times
pattern3 <- "(.).*\\1.*\\1"
# Test with sample words
sample_words <- c("level", "church", "eleven", "test", "banana")
#Here we are finding words that match each pattern
matches1 <- grep(pattern1, sample_words, value = TRUE)
matches2 <- grep(pattern2, sample_words, value = TRUE)
matches3 <- grep(pattern3, sample_words, value = TRUE)
list(StartEndSame = matches1, RepeatedPair = matches2, ThreeRepeats = matches3)
## $StartEndSame
## character(0)
##
## $RepeatedPair
## [1] "church"
##
## $ThreeRepeats
## [1] "eleven"