#Daniel Hanasab Assignment #3




# Creating three dataframes demonstrating normalization


#Loading the dplyr package, which provides tools for data manipulation.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#here we create a denormalized table where all the information is stored in one place
#we have names, courses and the professors that teach the courses, along with the student IDs
denormalized <- data.frame(
  StudentID = c(1, 2, 3, 4),
  Name = c("Alice", "Bob", "Charlie", "David"),
  Course = c("Math", "Math", "Science", "Math"),
  Professor = c("Dr. Smith", "Dr. Smith", "Dr. Brown", "Dr. Smith")
)

#here we are creating normalized tables, we want to structure the data to reduce reduncancy and improve data integrity - here we are only storing the student IDs and the names associated with those IDs
students <- data.frame(
  StudentID = c(1, 2, 3, 4),
  Name = c("Alice", "Bob", "Charlie", "David")
)

#here we assign a unique course ID to the classes and also the professor teaching those classes, again we are reducing redunancy here as well
courses <- data.frame(
  CourseID = c(101, 102),
  CourseName = c("Math", "Science"),
  Professor = c("Dr. Smith", "Dr. Brown")
)


#here we have a junction table connecting students and courses through CourseID. we can now easily update courses or professors without modifying multiple rows.
enrollments <- data.frame(
  StudentID = c(1, 2, 3, 4),
  CourseID = c(101, 101, 102, 101)
)


# Displaying dataframes separately, we are just printing the dataframes we put together before
print(students)
##   StudentID    Name
## 1         1   Alice
## 2         2     Bob
## 3         3 Charlie
## 4         4   David
print(courses)
##   CourseID CourseName Professor
## 1      101       Math Dr. Smith
## 2      102    Science Dr. Brown
print(enrollments)
##   StudentID CourseID
## 1         1      101
## 2         2      101
## 3         3      102
## 4         4      101

# Character Manipulation

## Finding Majors with "DATA" or "STATISTICS"

#stringr is a package for handling text and string operations.
library(stringr)


#Downloads and reads the list of college majors from 538
majors <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")

#here we turn all the text into uppercase, so capitalization doesn't mess up our search and then we do a search for DATA or STATISTICS in order to find the majors we are looking for
#toupper(majors$Major): Converts major names to uppercase (so the search is case-insensitive).
#grep("DATA|STATISTICS", ...): Finds majors that contain "DATA" or "STATISTICS".

data_stats_majors <- majors[grep("DATA|STATISTICS", toupper(majors$Major)), ]

data_stats_majors
##    FOD1P                                         Major          Major_Category
## 44  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS                Business
## 52  2101      COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 59  3702               STATISTICS AND DECISION SCIENCE Computers & Mathematics
# Regular Expressions

## Explanation of Expressions

#(.) → Captures any single character.
#\1\1 → Matches that same character two more times.
#(.)(.) → Captures two characters separately.
#\\2\\1 → Ensures the second character appears again first, then the first character appears again.

# (.)\1\1 matches any character repeated three times (e.g., "aaa", "111").
# "(.)(.)\\2\\1" matches a four-character palindrome (e.g., "abba").
# (..)\1 matches two-character sequences that repeat (e.g., "tata", "bobo").
# "(.).\\1.\\1" matches patterns like "a_b_a" or "c-d-c" where the same character appears with a gap.
# "(.)(.)(.).*\\3\\2\\1" matches sequences where three characters appear in reverse order later in the string (e.g., "xyz...zyx").


## Constructing Regular Expressions
# Words that start and end with the same character


#^ → Start of the string.
#(.) → Captures the first character.
#.* → Allows any characters in between.
#\\1$ → Ensures the last character is the same as the first one.

pattern1 <- "^(.) .* \1$"




#(..) → Captures two consecutive characters.
#.* → Allows any characters in between.
#\\1 → Ensures the same two-character sequence appears again.
# Words that contain a repeated pair of letters


pattern2 <- "(..).*\\1"




#(.) → Captures a single character.
#.*\1.*\1 → Ensures that this character appears at least three times.

# Words with one letter repeated at least three times
pattern3 <- "(.).*\\1.*\\1"

# Test with sample words
sample_words <- c("level", "church", "eleven", "test", "banana")


#Here we are finding words that match each pattern
matches1 <- grep(pattern1, sample_words, value = TRUE)
matches2 <- grep(pattern2, sample_words, value = TRUE)
matches3 <- grep(pattern3, sample_words, value = TRUE)

list(StartEndSame = matches1, RepeatedPair = matches2, ThreeRepeats = matches3)
## $StartEndSame
## character(0)
## 
## $RepeatedPair
## [1] "church"
## 
## $ThreeRepeats
## [1] "eleven"