#load libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Part 1: Provide an example of at least three dataframes in R that demonstrate normalization. The dataframes can contain any data, either real or synthetic. Although normalization is typically done in SQL and relational databases, you are expected to show this example in R, as it is our main work environment in this course.

grocerylist_denormalized <- data.frame(
  productID = c(1, 1, 1, 2, 2, 3, 3, 4, 5),
  productName = c("Lemon", "Lemon", "Lemon", "Sweet Potatoes","Sweet Potatoes", "Lime","Lime", "Steak", "Chicken"),
  productCost = c(0.10, 0.10, 0.10, 0.50, 0.50, 0.15, 0.15, 8.50, 4.50 ),
  productSellPrice = c(0.50, 0.50, 0.50, 2.00, 2.00, 0.35, 0.35, 18.20, 9.97),
  CategoryID = c(1, 1, 1, 2, 2, 1, 1, 4, 3)
)
print(grocerylist_denormalized)
##   productID    productName productCost productSellPrice CategoryID
## 1         1          Lemon        0.10             0.50          1
## 2         1          Lemon        0.10             0.50          1
## 3         1          Lemon        0.10             0.50          1
## 4         2 Sweet Potatoes        0.50             2.00          2
## 5         2 Sweet Potatoes        0.50             2.00          2
## 6         3           Lime        0.15             0.35          1
## 7         3           Lime        0.15             0.35          1
## 8         4          Steak        8.50            18.20          4
## 9         5        Chicken        4.50             9.97          3
groceryCatgories_normalized <- data.frame(
  CategoryID = c(1, 2, 3, 4),
  CategoryName = c("Fruit","Vegetable","Poultry", "Red Meat" )
)
print(groceryCatgories_normalized)
##   CategoryID CategoryName
## 1          1        Fruit
## 2          2    Vegetable
## 3          3      Poultry
## 4          4     Red Meat
Product <- data.frame(
   ProductID= unique(grocerylist_denormalized$productID),
   ProductName= unique(grocerylist_denormalized$productName)
)
print(Product)
##   ProductID    ProductName
## 1         1          Lemon
## 2         2 Sweet Potatoes
## 3         3           Lime
## 4         4          Steak
## 5         5        Chicken
ProductInfo <- data.frame(
   ProductID= unique(grocerylist_denormalized$productID),
   ProductName= unique(grocerylist_denormalized$productName), 
   ProductPrice= unique(grocerylist_denormalized$productSellPrice)
)
print(ProductInfo)
##   ProductID    ProductName ProductPrice
## 1         1          Lemon         0.50
## 2         2 Sweet Potatoes         2.00
## 3         3           Lime         0.35
## 4         4          Steak        18.20
## 5         5        Chicken         9.97

#Part 2 Character Manipulation: using original data file from github

majors_list <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/majors-list.csv")
View(majors_list)

#code that identifies the majors that either contain “DATA” or “STATISTICS”

datastats_majors <- majors_list$Major[str_detect(majors_list$Major, "DATA|STATISTICS")]

datastats_majors <- data.frame(datastats_majors)
View(datastats_majors)

#Part 3 Describe, in words, what these expressions will match:

(.)\1\1 : This expression will match the same character appearing three times in a row. For example: “aaa” or “rrr”

expression <- "(.)\\1\\1"
str_view(expression)
## [1] │ (.)\1\1
str_view(c(words, "aaa","rrr","444"), expression) 
## [981] │ <aaa>
## [982] │ <rrr>
## [983] │ <444>

“(.)(.)\2\1” : This expression will match a pair of characters followed by the same pair of characters reversed. For example: “bccb” or “effe”

expression <- "(.)(.)\\2\\1"
str_view(expression)
## [1] │ (.)(.)\2\1
str_view(c(words), expression) 
##  [19] │ after<noon>
##  [43] │ <appa>rent
##  [53] │ <arra>nge
## [107] │ b<otto>m
## [112] │ br<illi>ant
## [174] │ c<ommo>n
## [230] │ d<iffi>cult
## [259] │ <effe>ct
## [329] │ f<ollo>w
## [422] │ in<deed>
## [470] │ l<ette>r
## [521] │ m<illi>on
## [581] │ <oppo>rtunity
## [582] │ <oppo>se
## [877] │ tom<orro>w

(..)\1 : This expression will match any two charcters repeated. For example: “b2b2” or “c3c3”

expression <- "(..)\\1"
str_view(expression)
## [1] │ (..)\1
str_view(c(words), expression)
## [696] │ r<emem>ber

“(.).\1.\1” : This expression will match a character followed by any character, the origina characterm any other character, the original character again. For example: “abaca” or “b3b.b”

expression <- "(.).\\1.\\1"
str_view(expression)
## [1] │ (.).\1.\1
str_view(c(words), expression)
## [265] │ <eleve>n

“(.)(.)(.).*\3\2\1” This expression will match three characters followed by zero or more characters of any kinds followed by the same 3 characters but reverse. For example: “abccba” or “abc1cba”

expression <- "(.)(.)(.).*\\3\\2\\1"
str_view(expression)
## [1] │ (.)(.)(.).*\3\2\1
str_view(c(words), expression)
## [598] │ <paragrap>h

#Part 4 Construct regular expressions to match words that:

  1. Start and end with the same character.
expression <- "^(.).*\\1$"
str_view(expression)
## [1] │ ^(.).*\1$
str_view(c(words), expression)
##  [36] │ <america>
##  [49] │ <area>
## [209] │ <dad>
## [213] │ <dead>
## [223] │ <depend>
## [258] │ <educate>
## [266] │ <else>
## [268] │ <encourage>
## [270] │ <engine>
## [278] │ <europe>
## [283] │ <evidence>
## [285] │ <example>
## [287] │ <excuse>
## [288] │ <exercise>
## [291] │ <expense>
## [292] │ <experience>
## [296] │ <eye>
## [386] │ <health>
## [394] │ <high>
## [450] │ <knock>
## ... and 16 more
  1. Contain a repeated pair of letters (e.g. church’’ containsch’’ repeated twice.)
expression <- "(..).*\\1"
str_view(expression)
## [1] │ (..).*\1
str_view(c(words), expression)
##  [48] │ ap<propr>iate
## [152] │ <church>
## [181] │ c<ondition>
## [217] │ <decide>
## [275] │ <environmen>t
## [487] │ l<ondon>
## [598] │ pa<ragra>ph
## [603] │ p<articular>
## [617] │ <photograph>
## [638] │ p<repare>
## [641] │ p<ressure>
## [696] │ r<emem>ber
## [698] │ <repre>sent
## [699] │ <require>
## [739] │ <sense>
## [858] │ the<refore>
## [903] │ u<nderstand>
## [946] │ w<hethe>r
  1. Contain one letter repeated in at least three places (e.g. eleven’’ contains threee’’s.)
expression <- "(.).*\\1.*\\1"
str_view(expression)
## [1] │ (.).*\1.*\1
str_view(c(words), expression)
##  [48] │ a<pprop>riate
##  [62] │ <availa>ble
##  [86] │ b<elieve>
##  [90] │ b<etwee>n
## [119] │ bu<siness>
## [221] │ d<egree>
## [229] │ diff<erence>
## [233] │ di<scuss>
## [265] │ <eleve>n
## [275] │ e<nvironmen>t
## [283] │ <evidence>
## [288] │ <exercise>
## [291] │ <expense>
## [292] │ <experience>
## [423] │ <indivi>dual
## [598] │ p<aragra>ph
## [684] │ r<eceive>
## [696] │ r<emembe>r
## [698] │ r<eprese>nt
## [845] │ t<elephone>
## ... and 2 more