Load necessary library

chooseCRANmirror(graphics = FALSE, ind = 1)  # Selects the first mirror
install.packages("dplyr")
## 
## The downloaded binary packages are in
##  /var/folders/nz/h7z329n55nxfs2dv7hmbhc400000gn/T//RtmpipV24q/downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Unnormalized data

This dataframe contains redundant data since customer and product details are repeated for each order.

orders <- data.frame(
  OrderID = c(1, 2, 3, 4, 5),
  CustomerID = c(101, 102, 101, 103, 103),
  CustomerName = c("Jessica", "Regina", "Jessica", "Natasha", "Natasha"),
  ProductID = c(201, 202, 204, 203, 202),
  ProductName = c("Laptop", "Phone", "Headphones", "Tablet", "Phone"),
  Price = c(1000, 500, 300, 700, 500)
)

print(orders)
##   OrderID CustomerID CustomerName ProductID ProductName Price
## 1       1        101      Jessica       201      Laptop  1000
## 2       2        102       Regina       202       Phone   500
## 3       3        101      Jessica       204  Headphones   300
## 4       4        103      Natasha       203      Tablet   700
## 5       5        103      Natasha       202       Phone   500

Normalized Data

I split the original dataframe into three separate tables:

Customers: Containing unique customer data. Products Table: Containing unique product data. Orders Normalized Table: Containing references to the CustomerID and ProductID.

Although customerID 101 is repeated in the last table, this table is still normalized because each row represents a unique order/purchase,not just a customer. Jessica and Natasha will appear multiple times in the table because they each placed multiple orders.

customers <- data.frame(
  CustomerID = c(101, 102, 103),
  CustomerName = c("Jessica", "Regina", "Natasha"),
  Email = c("jessica_mp@gmail.com", "regina_ca@gmail.com", "natasha_mh@gmail.com"),
  City = c("Stockholm", "Philadelphia", "Kingston"),
  RegistrationDate = as.Date(c("2024-03-10", "2024-06-21", "2024-09-05"))
)

print(customers)
##   CustomerID CustomerName                Email         City RegistrationDate
## 1        101      Jessica jessica_mp@gmail.com    Stockholm       2024-03-10
## 2        102       Regina  regina_ca@gmail.com Philadelphia       2024-06-21
## 3        103      Natasha natasha_mh@gmail.com     Kingston       2024-09-05
products <- data.frame(
  ProductID = c(201, 202, 203, 204),
  ProductName = c("Laptop", "Phone", "Tablet", "Headphones"),
  Price = c(1000, 500, 700, 300),
  Category = c("Electronics", "Electronics", "Electronics", "Accessories")
)

print(products)
##   ProductID ProductName Price    Category
## 1       201      Laptop  1000 Electronics
## 2       202       Phone   500 Electronics
## 3       203      Tablet   700 Electronics
## 4       204  Headphones   300 Accessories
orders_normalized <- data.frame(
  OrderID = c(1, 2, 3, 4, 5),
  CustomerID = c(101, 102, 101, 103, 103),
  ProductID = c(201, 202, 204, 203, 202)
)

print(orders_normalized)
##   OrderID CustomerID ProductID
## 1       1        101       201
## 2       2        102       202
## 3       3        101       204
## 4       4        103       203
## 5       5        103       202

Summary of all orders

summary_orders <- orders_normalized %>%
  left_join(customers, by = "CustomerID") %>%
  left_join(products, by = "ProductID") %>%
  group_by(CustomerID, CustomerName) %>%
  summarise(
    ProductsPurchased = paste(unique(ProductName), collapse = ", "),
    TotalSpent = sum(Price)
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'CustomerID'. You can override using the
## `.groups` argument.
print(summary_orders)
## # A tibble: 3 × 4
##   CustomerID CustomerName ProductsPurchased  TotalSpent
##        <dbl> <chr>        <chr>                   <dbl>
## 1        101 Jessica      Laptop, Headphones       1300
## 2        102 Regina       Phone                     500
## 3        103 Natasha      Tablet, Phone            1200
library(stringr)
library(readr)

# Load the majors dataset from GitHub
url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
majors_table <- read_csv(url, show_col_types = FALSE)

# Extracting the Major column
majors <- majors_table$Major

# Finding majors with "DATA" or "STATISTICS"
data_stats_majors <- majors[str_detect(majors, regex("DATA|STATISTICS", ignore_case = TRUE))]

print(data_stats_majors)
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
## [3] "STATISTICS AND DECISION SCIENCE"
expressions <- list(
  "(.)\\1\\1" = "Matches a string with any character that appears three times in a row (e.g <aaa>, <bbb>)",
  "(.)(.)\\2\\1" = "Matches a string where the first character is followed by the second character, then the second character repeats, and the first character repeats at the end (e.g <abba>, <deed>).",
  "(..)\\1" = "Match a string with a repeated pair of letters (e.g <coco>nut, b<anan>a)",
  "(.).\\1.\\1" = "Matches string where a character repeats with one character in between.(e.g <dad>, <mom>) ",
  "(.)(.)(.).*\\3\\2\\1" = "Mmatches a string where three characters are captured, followed by any characters, and then the same first three characters appear in reverse order. (e.g <abcxyzcba>, <reviver>)"
)

print (expressions)
## $`(.)\\1\\1`
## [1] "Matches a string with any character that appears three times in a row (e.g <aaa>, <bbb>)"
## 
## $`(.)(.)\\2\\1`
## [1] "Matches a string where the first character is followed by the second character, then the second character repeats, and the first character repeats at the end (e.g <abba>, <deed>)."
## 
## $`(..)\\1`
## [1] "Match a string with a repeated pair of letters (e.g <coco>nut, b<anan>a)"
## 
## $`(.).\\1.\\1`
## [1] "Matches string where a character repeats with one character in between.(e.g <dad>, <mom>) "
## 
## $`(.)(.)(.).*\\3\\2\\1`
## [1] "Mmatches a string where three characters are captured, followed by any characters, and then the same first three characters appear in reverse order. (e.g <abcxyzcba>, <reviver>)"
regex_examples <- list(
  "^(.).*\\1$" = "Start and end with the same character. (e.g <area>, <river>)",
  "(..)\\1" = "Contain a repeated pair of letters (e.g. church contains ch repeated twice.)",
  "(.).*\\1.*\\1" ="Contain one letter repeated in at least three places (e.g. eleven contains three Es.)"
)

print (regex_examples)
## $`^(.).*\\1$`
## [1] "Start and end with the same character. (e.g <area>, <river>)"
## 
## $`(..)\\1`
## [1] "Contain a repeated pair of letters (e.g. church contains ch repeated twice.)"
## 
## $`(.).*\\1.*\\1`
## [1] "Contain one letter repeated in at least three places (e.g. eleven contains three Es.)"