chooseCRANmirror(graphics = FALSE, ind = 1) # Selects the first mirror
install.packages("dplyr")
##
## The downloaded binary packages are in
## /var/folders/nz/h7z329n55nxfs2dv7hmbhc400000gn/T//RtmpipV24q/downloaded_packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
This dataframe contains redundant data since customer and product details are repeated for each order.
orders <- data.frame(
OrderID = c(1, 2, 3, 4, 5),
CustomerID = c(101, 102, 101, 103, 103),
CustomerName = c("Jessica", "Regina", "Jessica", "Natasha", "Natasha"),
ProductID = c(201, 202, 204, 203, 202),
ProductName = c("Laptop", "Phone", "Headphones", "Tablet", "Phone"),
Price = c(1000, 500, 300, 700, 500)
)
print(orders)
## OrderID CustomerID CustomerName ProductID ProductName Price
## 1 1 101 Jessica 201 Laptop 1000
## 2 2 102 Regina 202 Phone 500
## 3 3 101 Jessica 204 Headphones 300
## 4 4 103 Natasha 203 Tablet 700
## 5 5 103 Natasha 202 Phone 500
I split the original dataframe into three separate tables:
Customers: Containing unique customer data. Products Table: Containing unique product data. Orders Normalized Table: Containing references to the CustomerID and ProductID.
Although customerID 101 is repeated in the last table, this table is still normalized because each row represents a unique order/purchase,not just a customer. Jessica and Natasha will appear multiple times in the table because they each placed multiple orders.
customers <- data.frame(
CustomerID = c(101, 102, 103),
CustomerName = c("Jessica", "Regina", "Natasha"),
Email = c("jessica_mp@gmail.com", "regina_ca@gmail.com", "natasha_mh@gmail.com"),
City = c("Stockholm", "Philadelphia", "Kingston"),
RegistrationDate = as.Date(c("2024-03-10", "2024-06-21", "2024-09-05"))
)
print(customers)
## CustomerID CustomerName Email City RegistrationDate
## 1 101 Jessica jessica_mp@gmail.com Stockholm 2024-03-10
## 2 102 Regina regina_ca@gmail.com Philadelphia 2024-06-21
## 3 103 Natasha natasha_mh@gmail.com Kingston 2024-09-05
products <- data.frame(
ProductID = c(201, 202, 203, 204),
ProductName = c("Laptop", "Phone", "Tablet", "Headphones"),
Price = c(1000, 500, 700, 300),
Category = c("Electronics", "Electronics", "Electronics", "Accessories")
)
print(products)
## ProductID ProductName Price Category
## 1 201 Laptop 1000 Electronics
## 2 202 Phone 500 Electronics
## 3 203 Tablet 700 Electronics
## 4 204 Headphones 300 Accessories
orders_normalized <- data.frame(
OrderID = c(1, 2, 3, 4, 5),
CustomerID = c(101, 102, 101, 103, 103),
ProductID = c(201, 202, 204, 203, 202)
)
print(orders_normalized)
## OrderID CustomerID ProductID
## 1 1 101 201
## 2 2 102 202
## 3 3 101 204
## 4 4 103 203
## 5 5 103 202
summary_orders <- orders_normalized %>%
left_join(customers, by = "CustomerID") %>%
left_join(products, by = "ProductID") %>%
group_by(CustomerID, CustomerName) %>%
summarise(
ProductsPurchased = paste(unique(ProductName), collapse = ", "),
TotalSpent = sum(Price)
) %>%
ungroup()
## `summarise()` has grouped output by 'CustomerID'. You can override using the
## `.groups` argument.
print(summary_orders)
## # A tibble: 3 × 4
## CustomerID CustomerName ProductsPurchased TotalSpent
## <dbl> <chr> <chr> <dbl>
## 1 101 Jessica Laptop, Headphones 1300
## 2 102 Regina Phone 500
## 3 103 Natasha Tablet, Phone 1200
library(stringr)
library(readr)
# Load the majors dataset from GitHub
url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
majors_table <- read_csv(url, show_col_types = FALSE)
# Extracting the Major column
majors <- majors_table$Major
# Finding majors with "DATA" or "STATISTICS"
data_stats_majors <- majors[str_detect(majors, regex("DATA|STATISTICS", ignore_case = TRUE))]
print(data_stats_majors)
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
expressions <- list(
"(.)\\1\\1" = "Matches a string with any character that appears three times in a row (e.g <aaa>, <bbb>)",
"(.)(.)\\2\\1" = "Matches a string where the first character is followed by the second character, then the second character repeats, and the first character repeats at the end (e.g <abba>, <deed>).",
"(..)\\1" = "Match a string with a repeated pair of letters (e.g <coco>nut, b<anan>a)",
"(.).\\1.\\1" = "Matches string where a character repeats with one character in between.(e.g <dad>, <mom>) ",
"(.)(.)(.).*\\3\\2\\1" = "Mmatches a string where three characters are captured, followed by any characters, and then the same first three characters appear in reverse order. (e.g <abcxyzcba>, <reviver>)"
)
print (expressions)
## $`(.)\\1\\1`
## [1] "Matches a string with any character that appears three times in a row (e.g <aaa>, <bbb>)"
##
## $`(.)(.)\\2\\1`
## [1] "Matches a string where the first character is followed by the second character, then the second character repeats, and the first character repeats at the end (e.g <abba>, <deed>)."
##
## $`(..)\\1`
## [1] "Match a string with a repeated pair of letters (e.g <coco>nut, b<anan>a)"
##
## $`(.).\\1.\\1`
## [1] "Matches string where a character repeats with one character in between.(e.g <dad>, <mom>) "
##
## $`(.)(.)(.).*\\3\\2\\1`
## [1] "Mmatches a string where three characters are captured, followed by any characters, and then the same first three characters appear in reverse order. (e.g <abcxyzcba>, <reviver>)"
regex_examples <- list(
"^(.).*\\1$" = "Start and end with the same character. (e.g <area>, <river>)",
"(..)\\1" = "Contain a repeated pair of letters (e.g. church contains ch repeated twice.)",
"(.).*\\1.*\\1" ="Contain one letter repeated in at least three places (e.g. eleven contains three Es.)"
)
print (regex_examples)
## $`^(.).*\\1$`
## [1] "Start and end with the same character. (e.g <area>, <river>)"
##
## $`(..)\\1`
## [1] "Contain a repeated pair of letters (e.g. church contains ch repeated twice.)"
##
## $`(.).*\\1.*\\1`
## [1] "Contain one letter repeated in at least three places (e.g. eleven contains three Es.)"