# Load libraries
library(dplyr)
library(knitr)
knitr::opts_chunk$set(echo = TRUE)

Introduction

In Week 3, we work with data normalization and character manipulation. In this assignment, Part 1 will be normalizing data and Part 2 will involve working with a College Majors dataset from the fivethirtyeight website.

Part 1: Normalization

Normalization in data science means organizing data in a way that reduces redundancy and improves data integrity. This process involves breaking down larger tables into smaller tables. Here are three dataframes to illustrate the normalization process:

Example 1: Normalizating Employee Data

The original employee dataframe contains information about employees, departments, and salaries.

Create original Employee dataframe

employee_df <- data.frame(
  employee_id = c(1, 2, 3, 4),
  name = c("Gary Larsen", "William Pace", "Jennifer Fisher", "Samuel Morales"),
  department = c("Finance", "IT", "Finance", "HR"),
  salary = c(95000, 85000, 120000, 65000)
)

Display original Employee dataframe

knitr::kable(employee_df)

employee_id	name	department	salary
1	Gary Larsen	Finance	95000
2	William Pace	IT	85000
3	Jennifer Fisher	Finance	120000
4	Samuel Morales	HR	65000

We normalize the original Employee dataframe into separate Employees, Departments, and Salaries tables.

Create normalized Employees table

employees_df <- data.frame(
  employee_id = c(1, 2, 3, 4),
  name = c("Gary Larsen", "William Pace", "Jennifer Fisher", "Samuel Morales")
)

Display normalized Employees table

knitr::kable(employees_df)

employee_id	name
1	Gary Larsen
2	William Pace
3	Jennifer Fisher
4	Samuel Morales

Create normalized Departments table

departments_df <- data.frame(
  department_id = c(1,2,3),
  department = c("Finance", "IT", "HR")
)

Display normalized Departments table

knitr::kable(departments_df)

department_id	department
1	Finance
2	IT
3	HR

Create normalized Salaries table

salaries_df <- data.frame(
  employee_id = c(1, 2, 3, 4),
  department_id = c(1, 2, 1, 3),
  salary = c(95000, 85000, 120000, 65000)
)

Display normalized Salaries table

knitr::kable(salaries_df)

employee_id	department_id	salary
1	1	95000
2	2	85000
3	1	120000
4	3	65000

Example 2: Normalizing Product Inventory Data

The original product inventory dataframe contains information such as the product ID, name, category, quantity, and price.

Create original Product Inventory dataframe

inventory_df <- data.frame(
  product_id = c(1,2,3,4),
  product_name = c("Keyboard", "Dumbbell", "Running Shoes", "Water Bottle"),
  category = c("Computer Accessories", "Fitness Equipment", "Sportswear", "Hydration"),
  quantity = c(50, 30, 20, 100),
  price = c(40, 100, 150, 20)
)

Display original Product Inventory dataframe

knitr::kable(inventory_df)

product_id	product_name	category	quantity	price
1	Keyboard	Computer Accessories	50	40
2	Dumbbell	Fitness Equipment	30	100
3	Running Shoes	Sportswear	20	150
4	Water Bottle	Hydration	100	20

We normalize the original Product Inventory dataframe into separate Products, Categories, and Inventory tables.

Create normalized Products table

products_df <- data.frame(
  product_id = c(1, 2, 3, 4), 
  product_name = c("Keyboard", "Dumbbell", "Running Shoes", "Water Bottle")
)

Display normalized Products table

knitr::kable(products_df)

product_id	product_name
1	Keyboard
2	Dumbbell
3	Running Shoes
4	Water Bottle

Create normalized Categories table

categories_df <- data.frame(
  category_id = c(1, 2, 3, 4),
  category = c("Computer Accessories", "Fitness Equipment", "Sportswear", "Hydration")
)

Display normalized Categories table

knitr::kable(categories_df)

category_id	category
1	Computer Accessories
2	Fitness Equipment
3	Sportswear
4	Hydration

Create normalized Inventory table

inventory_details_df <- data.frame(
  product_id = c(1, 2, 3, 4),
  category_id = c(1, 2, 3, 4),
  quantity = c(50, 30, 20, 100),
  price = c(40, 100, 150, 20)
)

Display normalized Inventory table

knitr::kable(inventory_details_df)

product_id	category_id	quantity	price
1	1	50	40
2	2	30	100
3	3	20	150
4	4	100	20

Example 3: Normalizing Sales Data

The original Sales dataframe contains information such as the Sale ID, Product name, Customer, and Sale amount.

Create original Sales dataframe

sales_df <- data.frame(
  sale_id = c(1, 2, 3, 4),
  product_name = c("Soda", "Bagel", "Coffee", "Cheese"), 
  customer = c("James", "Shirley", "Mike", "Roman"),
  sale_amount = c(2.50, 4.50, 1.50, 6.00)
)

Display original Sales dataframe

knitr::kable(sales_df)

sale_id	product_name	customer	sale_amount
1	Soda	James	2.5
2	Bagel	Shirley	4.5
3	Coffee	Mike	1.5
4	Cheese	Roman	6.0

Create normalized Products table

products_sales_df <- data.frame(
  product_id = c(1, 2, 3, 4), 
  product_name = c("Soda", "Bagel", "Coffee", "Cheese")
)

Display normalized Products table

knitr::kable(products_sales_df)

product_id	product_name
1	Soda
2	Bagel
3	Coffee
4	Cheese

Create normalized Customers table

customers_df <- data.frame(
  customer_id = c(1, 2, 3, 4),
  customer = c("James", "Shirley", "Mike", "Roman")
)

Display normalized Customers table

knitr::kable(customers_df)

Create normalized Sales table

sales_details_df <- data.frame(
  sale_id = c(1, 2, 3, 4),
  product_id = c(1, 2, 3, 4),
  customer_id = c(1, 2, 3, 4),
  sale_amount = c(2.50, 4.50, 1.50, 6.00)
)

Display normalized Sales table

knitr::kable(sales_details_df)

sale_id	product_id	customer_id	sale_amount
1	1	1	2.5
2	2	2	4.5
3	3	3	1.5
4	4	4	6.0

Part 2: Character Manipulation

Source: [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/]

Using the 173 majors listed in fivethirtyeight College Majors dataset, provide code that identifies the majors that contain either “DATA” or “STATISTICS”

Load the College Majors dataset

majors_df <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/all-ages.csv")

Find majors containing “DATA” or “STATISTICS”

selected_majors <- majors_df[grep("DATA|STATITICS", majors_df$Major, ignore.case = TRUE), ]

Display selected majors

knitr::kable(selected_majors)

	Major_code	Major	Major_category	Total	Employed	Employed_full_time_year_round	Unemployed	Unemployment_rate	Median	P25th	P75th
20	2101	COMPUTER PROGRAMMING AND DATA PROCESSING	Computers & Mathematics	29317	22828	18747	2265	0.0902642	60000	40000	85000

Clean selected majors for better readability

cleaned_majors <- selected_majors %>%
  select(Major, Employed, Unemployment_rate, Median) %>%
  rename(
    Major_Name = Major,
    Employed_Graduates = Employed,
    Unemployment_Rate = Unemployment_rate,
    Median_Salary = Median
  )

knitr::kable(cleaned_majors)

	Major_Name	Employed_Graduates	Unemployment_Rate	Median_Salary
20	COMPUTER PROGRAMMING AND DATA PROCESSING	22828	0.0902642	60000

Part 2: Regular Expressions

Describe, in words, what these expressions will match:

(.)\1\1

Description: Looks for the same character repeated 3x in a row

Example: “aaa”, “111”, “!!!”

"(.)(.)\\2\\1"

Description: Looks for two different characters next to each other, followed by those same two characters but mirrored in reverse (like a symmetrical reflection).

Example: “abba”, “7887”

(..)\1

Description: Looks for two characters repeated one after the other.

Example: “abab”, “7878”

"(.).\\1.\\1"

Description: Matches single character, any character, first character, any character, first character.

Example: “abaca”, “12131”

"(.)(.)(.).*\\3\\2\\1"

Description: Looks for 3 characters, followed by any sequence of characters, an then those 3 characters in reverse.

Example: “abc…cba”, “123…321”

Construct regular expressions to match words that:

Question 1: Start and end with the same character.

Regex Answer: ^(.).*\1$

Question 2: Contain a repeated pair of letters (e.g. “church contains”ch” repeated twice)

Regex Answer: (..).*\1

Question 3: Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s)

Regex Answer: (.).?\1.?\1

week3

Emin Allen

2024-09-21

Introduction

Part 1: Normalization

Example 1: Normalizating Employee Data

Create original Employee dataframe

Display original Employee dataframe

Create normalized Employees table

Display normalized Employees table

Create normalized Departments table

Display normalized Departments table

Create normalized Salaries table

Display normalized Salaries table

Example 2: Normalizing Product Inventory Data

Create original Product Inventory dataframe

Display original Product Inventory dataframe

Create normalized Products table

Display normalized Products table

Create normalized Categories table

Display normalized Categories table

Create normalized Inventory table

Display normalized Inventory table

Example 3: Normalizing Sales Data

Create original Sales dataframe

Display original Sales dataframe

Create normalized Products table

Display normalized Products table

Create normalized Customers table

Display normalized Customers table

Create normalized Sales table

Display normalized Sales table

Part 2: Character Manipulation

Load the College Majors dataset

Find majors containing “DATA” or “STATISTICS”

Display selected majors

Clean selected majors for better readability

Part 2: Regular Expressions

Construct regular expressions to match words that: