# Load libraries
library(dplyr)
library(knitr)
knitr::opts_chunk$set(echo = TRUE)

Introduction

In Week 3, we work with data normalization and character manipulation. In this assignment, Part 1 will be normalizing data and Part 2 will involve working with a College Majors dataset from the fivethirtyeight website.

Part 1: Normalization

Normalization in data science means organizing data in a way that reduces redundancy and improves data integrity. This process involves breaking down larger tables into smaller tables. Here are three dataframes to illustrate the normalization process:

Example 1: Normalizating Employee Data

The original employee dataframe contains information about employees, departments, and salaries.

Create original Employee dataframe

employee_df <- data.frame(
  employee_id = c(1, 2, 3, 4),
  name = c("Gary Larsen", "William Pace", "Jennifer Fisher", "Samuel Morales"),
  department = c("Finance", "IT", "Finance", "HR"),
  salary = c(95000, 85000, 120000, 65000)
)

Display original Employee dataframe

knitr::kable(employee_df)
employee_id name department salary
1 Gary Larsen Finance 95000
2 William Pace IT 85000
3 Jennifer Fisher Finance 120000
4 Samuel Morales HR 65000

We normalize the original Employee dataframe into separate Employees, Departments, and Salaries tables.

Create normalized Employees table

employees_df <- data.frame(
  employee_id = c(1, 2, 3, 4),
  name = c("Gary Larsen", "William Pace", "Jennifer Fisher", "Samuel Morales")
)

Display normalized Employees table

knitr::kable(employees_df)
employee_id name
1 Gary Larsen
2 William Pace
3 Jennifer Fisher
4 Samuel Morales

Create normalized Departments table

departments_df <- data.frame(
  department_id = c(1,2,3),
  department = c("Finance", "IT", "HR")
)

Display normalized Departments table

knitr::kable(departments_df)
department_id department
1 Finance
2 IT
3 HR

Create normalized Salaries table

salaries_df <- data.frame(
  employee_id = c(1, 2, 3, 4),
  department_id = c(1, 2, 1, 3),
  salary = c(95000, 85000, 120000, 65000)
)

Display normalized Salaries table

knitr::kable(salaries_df)
employee_id department_id salary
1 1 95000
2 2 85000
3 1 120000
4 3 65000

Example 2: Normalizing Product Inventory Data

The original product inventory dataframe contains information such as the product ID, name, category, quantity, and price.

Create original Product Inventory dataframe

inventory_df <- data.frame(
  product_id = c(1,2,3,4),
  product_name = c("Keyboard", "Dumbbell", "Running Shoes", "Water Bottle"),
  category = c("Computer Accessories", "Fitness Equipment", "Sportswear", "Hydration"),
  quantity = c(50, 30, 20, 100),
  price = c(40, 100, 150, 20)
)

Display original Product Inventory dataframe

knitr::kable(inventory_df)
product_id product_name category quantity price
1 Keyboard Computer Accessories 50 40
2 Dumbbell Fitness Equipment 30 100
3 Running Shoes Sportswear 20 150
4 Water Bottle Hydration 100 20

We normalize the original Product Inventory dataframe into separate Products, Categories, and Inventory tables.

Create normalized Products table

products_df <- data.frame(
  product_id = c(1, 2, 3, 4), 
  product_name = c("Keyboard", "Dumbbell", "Running Shoes", "Water Bottle")
)

Display normalized Products table

knitr::kable(products_df)
product_id product_name
1 Keyboard
2 Dumbbell
3 Running Shoes
4 Water Bottle

Create normalized Categories table

categories_df <- data.frame(
  category_id = c(1, 2, 3, 4),
  category = c("Computer Accessories", "Fitness Equipment", "Sportswear", "Hydration")
)

Display normalized Categories table

knitr::kable(categories_df)
category_id category
1 Computer Accessories
2 Fitness Equipment
3 Sportswear
4 Hydration

Create normalized Inventory table

inventory_details_df <- data.frame(
  product_id = c(1, 2, 3, 4),
  category_id = c(1, 2, 3, 4),
  quantity = c(50, 30, 20, 100),
  price = c(40, 100, 150, 20)
)

Display normalized Inventory table

knitr::kable(inventory_details_df)
product_id category_id quantity price
1 1 50 40
2 2 30 100
3 3 20 150
4 4 100 20

Example 3: Normalizing Sales Data

The original Sales dataframe contains information such as the Sale ID, Product name, Customer, and Sale amount.

Create original Sales dataframe

sales_df <- data.frame(
  sale_id = c(1, 2, 3, 4),
  product_name = c("Soda", "Bagel", "Coffee", "Cheese"), 
  customer = c("James", "Shirley", "Mike", "Roman"),
  sale_amount = c(2.50, 4.50, 1.50, 6.00)
)

Display original Sales dataframe

knitr::kable(sales_df)
sale_id product_name customer sale_amount
1 Soda James 2.5
2 Bagel Shirley 4.5
3 Coffee Mike 1.5
4 Cheese Roman 6.0

Create normalized Products table

products_sales_df <- data.frame(
  product_id = c(1, 2, 3, 4), 
  product_name = c("Soda", "Bagel", "Coffee", "Cheese")
)

Display normalized Products table

knitr::kable(products_sales_df)
product_id product_name
1 Soda
2 Bagel
3 Coffee
4 Cheese

Create normalized Customers table

customers_df <- data.frame(
  customer_id = c(1, 2, 3, 4),
  customer = c("James", "Shirley", "Mike", "Roman")
)

Display normalized Customers table

knitr::kable(customers_df)

Create normalized Sales table

sales_details_df <- data.frame(
  sale_id = c(1, 2, 3, 4),
  product_id = c(1, 2, 3, 4),
  customer_id = c(1, 2, 3, 4),
  sale_amount = c(2.50, 4.50, 1.50, 6.00)
)

Display normalized Sales table

knitr::kable(sales_details_df)
sale_id product_id customer_id sale_amount
1 1 1 2.5
2 2 2 4.5
3 3 3 1.5
4 4 4 6.0

Part 2: Character Manipulation

Source: [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/]

Using the 173 majors listed in fivethirtyeight College Majors dataset, provide code that identifies the majors that contain either “DATA” or “STATISTICS”

Load the College Majors dataset

majors_df <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/all-ages.csv")

Find majors containing “DATA” or “STATISTICS”

selected_majors <- majors_df[grep("DATA|STATITICS", majors_df$Major, ignore.case = TRUE), ]

Display selected majors

knitr::kable(selected_majors)
Major_code Major Major_category Total Employed Employed_full_time_year_round Unemployed Unemployment_rate Median P25th P75th
20 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics 29317 22828 18747 2265 0.0902642 60000 40000 85000

Clean selected majors for better readability

cleaned_majors <- selected_majors %>%
  select(Major, Employed, Unemployment_rate, Median) %>%
  rename(
    Major_Name = Major,
    Employed_Graduates = Employed,
    Unemployment_Rate = Unemployment_rate,
    Median_Salary = Median
  )
knitr::kable(cleaned_majors)
Major_Name Employed_Graduates Unemployment_Rate Median_Salary
20 COMPUTER PROGRAMMING AND DATA PROCESSING 22828 0.0902642 60000

Part 2: Regular Expressions

Describe, in words, what these expressions will match:

(.)\1\1

Description: Looks for the same character repeated 3x in a row

Example: “aaa”, “111”, “!!!”

"(.)(.)\\2\\1"

Description: Looks for two different characters next to each other, followed by those same two characters but mirrored in reverse (like a symmetrical reflection).

Example: “abba”, “7887”

(..)\1

Description: Looks for two characters repeated one after the other.

Example: “abab”, “7878”

"(.).\\1.\\1"

Description: Matches single character, any character, first character, any character, first character.

Example: “abaca”, “12131”

"(.)(.)(.).*\\3\\2\\1"

Description: Looks for 3 characters, followed by any sequence of characters, an then those 3 characters in reverse.

Example: “abc…cba”, “123…321”

Construct regular expressions to match words that:

Question 1: Start and end with the same character.

Regex Answer: ^(.).*\1$

Question 2: Contain a repeated pair of letters (e.g. “church contains”ch” repeated twice)

Regex Answer: (..).*\1

Question 3: Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s)

Regex Answer: (.).?\1.?\1