# Load libraries
library(dplyr)
library(knitr)
knitr::opts_chunk$set(echo = TRUE)
In Week 3, we work with data normalization and character manipulation. In this assignment, Part 1 will be normalizing data and Part 2 will involve working with a College Majors dataset from the fivethirtyeight website.
Normalization in data science means organizing data in a way that reduces redundancy and improves data integrity. This process involves breaking down larger tables into smaller tables. Here are three dataframes to illustrate the normalization process:
The original employee dataframe contains information about employees, departments, and salaries.
employee_df <- data.frame(
employee_id = c(1, 2, 3, 4),
name = c("Gary Larsen", "William Pace", "Jennifer Fisher", "Samuel Morales"),
department = c("Finance", "IT", "Finance", "HR"),
salary = c(95000, 85000, 120000, 65000)
)
knitr::kable(employee_df)
| employee_id | name | department | salary |
|---|---|---|---|
| 1 | Gary Larsen | Finance | 95000 |
| 2 | William Pace | IT | 85000 |
| 3 | Jennifer Fisher | Finance | 120000 |
| 4 | Samuel Morales | HR | 65000 |
We normalize the original Employee dataframe into separate Employees, Departments, and Salaries tables.
employees_df <- data.frame(
employee_id = c(1, 2, 3, 4),
name = c("Gary Larsen", "William Pace", "Jennifer Fisher", "Samuel Morales")
)
knitr::kable(employees_df)
| employee_id | name |
|---|---|
| 1 | Gary Larsen |
| 2 | William Pace |
| 3 | Jennifer Fisher |
| 4 | Samuel Morales |
departments_df <- data.frame(
department_id = c(1,2,3),
department = c("Finance", "IT", "HR")
)
knitr::kable(departments_df)
| department_id | department |
|---|---|
| 1 | Finance |
| 2 | IT |
| 3 | HR |
salaries_df <- data.frame(
employee_id = c(1, 2, 3, 4),
department_id = c(1, 2, 1, 3),
salary = c(95000, 85000, 120000, 65000)
)
knitr::kable(salaries_df)
| employee_id | department_id | salary |
|---|---|---|
| 1 | 1 | 95000 |
| 2 | 2 | 85000 |
| 3 | 1 | 120000 |
| 4 | 3 | 65000 |
The original product inventory dataframe contains information such as the product ID, name, category, quantity, and price.
inventory_df <- data.frame(
product_id = c(1,2,3,4),
product_name = c("Keyboard", "Dumbbell", "Running Shoes", "Water Bottle"),
category = c("Computer Accessories", "Fitness Equipment", "Sportswear", "Hydration"),
quantity = c(50, 30, 20, 100),
price = c(40, 100, 150, 20)
)
knitr::kable(inventory_df)
| product_id | product_name | category | quantity | price |
|---|---|---|---|---|
| 1 | Keyboard | Computer Accessories | 50 | 40 |
| 2 | Dumbbell | Fitness Equipment | 30 | 100 |
| 3 | Running Shoes | Sportswear | 20 | 150 |
| 4 | Water Bottle | Hydration | 100 | 20 |
We normalize the original Product Inventory dataframe into separate Products, Categories, and Inventory tables.
products_df <- data.frame(
product_id = c(1, 2, 3, 4),
product_name = c("Keyboard", "Dumbbell", "Running Shoes", "Water Bottle")
)
knitr::kable(products_df)
| product_id | product_name |
|---|---|
| 1 | Keyboard |
| 2 | Dumbbell |
| 3 | Running Shoes |
| 4 | Water Bottle |
categories_df <- data.frame(
category_id = c(1, 2, 3, 4),
category = c("Computer Accessories", "Fitness Equipment", "Sportswear", "Hydration")
)
knitr::kable(categories_df)
| category_id | category |
|---|---|
| 1 | Computer Accessories |
| 2 | Fitness Equipment |
| 3 | Sportswear |
| 4 | Hydration |
inventory_details_df <- data.frame(
product_id = c(1, 2, 3, 4),
category_id = c(1, 2, 3, 4),
quantity = c(50, 30, 20, 100),
price = c(40, 100, 150, 20)
)
knitr::kable(inventory_details_df)
| product_id | category_id | quantity | price |
|---|---|---|---|
| 1 | 1 | 50 | 40 |
| 2 | 2 | 30 | 100 |
| 3 | 3 | 20 | 150 |
| 4 | 4 | 100 | 20 |
The original Sales dataframe contains information such as the Sale ID, Product name, Customer, and Sale amount.
sales_df <- data.frame(
sale_id = c(1, 2, 3, 4),
product_name = c("Soda", "Bagel", "Coffee", "Cheese"),
customer = c("James", "Shirley", "Mike", "Roman"),
sale_amount = c(2.50, 4.50, 1.50, 6.00)
)
knitr::kable(sales_df)
| sale_id | product_name | customer | sale_amount |
|---|---|---|---|
| 1 | Soda | James | 2.5 |
| 2 | Bagel | Shirley | 4.5 |
| 3 | Coffee | Mike | 1.5 |
| 4 | Cheese | Roman | 6.0 |
products_sales_df <- data.frame(
product_id = c(1, 2, 3, 4),
product_name = c("Soda", "Bagel", "Coffee", "Cheese")
)
knitr::kable(products_sales_df)
| product_id | product_name |
|---|---|
| 1 | Soda |
| 2 | Bagel |
| 3 | Coffee |
| 4 | Cheese |
customers_df <- data.frame(
customer_id = c(1, 2, 3, 4),
customer = c("James", "Shirley", "Mike", "Roman")
)
knitr::kable(customers_df)
sales_details_df <- data.frame(
sale_id = c(1, 2, 3, 4),
product_id = c(1, 2, 3, 4),
customer_id = c(1, 2, 3, 4),
sale_amount = c(2.50, 4.50, 1.50, 6.00)
)
knitr::kable(sales_details_df)
| sale_id | product_id | customer_id | sale_amount |
|---|---|---|---|
| 1 | 1 | 1 | 2.5 |
| 2 | 2 | 2 | 4.5 |
| 3 | 3 | 3 | 1.5 |
| 4 | 4 | 4 | 6.0 |
Source: [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/]
Using the 173 majors listed in fivethirtyeight College Majors dataset, provide code that identifies the majors that contain either “DATA” or “STATISTICS”
majors_df <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/all-ages.csv")
selected_majors <- majors_df[grep("DATA|STATITICS", majors_df$Major, ignore.case = TRUE), ]
knitr::kable(selected_majors)
| Major_code | Major | Major_category | Total | Employed | Employed_full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 20 | 2101 | COMPUTER PROGRAMMING AND DATA PROCESSING | Computers & Mathematics | 29317 | 22828 | 18747 | 2265 | 0.0902642 | 60000 | 40000 | 85000 |
cleaned_majors <- selected_majors %>%
select(Major, Employed, Unemployment_rate, Median) %>%
rename(
Major_Name = Major,
Employed_Graduates = Employed,
Unemployment_Rate = Unemployment_rate,
Median_Salary = Median
)
knitr::kable(cleaned_majors)
| Major_Name | Employed_Graduates | Unemployment_Rate | Median_Salary | |
|---|---|---|---|---|
| 20 | COMPUTER PROGRAMMING AND DATA PROCESSING | 22828 | 0.0902642 | 60000 |
Describe, in words, what these expressions will match:
(.)\1\1
Description: Looks for the same character repeated 3x in a row
Example: “aaa”, “111”, “!!!”
"(.)(.)\\2\\1"
Description: Looks for two different characters next to each other, followed by those same two characters but mirrored in reverse (like a symmetrical reflection).
Example: “abba”, “7887”
(..)\1
Description: Looks for two characters repeated one after the other.
Example: “abab”, “7878”
"(.).\\1.\\1"
Description: Matches single character, any character, first character, any character, first character.
Example: “abaca”, “12131”
"(.)(.)(.).*\\3\\2\\1"
Description: Looks for 3 characters, followed by any sequence of characters, an then those 3 characters in reverse.
Example: “abc…cba”, “123…321”
Question 1: Start and end with the same character.
Regex Answer: ^(.).*\1$
Question 2: Contain a repeated pair of letters (e.g. “church contains”ch” repeated twice)
Regex Answer: (..).*\1
Question 3: Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s)
Regex Answer: (.).?\1.?\1