library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
Below I have attempted to create 3 tables that achieve 3rd normal form for financial aid awards for students
# Students Data Frame
students <- data.frame(
student_id = c(1, 2, 3, 4, 5, 6),
first_name = c("Alice", "Bob", "Charlie", "Peter", "Tony", "Emily"),
last_name = c("Wunderland", "Dylan", "Chocolate", "Parker", "Stark", "Clarke"),
student_email = c("alice_w@aol.com", "bobby@yahoo.com", "charlie_chocolate@aol.com",
"spidey@gmail.com", "iron_man@gmail.com", "Em_C@yahoo.com")
)
print(students)
## student_id first_name last_name student_email
## 1 1 Alice Wunderland alice_w@aol.com
## 2 2 Bob Dylan bobby@yahoo.com
## 3 3 Charlie Chocolate charlie_chocolate@aol.com
## 4 4 Peter Parker spidey@gmail.com
## 5 5 Tony Stark iron_man@gmail.com
## 6 6 Emily Clarke Em_C@yahoo.com
# Awards Data Frame
awards <- data.frame(
award_id = c(001, 002, 003, 004),
award_source = c("Federal", "State", "Institutional", "Private"),
award_type = c("Grant", "Grant", "Scholarhip", "Loan"),
award_name = c("Pell Grant", "TAP Grant", "Vallone Scholarship", "Wells Loan")
)
print(awards)
## award_id award_source award_type award_name
## 1 1 Federal Grant Pell Grant
## 2 2 State Grant TAP Grant
## 3 3 Institutional Scholarhip Vallone Scholarship
## 4 4 Private Loan Wells Loan
# Award Activity
stdnt_award <- data.frame(
student_id = c(1, 1, 2, 3, 4, 5, 5, 6, 6, 6),
award_id = c(001, 002, 003, 002, 004, 001, 002, 001, 002, 003),
award_amt = c(2000.00, 506.50, 350.00, 2360.60, 10890.90,
460.20, 3200.45, 1260.50, 450.00, 250.00),
award_date = c("2025-01-14", "2025-01-14", "2025-01-22", "2025-01-04", "2025-01-06",
"2025-01-08", "2025-02-03", "2025-02-04", "2025-02-10", "2025-02-12")
)
print(stdnt_award)
## student_id award_id award_amt award_date
## 1 1 1 2000.00 2025-01-14
## 2 1 2 506.50 2025-01-14
## 3 2 3 350.00 2025-01-22
## 4 3 2 2360.60 2025-01-04
## 5 4 4 10890.90 2025-01-06
## 6 5 1 460.20 2025-01-08
## 7 5 2 3200.45 2025-02-03
## 8 6 1 1260.50 2025-02-04
## 9 6 2 450.00 2025-02-10
## 10 6 3 250.00 2025-02-12
# Import the data from github and load into a dataframe
majors_data <- read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/majors-list.csv')
## Rows: 174 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): FOD1P, Major, Major_Category
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Filter the majors for those that have "DATA" and/or "STATISTICS"
filtered_majors <- majors_data |>
filter(str_detect(Major, "DATA|STATISTICS"))
print(filtered_majors)
## # A tibble: 3 × 3
## FOD1P Major Major_Category
## <chr> <chr> <chr>
## 1 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 2 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version:
The expression places a character into the capture group, then matches on strings that repeat the character from the first capture group, two more times consecutively such as “aaa” and “aaalex”
This expression take a character and places it into the first capture group, then the next character into the second capture group and evaluates that the following character matches the character from group 2 and then group one, essentially it matches when the two characters are swapped such as “abba”
This expression takes two characters and stores them in the capture group then evaluates that the same two characters follow such as “abab”
This expression takes a character and stores it into capture group 1, then followed by matching a character(any character), and then repeats the character from group 1, then another character, followed again by the same character from group 1, for example “abaca”
This expression takes a character and stores it to capture group 1, then the next character to capture group 2, and the following to capture group 3, it follows matching any number of characters before matching against the character from group 3, then group 2 then group 1.
“^(.).*\1$”
“(..).*\1”
“(.).\1.\1”