knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

Normalization

Provide an example of at least three dataframes in R that demonstrate normalization. The dataframes can contain any data, either real or synthetic. Although normalization is typically done in SQL and relational databases, you are expected to show this example in R, as it is our main work environment in this course.

To demonstrate normalization, we will load a table scraped from Wikipedia on the judges that have been on the television show Drag Race.

drag_race_judges_raw <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/drag_race_judges_raw.csv")

The table must be atomized and flattened. Additionally the “X” added to the season number is removed and all rows with an empty judge_type are filtered out as this indicates that the person listed was not a judge at that time. Finally, the judges names are split into first and last name. The table should now meet first normal form (1NF).

drag_race_judges_flat <- drag_race_judges_raw |> 
  pivot_longer(
    cols = starts_with("X"),
    names_to = "season",
    values_to = "judge_type") |> 
  mutate(season = str_remove_all(season,"X")) |> 
  filter(judge_type != "") |> 
  relocate(season) |> 
  separate_wider_delim(
    Judge,
    delim = " ",
    names = c("judge_name_last", "judge_name_first"),
    too_few = ,
    too_many = "merge")

This data can now be extracted to increase the normalization form. This can be started by creating a table of judge_type. A judge type key will be assigned:

judge_type <- drag_race_judges_flat |> 
  select(judge_type) |> 
  distinct() |> 
  mutate(judge_type_key = row_number()) |> 
  relocate(judge_type_key)

The same transformation can be performed for the judges themselves. Separating them out and assigning them a judge key.

judge_list <- drag_race_judges_flat |> 
  select(judge_name_first,judge_name_last) |> 
  distinct() |> 
  mutate(judge_key = row_number()) |> 
  relocate(judge_key)

Finally, these new tables can be used to create a table that contains all the original table information but transformed to third normal form. This is done by joining the judge_list and judge_type tables to the original table and then selecting the appropriate columns

drag_race_judges <- drag_race_judges_flat |> 
  left_join(judge_list) |> 
  left_join(judge_type) |> 
  select(season, judge_key, judge_type_key)
## Joining with `by = join_by(judge_name_last, judge_name_first)`
## Joining with `by = join_by(judge_type)`

While this is a simple example using only three variables these three final tables (drag_race_judges, judge_list, judge_type) should now contain all of the data in the starting table but at third normal form.

2. Character Manipulation

Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

The below code loads the .csv file from the 538 website.

college_majors <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/majors-list.csv")

glimpse(college_majors)
## Rows: 174
## Columns: 3
## $ FOD1P          <chr> "1100", "1101", "1102", "1103", "1104", "1105", "1106",…
## $ Major          <chr> "GENERAL AGRICULTURE", "AGRICULTURE PRODUCTION AND MANA…
## $ Major_Category <chr> "Agriculture & Natural Resources", "Agriculture & Natur…

The code below then creates a new data frame and filters the college major list to only majors that contain the words “Data” and “Statistics”. It does this by finding the row indices for majors with those words and then slicing the college_majors data frame to only those rows.

data_stat_majors <- college_majors |> 
  slice(grep("data | statistics", college_majors$Major, ignore.case = TRUE, value = FALSE))

glimpse(data_stat_majors)
## Rows: 2
## Columns: 3
## $ FOD1P          <chr> "6212", "2101"
## $ Major          <chr> "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS", "COMPU…
## $ Major_Category <chr> "Business", "Computers & Mathematics"

An alternative method would be to use filter and str_detect together:

data_majors_alt <- college_majors |> 
  filter(str_detect(Major, regex("Data | Statistics", ignore_case = TRUE)))

3. Describe, in words, what these expressions will match:

(.)\1\1 If quoted and escaped correctly this will look for three of the same characters repeated, for example “eee”.

“(.)(.)\2\1” This will look for a four character pattern where the first two characters are repeated in reverse order, for example “abba”.

(..)\1 If quoted and escaped correctly this will look for a four character pattern where any two characters are then immediately repeated, for example “rber”.

“(.).\1.\1” This will look for a five character pattern where a captured character is repeated 3 times with any single character between them. For example “n”.

“(.)(.)(.).*\3\2\1” This will look for a three character pattern that is then repeated in reverse order with any number of characters between them. For example the “par” in “h”.

4. Construct regular expressions to match words that:

Start and end with the same character.

str_view(words, "^(.).*\\1$")
##  [36] │ <america>
##  [49] │ <area>
## [209] │ <dad>
## [213] │ <dead>
## [223] │ <depend>
## [258] │ <educate>
## [266] │ <else>
## [268] │ <encourage>
## [270] │ <engine>
## [278] │ <europe>
## [283] │ <evidence>
## [285] │ <example>
## [287] │ <excuse>
## [288] │ <exercise>
## [291] │ <expense>
## [292] │ <experience>
## [296] │ <eye>
## [386] │ <health>
## [394] │ <high>
## [450] │ <knock>
## ... and 16 more

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

str_view(words, "(..).*\\1")
##  [48] │ ap<propr>iate
## [152] │ <church>
## [181] │ c<ondition>
## [217] │ <decide>
## [275] │ <environmen>t
## [487] │ l<ondon>
## [598] │ pa<ragra>ph
## [603] │ p<articular>
## [617] │ <photograph>
## [638] │ p<repare>
## [641] │ p<ressure>
## [696] │ r<emem>ber
## [698] │ <repre>sent
## [699] │ <require>
## [739] │ <sense>
## [858] │ the<refore>
## [903] │ u<nderstand>
## [946] │ w<hethe>r

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

str_view(words, "(.).*\\1.*\\1")
##  [48] │ a<pprop>riate
##  [62] │ <availa>ble
##  [86] │ b<elieve>
##  [90] │ b<etwee>n
## [119] │ bu<siness>
## [221] │ d<egree>
## [229] │ diff<erence>
## [233] │ di<scuss>
## [265] │ <eleve>n
## [275] │ e<nvironmen>t
## [283] │ <evidence>
## [288] │ <exercise>
## [291] │ <expense>
## [292] │ <experience>
## [423] │ <indivi>dual
## [598] │ p<aragra>ph
## [684] │ r<eceive>
## [696] │ r<emembe>r
## [698] │ r<eprese>nt
## [845] │ t<elephone>
## ... and 2 more