Library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

Load Data

children_supply <- read.csv ("https://raw.githubusercontent.com/JaydeeJan/Children-Supply/refs/heads/main/Childcare_Need___Supply__All_.csv")

head(children_supply)
##   Geographic.Unit Geographic.ID Geographic.Name State.Median.Income.Bracket
## 1          County         53001    Adams County                <=60% of SMI
## 2          County         53001    Adams County       >60% and <=75% of SMI
## 3          County         53001    Adams County       >75% and <=85% of SMI
## 4          County         53001    Adams County                 >85% of SMI
## 5          County         53001    Adams County                <=60% of SMI
## 6          County         53001    Adams County       >60% and <=75% of SMI
##   Age.Group Childcare.Subsidized Private.Care.Estimate
## 1    Infant                   18                    14
## 2    Infant                    0                     0
## 3    Infant                    0                     0
## 4    Infant                    0                     4
## 5 Preschool                  220                    23
## 6 Preschool                    0                     2
##   Estimated.Children.Receiving.Childcare Estimate.of.Unserved Percent.Need.Met
## 1                                     32                  245             11.6
## 2                                      0                   14              0.0
## 3                                      0                    8              0.0
## 4                                      4                   38              9.5
## 5                                    243                  469             34.1
## 6                                      2                   60              3.2
tail(children_supply)
##       Geographic.Unit Geographic.ID Geographic.Name State.Median.Income.Bracket
## 15627        ZIP Code         99403           99403       >75% and <=85% of SMI
## 15628        ZIP Code         99403           99403                 >85% of SMI
## 15629        ZIP Code         99403           99403                <=60% of SMI
## 15630        ZIP Code         99403           99403       >60% and <=75% of SMI
## 15631        ZIP Code         99403           99403       >75% and <=85% of SMI
## 15632        ZIP Code         99403           99403                 >85% of SMI
##        Age.Group Childcare.Subsidized Private.Care.Estimate
## 15627 School Age                    0                     0
## 15628 School Age                    0                     6
## 15629    Toddler                   60                     6
## 15630    Toddler                    0                     6
## 15631    Toddler                    0                     1
## 15632    Toddler                    0                    23
##       Estimated.Children.Receiving.Childcare Estimate.of.Unserved
## 15627                                      0                   99
## 15628                                      6                  223
## 15629                                     66                  199
## 15630                                      6                   51
## 15631                                      1                   15
## 15632                                     23                   67
##       Percent.Need.Met
## 15627              0.0
## 15628              2.6
## 15629             24.9
## 15630             10.5
## 15631              6.2
## 15632             25.6
glimpse(children_supply)
## Rows: 15,632
## Columns: 10
## $ Geographic.Unit                        <chr> "County", "County", "County", "…
## $ Geographic.ID                          <int> 53001, 53001, 53001, 53001, 530…
## $ Geographic.Name                        <chr> "Adams County", "Adams County",…
## $ State.Median.Income.Bracket            <chr> "<=60% of SMI", ">60% and <=75%…
## $ Age.Group                              <chr> "Infant", "Infant", "Infant", "…
## $ Childcare.Subsidized                   <int> 18, 0, 0, 0, 220, 0, 0, NA, 154…
## $ Private.Care.Estimate                  <int> 14, 0, 0, 4, 23, 2, 0, 11, 58, …
## $ Estimated.Children.Receiving.Childcare <int> 32, 0, 0, 4, 243, 2, 0, NA, 212…
## $ Estimate.of.Unserved                   <int> 245, 14, 8, 38, 469, 60, 20, 10…
## $ Percent.Need.Met                       <dbl> 11.6, 0.0, 0.0, 9.5, 34.1, 3.2,…

Replacing N/A to empty values

children_supply[children_supply == ""] <- NA
head(children_supply,10)
##    Geographic.Unit Geographic.ID Geographic.Name State.Median.Income.Bracket
## 1           County         53001    Adams County                <=60% of SMI
## 2           County         53001    Adams County       >60% and <=75% of SMI
## 3           County         53001    Adams County       >75% and <=85% of SMI
## 4           County         53001    Adams County                 >85% of SMI
## 5           County         53001    Adams County                <=60% of SMI
## 6           County         53001    Adams County       >60% and <=75% of SMI
## 7           County         53001    Adams County       >75% and <=85% of SMI
## 8           County         53001    Adams County                 >85% of SMI
## 9           County         53001    Adams County                <=60% of SMI
## 10          County         53001    Adams County       >60% and <=75% of SMI
##     Age.Group Childcare.Subsidized Private.Care.Estimate
## 1      Infant                   18                    14
## 2      Infant                    0                     0
## 3      Infant                    0                     0
## 4      Infant                    0                     4
## 5   Preschool                  220                    23
## 6   Preschool                    0                     2
## 7   Preschool                    0                     0
## 8   Preschool                   NA                    11
## 9  School Age                  154                    58
## 10 School Age                    0                     5
##    Estimated.Children.Receiving.Childcare Estimate.of.Unserved Percent.Need.Met
## 1                                      32                  245             11.6
## 2                                       0                   14              0.0
## 3                                       0                    8              0.0
## 4                                       4                   38              9.5
## 5                                     243                  469             34.1
## 6                                       2                   60              3.2
## 7                                       0                   20              0.0
## 8                                      NA                  100               NA
## 9                                     212                 2178              8.9
## 10                                      5                  219              2.2
tail(children_supply,10)
##       Geographic.Unit Geographic.ID Geographic.Name State.Median.Income.Bracket
## 15623        ZIP Code         99403           99403       >75% and <=85% of SMI
## 15624        ZIP Code         99403           99403                 >85% of SMI
## 15625        ZIP Code         99403           99403                <=60% of SMI
## 15626        ZIP Code         99403           99403       >60% and <=75% of SMI
## 15627        ZIP Code         99403           99403       >75% and <=85% of SMI
## 15628        ZIP Code         99403           99403                 >85% of SMI
## 15629        ZIP Code         99403           99403                <=60% of SMI
## 15630        ZIP Code         99403           99403       >60% and <=75% of SMI
## 15631        ZIP Code         99403           99403       >75% and <=85% of SMI
## 15632        ZIP Code         99403           99403                 >85% of SMI
##        Age.Group Childcare.Subsidized Private.Care.Estimate
## 15623  Preschool                    0                     6
## 15624  Preschool                    0                    61
## 15625 School Age                   63                    15
## 15626 School Age                   NA                     2
## 15627 School Age                    0                     0
## 15628 School Age                    0                     6
## 15629    Toddler                   60                     6
## 15630    Toddler                    0                     6
## 15631    Toddler                    0                     1
## 15632    Toddler                    0                    23
##       Estimated.Children.Receiving.Childcare Estimate.of.Unserved
## 15623                                      6                   30
## 15624                                     61                   65
## 15625                                     78                  965
## 15626                                     NA                  156
## 15627                                      0                   99
## 15628                                      6                  223
## 15629                                     66                  199
## 15630                                      6                   51
## 15631                                      1                   15
## 15632                                     23                   67
##       Percent.Need.Met
## 15623             16.7
## 15624             48.4
## 15625              7.5
## 15626               NA
## 15627              0.0
## 15628              2.6
## 15629             24.9
## 15630             10.5
## 15631              6.2
## 15632             25.6

First Normal Form (1NF) sort the data by Percent Need Met from min. to max.

1. Using row order to convey information is not permitted.

2. Mixing data types within the same column is not permitted.

3. Having a table without a primary key is not permitted.

4. Repeating groups are not permitted.

percent_met <- children_supply %>% arrange(`Percent.Need.Met`)
filtered_percentmet <- percent_met %>% filter(`Percent.Need.Met` !=0)
head(filtered_percentmet,10)
##    Geographic.Unit Geographic.ID                 Geographic.Name
## 1           County         53039                Klickitat County
## 2           County         53019                    Ferry County
## 3           County         53047                 Okanogan County
## 4  School District       5301630        Colville School District
## 5  School District       5301660        Concrete School District
## 6  School District       5304260        Lakewood School District
## 7  School District       5304560       Riverview School District
## 8  School District       5307650 San Juan Island School District
## 9  School District       5308550          Sultan School District
## 10 School District       5310140            Yelm School District
##    State.Median.Income.Bracket  Age.Group Childcare.Subsidized
## 1        >60% and <=75% of SMI School Age                    0
## 2                  >85% of SMI School Age                    0
## 3        >75% and <=85% of SMI School Age                    0
## 4                  >85% of SMI School Age                    0
## 5                  >85% of SMI School Age                    0
## 6        >60% and <=75% of SMI School Age                    0
## 7        >60% and <=75% of SMI School Age                    0
## 8                 <=60% of SMI School Age                    0
## 9                  >85% of SMI School Age                    0
## 10       >75% and <=85% of SMI School Age                    0
##    Private.Care.Estimate Estimated.Children.Receiving.Childcare
## 1                      1                                      1
## 2                      1                                      1
## 3                      1                                      1
## 4                      1                                      1
## 5                      1                                      1
## 6                      1                                      1
## 7                      1                                      1
## 8                      1                                      1
## 9                      2                                      2
## 10                     1                                      1
##    Estimate.of.Unserved Percent.Need.Met
## 1                   246              0.4
## 2                   185              0.5
## 3                   201              0.5
## 4                   209              0.5
## 5                   216              0.5
## 6                   216              0.5
## 7                   193              0.5
## 8                   213              0.5
## 9                   369              0.5
## 10                  221              0.5
tail(filtered_percentmet,10)
##      Geographic.Unit Geographic.ID Geographic.Name State.Median.Income.Bracket
## 7376        ZIP Code         99140           99140                <=60% of SMI
## 7377        ZIP Code         99140           99140       >60% and <=75% of SMI
## 7378        ZIP Code         99140           99140                 >85% of SMI
## 7379        ZIP Code         99140           99140                 >85% of SMI
## 7380        ZIP Code         99140           99140       >75% and <=85% of SMI
## 7381        ZIP Code         99155           99155       >75% and <=85% of SMI
## 7382        ZIP Code         99158           99158       >75% and <=85% of SMI
## 7383        ZIP Code         99173           99173       >75% and <=85% of SMI
## 7384        ZIP Code         99204           99204       >75% and <=85% of SMI
## 7385        ZIP Code         99357           99357                 >85% of SMI
##      Age.Group Childcare.Subsidized Private.Care.Estimate
## 7376    Infant                    0                     3
## 7377    Infant                    0                     1
## 7378    Infant                    0                     1
## 7379 Preschool                    0                     1
## 7380   Toddler                    0                     2
## 7381 Preschool                    0                     1
## 7382 Preschool                    0                     1
## 7383 Preschool                    0                     1
## 7384   Toddler                    0                     1
## 7385    Infant                    0                     5
##      Estimated.Children.Receiving.Childcare Estimate.of.Unserved
## 7376                                      3                    0
## 7377                                      1                    0
## 7378                                      1                    0
## 7379                                      1                    0
## 7380                                      2                    0
## 7381                                      1                    0
## 7382                                      1                    0
## 7383                                      1                    0
## 7384                                      1                    0
## 7385                                      5                    0
##      Percent.Need.Met
## 7376              100
## 7377              100
## 7378              100
## 7379              100
## 7380              100
## 7381              100
## 7382              100
## 7383              100
## 7384              100
## 7385              100

Second Normal Form (2NF)

Each non-key attribute in the table must be dependent on the entire primary key.

# Geographic Table
geographic_table <- filtered_percentmet %>%
  select(`Geographic.Unit`, `Geographic.ID`, `Geographic.Name`)
head(geographic_table,10)
##    Geographic.Unit Geographic.ID                 Geographic.Name
## 1           County         53039                Klickitat County
## 2           County         53019                    Ferry County
## 3           County         53047                 Okanogan County
## 4  School District       5301630        Colville School District
## 5  School District       5301660        Concrete School District
## 6  School District       5304260        Lakewood School District
## 7  School District       5304560       Riverview School District
## 8  School District       5307650 San Juan Island School District
## 9  School District       5308550          Sultan School District
## 10 School District       5310140            Yelm School District
# Childcare-related info. table
childcare_table <- filtered_percentmet %>%
  select(`Geographic.ID`, `State.Median.Income.Bracket`, `Age.Group`, `Estimated.Children.Receiving.Childcare`, `Estimate.of.Unserved`, `Percent.Need.Met`)
head(childcare_table, 10)
##    Geographic.ID State.Median.Income.Bracket  Age.Group
## 1          53039       >60% and <=75% of SMI School Age
## 2          53019                 >85% of SMI School Age
## 3          53047       >75% and <=85% of SMI School Age
## 4        5301630                 >85% of SMI School Age
## 5        5301660                 >85% of SMI School Age
## 6        5304260       >60% and <=75% of SMI School Age
## 7        5304560       >60% and <=75% of SMI School Age
## 8        5307650                <=60% of SMI School Age
## 9        5308550                 >85% of SMI School Age
## 10       5310140       >75% and <=85% of SMI School Age
##    Estimated.Children.Receiving.Childcare Estimate.of.Unserved Percent.Need.Met
## 1                                       1                  246              0.4
## 2                                       1                  185              0.5
## 3                                       1                  201              0.5
## 4                                       1                  209              0.5
## 5                                       1                  216              0.5
## 6                                       1                  216              0.5
## 7                                       1                  193              0.5
## 8                                       1                  213              0.5
## 9                                       2                  369              0.5
## 10                                      1                  221              0.5
tail(childcare_table, 10)
##      Geographic.ID State.Median.Income.Bracket Age.Group
## 7376         99140                <=60% of SMI    Infant
## 7377         99140       >60% and <=75% of SMI    Infant
## 7378         99140                 >85% of SMI    Infant
## 7379         99140                 >85% of SMI Preschool
## 7380         99140       >75% and <=85% of SMI   Toddler
## 7381         99155       >75% and <=85% of SMI Preschool
## 7382         99158       >75% and <=85% of SMI Preschool
## 7383         99173       >75% and <=85% of SMI Preschool
## 7384         99204       >75% and <=85% of SMI   Toddler
## 7385         99357                 >85% of SMI    Infant
##      Estimated.Children.Receiving.Childcare Estimate.of.Unserved
## 7376                                      3                    0
## 7377                                      1                    0
## 7378                                      1                    0
## 7379                                      1                    0
## 7380                                      2                    0
## 7381                                      1                    0
## 7382                                      1                    0
## 7383                                      1                    0
## 7384                                      1                    0
## 7385                                      5                    0
##      Percent.Need.Met
## 7376              100
## 7377              100
## 7378              100
## 7379              100
## 7380              100
## 7381              100
## 7382              100
## 7383              100
## 7384              100
## 7385              100

Fourth Normal Form (4NF)

The only kinds of multivalued dependency allowed in a table are multivalued dependencies on the key.

median_income <- unique(childcare_table[, c("Geographic.ID", "State.Median.Income.Bracket")])
income_sorted <- median_income[order(median_income$Geographic.ID), ]
head(income_sorted, 10)
##      Geographic.ID State.Median.Income.Bracket
## 309          53001       >60% and <=75% of SMI
## 337          53001       >75% and <=85% of SMI
## 2324         53001                <=60% of SMI
## 2455         53001                 >85% of SMI
## 850          53002       >75% and <=85% of SMI
## 2018         53002                <=60% of SMI
## 2348         53002                 >85% of SMI
## 2556         53002       >60% and <=75% of SMI
## 310          53003                 >85% of SMI
## 947          53003       >75% and <=85% of SMI
tail(income_sorted, 10)
##      Geographic.ID State.Median.Income.Bracket
## 2845       5310110       >60% and <=75% of SMI
## 2963       5310110                 >85% of SMI
## 4003       5310110                <=60% of SMI
## 10         5310140       >75% and <=85% of SMI
## 273        5310140                 >85% of SMI
## 1249       5310140                <=60% of SMI
## 1413       5310170       >60% and <=75% of SMI
## 1511       5310170       >75% and <=85% of SMI
## 1731       5310170                 >85% of SMI
## 2928       5310170                <=60% of SMI
age_group <- unique(childcare_table[, c("Geographic.ID", "Age.Group")])
age_sorted <- age_group[order(age_group$Geographic.ID), ]
head(age_sorted, 10)
##      Geographic.ID  Age.Group
## 309          53001 School Age
## 594          53001  Preschool
## 1917         53001    Toddler
## 2231         53001     Infant
## 850          53002 School Age
## 1493         53002     Infant
## 2556         53002  Preschool
## 2941         53002    Toddler
## 310          53003 School Age
## 947          53003     Infant
tail(age_sorted, 10)
##      Geographic.ID  Age.Group
## 2963       5310110 School Age
## 6243       5310110  Preschool
## 10         5310140 School Age
## 1362       5310140  Preschool
## 2663       5310140     Infant
## 3762       5310140    Toddler
## 1413       5310170    Toddler
## 1511       5310170 School Age
## 1731       5310170  Preschool
## 4354       5310170     Infant

Character Manipulation

2. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

listed_majors <- read.csv ("https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/majors-list.csv")

head(listed_majors)
##   FOD1P                                 Major                  Major_Category
## 1  1100                   GENERAL AGRICULTURE Agriculture & Natural Resources
## 2  1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3  1102                AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4  1103                       ANIMAL SCIENCES Agriculture & Natural Resources
## 5  1104                          FOOD SCIENCE Agriculture & Natural Resources
## 6  1105            PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
tail(listed_majors)
##     FOD1P                            Major Major_Category
## 169  5503                      CRIMINOLOGY Social Science
## 170  5504                        GEOGRAPHY Social Science
## 171  5505          INTERNATIONAL RELATIONS Social Science
## 172  5506 POLITICAL SCIENCE AND GOVERNMENT Social Science
## 173  5507                        SOCIOLOGY Social Science
## 174  5599    MISCELLANEOUS SOCIAL SCIENCES Social Science
# use grepl to filter rows where the Major column contains "DATA" or "STATISTICS"
filtered_majors <- listed_majors[grepl("DATA|STATISTICS", listed_majors$Major, ignore.case = TRUE), ]

print(filtered_majors)
##    FOD1P                                         Major          Major_Category
## 44  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS                Business
## 52  2101      COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 59  3702               STATISTICS AND DECISION SCIENCE Computers & Mathematics

Describe, in words, what these expressions will match:

(.)\1\1

1. (.) - will match any single character in first group

2. \1 - refers to the match contained in the first parenthesis

3. \1\1 - character repeated two or more times.

“(.)(.)\2\1”

1. (.)(.) - will match any single character in first group and second group

2. \2 refers to the match contained in the second parenthesis

3. \1 refers to the match contained in the first parenthesis

(..)\1

1. (..) - match with two characters

2. \1 - refers to the match contained in the first parenthesis

“(.).\1.\1”

1. (.) - will match any single character in first group

2. . - match any character

3. \1 - refers to the match contained in the first parenthesis

4. . - match any character

5. \1 - refers to the match contained in the first parenthesis

“(.)(.)(.).*\3\2\1”

1. (.) - will match any single character in first group

2. (.) - will match any single character in second group

3. (.) - will match any single character in third group

4. .* - will match any sequence of characters

5. \3 - refers to the match contained in the third parenthesis

6. \2 - refers to the match contained in the second parenthesis

7. \1 - refers to the match contained in the first parenthesis

Construct regular expressions to match words that:

Start and end with the same character.

(.)\1

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

“(..).*\1”

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

“(.)\1.*\1”