Overview

Normalization

# import data
df <- read.csv('https://raw.githubusercontent.com/yli1048/yli1048/refs/heads/607/Employee%20Sample%20Data.csv', header=TRUE)
glimpse(df)
## Rows: 19
## Columns: 14
## $ EEID          <chr> "E02387", "E04105", "E02572", "E02832", "E01639", "E0064…
## $ Full.Name     <chr> "Emily Davis", "Theodore Dinh", "Luna Sanders", "Penelop…
## $ Job.Title     <chr> "Sr. Manger", "Technical Architect", "Director", "Comput…
## $ Department    <chr> "IT", "IT", "Finance", "IT", "Finance", "Sales", "IT", "…
## $ Business.Unit <chr> "Research & Development", "Manufacturing", "Speciality P…
## $ Gender        <chr> "Female", "Male", "Female", "Female", "Male", "Male", "F…
## $ Ethnicity     <chr> "Black", "Asian", "Caucasian", "Caucasian", "Asian", "As…
## $ Age           <int> 55, 59, 50, 26, 55, 57, 27, 25, 29, 34, 36, 27, 59, 51, …
## $ Hire.Date     <chr> "4/8/2016", "11/29/1997", "10/26/2006", "9/27/2019", "11…
## $ Annual.Salary <int> 141604, 99975, 163099, 84913, 95409, 50994, 119746, 4133…
## $ Bonus..       <chr> "15%", "0%", "20%", "7%", "0%", "0%", "10%", "0%", "6%",…
## $ Country       <chr> "United States", "China", "United States", "United State…
## $ City          <chr> "Seattle", "Chongqing", "Chicago", "Chicago", "Phoenix",…
## $ Exit.Date     <chr> "10/16/2021", "", "", "", "", "", "", "5/20/2021", "", "…
#Create data frame for employee information
df1 <- df[, c(1:5, 9, 14)]
print(df1)
##      EEID       Full.Name                Job.Title      Department
## 1  E02387     Emily Davis               Sr. Manger              IT
## 2  E04105   Theodore Dinh      Technical Architect              IT
## 3  E02572    Luna Sanders                 Director         Finance
## 4  E02832 Penelope Jordan Computer Systems Manager              IT
## 5  E01639       Austin Vo              Sr. Analyst         Finance
## 6  E00644    Joshua Gupta   Account Representative           Sales
## 7  E01550     Ruby Barnes                  Manager              IT
## 8  E04332     Luke Martin                  Analyst         Finance
## 9  E04533   Easton Bailey                  Manager      Accounting
## 10 E03838 Madeline Walker              Sr. Analyst         Finance
## 11 E00591    Savannah Ali               Sr. Manger Human Resources
## 12 E03344   Camila Rogers        Controls Engineer     Engineering
## 13 E00530       Eli Jones                  Manager Human Resources
## 14 E04239    Everleigh Ng               Sr. Manger         Finance
## 15 E03496     Robert Yang              Sr. Analyst      Accounting
## 16 E00549     Isabella Xi           Vice President       Marketing
## 17 E00163    Bella Powell                 Director         Finance
## 18 E00884    Camila Silva               Sr. Manger       Marketing
## 19 E04116    David Barnes                 Director              IT
##             Business.Unit  Hire.Date  Exit.Date
## 1  Research & Development   4/8/2016 10/16/2021
## 2           Manufacturing 11/29/1997           
## 3     Speciality Products 10/26/2006           
## 4           Manufacturing  9/27/2019           
## 5           Manufacturing 11/20/1995           
## 6               Corporate  1/24/2017           
## 7               Corporate   7/1/2020           
## 8           Manufacturing  5/16/2020  5/20/2021
## 9           Manufacturing  1/25/2019           
## 10    Speciality Products  6/13/2018           
## 11          Manufacturing  2/11/2009           
## 12    Speciality Products 10/21/2021           
## 13          Manufacturing  3/14/1999           
## 14 Research & Development  6/10/2021           
## 15    Speciality Products  11/4/2017   3/9/2020
## 16 Research & Development  3/13/2013           
## 17 Research & Development   3/4/2002           
## 18    Speciality Products  12/1/2003           
## 19              Corporate  11/3/2013
#Create data frame for employee's salary
df2 <- df[, c(1, 10, 11)]
print(df2)
##      EEID Annual.Salary Bonus..
## 1  E02387        141604     15%
## 2  E04105         99975      0%
## 3  E02572        163099     20%
## 4  E02832         84913      7%
## 5  E01639         95409      0%
## 6  E00644         50994      0%
## 7  E01550        119746     10%
## 8  E04332         41336      0%
## 9  E04533        113527      6%
## 10 E03838         77203      0%
## 11 E00591        157333     15%
## 12 E03344        109851      0%
## 13 E00530        105086      9%
## 14 E04239        146742     10%
## 15 E03496         97078      0%
## 16 E00549        249270     30%
## 17 E00163        175837     20%
## 18 E00884        154828     13%
## 19 E04116        186503     24%
#Plot histogram
hist(df2$Annual.Salary, main = "Annual Salary", xlab = "Salary")

#Create data frame for workforce diversity
df3 <- df[, c(1, 6:8, 12, 13)]
print(df3)
##      EEID Gender Ethnicity Age       Country      City
## 1  E02387 Female     Black  55 United States   Seattle
## 2  E04105   Male     Asian  59         China Chongqing
## 3  E02572 Female Caucasian  50 United States   Chicago
## 4  E02832 Female Caucasian  26 United States   Chicago
## 5  E01639   Male     Asian  55 United States   Phoenix
## 6  E00644   Male     Asian  57         China Chongqing
## 7  E01550 Female Caucasian  27 United States   Phoenix
## 8  E04332   Male     Black  25 United States     Miami
## 9  E04533   Male Caucasian  29 United States    Austin
## 10 E03838 Female Caucasian  34 United States   Chicago
## 11 E00591 Female     Asian  36 United States     Miami
## 12 E03344 Female Caucasian  27 United States   Seattle
## 13 E00530   Male Caucasian  59 United States    Austin
## 14 E04239 Female     Asian  51         China  Shanghai
## 15 E03496   Male     Asian  31 United States    Austin
## 16 E00549 Female     Asian  41 United States   Seattle
## 17 E00163 Female     Black  65 United States   Phoenix
## 18 E00884 Female    Latino  64 United States   Seattle
## 19 E04116   Male Caucasian  64 United States  Columbus
#Plot histogram
hist(df3$Age, main = "Age", xlab = "Age")

Character Manipulation

2. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”
library(stringr)
majors <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv", header = TRUE)
glimpse(majors)
## Rows: 174
## Columns: 3
## $ FOD1P          <chr> "1100", "1101", "1102", "1103", "1104", "1105", "1106",…
## $ Major          <chr> "GENERAL AGRICULTURE", "AGRICULTURE PRODUCTION AND MANA…
## $ Major_Category <chr> "Agriculture & Natural Resources", "Agriculture & Natur…
#Subset data frame based on majors that have "DATA" or "STATISTICS"
majors[str_detect(majors$Major, "DATA|STATISTICS"),]
##    FOD1P                                         Major          Major_Category
## 44  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS                Business
## 52  2101      COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 59  3702               STATISTICS AND DECISION SCIENCE Computers & Mathematics
Majors that contains either “DATA” or “STATISTICS” are “COMPUTER PROGRAMMING AND DATA PROCESSING”, “MANAGEMENT INFORMATION SYSTEMS AND STATISTICS”, and “STATISTICS AND DECISION SCIENCE”.

3. Describe, in words, what these expressions will match:

(.)\1\1

The word contains the same character three times in a row. For example: AAA.

“(.)(.)\2\1”

Two consecutive characters are repeated in reverse order. For example: Alla.

(..)\1

Two consecutive characters are repeated. For example: Haha.

“(.).\1.\1”

The word has the same character in the first, third, and fifth positions. For example: Abaca.

“(.)(.)(.).*\3\2\1”

The first three characters are repeated in reverse order at the end, separated by any number of characters. For example: qwerewq.

4. Construct regular expressions to match words that:

Start and end with the same character.
str_subset(words, "^(.)((.*\\1$)|\\1?$)")
##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "depend"     "educate"    "else"       "encourage"  "engine"    
## [11] "europe"     "evidence"   "example"    "excuse"     "exercise"  
## [16] "expense"    "experience" "eye"        "health"     "high"      
## [21] "knock"      "level"      "local"      "nation"     "non"       
## [26] "rather"     "refer"      "remember"   "serious"    "stairs"    
## [31] "test"       "tonight"    "transport"  "treat"      "trust"     
## [36] "window"     "yesterday"
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
str_subset("church", "([A-Za-z][A-Za-z]).*\\1")
## [1] "church"
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
str_subset(words, "([a-z]).*\\1.*\\1")
##  [1] "appropriate" "available"   "believe"     "between"     "business"   
##  [6] "degree"      "difference"  "discuss"     "eleven"      "environment"
## [11] "evidence"    "exercise"    "expense"     "experience"  "individual" 
## [16] "paragraph"   "receive"     "remember"    "represent"   "telephone"  
## [21] "therefore"   "tomorrow"