# import data
df <- read.csv('https://raw.githubusercontent.com/yli1048/yli1048/refs/heads/607/Employee%20Sample%20Data.csv', header=TRUE)
glimpse(df)
## Rows: 19
## Columns: 14
## $ EEID <chr> "E02387", "E04105", "E02572", "E02832", "E01639", "E0064…
## $ Full.Name <chr> "Emily Davis", "Theodore Dinh", "Luna Sanders", "Penelop…
## $ Job.Title <chr> "Sr. Manger", "Technical Architect", "Director", "Comput…
## $ Department <chr> "IT", "IT", "Finance", "IT", "Finance", "Sales", "IT", "…
## $ Business.Unit <chr> "Research & Development", "Manufacturing", "Speciality P…
## $ Gender <chr> "Female", "Male", "Female", "Female", "Male", "Male", "F…
## $ Ethnicity <chr> "Black", "Asian", "Caucasian", "Caucasian", "Asian", "As…
## $ Age <int> 55, 59, 50, 26, 55, 57, 27, 25, 29, 34, 36, 27, 59, 51, …
## $ Hire.Date <chr> "4/8/2016", "11/29/1997", "10/26/2006", "9/27/2019", "11…
## $ Annual.Salary <int> 141604, 99975, 163099, 84913, 95409, 50994, 119746, 4133…
## $ Bonus.. <chr> "15%", "0%", "20%", "7%", "0%", "0%", "10%", "0%", "6%",…
## $ Country <chr> "United States", "China", "United States", "United State…
## $ City <chr> "Seattle", "Chongqing", "Chicago", "Chicago", "Phoenix",…
## $ Exit.Date <chr> "10/16/2021", "", "", "", "", "", "", "5/20/2021", "", "…
#Create data frame for employee information
df1 <- df[, c(1:5, 9, 14)]
print(df1)
## EEID Full.Name Job.Title Department
## 1 E02387 Emily Davis Sr. Manger IT
## 2 E04105 Theodore Dinh Technical Architect IT
## 3 E02572 Luna Sanders Director Finance
## 4 E02832 Penelope Jordan Computer Systems Manager IT
## 5 E01639 Austin Vo Sr. Analyst Finance
## 6 E00644 Joshua Gupta Account Representative Sales
## 7 E01550 Ruby Barnes Manager IT
## 8 E04332 Luke Martin Analyst Finance
## 9 E04533 Easton Bailey Manager Accounting
## 10 E03838 Madeline Walker Sr. Analyst Finance
## 11 E00591 Savannah Ali Sr. Manger Human Resources
## 12 E03344 Camila Rogers Controls Engineer Engineering
## 13 E00530 Eli Jones Manager Human Resources
## 14 E04239 Everleigh Ng Sr. Manger Finance
## 15 E03496 Robert Yang Sr. Analyst Accounting
## 16 E00549 Isabella Xi Vice President Marketing
## 17 E00163 Bella Powell Director Finance
## 18 E00884 Camila Silva Sr. Manger Marketing
## 19 E04116 David Barnes Director IT
## Business.Unit Hire.Date Exit.Date
## 1 Research & Development 4/8/2016 10/16/2021
## 2 Manufacturing 11/29/1997
## 3 Speciality Products 10/26/2006
## 4 Manufacturing 9/27/2019
## 5 Manufacturing 11/20/1995
## 6 Corporate 1/24/2017
## 7 Corporate 7/1/2020
## 8 Manufacturing 5/16/2020 5/20/2021
## 9 Manufacturing 1/25/2019
## 10 Speciality Products 6/13/2018
## 11 Manufacturing 2/11/2009
## 12 Speciality Products 10/21/2021
## 13 Manufacturing 3/14/1999
## 14 Research & Development 6/10/2021
## 15 Speciality Products 11/4/2017 3/9/2020
## 16 Research & Development 3/13/2013
## 17 Research & Development 3/4/2002
## 18 Speciality Products 12/1/2003
## 19 Corporate 11/3/2013
#Create data frame for employee's salary
df2 <- df[, c(1, 10, 11)]
print(df2)
## EEID Annual.Salary Bonus..
## 1 E02387 141604 15%
## 2 E04105 99975 0%
## 3 E02572 163099 20%
## 4 E02832 84913 7%
## 5 E01639 95409 0%
## 6 E00644 50994 0%
## 7 E01550 119746 10%
## 8 E04332 41336 0%
## 9 E04533 113527 6%
## 10 E03838 77203 0%
## 11 E00591 157333 15%
## 12 E03344 109851 0%
## 13 E00530 105086 9%
## 14 E04239 146742 10%
## 15 E03496 97078 0%
## 16 E00549 249270 30%
## 17 E00163 175837 20%
## 18 E00884 154828 13%
## 19 E04116 186503 24%
#Plot histogram
hist(df2$Annual.Salary, main = "Annual Salary", xlab = "Salary")
#Create data frame for workforce diversity
df3 <- df[, c(1, 6:8, 12, 13)]
print(df3)
## EEID Gender Ethnicity Age Country City
## 1 E02387 Female Black 55 United States Seattle
## 2 E04105 Male Asian 59 China Chongqing
## 3 E02572 Female Caucasian 50 United States Chicago
## 4 E02832 Female Caucasian 26 United States Chicago
## 5 E01639 Male Asian 55 United States Phoenix
## 6 E00644 Male Asian 57 China Chongqing
## 7 E01550 Female Caucasian 27 United States Phoenix
## 8 E04332 Male Black 25 United States Miami
## 9 E04533 Male Caucasian 29 United States Austin
## 10 E03838 Female Caucasian 34 United States Chicago
## 11 E00591 Female Asian 36 United States Miami
## 12 E03344 Female Caucasian 27 United States Seattle
## 13 E00530 Male Caucasian 59 United States Austin
## 14 E04239 Female Asian 51 China Shanghai
## 15 E03496 Male Asian 31 United States Austin
## 16 E00549 Female Asian 41 United States Seattle
## 17 E00163 Female Black 65 United States Phoenix
## 18 E00884 Female Latino 64 United States Seattle
## 19 E04116 Male Caucasian 64 United States Columbus
#Plot histogram
hist(df3$Age, main = "Age", xlab = "Age")
library(stringr)
majors <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv", header = TRUE)
glimpse(majors)
## Rows: 174
## Columns: 3
## $ FOD1P <chr> "1100", "1101", "1102", "1103", "1104", "1105", "1106",…
## $ Major <chr> "GENERAL AGRICULTURE", "AGRICULTURE PRODUCTION AND MANA…
## $ Major_Category <chr> "Agriculture & Natural Resources", "Agriculture & Natur…
#Subset data frame based on majors that have "DATA" or "STATISTICS"
majors[str_detect(majors$Major, "DATA|STATISTICS"),]
## FOD1P Major Major_Category
## 44 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 52 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 59 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
The word contains the same character three times in a row. For example: AAA.
Two consecutive characters are repeated in reverse order. For example: Alla.
Two consecutive characters are repeated. For example: Haha.
The word has the same character in the first, third, and fifth positions. For example: Abaca.
The first three characters are repeated in reverse order at the end, separated by any number of characters. For example: qwerewq.
str_subset(words, "^(.)((.*\\1$)|\\1?$)")
## [1] "a" "america" "area" "dad" "dead"
## [6] "depend" "educate" "else" "encourage" "engine"
## [11] "europe" "evidence" "example" "excuse" "exercise"
## [16] "expense" "experience" "eye" "health" "high"
## [21] "knock" "level" "local" "nation" "non"
## [26] "rather" "refer" "remember" "serious" "stairs"
## [31] "test" "tonight" "transport" "treat" "trust"
## [36] "window" "yesterday"
str_subset("church", "([A-Za-z][A-Za-z]).*\\1")
## [1] "church"
str_subset(words, "([a-z]).*\\1.*\\1")
## [1] "appropriate" "available" "believe" "between" "business"
## [6] "degree" "difference" "discuss" "eleven" "environment"
## [11] "evidence" "exercise" "expense" "experience" "individual"
## [16] "paragraph" "receive" "remember" "represent" "telephone"
## [21] "therefore" "tomorrow"