library(tidyverse, quietly=TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
major_data <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
data
## function (..., list = character(), package = NULL, lib.loc = NULL,
## verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
## {
## fileExt <- function(x) {
## db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
## ans <- sub(".*\\.", "", x)
## ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2",
## x[db])
## ans
## }
## my_read_table <- function(...) {
## lcc <- Sys.getlocale("LC_COLLATE")
## on.exit(Sys.setlocale("LC_COLLATE", lcc))
## Sys.setlocale("LC_COLLATE", "C")
## read.table(...)
## }
## stopifnot(is.character(list))
## names <- c(as.character(substitute(list(...))[-1L]), list)
## if (!is.null(package)) {
## if (!is.character(package))
## stop("'package' must be a character vector or NULL")
## }
## paths <- find.package(package, lib.loc, verbose = verbose)
## if (is.null(lib.loc))
## paths <- c(path.package(package, TRUE), if (!length(package)) getwd(),
## paths)
## paths <- unique(normalizePath(paths[file.exists(paths)]))
## paths <- paths[dir.exists(file.path(paths, "data"))]
## dataExts <- tools:::.make_file_exts("data")
## if (length(names) == 0L) {
## db <- matrix(character(), nrow = 0L, ncol = 4L)
## for (path in paths) {
## entries <- NULL
## packageName <- if (file_test("-f", file.path(path,
## "DESCRIPTION")))
## basename(path)
## else "."
## if (file_test("-f", INDEX <- file.path(path, "Meta",
## "data.rds"))) {
## entries <- readRDS(INDEX)
## }
## else {
## dataDir <- file.path(path, "data")
## entries <- tools::list_files_with_type(dataDir,
## "data")
## if (length(entries)) {
## entries <- unique(tools::file_path_sans_ext(basename(entries)))
## entries <- cbind(entries, "")
## }
## }
## if (NROW(entries)) {
## if (is.matrix(entries) && ncol(entries) == 2L)
## db <- rbind(db, cbind(packageName, dirname(path),
## entries))
## else warning(gettextf("data index for package %s is invalid and will be ignored",
## sQuote(packageName)), domain = NA, call. = FALSE)
## }
## }
## colnames(db) <- c("Package", "LibPath", "Item", "Title")
## footer <- if (missing(package))
## paste0("Use ", sQuote(paste("data(package =", ".packages(all.available = TRUE))")),
## "\n", "to list the data sets in all *available* packages.")
## else NULL
## y <- list(title = "Data sets", header = NULL, results = db,
## footer = footer)
## class(y) <- "packageIQR"
## return(y)
## }
## paths <- file.path(paths, "data")
## for (name in names) {
## found <- FALSE
## for (p in paths) {
## tmp_env <- if (overwrite)
## envir
## else new.env()
## if (file_test("-f", file.path(p, "Rdata.rds"))) {
## rds <- readRDS(file.path(p, "Rdata.rds"))
## if (name %in% names(rds)) {
## found <- TRUE
## if (verbose)
## message(sprintf("name=%s:\t found in Rdata.rds",
## name), domain = NA)
## thispkg <- sub(".*/([^/]*)/data$", "\\1", p)
## thispkg <- sub("_.*$", "", thispkg)
## thispkg <- paste0("package:", thispkg)
## objs <- rds[[name]]
## lazyLoad(file.path(p, "Rdata"), envir = tmp_env,
## filter = function(x) x %in% objs)
## break
## }
## else if (verbose)
## message(sprintf("name=%s:\t NOT found in names() of Rdata.rds, i.e.,\n\t%s\n",
## name, paste(names(rds), collapse = ",")),
## domain = NA)
## }
## files <- list.files(p, full.names = TRUE)
## files <- files[grep(name, files, fixed = TRUE)]
## if (length(files) > 1L) {
## o <- match(fileExt(files), dataExts, nomatch = 100L)
## paths0 <- dirname(files)
## paths0 <- factor(paths0, levels = unique(paths0))
## files <- files[order(paths0, o)]
## }
## if (length(files)) {
## for (file in files) {
## if (verbose)
## message("name=", name, ":\t file= ...", .Platform$file.sep,
## basename(file), "::\t", appendLF = FALSE,
## domain = NA)
## ext <- fileExt(file)
## if (basename(file) != paste0(name, ".", ext))
## found <- FALSE
## else {
## found <- TRUE
## switch(ext, R = , r = {
## library("utils")
## sys.source(file, chdir = TRUE, envir = tmp_env)
## }, RData = , rdata = , rda = load(file, envir = tmp_env),
## TXT = , txt = , tab = , tab.gz = , tab.bz2 = ,
## tab.xz = , txt.gz = , txt.bz2 = , txt.xz = assign(name,
## my_read_table(file, header = TRUE, as.is = FALSE),
## envir = tmp_env), CSV = , csv = , csv.gz = ,
## csv.bz2 = , csv.xz = assign(name, my_read_table(file,
## header = TRUE, sep = ";", as.is = FALSE),
## envir = tmp_env), found <- FALSE)
## }
## if (found)
## break
## }
## if (verbose)
## message(if (!found)
## "*NOT* ", "found", domain = NA)
## }
## if (found)
## break
## }
## if (!found) {
## warning(gettextf("data set %s not found", sQuote(name)),
## domain = NA)
## }
## else if (!overwrite) {
## for (o in ls(envir = tmp_env, all.names = TRUE)) {
## if (exists(o, envir = envir, inherits = FALSE))
## warning(gettextf("an object named %s already exists and will not be overwritten",
## sQuote(o)))
## else assign(o, get(o, envir = tmp_env, inherits = FALSE),
## envir = envir)
## }
## rm(tmp_env)
## }
## }
## invisible(names)
## }
## <bytecode: 0x0000018cacf33740>
## <environment: namespace:utils>
data <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv")
normalize_fun <- function(x){
return ((x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) -(min(x, na.rm = TRUE))))
}
normalize_data <- as.data.frame(apply(data[4:6],2,normalize_fun))
normalize_data
## Total Employed Employed_full_time_year_round
## 1 0.0402907423 0.0377205889 0.0376543047
## 2 0.0297746253 0.0320340039 0.0325786995
## 3 0.0101114538 0.0105524828 0.0112042000
## 4 0.0324092616 0.0338666313 0.0329382946
## 5 0.0070115991 0.0067104253 0.0059996151
## 6 0.0246748437 0.0261595661 0.0257876655
## 7 0.0013424694 0.0014594718 0.0015214434
## 8 0.0019714115 0.0020825311 0.0020538712
## 9 0.0332285203 0.0365972971 0.0330935861
## 10 0.0214830346 0.0198630970 0.0198731769
## 11 0.0258856293 0.0273895345 0.0255389929
## 12 0.0936511771 0.0914945178 0.0835411195
## 13 0.0324704577 0.0315805221 0.0255054582
## 14 0.3156821571 0.3354167145 0.3067888155
## 15 0.1331921871 0.1330040384 0.1208869050
## 16 0.0669046373 0.0718184237 0.0641781858
## 17 0.0590920421 0.0620258523 0.0569878310
## 18 0.0191422037 0.0204500307 0.0186597369
## 19 0.0805436777 0.0921226772 0.0974348021
## 20 0.0086254459 0.0090679356 0.0091080235
## 21 0.2501978460 0.2783281610 0.2888931538
## 22 0.0241609246 0.0275833374 0.0291550650
## 23 0.0118438481 0.0131216462 0.0139622998
## 24 0.0158196721 0.0180963455 0.0179854315
## 25 0.0127931886 0.0135560027 0.0127364776
## 26 0.4602430414 0.3579407762 0.3047891158
## 27 0.0005257738 0.0006889353 0.0007093878
## 28 0.0000000000 0.0000000000 0.0000000000
## 29 0.4627530427 0.3476131218 0.2583167337
## 30 0.0212783000 0.0194274654 0.0146525986
## 31 0.0894760653 0.0816224703 0.0697779642
## 32 0.0495601891 0.0475871114 0.0361349250
## 33 0.0173274671 0.0147613207 0.0122396482
## 34 0.0710855163 0.0543982633 0.0453100179
## 35 0.0471924447 0.0453821785 0.0363835977
## 36 0.0399299737 0.0328500161 0.0260740002
## 37 0.0274488532 0.0243923897 0.0189852814
## 38 0.0573670170 0.0466890730 0.0343384972
## 39 0.0735202239 0.0653094514 0.0483224655
## 40 0.0714991506 0.0529396414 0.0465508017
## 41 0.1604183634 0.1520162726 0.1604145095
## 42 0.0202933952 0.0184673761 0.0192943165
## 43 0.0097247329 0.0096807947 0.0090430178
## 44 0.0055079693 0.0051940027 0.0052040689
## 45 0.0051106752 0.0048382723 0.0041835823
## 46 0.0594819670 0.0553379523 0.0558806701
## 47 0.1141249567 0.1110707355 0.1132105551
## 48 0.0486249461 0.0540820585 0.0567159420
## 49 0.2144269642 0.2076041287 0.2173172140
## 50 0.0058267657 0.0057023102 0.0057597131
## 51 0.0034026312 0.0035517781 0.0036171039
## 52 0.0012393011 0.0011169167 0.0011644278
## 53 0.0435645734 0.0424075590 0.0432963884
## 54 0.0060984636 0.0056079588 0.0055605686
## 55 0.1855532992 0.1788065482 0.1862259073
## 56 0.0033391924 0.0023150096 0.0022540475
## 57 0.0026753268 0.0025177376 0.0027477814
## 58 0.0043888176 0.0039092084 0.0041959644
## 59 0.0023805603 0.0024769370 0.0027761569
## 60 0.0055220668 0.0053168295 0.0054393277
## 61 0.0174969578 0.0180262195 0.0186251703
## 62 0.0112094592 0.0121594318 0.0126699242
## 63 0.0143224503 0.0109579388 0.0108399616
## 64 0.0295730947 0.0307045840 0.0325358783
## 65 0.0255504925 0.0271617311 0.0289806845
## 66 0.0086353783 0.0096467942 0.0104112334
## 67 0.0198006225 0.0219324529 0.0232627609
## 68 0.0235156422 0.0187704056 0.0151344664
## 69 0.0749559292 0.0646698168 0.0507488298
## 70 0.0177491114 0.0141119110 0.0113259567
## 71 0.1280446661 0.1020410505 0.0789736938
## 72 0.0022216427 0.0024556867 0.0026053879
## 73 0.0207108744 0.0203012785 0.0186561254
## 74 0.3512370903 0.3006452446 0.2482269174
## 75 0.0182034363 0.0184542009 0.0147217317
## 76 0.1918625850 0.1714645634 0.1525565563
## 77 0.0140308877 0.0121037560 0.0094758733
## 78 0.0044205370 0.0023796106 0.0016700279
## 79 0.2681920622 0.2471781703 0.2175602115
## 80 0.0233653753 0.0217186747 0.0185782217
## 81 0.0037611571 0.0033116495 0.0027034124
## 82 0.0082665997 0.0079599440 0.0063318666
## 83 0.0137681610 0.0149670238 0.0126833381
## 84 0.0012707001 0.0013833957 0.0012407838
## 85 0.0213029707 0.0186705291 0.0169721677
## 86 0.0008391235 0.0008453376 0.0007666547
## 87 0.0133247296 0.0127085400 0.0098612644
## 88 0.0169807960 0.0145445674 0.0129283993
## 89 0.0036140942 0.0031854226 0.0022457928
## 90 0.0086485146 0.0088426822 0.0079528822
## 91 0.1379026847 0.1187510253 0.1076953873
## 92 0.0053557800 0.0057987867 0.0056833571
## 93 0.0071801286 0.0073594100 0.0069004087
## 94 0.0006148446 0.0000671510 0.0003172898
## 95 0.0137140137 0.0145411674 0.0128695846
## 96 0.0173604681 0.0176896145 0.0136233414
## 97 0.0199089171 0.0180143193 0.0139395994
## 98 0.0015340676 0.0018623778 0.0020358140
## 99 0.0014424337 0.0017149006 0.0013135283
## 100 0.0190556961 0.0177737657 0.0154698134
## 101 0.1115028160 0.1212079871 0.1048083079
## 102 0.0651584659 0.0583287220 0.0486696786
## 103 0.0738419039 0.0694184128 0.0621748747
## 104 0.0020697738 0.0018615278 0.0019955724
## 105 0.0007381980 0.0008109121 0.0006985535
## 106 0.0037342436 0.0041480620 0.0041299268
## 107 0.0979349040 0.0835490241 0.0785305199
## 108 0.0338039559 0.0315380215 0.0300104577
## 109 0.0018810591 0.0019707545 0.0020260116
## 110 0.0026737248 0.0027157906 0.0027936982
## 111 0.0385195799 0.0337051289 0.0308101312
## 112 0.0015417572 0.0018589778 0.0017603136
## 113 0.1363477912 0.1304637754 0.1227060333
## 114 0.0031302926 0.0034289513 0.0031703186
## 115 0.4747276133 0.4481105493 0.3795735522
## 116 0.0037310396 0.0030851211 0.0024093389
## 117 0.0016795285 0.0015453231 0.0011370842
## 118 0.0048819108 0.0049211486 0.0040215840
## 119 0.0049895646 0.0044141160 0.0038889929
## 120 0.0027153766 0.0022971593 0.0021322908
## 121 0.0101585524 0.0095324675 0.0075298291
## 122 0.2418191069 0.2600516128 0.2664749514
## 123 0.0167376136 0.0154647062 0.0155647423
## 124 0.0039684549 0.0041034363 0.0036645684
## 125 0.0254364307 0.0254621307 0.0226312767
## 126 0.1014916469 0.0950267456 0.0840570379
## 127 0.0400392296 0.0334365249 0.0288620233
## 128 0.2419712961 0.2269338427 0.2215802478
## 129 0.0450771744 0.0428861161 0.0335104481
## 130 0.0232894409 0.0246682188 0.0245938303
## 131 0.0362136724 0.0349265972 0.0319750750
## 132 0.0240218717 0.0234059499 0.0211516227
## 133 0.2391966458 0.2295620820 0.2170303633
## 134 0.2153596440 0.1945177580 0.1730503830
## 135 0.0043208931 0.0045964437 0.0043084346
## 136 0.0288198380 0.0329647678 0.0334433787
## 137 0.0042709110 0.0047239456 0.0048289963
## 138 0.0398072611 0.0413624684 0.0425250904
## 139 0.1824877271 0.1638267742 0.1318966038
## 140 0.0552434163 0.0567719237 0.0414932536
## 141 0.0877462342 0.0812663149 0.0593558965
## 142 0.0168994148 0.0168328017 0.0115493494
## 143 0.1609236318 0.1608598049 0.1370165780
## 144 0.0420080779 0.0451182495 0.0351907944
## 145 0.0283411628 0.0254166550 0.0195512439
## 146 0.0251871607 0.0243558391 0.0184956748
## 147 0.0019592363 0.0020991064 0.0013976230
## 148 0.0327190868 0.0326005374 0.0271646517
## 149 0.0232548379 0.0203582294 0.0128938328
## 150 0.0339987581 0.0356444329 0.0341543143
## 151 0.0198390703 0.0211597913 0.0123949397
## 152 0.0520948610 0.0509952374 0.0469666319
## 153 0.0096497597 0.0074448363 0.0062188804
## 154 0.5663029290 0.5628014889 0.4882925216
## 155 0.0569309548 0.0520913288 0.0454735641
## 156 0.0800169427 0.0840161060 0.0655329876
## 157 0.0174120522 0.0174469358 0.0143523341
## 158 0.0241103016 0.0217254748 0.0178420062
## 159 0.6876762592 0.6712915858 0.6725269838
## 160 0.5692912851 0.5671000031 0.5643806838
## 161 0.0023603752 0.0027004904 0.0029856198
## 162 1.0000000000 1.0000000000 1.0000000000
## 163 0.0175591151 0.0194861163 0.0206424113
## 164 0.0234374650 0.0240090339 0.0244431822
## 165 0.3563560959 0.3776746712 0.3631131755
## 166 0.2608530159 0.2844095769 0.2889039881
## 167 0.0592346194 0.0600903734 0.0595230541
## 168 0.0268070952 0.0276088378 0.0257541308
## 169 0.0635856300 0.0688089537 0.0626355898
## 170 0.0494301073 0.0565198950 0.0604429366
## 171 0.0321542244 0.0322915578 0.0312182227
## 172 0.2275190845 0.2026957303 0.1821553110
## 173 0.0049181158 0.0044179410 0.0036686958
major_data$Major <- toupper(major_data$Major)
data_mjrs <- str_subset(major_data$Major, pattern = "(DATA)|(STATISTICS)")
data_mjrs
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
data$Major <- toupper(data$Major)
data_mjrs <- str_subset(data$Major, pattern = "(DATA)|(STATISTICS)")
data_mjrs
## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [2] "STATISTICS AND DECISION SCIENCE"
## [3] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
(.)\1\1 = It would find any same character appearing three times in a row.
library("stringr")
testData <- c("fdraaaaaoijh")
str_extract_all(testData , regex("(.)\\1\\1"))
## [[1]]
## [1] "aaa"
“(.)(.)\2\1” = It would search words that a pair of characters followed by the same pair of characters in reversed order.
Search_in <- c("cdsabbavdf")
str_extract_all(testData , regex("(.)(.)\\2\\1"))
## [[1]]
## [1] "aaaa"
Search_in <- c("scfababthlb")
str_extract_all(Search_in , regex("(..)\\1"))
## [[1]]
## [1] "abab"
“(.).\1.\1” = It would search words with repeated character pairs spaced by 1 character in between.
Data <- c("scfacbababthlb")
str_extract_all(Data , regex("(.).\\1.\\1"))
## [[1]]
## [1] "babab"
“(.)(.)(.).*\3\2\1” = it would search words with any three character pairs repeated three times
Data <- c("applebccfdscbaelppa")
str_extract_all(Data , regex("(.)(.)(.).*\\3\\2\\1"))
## [[1]]
## [1] "applebccfdscbaelppa"
Start and end with the same character.
Start with ^ and end with*$
(“^.*$“)
str_view("churchc", "^(.)(.*)\\1$")
## [1] │ <churchc>
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated
Start with [A-Za-z] and end with [A-Za-z]
str_view("church", "([A-Za-z][A-Za-z]).*\\1")
## [1] │ <church>
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
str_view("eleven", "([A-Za-z]).*\\1.*\\1.")
## [1] │ <eleven>
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.