Week 3 assignment

1. Reading the Data from Github to RStudio

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
Majors <- read.csv("https://raw.githubusercontent.com/SalouaDaouki/Data607/main/all-ages.csv")
summary(Majors)

   Major_code      Major           Major_category         Total        
 Min.   :1100   Length:173         Length:173         Min.   :   2396  
 1st Qu.:2403   Class :character   Class :character   1st Qu.:  24280  
 Median :3608   Mode  :character   Mode  :character   Median :  75791  
 Mean   :3880                                         Mean   : 230257  
 3rd Qu.:5503                                         3rd Qu.: 205763  
 Max.   :6403                                         Max.   :3123510  
    Employed       Employed_full_time_year_round   Unemployed    
 Min.   :   1492   Min.   :   1093               Min.   :     0  
 1st Qu.:  17281   1st Qu.:  12722               1st Qu.:  1101  
 Median :  56564   Median :  39613               Median :  3619  
 Mean   : 166162   Mean   : 126308               Mean   :  9725  
 3rd Qu.: 142879   3rd Qu.: 111025               3rd Qu.:  8862  
 Max.   :2354398   Max.   :1939384               Max.   :147261  
 Unemployment_rate     Median           P25th           P75th       
 Min.   :0.00000   Min.   : 35000   Min.   :24900   Min.   : 45800  
 1st Qu.:0.04626   1st Qu.: 46000   1st Qu.:32000   1st Qu.: 70000  
 Median :0.05472   Median : 53000   Median :36000   Median : 80000  
 Mean   :0.05736   Mean   : 56816   Mean   :38697   Mean   : 82506  
 3rd Qu.:0.06904   3rd Qu.: 65000   3rd Qu.:42000   3rd Qu.: 95000  
 Max.   :0.15615   Max.   :125000   Max.   :78000   Max.   :210000

view(Majors)

2. Getting the majors that contain “DATA” and “STATISTICS”

data_stat_mjrs <- str_subset(Majors$Major, "DATA|STATISTICS")
show(data_stat_mjrs)

[1] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
[2] "STATISTICS AND DECISION SCIENCE"              
[3] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"

data_stat <- grep("DATA|STATISTICS", Majors$Major, value= TRUE, ignore.case = TRUE)
show(data_stat)

[1] "COMPUTER PROGRAMMING AND DATA PROCESSING"     
[2] "STATISTICS AND DECISION SCIENCE"              
[3] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"

3. Transforming the data below:

[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”

Food <- c("bell pepper", "bilberry","blackberry","blood orange",
          "blueberry", "cantaloupe","chili pepper", "cloudberry",
          "elderberry", "lime", "lychee","mulberry",
          "olive","salal berry")
print(Food)

 [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
 [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
[11] "lychee"       "mulberry"     "olive"        "salal berry"

unlist(Food, recursive = TRUE, use.names = TRUE)

 [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
 [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
[11] "lychee"       "mulberry"     "olive"        "salal berry"

Food_list <- as_vector(Food)
print(Food_list)

 [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
 [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
[11] "lychee"       "mulberry"     "olive"        "salal berry"

4. Explaining the expressions:

library (stringr)
list <- c("banana","555555","AA1ab","aaapple", "church", "aaaa", "cooc", "abba", "1212")
str_view(list, "(.)\1\1")

“(.)\1\1” this expression matches the strings that have repeated characters 2 times.

str_view(list,"(.)(.)\2\1")

“(.)(.)\2\1” matches the strings that have two consecutive characters

str_view(list,"(..)\1")

“(..)\1” matches the groups of characters that are repeated, it is not neccassary to be the same letter.

str_view(list,"(.)(.)(.).*\3\2\1")

“(.)(.)(.).*\3\2\1” this expression matches the strings that have repeated characters 3 times.

str_view(list, "(.)\\1")

[2] │ <55><55><55>
[3] │ <AA>1ab
[4] │ <aa>a<pp>le
[6] │ <aa><aa>
[7] │ c<oo>c
[8] │ a<bb>a

4. strings

start and end of the string:

str_view (list, "^(..).*\\1$")

[2] │ <555555>
[5] │ <church>
[6] │ <aaaa>
[9] │ <1212>

repeated pair of letters

str_view (list, "(..)\\1")

[1] │ b<anan>a
[2] │ <5555>55
[6] │ <aaaa>
[9] │ <1212>

repeated pair of letters

str_view (list, ".([A-Za-z]).\1.\1.")