library(stringr)
library(stringi)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
# extract data to get the names
name<- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#extract data to get the phone numbers
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone## [1] "555-1239" "(636) 555-0113" "555-6542" "555 8904"
## [5] "636-555-3226" "5553642"
#combine both using a different method than the book and change column names
combined.data <- list(name,phone)
combined_df<- as.data.frame(combined.data)
colnames(combined_df) [1:2]<- c("user name","phone number")
combined_df## user name phone number
## 1 Moe Szyslak 555-1239
## 2 Burns, C. Montgomery (636) 555-0113
## 3 Rev. Timothy Lovejoy 555-6542
## 4 Ned Flanders 555 8904
## 5 Simpson, Homer 636-555-3226
## 6 Dr. Julius Hibbert 5553642
# write a for/if loop to find names with ","; split them apart and reverse the order
for(i in 1:length(name)) {
if(str_detect(name[i],",")==TRUE) {
temp<-unlist(str_split(name[i],","))
name[i]<-paste(temp[2], temp[1], sep=" ")
}
}
name## [1] "Moe Szyslak" " C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" " Homer Simpson" "Dr. Julius Hibbert"
## [1] 5551239 6365550113 5556542 5558904 6365553226 5553642
#write a ifelse statement focused on number of characters
#if the string is 7 characters long, split the first 3 characters, add "-"
#if the string is not 7 characters, split first 3 characters
# add "-", the second 3 characters add "-"
phone<- ifelse(stri_length(phone)== 7,
(gsub("(^\\d{3})(\\d{4}$)","\\1-\\2", phone[nchar(phone) == 7])),
(gsub("(^\\d{3})(\\d{3})(\\d{4}$)", "\\1-\\2-\\3", phone[nchar(phone) == 10]))
)
phone## [1] "555-1239" "636-555-3226" "555-8904" "555-3642"
## [5] "636-555-0113" "555-6542"
## name phone
## 1 Moe Szyslak 555-1239
## 2 C. Montgomery Burns 636-555-3226
## 3 Rev. Timothy Lovejoy 555-8904
## 4 Ned Flanders 555-3642
## 5 Homer Simpson 636-555-0113
## 6 Dr. Julius Hibbert 555-6542
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
#Construct a new list identifing titles
title <- c()
for(t in 1:length(title_present)) {
if(title_present[t]==FALSE){
title[t]<-'NA'
} else if(str_detect(name[t], "Rev.")=='TRUE') {
title[t]<-"Rev."
} else {
title[t]<-"Dr."
}
}
title## [1] "NA" "NA" "Rev." "NA" "NA" "Dr."
#clean up the Name column to remove titles
name<- gsub("Rev.", "", name)
name<- gsub("Dr.", "", name)
#reconstruct the combined dataframe with new colum
combined_df<- data.frame(title, name, phone)
combined_df## title name phone
## 1 NA Moe Szyslak 555-1239
## 2 NA C. Montgomery Burns 636-555-3226
## 3 Rev. Timothy Lovejoy 555-8904
## 4 NA Ned Flanders 555-3642
## 5 NA Homer Simpson 636-555-0113
## 6 Dr. Julius Hibbert 555-6542
#creating a regular expression to find the literal "."
dot <- "\\."
#finding any case where DOT is present, since I cleaned up all titles,
#this will find middle names.
middle <- str_extract(name, dot)
#replace all found instances of dot with Yes
middle <- ifelse(middle == dot, "n/a", "yes")
#updating the dataframe to add new column
combined_df<- data.frame(title, name, phone, middle)
combined_df## title name phone middle
## 1 NA Moe Szyslak 555-1239 <NA>
## 2 NA C. Montgomery Burns 636-555-3226 yes
## 3 Rev. Timothy Lovejoy 555-8904 <NA>
## 4 NA Ned Flanders 555-3642 <NA>
## 5 NA Homer Simpson 636-555-0113 <NA>
## 6 Dr. Julius Hibbert 555-6542 <NA>
## [1] "12$" "1$" "578$" "487$"
## [1] "drt" "txt" "few" "data" "txt" "html" "html" "load"
## [1] "Lenovo.txt" "phone8.txt"
## [1] "02/24/1954" "01/08/2019"
## [1] "<html> table </html>"
message_raw<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
message<- unlist (str_extract_all(message_raw, "[[:upper:].]{1,}"))
message<- str_replace_all(paste(message, collapse = ''), "[.]", " ")
message## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"