Data 607 Week2

Our final answer is presented first, followed by the code to accomplish it.

##      First_Name Last_Name Second_Name_Boolean Title_Boolean Title
## 1           Moe   Szyslak                  No            No      
## 2 C. Montgomery     Burns                 Yes            No      
## 3   Rev Timothy   Lovejoy                  No           Yes   Rev
## 4           Ned  Flanders                  No            No      
## 5         Homer   Simpson                  No            No      
## 6     Dr Julius   Hibbert                  No           Yes    Dr

library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
new.data<-data.frame("First_Name" = integer(15),"Last_Name" = integer(15),"Second_Name_Boolean" = integer(15),"Title_Boolean" = integer(15),"Title" = integer(15))
#Strip Names and initials in order.
split.string<-raw.data
next.split<-split.string
for (i in 1:15) {
extracter<-as.character(unlist(str_extract(next.split, "[A-Z][.]|[A-Z]{1}[[:alpha:]]{1,9}\\,?")))
new.data[[i,1]]<-extracter
split.string<-unlist(str_split_fixed(next.split, "[A-Z][.]|[A-Z]{1}[[:alpha:]]{1,9}\\,?",2))
next.split<-split.string[[2]]
               }
#Append all initial initials to full middles names.
for(i in 1:15){
  if (!is.na(unlist(str_extract(new.data[[i,1]], "[.]")))) {
 new.data[[i,1]]<-paste(new.data[[i,1]],new.data[[i+1,1]], sep=" ")
 new.data[[i,3]]<-"Yes"
      for(j in i:13)                                                 {
 new.data[[j+1,1]]<-new.data[[j+2,1]]       
                                                                     }
 new.data[[15,1]]<-NA
                                                          } else{ new.data[[i,3]]<-"No"} 
             }
#Move titles to title place
for(i in 1:12){
  if (unlist(str_detect(new.data[[i,1]],"Dr|Rev"))=="TRUE") {
            new.data[[i,5]]<-new.data[[i,1]]
 new.data[[i,1]]<-paste(new.data[[i,1]],new.data[[i+1,1]], sep=" ")
      for(j in i:13)                                                 {
 new.data[[j+1,1]]<-new.data[[j+2,1]]       
                                                                     }
 new.data[[15,1]]<-NA
  new.data[[i,4]]<-"Yes"
                                                          } else{ new.data[[i,4]]<-"No"} 
}
#Switch any name that is last name first and put names in first and last column
for (i in seq(1,11, by=2)){
if (unlist(str_detect(new.data[[i,1]],","))=="TRUE") {
new.data[[i,1]]<-unlist(str_extract(new.data[[i,1]], "[:alpha:]{1,15}"))
    new.data[[i,2]]<-new.data[[i,1]]
    new.data[[i,1]]<-new.data[[i+1,1]]
    new.data[[i,3]]<-new.data[[i+1,3]]
    new.data[[i,4]]<-new.data[[i+1,4]]
    new.data[[i,5]]<-new.data[[i+1,5]]
                                                        
                                                     } else           {
   new.data[[i,2]]<-new.data[[i+1,1]]
                                                                      }
               }
final.answer<-data.frame("First_Name" = integer(6),"Last_Name" = integer(6),"Second_Name_Boolean" = integer(6),"Title_Boolean" = integer(6),"Title" = integer(6))
for (i in seq(1,11, by=2)){
  for(j in 1:5)                {
       new.row<-ceiling(i/2)
       final.answer[[new.row,j]]<-new.data[[i,j]]
                               }
                          }
final.answer

The first code is to find numbers followed by dollar signs.

example1<-"990$ gog akd kkd2$5$  %%%5$"
extraction1<-unlist(str_extract_all(example1, "[0-9]+\\$"))
extraction1

## [1] "990$" "2$"   "5$"   "5$"

The second code finds words that are all lower case and between 1 and 4 letters.

example2<-"I am for.. each panda Now."
extraction2<-unlist(str_extract_all(example2, "\\b[a-z]{1,4}\\b"))
extraction2

## [1] "am"   "for"  "each"

The third code is to find text files and will find anything ending in .txt.

example3<-"I am.txt$ for.. each panda Now.txt"
extraction3<-unlist(str_extract_all(example3, ".*?\\.txt$"))
extraction3

## [1] "I am.txt$ for.. each panda Now.txt"

example4<-".txt"
extraction4<-unlist(str_extract_all(example4, ".*?\\.txt$"))
extraction4

## [1] ".txt"

The fourth code is to find dates in mm/dd/yyyy.

example5<-"05/26/2004 3/17/1980 07/16/2000"
extraction5<-unlist(str_extract_all(example5, "\\d{2}/\\d{2}/\\d{4}"))
extraction5

## [1] "05/26/2004" "07/16/2000"

The last code finds html codes, like the color code around this statement.

example6<-"<fred>aa aa</fred> ate </fred"
extraction6<-unlist(str_extract_all(example6, "<(.+?)>.+?</\\1>"))
extraction6

## [1] "<fred>aa aa</fred>"

Data 607 Week2

Dan Wigodsky

February 19, 2018

Our final answer is presented first, followed by the code to accomplish it.

The first code is to find numbers followed by dollar signs.

The second code finds words that are all lower case and between 1 and 4 letters.

The third code is to find text files and will find anything ending in .txt.

The fourth code is to find dates in mm/dd/yyyy.

The last code finds html codes, like the color code around this statement.