Editing Text Variables

You run these R code chunks to clean text variables, if you need to

# create a folder for the data
if(!file.exists("./data")){dir.create("./data")}

#Get Data From the Web
fileUrl <-"https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?acessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/cameras.csv")
CameraData <-read.csv("./data/cameras.csv")
names(CameraData)
## [1] "address"                      "direction"                   
## [3] "street"                       "crossStreet"                 
## [5] "intersection"                 "Location.1"                  
## [7] "X2010.Census.Neighborhoods"   "X2010.Census.Wards.Precincts"
## [9] "Zip.Codes"
# make all letters lowercase:
tolower(names(CameraData))
## [1] "address"                      "direction"                   
## [3] "street"                       "crossstreet"                 
## [5] "intersection"                 "location.1"                  
## [7] "x2010.census.neighborhoods"   "x2010.census.wards.precincts"
## [9] "zip.codes"
# split variable names:
splitNames = strsplit(names(CameraData), "\\.")
splitNames[[6]]
## [1] "Location" "1"
# select first element
firstElement <-function(x){x[1]}
sapply(splitNames, firstElement)
## [1] "address"      "direction"    "street"       "crossStreet"  "intersection"
## [6] "Location"     "X2010"        "X2010"        "Zip"
#read the data
if(!file.exists("./data")){dir.create("./data")}
fileUrl1 = "https://raw.githubusercontent.com/DataScienceSpecialization/courses/master/03_GettingData/04_01_editingTextVariables/data/reviews.csv
"
fileUrl2 = "https://raw.githubusercontent.com/DataScienceSpecialization/courses/master/03_GettingData/04_01_editingTextVariables/data/solutions.csv
"
download.file(fileUrl1,destfile="./data/reviews.csv")
download.file(fileUrl2,destfile="./data/solutions.csv")
reviews = read.csv("./data/reviews.csv"); solutions <- read.csv("./data/solutions.csv")
head(reviews,2)
##   id solution_id reviewer_id      start       stop time_left accept
## 1  1           3          27 1304095698 1304095758      1754      1
## 2  2           4          22 1304095188 1304095206      2306      1
names(reviews)
## [1] "id"          "solution_id" "reviewer_id" "start"       "stop"       
## [6] "time_left"   "accept"
sub("_", "", names(reviews),)
## [1] "id"         "solutionid" "reviewerid" "start"      "stop"      
## [6] "timeleft"   "accept"
testName <-"I_am_Wonder_Woman"

sub("_", "", testName,)
## [1] "Iam_Wonder_Woman"
# remove all underscores
gsub("_", "", testName,)
## [1] "IamWonderWoman"
#First check if there are records
table(grepl("Alameda", CameraData$intersection))
## 
## FALSE  TRUE 
##    77     3
#then inspect
grep("Alameda", CameraData$intersection)
## [1] 65 69 79
#Subset
CameraData2 <-CameraData[!grepl("Alameda", CameraData$intersection), ]

# return values
grep("Alameda", CameraData$intersection, value = TRUE)
## [1] "E 33rd  & The Alameda"    "The Alameda  & 33rd St"  
## [3] "Harford \n & The Alameda"
# check if a value exists
length(grep("JeffStreet",  CameraData$intersection))
## [1] 0
#[1] 0 does not exist

Using the String Library

library(stringr)
nchar("Linda Angulo Lopez")
## [1] 18
#join
paste("Linda", "Lopez", "!?that's not my name")
## [1] "Linda Lopez !?that's not my name"
paste("My name is", substr("Linda Angulo Lopez", 1,12 ))
## [1] "My name is Linda Angulo"
#join with no space
paste0("@","lindangulopez")
## [1] "@lindangulopez"
#trim off excess space before and after string
str_trim("    Tweet me            ")
## [1] "Tweet me"

Reminder Lists

myList <- list(letters = c("a","b","c"), numbers= 1:3, matrix(1:25, ncol=5))

#inspect list elements
myList$letters ;  myList[1]
## [1] "a" "b" "c"
## $letters
## [1] "a" "b" "c"
myList$numbers ; myList[2]
## [1] 1 2 3
## $numbers
## [1] 1 2 3
myList[3]
## [[1]]
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    6   11   16   21
## [2,]    2    7   12   17   22
## [3,]    3    8   13   18   23
## [4,]    4    9   14   19   24
## [5,]    5   10   15   20   25
#select element in matrix
myList[[3]][2,3]
## [1] 12

by Linda