This document provides information about:

library(readxl)
library(dplyr)
library(tidyr)
library(stringr)
library(stringi)

Lets load the dataset and look what it looks like

head(dataset)
## # A tibble: 6 x 5
##   CustomerNo Name   Surname   Phone      Email                       
##        <dbl> <chr>  <chr>     <chr>      <chr>                       
## 1          5 çigdem tu?       5321639886 saim_tuncel@hotmail.com     
## 2         23 müjde  öxtürk    5352426442 ecankaya@iski.gov.tr        
## 3         27 itir   nohutlu   5374760424 emr.ays@hotmail.com         
## 4         29 seyda  bekiroglu 5424248191 mehmetkahraman1974@mynet.com
## 5         31 berk   güler     5449190312 msa3eedm@hotmail.com        
## 6         32 numan  akgül     5546874250 sdkyucel@hotmail.com

As you can see, there are turkish letters like ö,ü,ğ and they are all in lowercase. In order to standardize them we need to apply some processes.

Turkish Letter Transformation

source<- ('şçöüİğÇŞÖÜı')
change<- ('scouIgCSOUi')
dataset<- data.frame(lapply(dataset,function(x) chartr(source,change,x)))

Data set is now in English letters

head(dataset)
##   CustomerNo   Name   Surname      Phone                        Email
## 1          5 cigdem       tu? 5321639886      saim_tuncel@hotmail.com
## 2         23  mujde    oxturk 5352426442         ecankaya@iski.gov.tr
## 3         27   itir   nohutlu 5374760424          emr.ays@hotmail.com
## 4         29  seyda bekiroglu 5424248191 mehmetkahraman1974@mynet.com
## 5         31   berk     guler 5449190312         msa3eedm@hotmail.com
## 6         32  numan     akgul 5546874250         sdkyucel@hotmail.com

Capital Letter Transformation

dataset<- data.frame(lapply(dataset, function(x) {
  if (is.character(x)) 
    return(toupper(x))
  else return(x)
}))
head(dataset)
##   CustomerNo   Name   Surname      Phone                        Email
## 1          5 CIGDEM       TU? 5321639886      SAIM_TUNCEL@HOTMAIL.COM
## 2         23  MUJDE    OXTURK 5352426442         ECANKAYA@ISKI.GOV.TR
## 3         27   iTiR   NOHUTLU 5374760424          EMR.AYS@HOTMAIL.COM
## 4         29  SEYDA BEKIROGLU 5424248191 MEHMETKAHRAMAN1974@MYNET.COM
## 5         31   BERK     GULER 5449190312         MSA3EEDM@HOTMAIL.COM
## 6         32  NUMAN     AKGUL 5546874250         SDKYUCEL@HOTMAIL.COM

Name-surname Standardization

#trim spaces
dataset[,2:3] <- as.data.frame(lapply(dataset[,2:3],function(x)gsub('\\s+', '',x)))
#remoce punctutation
dataset[,2:3] <- as.data.frame(lapply(dataset[,2:3],function(x)gsub('[[:punct:] ]+', '',x)))
#remove numbers
dataset[,2:3] <- as.data.frame(lapply(dataset[,2:3],function(x)gsub('[[:digit:]]+', '',x)))
head(dataset[,2:3],20)
##         Name   Surname
## 1     CIGDEM        TU
## 2      MUJDE    OXTURK
## 3       iTiR   NOHUTLU
## 4      SEYDA BEKIROGLU
## 5       BERK     GULER
## 6      NUMAN     AKGUL
## 7       ARZU    KAPLAN
## 8       UFUK   OZDILEK
## 9      SALIM   KARAMAN
## 10    GUNDEM     TULAZ
## 11      TUNC    TUNCAY
## 12      EMEL     MUZAC
## 13 KADIRKAAN  ESMEROLU
## 14       ALI     CRAKL
## 15     SEMIH      EREN
## 16    TAHSIN     COKUN
## 17      HSAN   BILGILI
## 18     FATIH     SANAY
## 19     RASIM    CELEBI
## 20    CUNEYT     AKYUZ