library(readxl)
library(dplyr)
library(tidyr)
library(stringr)
library(stringi)
Lets load the dataset and look what it looks like
head(dataset)
## # A tibble: 6 x 5
## CustomerNo Name Surname Phone Email
## <dbl> <chr> <chr> <chr> <chr>
## 1 5 çigdem tu? 5321639886 saim_tuncel@hotmail.com
## 2 23 müjde öxtürk 5352426442 ecankaya@iski.gov.tr
## 3 27 itir nohutlu 5374760424 emr.ays@hotmail.com
## 4 29 seyda bekiroglu 5424248191 mehmetkahraman1974@mynet.com
## 5 31 berk güler 5449190312 msa3eedm@hotmail.com
## 6 32 numan akgül 5546874250 sdkyucel@hotmail.com
As you can see, there are turkish letters like ö,ü,ğ and they are all in lowercase. In order to standardize them we need to apply some processes.
source<- ('şçöüİğÇŞÖÜı')
change<- ('scouIgCSOUi')
dataset<- data.frame(lapply(dataset,function(x) chartr(source,change,x)))
Data set is now in English letters
head(dataset)
## CustomerNo Name Surname Phone Email
## 1 5 cigdem tu? 5321639886 saim_tuncel@hotmail.com
## 2 23 mujde oxturk 5352426442 ecankaya@iski.gov.tr
## 3 27 itir nohutlu 5374760424 emr.ays@hotmail.com
## 4 29 seyda bekiroglu 5424248191 mehmetkahraman1974@mynet.com
## 5 31 berk guler 5449190312 msa3eedm@hotmail.com
## 6 32 numan akgul 5546874250 sdkyucel@hotmail.com
dataset<- data.frame(lapply(dataset, function(x) {
if (is.character(x))
return(toupper(x))
else return(x)
}))
head(dataset)
## CustomerNo Name Surname Phone Email
## 1 5 CIGDEM TU? 5321639886 SAIM_TUNCEL@HOTMAIL.COM
## 2 23 MUJDE OXTURK 5352426442 ECANKAYA@ISKI.GOV.TR
## 3 27 iTiR NOHUTLU 5374760424 EMR.AYS@HOTMAIL.COM
## 4 29 SEYDA BEKIROGLU 5424248191 MEHMETKAHRAMAN1974@MYNET.COM
## 5 31 BERK GULER 5449190312 MSA3EEDM@HOTMAIL.COM
## 6 32 NUMAN AKGUL 5546874250 SDKYUCEL@HOTMAIL.COM
#trim spaces
dataset[,2:3] <- as.data.frame(lapply(dataset[,2:3],function(x)gsub('\\s+', '',x)))
#remoce punctutation
dataset[,2:3] <- as.data.frame(lapply(dataset[,2:3],function(x)gsub('[[:punct:] ]+', '',x)))
#remove numbers
dataset[,2:3] <- as.data.frame(lapply(dataset[,2:3],function(x)gsub('[[:digit:]]+', '',x)))
head(dataset[,2:3],20)
## Name Surname
## 1 CIGDEM TU
## 2 MUJDE OXTURK
## 3 iTiR NOHUTLU
## 4 SEYDA BEKIROGLU
## 5 BERK GULER
## 6 NUMAN AKGUL
## 7 ARZU KAPLAN
## 8 UFUK OZDILEK
## 9 SALIM KARAMAN
## 10 GUNDEM TULAZ
## 11 TUNC TUNCAY
## 12 EMEL MUZAC
## 13 KADIRKAAN ESMEROLU
## 14 ALI CRAKL
## 15 SEMIH EREN
## 16 TAHSIN COKUN
## 17 HSAN BILGILI
## 18 FATIH SANAY
## 19 RASIM CELEBI
## 20 CUNEYT AKYUZ