library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
#Step-1 read Dataset
df <- read.csv("E:/dbda036/R-Files1/Assignment/Day 2/DataCleaningExercise-Dataset1.csv", header=T, stringsAsFactors=F)
View(df)
#Step -2 Define Functions
detectNAs <- function(inp) {
return(sum(is.na(inp)))
}
detectZeros <- function(inp) {
if (class(inp) != "numeric") {
return ("Non Numeric Column")
}
sum(inp==0)
}
detectZeros(0)
## [1] 1
detectSpaces <- function(inp) {
if (class(inp) != "character") {
return ("Non Character Column")
}
sum(trimws(inp)=="")
}
#Step -3 Finding Which Data Needs Cleaning
lapply(df, FUN=detectNAs)
## $Office.Title
## [1] 0
##
## $Office.Description
## [1] 0
##
## $Office.Address.1
## [1] 0
##
## $Office.Address.2
## [1] 0
##
## $City
## [1] 0
##
## $State
## [1] 0
##
## $Zip.Code
## [1] 0
##
## $Office.Phone
## [1] 0
##
## $Parish
## [1] 0
##
## $Candidate.Name
## [1] 0
##
## $Candidate.Address.1
## [1] 0
##
## $Candidate.Address.2
## [1] 0
##
## $Candidate.City
## [1] 0
##
## $Candidate.State
## [1] 0
##
## $Candidate.Zip.Code
## [1] 0
##
## $Phone
## [1] 0
##
## $Ethnicity
## [1] 0
##
## $Sex
## [1] 0
##
## $Party.Code
## [1] 0
##
## $Office.Level
## [1] 0
##
## $Expiration.Date
## [1] 0
##
## $Commissioned.Date
## [1] 0
##
## $Salutation
## [1] 0
lapply(df, FUN=detectZeros)
## $Office.Title
## [1] "Non Numeric Column"
##
## $Office.Description
## [1] "Non Numeric Column"
##
## $Office.Address.1
## [1] "Non Numeric Column"
##
## $Office.Address.2
## [1] "Non Numeric Column"
##
## $City
## [1] "Non Numeric Column"
##
## $State
## [1] "Non Numeric Column"
##
## $Zip.Code
## [1] "Non Numeric Column"
##
## $Office.Phone
## [1] "Non Numeric Column"
##
## $Parish
## [1] "Non Numeric Column"
##
## $Candidate.Name
## [1] "Non Numeric Column"
##
## $Candidate.Address.1
## [1] "Non Numeric Column"
##
## $Candidate.Address.2
## [1] "Non Numeric Column"
##
## $Candidate.City
## [1] "Non Numeric Column"
##
## $Candidate.State
## [1] "Non Numeric Column"
##
## $Candidate.Zip.Code
## [1] "Non Numeric Column"
##
## $Phone
## [1] "Non Numeric Column"
##
## $Ethnicity
## [1] "Non Numeric Column"
##
## $Sex
## [1] "Non Numeric Column"
##
## $Party.Code
## [1] "Non Numeric Column"
##
## $Office.Level
## [1] "Non Numeric Column"
##
## $Expiration.Date
## [1] "Non Numeric Column"
##
## $Commissioned.Date
## [1] "Non Numeric Column"
##
## $Salutation
## [1] "Non Numeric Column"
lapply(df, FUN=detectSpaces)
## $Office.Title
## [1] 0
##
## $Office.Description
## [1] 284
##
## $Office.Address.1
## [1] 2357
##
## $Office.Address.2
## [1] 6862
##
## $City
## [1] 2358
##
## $State
## [1] 10
##
## $Zip.Code
## [1] 2366
##
## $Office.Phone
## [1] 2397
##
## $Parish
## [1] 1013
##
## $Candidate.Name
## [1] 911
##
## $Candidate.Address.1
## [1] 911
##
## $Candidate.Address.2
## [1] 6862
##
## $Candidate.City
## [1] 912
##
## $Candidate.State
## [1] 911
##
## $Candidate.Zip.Code
## [1] 912
##
## $Phone
## [1] 1016
##
## $Ethnicity
## [1] 1096
##
## $Sex
## [1] 1103
##
## $Party.Code
## [1] 1051
##
## $Office.Level
## [1] "Non Character Column"
##
## $Expiration.Date
## [1] 1104
##
## $Commissioned.Date
## [1] 966
##
## $Salutation
## [1] 2822
#Step -4 Data cleaning
################
# Office Title
################
df$Office.Title <- str_replace_all(df$Office.Title, pattern="[[:punct:]]", "")
df$Office.Title <- trimws(toupper(df$Office.Title))
df$Office.Title <- trimws(toupper(df$Office.Title))
df$Office.Title[df$Office.Title==" "] <- NA
#####################
# Office Description
#####################
df$Office.Description <- str_replace_all(df$Office.Description, pattern="[[:punct:]]", "")
df$Office.Description <- trimws(toupper(df$Office.Description))
df$Office.Description <- trimws(toupper(df$Office.Description))
df$Office.Description[df$Office.Description==""] <- NA
#####################
# Office Address 1
#####################
df$Office.Address.1 <- str_replace_all(df$Office.Address.1, pattern="[[:punct:]]", "")
df$Office.Address.1 <- trimws(toupper(df$Office.Address.1))
df$Office.Address.1 <- trimws(toupper(df$Office.Address.1))
df$Office.Address.1[df$Office.Address.1==""] <- NA
#####################
# Office Address 2
#####################
df$Office.Address.2 <- str_replace_all(df$Office.Address.2, pattern="[[:punct:]]", "")
df$Office.Address.2 <- trimws(toupper(df$Office.Address.2))
df$Office.Address.2 <- trimws(toupper(df$Office.Address.2))
df$Office.Address.2[df$Office.Address.2==""] <- NA
####################
# City
#####################
df$City <- str_replace_all(df$City, pattern="St.", "")
df$City <- str_replace_all(df$City, pattern="[[:punct:]]", "")
df$City <- trimws(toupper(df$City))
df$City[df$City==""] <- NA
#####################
# State
#####################
df$State[df$State==" "] <- NA
summarise(group_by(df, State), n())
## # A tibble: 2 x 2
## State `n()`
## <chr> <int>
## 1 LA 6855
## 2 <NA> 10
#####################
# Zip Code
#####################
getZipcode <- function(value){
l=strsplit(value,'')[[1]]
if (length(l) ==10) {
if(l[6]=='-'){
return(value)
}
}else if(length(l)==5){
return(value)
}
else {
return(NA)
}
}
df$Zip.Code <- lapply(df$Zip.Code,getZipcode)
#####################
# Parish
#####################
df$Parish <- str_replace_all(df$Parish, pattern="[[:punct:]]", "")
df$Parish[df$Parish==""] <- NA
df$Parish <- str_replace_all(df$Parish, pattern="ST.", "")
df$Parish <- trimws(toupper(df$Parish))
#####################
# Candidate Name
#####################
df$Candidate.Name <- str_replace_all(df$Candidate.Name, pattern="[[:punct:]]", "")
df$Candidate.Name[df$Candidate.Name==""] <- NA
df$Candidate.Name <- trimws(toupper(df$Candidate.Name))
#####################
# Candidate Address 1
#####################
df$Candidate.Address.1 <- str_replace_all(df$Candidate.Address.1, pattern="[[:punct:]]", "")
df$Candidate.Address.1[df$Candidate.Address.1==""] <- NA
df$Candidate.Address.1 <- trimws(toupper(df$Candidate.Address.1))
#####################
# Candidate Address 2
#####################
df$Candidate.Address.2 <- str_replace_all(df$Candidate.Address.2, pattern="[[:punct:]]", "")
df$Candidate.Address.2[df$Candidate.Address.2==""] <- NA
df$Candidate.Address.2 <- trimws(toupper(df$Candidate.Address.2))
#####################
# Candidate City
#####################
df$Candidate.City <- str_replace_all(df$Candidate.City, pattern="St.", "")
df$Candidate.City <- str_replace_all(df$Candidate.City, pattern="[[:punct:]]", "")
df$Candidate.City <- trimws(toupper(df$Candidate.City))
df$Candidate.City[df$Candidate.City==""] <- NA
#####################
# Candidate State
#####################
df$Candidate.State <- str_replace_all(df$Candidate.State, pattern="[[:punct:]]", "")
df$Candidate.State[df$Candidate.State==""] <- NA
#####################
# Candidate Zip Code
#####################
getZipcode <- function(value){
l=strsplit(value,'')[[1]]
if (length(l) ==10) {
if(l[6]=='-'){
return(value)
}
}else if(length(l)==5){
return(value)
}
else {
return(NA)
}
}
df$Candidate.Zip.Code <- lapply(df$Candidate.Zip.Code,getZipcode)
#####################
# Phone
#####################
getPhone <- function(value){
l=strsplit(value,'')[[1]]
if (length(l) ==12) {
if(l[4]=='-' && l[8]=='-' ){
return(value)
}
}else {
return(NA)
}
}
df$Phone <- lapply(df$Phone,getPhone)
df$Candidate.State <- str_replace_all(df$Candidate.State, pattern="[[:punct:]]", "")
df$Phone[df$Phone==""] <- NA
#####################
#Ethnicity
#####################
df$Ethnicity<- factor(df$Ethnicity,levels = c("A","B","H","I","O","W"),labels = c(1,2,3,4,5,6),ordered = TRUE)
df$Ethnicity[df$Ethnicity==""] <- NA
#####################
# Salutation
#####################
df$Salutation <- str_replace_all(df$Salutation, pattern="Chief", "")
df$Salutation <- str_replace_all(df$Salutation, pattern="Judge", "")
df$Salutation <- str_replace_all(df$Salutation, pattern="Mr.", "")
df$Salutation <- str_replace_all(df$Salutation, pattern="Ms.", "")
Salcheck <- function(g,n){
if (g==0 && n!=""){
return(paste("Mr.",n))
}else if (g==1 && n!=""){
return(paste("Ms.",n))
}else{
return("NA")
}
}
df$Salutation<-mapply(Salcheck,df$Sex,df$Salutation)
#####################
# Sex
#####################
df$Sex<- factor(df$Sex,levels = c("M","F"),labels = c(0,1),ordered = TRUE)
df$Sex[df$Sex==""] <- NA
#####################
# Party Code
#####################
df$Party.Code<- factor(df$Party.Code,levels = c("D","L","N","O","R"),labels = c(1,2,3,4,5),ordered = TRUE)
df$Party.Code[df$Party.Code==""] <- NA
#####################
# Office Level
#####################
df$Office.Level[df$Office.Level==""] <- NA
#####################
# Expiration Date
#####################
df$Expiration.Date[df$Expiration.Date==""] <- NA
df$Expiration.Date <- as.Date(df$Expiration.Date,format=" %m/%d/%Y")
#####################
# Commissioned Date
#####################
df$Commissioned.Date[df$Commissioned.Date==""] <- NA
df$Commissioned.Date <- as.Date(df$Commissioned.Date,format="%d-%m-%Y")