DATA CLEANING

load libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

#Step-1 read Dataset

df <- read.csv("E:/dbda036/R-Files1/Assignment/Day 2/DataCleaningExercise-Dataset1.csv", header=T, stringsAsFactors=F)
View(df)

#Step -2 Define Functions

detectNAs <- function(inp) {
  return(sum(is.na(inp)))
}

detectZeros <- function(inp) {
  if (class(inp) != "numeric") {
    return ("Non Numeric Column")
  }
  sum(inp==0)
}
detectZeros(0)
## [1] 1
detectSpaces <- function(inp) {
  if (class(inp) != "character") {
    return ("Non Character Column")
  }
  sum(trimws(inp)=="")
}

#Step -3 Finding Which Data Needs Cleaning

lapply(df, FUN=detectNAs)
## $Office.Title
## [1] 0
## 
## $Office.Description
## [1] 0
## 
## $Office.Address.1
## [1] 0
## 
## $Office.Address.2
## [1] 0
## 
## $City
## [1] 0
## 
## $State
## [1] 0
## 
## $Zip.Code
## [1] 0
## 
## $Office.Phone
## [1] 0
## 
## $Parish
## [1] 0
## 
## $Candidate.Name
## [1] 0
## 
## $Candidate.Address.1
## [1] 0
## 
## $Candidate.Address.2
## [1] 0
## 
## $Candidate.City
## [1] 0
## 
## $Candidate.State
## [1] 0
## 
## $Candidate.Zip.Code
## [1] 0
## 
## $Phone
## [1] 0
## 
## $Ethnicity
## [1] 0
## 
## $Sex
## [1] 0
## 
## $Party.Code
## [1] 0
## 
## $Office.Level
## [1] 0
## 
## $Expiration.Date
## [1] 0
## 
## $Commissioned.Date
## [1] 0
## 
## $Salutation
## [1] 0
lapply(df, FUN=detectZeros)
## $Office.Title
## [1] "Non Numeric Column"
## 
## $Office.Description
## [1] "Non Numeric Column"
## 
## $Office.Address.1
## [1] "Non Numeric Column"
## 
## $Office.Address.2
## [1] "Non Numeric Column"
## 
## $City
## [1] "Non Numeric Column"
## 
## $State
## [1] "Non Numeric Column"
## 
## $Zip.Code
## [1] "Non Numeric Column"
## 
## $Office.Phone
## [1] "Non Numeric Column"
## 
## $Parish
## [1] "Non Numeric Column"
## 
## $Candidate.Name
## [1] "Non Numeric Column"
## 
## $Candidate.Address.1
## [1] "Non Numeric Column"
## 
## $Candidate.Address.2
## [1] "Non Numeric Column"
## 
## $Candidate.City
## [1] "Non Numeric Column"
## 
## $Candidate.State
## [1] "Non Numeric Column"
## 
## $Candidate.Zip.Code
## [1] "Non Numeric Column"
## 
## $Phone
## [1] "Non Numeric Column"
## 
## $Ethnicity
## [1] "Non Numeric Column"
## 
## $Sex
## [1] "Non Numeric Column"
## 
## $Party.Code
## [1] "Non Numeric Column"
## 
## $Office.Level
## [1] "Non Numeric Column"
## 
## $Expiration.Date
## [1] "Non Numeric Column"
## 
## $Commissioned.Date
## [1] "Non Numeric Column"
## 
## $Salutation
## [1] "Non Numeric Column"
lapply(df, FUN=detectSpaces)
## $Office.Title
## [1] 0
## 
## $Office.Description
## [1] 284
## 
## $Office.Address.1
## [1] 2357
## 
## $Office.Address.2
## [1] 6862
## 
## $City
## [1] 2358
## 
## $State
## [1] 10
## 
## $Zip.Code
## [1] 2366
## 
## $Office.Phone
## [1] 2397
## 
## $Parish
## [1] 1013
## 
## $Candidate.Name
## [1] 911
## 
## $Candidate.Address.1
## [1] 911
## 
## $Candidate.Address.2
## [1] 6862
## 
## $Candidate.City
## [1] 912
## 
## $Candidate.State
## [1] 911
## 
## $Candidate.Zip.Code
## [1] 912
## 
## $Phone
## [1] 1016
## 
## $Ethnicity
## [1] 1096
## 
## $Sex
## [1] 1103
## 
## $Party.Code
## [1] 1051
## 
## $Office.Level
## [1] "Non Character Column"
## 
## $Expiration.Date
## [1] 1104
## 
## $Commissioned.Date
## [1] 966
## 
## $Salutation
## [1] 2822

#Step -4 Data cleaning

################
# Office Title
################

df$Office.Title <- str_replace_all(df$Office.Title, pattern="[[:punct:]]", "")
df$Office.Title <- trimws(toupper(df$Office.Title))

df$Office.Title <- trimws(toupper(df$Office.Title))
df$Office.Title[df$Office.Title==" "] <- NA
#####################
# Office Description
#####################

df$Office.Description <- str_replace_all(df$Office.Description, pattern="[[:punct:]]", "")
df$Office.Description <- trimws(toupper(df$Office.Description))
df$Office.Description <- trimws(toupper(df$Office.Description))
df$Office.Description[df$Office.Description==""] <- NA
#####################
# Office Address 1
#####################
df$Office.Address.1 <- str_replace_all(df$Office.Address.1, pattern="[[:punct:]]", "")
df$Office.Address.1 <- trimws(toupper(df$Office.Address.1))
df$Office.Address.1 <- trimws(toupper(df$Office.Address.1))
df$Office.Address.1[df$Office.Address.1==""] <- NA
#####################
# Office Address 2
#####################
df$Office.Address.2 <- str_replace_all(df$Office.Address.2, pattern="[[:punct:]]", "")
df$Office.Address.2 <- trimws(toupper(df$Office.Address.2))
df$Office.Address.2 <- trimws(toupper(df$Office.Address.2))
df$Office.Address.2[df$Office.Address.2==""] <- NA
####################
# City
#####################

df$City <- str_replace_all(df$City, pattern="St.", "")
df$City <- str_replace_all(df$City, pattern="[[:punct:]]", "")
df$City <- trimws(toupper(df$City))
df$City[df$City==""] <- NA
#####################
# State
#####################
df$State[df$State=="  "] <- NA
summarise(group_by(df, State), n())
## # A tibble: 2 x 2
##   State `n()`
##   <chr> <int>
## 1 LA     6855
## 2 <NA>     10
#####################
# Zip Code
#####################
getZipcode <- function(value){
  l=strsplit(value,'')[[1]]
  if (length(l) ==10) {
    if(l[6]=='-'){
      return(value)
    } 
  }else if(length(l)==5){
      return(value)
    }
  else {
    return(NA)
  }
  }

df$Zip.Code <- lapply(df$Zip.Code,getZipcode)
#####################
# Parish
#####################
df$Parish <- str_replace_all(df$Parish, pattern="[[:punct:]]", "")
df$Parish[df$Parish==""] <- NA
df$Parish <- str_replace_all(df$Parish, pattern="ST.", "")
df$Parish <- trimws(toupper(df$Parish))
#####################
# Candidate Name
#####################
df$Candidate.Name <- str_replace_all(df$Candidate.Name, pattern="[[:punct:]]", "")
df$Candidate.Name[df$Candidate.Name==""] <- NA
df$Candidate.Name <- trimws(toupper(df$Candidate.Name))
#####################
# Candidate Address 1
#####################
df$Candidate.Address.1 <- str_replace_all(df$Candidate.Address.1, pattern="[[:punct:]]", "")
df$Candidate.Address.1[df$Candidate.Address.1==""] <- NA
df$Candidate.Address.1 <- trimws(toupper(df$Candidate.Address.1))
#####################
# Candidate Address 2
#####################
df$Candidate.Address.2 <- str_replace_all(df$Candidate.Address.2, pattern="[[:punct:]]", "")
df$Candidate.Address.2[df$Candidate.Address.2==""] <- NA
df$Candidate.Address.2 <- trimws(toupper(df$Candidate.Address.2))
#####################
# Candidate City
#####################
df$Candidate.City <- str_replace_all(df$Candidate.City, pattern="St.", "")
df$Candidate.City <- str_replace_all(df$Candidate.City, pattern="[[:punct:]]", "")
df$Candidate.City <- trimws(toupper(df$Candidate.City))
df$Candidate.City[df$Candidate.City==""] <- NA
#####################
# Candidate State
#####################

df$Candidate.State <- str_replace_all(df$Candidate.State, pattern="[[:punct:]]", "")
df$Candidate.State[df$Candidate.State==""] <- NA
#####################
# Candidate Zip Code
#####################
getZipcode <- function(value){
  l=strsplit(value,'')[[1]]
  if (length(l) ==10) {
    if(l[6]=='-'){
      return(value)
    } 
  }else if(length(l)==5){
    return(value)
  }
  else {
    return(NA)
  }
}

df$Candidate.Zip.Code <- lapply(df$Candidate.Zip.Code,getZipcode)
#####################
# Phone
#####################
getPhone <- function(value){
  l=strsplit(value,'')[[1]]
  if (length(l) ==12) {
    if(l[4]=='-' && l[8]=='-' ){
      return(value)
    } 
  }else {
    return(NA)
  }
}

df$Phone <- lapply(df$Phone,getPhone)

df$Candidate.State <- str_replace_all(df$Candidate.State, pattern="[[:punct:]]", "")
df$Phone[df$Phone==""] <- NA
#####################
#Ethnicity
#####################
df$Ethnicity<- factor(df$Ethnicity,levels = c("A","B","H","I","O","W"),labels = c(1,2,3,4,5,6),ordered = TRUE)
df$Ethnicity[df$Ethnicity==""] <- NA
#####################
# Salutation
#####################

df$Salutation <- str_replace_all(df$Salutation, pattern="Chief", "")
df$Salutation <- str_replace_all(df$Salutation, pattern="Judge", "")
df$Salutation <- str_replace_all(df$Salutation, pattern="Mr.", "")
df$Salutation <- str_replace_all(df$Salutation, pattern="Ms.", "")

Salcheck <- function(g,n){
  if (g==0 && n!=""){
    return(paste("Mr.",n))
  }else if (g==1 && n!=""){
    return(paste("Ms.",n))
  }else{
    return("NA")
  }
}
df$Salutation<-mapply(Salcheck,df$Sex,df$Salutation)
#####################
# Sex
#####################
df$Sex<- factor(df$Sex,levels = c("M","F"),labels = c(0,1),ordered = TRUE)
df$Sex[df$Sex==""] <- NA
#####################
# Party Code
#####################
df$Party.Code<- factor(df$Party.Code,levels = c("D","L","N","O","R"),labels = c(1,2,3,4,5),ordered = TRUE)
df$Party.Code[df$Party.Code==""] <- NA
#####################
# Office Level
#####################

df$Office.Level[df$Office.Level==""] <- NA
#####################
# Expiration Date
#####################
df$Expiration.Date[df$Expiration.Date==""] <- NA
df$Expiration.Date <- as.Date(df$Expiration.Date,format=" %m/%d/%Y")
#####################
# Commissioned Date
#####################

df$Commissioned.Date[df$Commissioned.Date==""] <- NA
df$Commissioned.Date <- as.Date(df$Commissioned.Date,format="%d-%m-%Y")