Set up install packages

library("stringi")
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.1
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library("sqldf")
## Warning: package 'sqldf' was built under R version 3.4.1
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite

Read in the dirty data

dirtydata <- read.csv(file="C:/Users/fangy/Desktop/All/3study/harrisburg/anly510/dirty_data.csv", header=TRUE)

Remove html column

dirtydata$Strange.HTML<-NULL

Remove the special characters, leading spaces and capitalize the street names for street2 and street

dirtydata$street2<-gsub( "[[:punct:]]"," ",dirtydata$Street.2)
dirtydata$street2<-gsub("\\s+", " ",str_trim(dirtydata$street2))
dirtydata$street2<-stri_trans_general(dirtydata$street2,id="Title")


dirtydata$street<-iconv(dirtydata$street,to="ASCII//TRANSLIT")
dirtydata$street<-gsub( "[[:punct:]]"," ",dirtydata$street)
dirtydata$street<-trimws(dirtydata$street)
dirtydata$street<-stri_trans_general(dirtydata$street,id="Title")
summary(dirtydata)
##       Year              Area                                Street    
##  Min.   :2011             :5265   BroadÌ´Street                : 128  
##  1st Qu.:2011   Birmingham:   2    BroadÌ´Street,Ì´Birmingham  :  35  
##  Median :2011   Coventry  :   2    BridgeÌ´Street,Ì´Walsall    :  31  
##  Mean   :2011   Dudley    :   2   HurstÌ´Street                :  31  
##  3rd Qu.:2012   Sandwell  :   2    BroadÌ´Street,Ì´CityÌ´Centre:  23  
##  Max.   :2012   Solihull  :   2   MarketÌ´Street               :  23  
##                 (Other)   :   4   (Other)                      :5008  
##                        Street.2      street2             street         
##  Broad Street              : 128   Length:5279        Length:5279       
##   Broad Street, Birmingham :  35   Class :character   Class :character  
##   Bridge Street, Walsall   :  31   Mode  :character   Mode  :character  
##  Hurst Street              :  31                                        
##   Broad Street, City Centre:  23                                        
##  Market Street             :  23                                        
##  (Other)                   :5008

Abbreviation

invisible(stri_replace_first_regex(dirtydata$street,
pattern= c("Road", "Roads","Avenue","St","Lane"), replacement=c("Rd.","Ave","St.","Rds.","Ln")))
## Warning in stri_replace_first_regex(dirtydata$street, pattern = c("Road", :
## longer object length is not a multiple of shorter object length
invisible(stri_replace_first_regex(dirtydata$street2,
pattern= c("Road", "Roads","Avenue","St","Lane"), replacement=c("Rd.","Ave","St.","Rds.","Ln")))
## Warning in stri_replace_first_regex(dirtydata$street2, pattern =
## c("Road", : longer object length is not a multiple of shorter object length

Fill the area

ddata<-data.frame(dirtydata$Year, dirtydata$Area,dirtydata$street2,dirtydata$street)
names(ddata)<-c("year","area","street2","street")
summary(ddata)
##       year              area                          street2    
##  Min.   :2011             :5265   Broad Street            : 162  
##  1st Qu.:2011   Birmingham:   2   Bridge Street Walsall   :  59  
##  Median :2011   Coventry  :   2   Broad Street Birmingham :  47  
##  Mean   :2011   Dudley    :   2   Broad St                :  45  
##  3rd Qu.:2012   Sandwell  :   2   Broad Street City Centre:  44  
##  Max.   :2012   Solihull  :   2   Hurst Street            :  38  
##                 (Other)   :   4   (Other)                 :4884  
##                       street    
##  Broad Street            : 162  
##  Bridge Street Walsall   :  59  
##  Broad Street Birmingham :  47  
##  Broad St                :  45  
##  Broad Street City Centre:  44  
##  Hurst Street            :  38  
##  (Other)                 :4884
cdata<-sqldf("SELECT year, area, street, street2,
                   CASE
                   WHEN (street2 LIKE '%Birmingham%' or street2 LIKE '%Acocks%') THEN 'Birmingham' 
WHEN street2 LIKE '%Coventry%' THEN 'Coventry' 
WHEN street2 LIKE '%Dudley%' THEN 'Dudley' 
WHEN street2 LIKE '%Sandwell%' THEN 'Sandwell' 
WHEN street2 LIKE '%Solihull%' THEN 'Solihull' 
WHEN street2 LIKE '%Walsall%' THEN 'Walsall' 
WHEN street2 LIKE '%Wolverhampton%' THEN 'Wolverhampton' 
else 'A' 
END area1 
FROM ddata") 

Remove the identical column for street and street2

cdata1 <- sqldf("SELECT year, area, area1,street, street2,
                CASE WHEN street=street2 then '1' else '0' 
                END AS equal
                FROM cdata")

cdata2 <- sqldf("SELECT year, area, area1,street, street2,equal,
                CASE WHEN equal='1' then ' ' else street2 
                END AS street22
                FROM cdata1")

Final clean data

cleandata <- data.frame(cdata2$year, cdata2$area1, cdata2$street, cdata2$street22 )

write.csv(cleandata, file="C:/Users/fangy/Desktop/All/3study/harrisburg/anly510/cleandata.csv")
summary(cleandata)
##   cdata2.year          cdata2.area1                   cdata2.street 
##  Min.   :2011   A            :4115   Broad Street            : 162  
##  1st Qu.:2011   Birmingham   : 318   Bridge Street Walsall   :  59  
##  Median :2011   Walsall      : 273   Broad Street Birmingham :  47  
##  Mean   :2011   Coventry     : 176   Broad St                :  45  
##  3rd Qu.:2012   Wolverhampton: 172   Broad Street City Centre:  44  
##  Max.   :2012   Solihull     : 135   Hurst Street            :  38  
##                 (Other)      :  90   (Other)                 :4884  
##                                 cdata2.street22
##                                         :5252  
##  Arcadian Åäìýå Birmingham             :   2  
##  Asda Stores WåäìýåTon                 :   2  
##  Lichfield Street City Centre WåäìýåTon:   2  
##  Mcdonalds Åäìýå Small Heath           :   2  
##  Belmont Road Penn WåäìýåTon           :   1  
##  (Other)                                :  18