Set up install packages
library("stringi")
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.1
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library("sqldf")
## Warning: package 'sqldf' was built under R version 3.4.1
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
Read in the dirty data
dirtydata <- read.csv(file="C:/Users/fangy/Desktop/All/3study/harrisburg/anly510/dirty_data.csv", header=TRUE)
Remove html column
dirtydata$Strange.HTML<-NULL
Remove the special characters, leading spaces and capitalize the street names for street2 and street
dirtydata$street2<-gsub( "[[:punct:]]"," ",dirtydata$Street.2)
dirtydata$street2<-gsub("\\s+", " ",str_trim(dirtydata$street2))
dirtydata$street2<-stri_trans_general(dirtydata$street2,id="Title")
dirtydata$street<-iconv(dirtydata$street,to="ASCII//TRANSLIT")
dirtydata$street<-gsub( "[[:punct:]]"," ",dirtydata$street)
dirtydata$street<-trimws(dirtydata$street)
dirtydata$street<-stri_trans_general(dirtydata$street,id="Title")
summary(dirtydata)
## Year Area Street
## Min. :2011 :5265 BroadÌ´Street : 128
## 1st Qu.:2011 Birmingham: 2 BroadÌ´Street,Ì´Birmingham : 35
## Median :2011 Coventry : 2 BridgeÌ´Street,Ì´Walsall : 31
## Mean :2011 Dudley : 2 HurstÌ´Street : 31
## 3rd Qu.:2012 Sandwell : 2 BroadÌ´Street,Ì´CityÌ´Centre: 23
## Max. :2012 Solihull : 2 MarketÌ´Street : 23
## (Other) : 4 (Other) :5008
## Street.2 street2 street
## Broad Street : 128 Length:5279 Length:5279
## Broad Street, Birmingham : 35 Class :character Class :character
## Bridge Street, Walsall : 31 Mode :character Mode :character
## Hurst Street : 31
## Broad Street, City Centre: 23
## Market Street : 23
## (Other) :5008
Abbreviation
invisible(stri_replace_first_regex(dirtydata$street,
pattern= c("Road", "Roads","Avenue","St","Lane"), replacement=c("Rd.","Ave","St.","Rds.","Ln")))
## Warning in stri_replace_first_regex(dirtydata$street, pattern = c("Road", :
## longer object length is not a multiple of shorter object length
invisible(stri_replace_first_regex(dirtydata$street2,
pattern= c("Road", "Roads","Avenue","St","Lane"), replacement=c("Rd.","Ave","St.","Rds.","Ln")))
## Warning in stri_replace_first_regex(dirtydata$street2, pattern =
## c("Road", : longer object length is not a multiple of shorter object length
Fill the area
ddata<-data.frame(dirtydata$Year, dirtydata$Area,dirtydata$street2,dirtydata$street)
names(ddata)<-c("year","area","street2","street")
summary(ddata)
## year area street2
## Min. :2011 :5265 Broad Street : 162
## 1st Qu.:2011 Birmingham: 2 Bridge Street Walsall : 59
## Median :2011 Coventry : 2 Broad Street Birmingham : 47
## Mean :2011 Dudley : 2 Broad St : 45
## 3rd Qu.:2012 Sandwell : 2 Broad Street City Centre: 44
## Max. :2012 Solihull : 2 Hurst Street : 38
## (Other) : 4 (Other) :4884
## street
## Broad Street : 162
## Bridge Street Walsall : 59
## Broad Street Birmingham : 47
## Broad St : 45
## Broad Street City Centre: 44
## Hurst Street : 38
## (Other) :4884
cdata<-sqldf("SELECT year, area, street, street2,
CASE
WHEN (street2 LIKE '%Birmingham%' or street2 LIKE '%Acocks%') THEN 'Birmingham'
WHEN street2 LIKE '%Coventry%' THEN 'Coventry'
WHEN street2 LIKE '%Dudley%' THEN 'Dudley'
WHEN street2 LIKE '%Sandwell%' THEN 'Sandwell'
WHEN street2 LIKE '%Solihull%' THEN 'Solihull'
WHEN street2 LIKE '%Walsall%' THEN 'Walsall'
WHEN street2 LIKE '%Wolverhampton%' THEN 'Wolverhampton'
else 'A'
END area1
FROM ddata")
Remove the identical column for street and street2
cdata1 <- sqldf("SELECT year, area, area1,street, street2,
CASE WHEN street=street2 then '1' else '0'
END AS equal
FROM cdata")
cdata2 <- sqldf("SELECT year, area, area1,street, street2,equal,
CASE WHEN equal='1' then ' ' else street2
END AS street22
FROM cdata1")
Final clean data
cleandata <- data.frame(cdata2$year, cdata2$area1, cdata2$street, cdata2$street22 )
write.csv(cleandata, file="C:/Users/fangy/Desktop/All/3study/harrisburg/anly510/cleandata.csv")
summary(cleandata)
## cdata2.year cdata2.area1 cdata2.street
## Min. :2011 A :4115 Broad Street : 162
## 1st Qu.:2011 Birmingham : 318 Bridge Street Walsall : 59
## Median :2011 Walsall : 273 Broad Street Birmingham : 47
## Mean :2011 Coventry : 176 Broad St : 45
## 3rd Qu.:2012 Wolverhampton: 172 Broad Street City Centre: 44
## Max. :2012 Solihull : 135 Hurst Street : 38
## (Other) : 90 (Other) :4884
## cdata2.street22
## :5252
## Arcadian Åäìýå Birmingham : 2
## Asda Stores WåäìýåTon : 2
## Lichfield Street City Centre WåäìýåTon: 2
## Mcdonalds Åäìýå Small Heath : 2
## Belmont Road Penn WåäìýåTon : 1
## (Other) : 18