This is an exercise to import an Excel file and clean it up.
This loads a CSV file. This could be improved by directly importing an Excel xls file.
products.orig <- read_csv("refine_original.csv")
products.norm <- products.orig # We'll clean up this version
head(products.orig)
## # A tibble: 6 x 6
## company `Product code / number` address city
## <chr> <chr> <chr> <chr>
## 1 Phillips p-5 Groningensingel 147 arnhem
## 2 phillips p-43 Groningensingel 148 arnhem
## 3 philips x-3 Groningensingel 149 arnhem
## 4 phllips x-34 Groningensingel 150 arnhem
## 5 phillps x-12 Groningensingel 151 arnhem
## 6 phillipS p-23 Groningensingel 152 arnhem
## # ... with 2 more variables: country <chr>, name <chr>
To correct any problems in the company names in the data, I compared entered names to a list of correct names. The entry with the most letters in common (in any order) was deemed the most correct. Then I substituted the correct name for the name in the data.
real.names <- c("philips", "akzo", "van houten", "unilever")
for(r in 1:dim(products.norm)[1]) {
products.norm[r, "company"] <-
real.names[which.min(adist(products.norm[r, "company"], real.names))]
}
head(products.norm)
## # A tibble: 6 x 6
## company `Product code / number` address city
## <chr> <chr> <chr> <chr>
## 1 philips p-5 Groningensingel 147 arnhem
## 2 philips p-43 Groningensingel 148 arnhem
## 3 philips x-3 Groningensingel 149 arnhem
## 4 philips x-34 Groningensingel 150 arnhem
## 5 philips x-12 Groningensingel 151 arnhem
## 6 philips p-23 Groningensingel 152 arnhem
## # ... with 2 more variables: country <chr>, name <chr>
Create a column for product code and a separate column for the number, then remove the original combined column.
pid <- products.norm$`Product code / number`
products.norm <- add_column(products.norm,
str_split_fixed(pid, "-", 2)[,1],
.after="company")
names(products.norm)[2] <- "code"
products.norm <- add_column(products.norm,
str_split_fixed(pid, "-", 2)[,2],
.after="code")
names(products.norm)[3] <- "number"
products.norm$`Product code / number` <- NULL
head(products.norm)
## # A tibble: 6 x 7
## company code number address city country
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 philips p 5 Groningensingel 147 arnhem the netherlands
## 2 philips p 43 Groningensingel 148 arnhem the netherlands
## 3 philips x 3 Groningensingel 149 arnhem the netherlands
## 4 philips x 34 Groningensingel 150 arnhem the netherlands
## 5 philips x 12 Groningensingel 151 arnhem the netherlands
## 6 philips p 23 Groningensingel 152 arnhem the netherlands
## # ... with 1 more variables: name <chr>
This uses the product code column as an index into a named vector to create a new column with the product categories.
category <- c("p" = "Smartphone",
"v" = "TV",
"x" = "Laptop",
"q" = "Tablet")
products.norm <- add_column(products.norm, category[products.norm$code],
.after="code")
names(products.norm)[3] <- "category"
head(products.norm)
## # A tibble: 6 x 8
## company code category number address city
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 philips p Smartphone 5 Groningensingel 147 arnhem
## 2 philips p Smartphone 43 Groningensingel 148 arnhem
## 3 philips x Laptop 3 Groningensingel 149 arnhem
## 4 philips x Laptop 34 Groningensingel 150 arnhem
## 5 philips x Laptop 12 Groningensingel 151 arnhem
## 6 philips p Smartphone 23 Groningensingel 152 arnhem
## # ... with 2 more variables: country <chr>, name <chr>
# used underscore in var name so the col name would have an underscore,
# and be consistent with the "dummy variable" columns we create next
full_address <- paste(products.norm$address,
products.norm$city,
products.norm$country, sep=", ")
products.norm <- add_column(products.norm, full_address)
head(products.norm)
## # A tibble: 6 x 9
## company code category number address city
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 philips p Smartphone 5 Groningensingel 147 arnhem
## 2 philips p Smartphone 43 Groningensingel 148 arnhem
## 3 philips x Laptop 3 Groningensingel 149 arnhem
## 4 philips x Laptop 34 Groningensingel 150 arnhem
## 5 philips x Laptop 12 Groningensingel 151 arnhem
## 6 philips p Smartphone 23 Groningensingel 152 arnhem
## # ... with 3 more variables: country <chr>, name <chr>, full_address <chr>
Create a column per company that is true if the row matches that company
for(company1 in real.names) {
colx <- paste0("company_", company1)
products.norm <- mutate(products.norm,
colx = (products.norm$company == company1))
names(products.norm)[dim(products.norm)[2]] <- colx
}
head(products.norm)
## # A tibble: 6 x 13
## company code category number address city
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 philips p Smartphone 5 Groningensingel 147 arnhem
## 2 philips p Smartphone 43 Groningensingel 148 arnhem
## 3 philips x Laptop 3 Groningensingel 149 arnhem
## 4 philips x Laptop 34 Groningensingel 150 arnhem
## 5 philips x Laptop 12 Groningensingel 151 arnhem
## 6 philips p Smartphone 23 Groningensingel 152 arnhem
## # ... with 7 more variables: country <chr>, name <chr>,
## # full_address <chr>, company_philips <lgl>, company_akzo <lgl>,
## # `company_van houten` <lgl>, company_unilever <lgl>
write_csv(products.norm, "refine_clean.csv")