Overview

This is an exercise to import an Excel file and clean it up.

  1. Import the data
  2. Clean up brand names
  3. Separate product code and product number
  4. Add product categories
  5. Add full address for geocoding
  6. Create dummy variables for company and product category
  7. Save results as refine_clean.csv

Import data set

This loads a CSV file. This could be improved by directly importing an Excel xls file.

products.orig <- read_csv("refine_original.csv")
products.norm <- products.orig  # We'll clean up this version
head(products.orig)
## # A tibble: 6 x 6
##    company `Product code / number`             address   city
##      <chr>                   <chr>               <chr>  <chr>
## 1 Phillips                     p-5 Groningensingel 147 arnhem
## 2 phillips                    p-43 Groningensingel 148 arnhem
## 3  philips                     x-3 Groningensingel 149 arnhem
## 4  phllips                    x-34 Groningensingel 150 arnhem
## 5  phillps                    x-12 Groningensingel 151 arnhem
## 6 phillipS                    p-23 Groningensingel 152 arnhem
## # ... with 2 more variables: country <chr>, name <chr>

Normalize company names

To correct any problems in the company names in the data, I compared entered names to a list of correct names. The entry with the most letters in common (in any order) was deemed the most correct. Then I substituted the correct name for the name in the data.

real.names <- c("philips", "akzo", "van houten", "unilever")
for(r in 1:dim(products.norm)[1]) {
  products.norm[r, "company"] <- 
    real.names[which.min(adist(products.norm[r, "company"], real.names))]
}
head(products.norm)
## # A tibble: 6 x 6
##   company `Product code / number`             address   city
##     <chr>                   <chr>               <chr>  <chr>
## 1 philips                     p-5 Groningensingel 147 arnhem
## 2 philips                    p-43 Groningensingel 148 arnhem
## 3 philips                     x-3 Groningensingel 149 arnhem
## 4 philips                    x-34 Groningensingel 150 arnhem
## 5 philips                    x-12 Groningensingel 151 arnhem
## 6 philips                    p-23 Groningensingel 152 arnhem
## # ... with 2 more variables: country <chr>, name <chr>

Separate product code and number

Create a column for product code and a separate column for the number, then remove the original combined column.

pid <- products.norm$`Product code / number`
products.norm <- add_column(products.norm, 
                            str_split_fixed(pid, "-", 2)[,1],
                            .after="company")
names(products.norm)[2] <- "code"
products.norm <- add_column(products.norm, 
                            str_split_fixed(pid, "-", 2)[,2],
                            .after="code")
names(products.norm)[3] <- "number"
products.norm$`Product code / number` <- NULL
head(products.norm)
## # A tibble: 6 x 7
##   company  code number             address   city         country
##     <chr> <chr>  <chr>               <chr>  <chr>           <chr>
## 1 philips     p      5 Groningensingel 147 arnhem the netherlands
## 2 philips     p     43 Groningensingel 148 arnhem the netherlands
## 3 philips     x      3 Groningensingel 149 arnhem the netherlands
## 4 philips     x     34 Groningensingel 150 arnhem the netherlands
## 5 philips     x     12 Groningensingel 151 arnhem the netherlands
## 6 philips     p     23 Groningensingel 152 arnhem the netherlands
## # ... with 1 more variables: name <chr>

Add product categories

This uses the product code column as an index into a named vector to create a new column with the product categories.

category <- c("p" = "Smartphone",
              "v" = "TV",
              "x" = "Laptop",
              "q" = "Tablet")
products.norm <- add_column(products.norm, category[products.norm$code], 
                            .after="code")
names(products.norm)[3] <- "category"
head(products.norm)
## # A tibble: 6 x 8
##   company  code   category number             address   city
##     <chr> <chr>      <chr>  <chr>               <chr>  <chr>
## 1 philips     p Smartphone      5 Groningensingel 147 arnhem
## 2 philips     p Smartphone     43 Groningensingel 148 arnhem
## 3 philips     x     Laptop      3 Groningensingel 149 arnhem
## 4 philips     x     Laptop     34 Groningensingel 150 arnhem
## 5 philips     x     Laptop     12 Groningensingel 151 arnhem
## 6 philips     p Smartphone     23 Groningensingel 152 arnhem
## # ... with 2 more variables: country <chr>, name <chr>

Add full address for geocoding

# used underscore in var name so the col name would have an underscore,
# and be consistent with the "dummy variable" columns we create next
full_address <- paste(products.norm$address,
                       products.norm$city,
                       products.norm$country, sep=", ")
products.norm <- add_column(products.norm, full_address)
head(products.norm)
## # A tibble: 6 x 9
##   company  code   category number             address   city
##     <chr> <chr>      <chr>  <chr>               <chr>  <chr>
## 1 philips     p Smartphone      5 Groningensingel 147 arnhem
## 2 philips     p Smartphone     43 Groningensingel 148 arnhem
## 3 philips     x     Laptop      3 Groningensingel 149 arnhem
## 4 philips     x     Laptop     34 Groningensingel 150 arnhem
## 5 philips     x     Laptop     12 Groningensingel 151 arnhem
## 6 philips     p Smartphone     23 Groningensingel 152 arnhem
## # ... with 3 more variables: country <chr>, name <chr>, full_address <chr>

Create dummy variables for company and product category

Create a column per company that is true if the row matches that company

for(company1 in real.names) {
  colx <- paste0("company_", company1)
  products.norm <- mutate(products.norm, 
                          colx = (products.norm$company == company1))
  names(products.norm)[dim(products.norm)[2]] <- colx
}
head(products.norm)
## # A tibble: 6 x 13
##   company  code   category number             address   city
##     <chr> <chr>      <chr>  <chr>               <chr>  <chr>
## 1 philips     p Smartphone      5 Groningensingel 147 arnhem
## 2 philips     p Smartphone     43 Groningensingel 148 arnhem
## 3 philips     x     Laptop      3 Groningensingel 149 arnhem
## 4 philips     x     Laptop     34 Groningensingel 150 arnhem
## 5 philips     x     Laptop     12 Groningensingel 151 arnhem
## 6 philips     p Smartphone     23 Groningensingel 152 arnhem
## # ... with 7 more variables: country <chr>, name <chr>,
## #   full_address <chr>, company_philips <lgl>, company_akzo <lgl>,
## #   `company_van houten` <lgl>, company_unilever <lgl>

Save results as refine_clean.csv

write_csv(products.norm, "refine_clean.csv")