title: “Assignment 2” author: “Anuranjan Saxena” date: “2023-11-21” output: html_document: toc: yes toc_float: yes theme: united highlight: tango

Clear the workspace

  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 509376 27.3    1133240 60.6   644240 34.5
## Vcells 901270  6.9    8388608 64.0  1635216 12.5
  cat("\f")       # Clear the console

Prepare needed libraries

    packages <- c("stringr")    
  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }
## Warning: package 'stringr' was built under R version 4.2.3
  rm(packages)
  
  # Set working directory and path to data
  # setwd("")
  
  # Load data
  sales <- read.csv("C:/Users/91976/Desktop/Software tools/R/R week 2/r.hw02.csv"
                                    , check.names = FALSE
                                    , stringsAsFactors = FALSE
                                    , na.strings = "NA"
                                    )

  
  if (! nrow(sales) == 2417909) {
    stop("Wrong number of sales records! Make sure you have downloaded the data file correctly.")
  }
  
  str(sales)
## 'data.frame':    2417909 obs. of  16 variables:
##  $                     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                : chr  "08/16/2019" "08/19/2019" "08/19/2019" "08/19/2019" ...
##  $ Store Number        : int  2514 2561 2573 5360 2603 5669 5102 2662 5128 2592 ...
##  $ Store Name          : chr  "Hy-Vee Drugstore / Marion" "Hy-Vee Food Store / Fleur / DSM" "Hy-Vee Food Store / Muscatine" "Super Saver Liquor" ...
##  $ Address             : chr  "2790 7th Avenue" "4605 Fleur Drive" "2400 2nd Ave" "1610 Rockingham Rd" ...
##  $ City                : chr  "Marion" "Des Moines" "Muscatine" "Davenport" ...
##  $ Zip Code            : chr  "52302" "50321" "52761" "52802" ...
##  $ Store Location      : chr  "POINT (-91.582229 42.033186)" "POINT (-93.64494300000001 41.542748)" "POINT (-91.035138 41.451349)" "POINT (-90.599327 41.520231)" ...
##  $ County              : chr  "LINN" "POLK" "MUSCATINE" "SCOTT" ...
##  $ Category Name       : chr  "American Dry Gins" "American Brandies" "Imported Dry Gins" "Tequila, 100% Agave, GinBrandylike" ...
##  $ Item Description    : chr  "Burnetts Gin London Dry" "Paramount Blackberry Brandy" "Bombay Dry Gin" "Jose Cuervo Especial Silver" ...
##  $ Bottle Volume (ml)  : int  750 750 750 200 1750 375 1000 1750 750 1750 ...
##  $ State Bottle Retail : num  8.07 8.22 16.5 4.5 37.49 ...
##  $ Bottles Sold        : int  12 12 2 48 2 3 48 6 12 6 ...
##  $ Sale (Dollars)      : chr  "$ 96.84" "$ 98.64" "$ 33" "$ 216" ...
##  $ Volume Sold (Liters): num  9 9 1.5 9.6 3.5 1.12 48 10.5 9 10.5 ...
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
  stargazer(sales,
            
            type='text',
            title='Descriptive Statistics',   
            summary= TRUE,
            digits = 2,
            out = 'summary.text'
  )
## 
## Descriptive Statistics
## ======================================================================
## Statistic                N         Mean      St. Dev.   Min     Max   
## ----------------------------------------------------------------------
##                      2,417,909 1,208,955.00 697,990.30   1   2,417,909
## Store Number         2,417,909   3,668.79    1,022.44  2,106   9,929  
## Bottle Volume (ml)   2,417,909    767.54      420.20    50     3,000  
## State Bottle Retail  2,417,909    15.63       14.47    0.00  2,623.68 
## Bottles Sold         2,417,909     9.52       20.20      1     3,600  
## Volume Sold (Liters) 2,417,909     6.50       17.45    0.05  4,704.00 
## ----------------------------------------------------------------------

Rename variables

 colnames(sales)
##  [1] ""                     "Date"                 "Store Number"        
##  [4] "Store Name"           "Address"              "City"                
##  [7] "Zip Code"             "Store Location"       "County"              
## [10] "Category Name"        "Item Description"     "Bottle Volume (ml)"  
## [13] "State Bottle Retail"  "Bottles Sold"         "Sale (Dollars)"      
## [16] "Volume Sold (Liters)"
  sales <- sales[,-1]
  
  colnames(sales)
##  [1] "Date"                 "Store Number"         "Store Name"          
##  [4] "Address"              "City"                 "Zip Code"            
##  [7] "Store Location"       "County"               "Category Name"       
## [10] "Item Description"     "Bottle Volume (ml)"   "State Bottle Retail" 
## [13] "Bottles Sold"         "Sale (Dollars)"       "Volume Sold (Liters)"
  #Storing old names just in case they're needed somewhere else
  old.names <- colnames(sales)
  old.names  
##  [1] "Date"                 "Store Number"         "Store Name"          
##  [4] "Address"              "City"                 "Zip Code"            
##  [7] "Store Location"       "County"               "Category Name"       
## [10] "Item Description"     "Bottle Volume (ml)"   "State Bottle Retail" 
## [13] "Bottles Sold"         "Sale (Dollars)"       "Volume Sold (Liters)"
  new.names <- tolower(old.names)
  new.names
##  [1] "date"                 "store number"         "store name"          
##  [4] "address"              "city"                 "zip code"            
##  [7] "store location"       "county"               "category name"       
## [10] "item description"     "bottle volume (ml)"   "state bottle retail" 
## [13] "bottles sold"         "sale (dollars)"       "volume sold (liters)"
 #Assigning them to our data
  colnames(sales) <- new.names
  colnames(sales)
##  [1] "date"                 "store number"         "store name"          
##  [4] "address"              "city"                 "zip code"            
##  [7] "store location"       "county"               "category name"       
## [10] "item description"     "bottle volume (ml)"   "state bottle retail" 
## [13] "bottles sold"         "sale (dollars)"       "volume sold (liters)"
  colnames(sales) <- tolower(gsub(" |-", ".", colnames(sales)))  
  colnames(sales)
##  [1] "date"                 "store.number"         "store.name"          
##  [4] "address"              "city"                 "zip.code"            
##  [7] "store.location"       "county"               "category.name"       
## [10] "item.description"     "bottle.volume.(ml)"   "state.bottle.retail" 
## [13] "bottles.sold"         "sale.(dollars)"       "volume.sold.(liters)"
  head(sales)  #Checking if the varibales have been corrected
##         date store.number                           store.name
## 1 08/16/2019         2514            Hy-Vee Drugstore / Marion
## 2 08/19/2019         2561      Hy-Vee Food Store / Fleur / DSM
## 3 08/19/2019         2573        Hy-Vee Food Store / Muscatine
## 4 08/19/2019         5360                   Super Saver Liquor
## 5 08/16/2019         2603 Hy-Vee Wine and Spirits / Bettendorf
## 6 08/20/2019         5669                Guddi Mart / Waterloo
##                 address       city zip.code
## 1       2790 7th Avenue     Marion    52302
## 2      4605 Fleur Drive Des Moines    50321
## 3          2400 2nd Ave  Muscatine    52761
## 4    1610 Rockingham Rd  Davenport    52802
## 5 2890 Devils Glen Road Bettendorf    52722
## 6         306 Byron Ave   Waterloo    50702
##                         store.location     county
## 1         POINT (-91.582229 42.033186)       LINN
## 2 POINT (-93.64494300000001 41.542748)       POLK
## 3         POINT (-91.035138 41.451349)  MUSCATINE
## 4         POINT (-90.599327 41.520231)      SCOTT
## 5         POINT (-90.483701 41.552338)      SCOTT
## 6         POINT (-92.353141 42.480288) BLACK HAWK
##                        category.name            item.description
## 1                  American Dry Gins     Burnetts Gin London Dry
## 2                  American Brandies Paramount Blackberry Brandy
## 3                  Imported Dry Gins              Bombay Dry Gin
## 4 Tequila, 100% Agave, GinBrandylike Jose Cuervo Especial Silver
## 5                  Imported Dry Gins               Tanqueray Gin
## 6 Tequila, 100% Agave, GinBrandylike            Don Julio Blanco
##   bottle.volume.(ml) state.bottle.retail bottles.sold sale.(dollars)
## 1                750                8.07           12        $ 96.84
## 2                750                8.22           12        $ 98.64
## 3                750               16.50            2           $ 33
## 4                200                4.50           48          $ 216
## 5               1750               37.49            2        $ 74.98
## 6                375               21.74            3        $ 65.22
##   volume.sold.(liters)
## 1                 9.00
## 2                 9.00
## 3                 1.50
## 4                 9.60
## 5                 3.50
## 6                 1.12

Q2 - store registry

Q2.1 - Filter out unique store records and drop store info from main data

  # Select the necessary columns
  selected_columns <- c("store.number", "store.name", "address", "store.location", "city", "zip.code", "county")

  # Create the new dataframe 'stores' with unique combinations
  stores <- sales[, selected_columns]
  
  stores <- unique(stores)
  
  n_rows <- nrow(stores)
  n_unique_store_numbers <- length(unique(stores$store.number))
  
  # Printing results
  print(paste("Number of rows in 'stores':", n_rows))
## [1] "Number of rows in 'stores': 5207"
  print(paste("Number of unique values in 'store.number':", n_unique_store_numbers))
## [1] "Number of unique values in 'store.number': 2278"

Q2.2 - Filter out store GPS coordinates from location variable

  sales <- subset(sales, select = -c(store.name, address, store.location, city, zip.code, county))
  ncol(sales)
## [1] 9

Q2.3 - Drop location variable

  # Use strsplit to extract coordinates
  coordinates <- strsplit(stores$store.location, "[(), ]")

  # Extracting latitude and longitude
  latitude <- as.numeric(sapply(coordinates, function(x) as.numeric(x[3])))
  longitude <- as.numeric(sapply(coordinates, function(x) as.numeric(x[4])))
  
  # Add new variables to the 'stores' dataframe
  stores$latitude <- latitude
  stores$longitude <- longitude
  
  # Check the data types
  str(stores)
## 'data.frame':    5207 obs. of  9 variables:
##  $ store.number  : int  2514 2561 2573 5360 2603 5669 5102 2662 5128 2592 ...
##  $ store.name    : chr  "Hy-Vee Drugstore / Marion" "Hy-Vee Food Store / Fleur / DSM" "Hy-Vee Food Store / Muscatine" "Super Saver Liquor" ...
##  $ address       : chr  "2790 7th Avenue" "4605 Fleur Drive" "2400 2nd Ave" "1610 Rockingham Rd" ...
##  $ store.location: chr  "POINT (-91.582229 42.033186)" "POINT (-93.64494300000001 41.542748)" "POINT (-91.035138 41.451349)" "POINT (-90.599327 41.520231)" ...
##  $ city          : chr  "Marion" "Des Moines" "Muscatine" "Davenport" ...
##  $ zip.code      : chr  "52302" "50321" "52761" "52802" ...
##  $ county        : chr  "LINN" "POLK" "MUSCATINE" "SCOTT" ...
##  $ latitude      : num  -91.6 -93.6 -91 -90.6 -90.5 ...
##  $ longitude     : num  42 41.5 41.5 41.5 41.6 ...

Q2.4 - Average GPS coordinates for stores

  stores <- stores[, !(names(stores) %in% c("store.location"))]

Q2.5 - Removing duplicates

coordinates <- aggregate(cbind(latitude, longitude) ~ store.number, data = stores, FUN = mean)
  
  stores <- stores[, !(names(stores) %in% c("latitude", "longitude"))]
  
  stores <- merge(stores, coordinates, by = "store.number", all.x = TRUE)
  
  sum(is.na(coordinates))  # No NA
## [1] 0
  summary(coordinates)     
##   store.number     latitude        longitude    
##  Min.   :2106   Min.   :-96.56   Min.   :40.39  
##  1st Qu.:4132   1st Qu.:-94.02   1st Qu.:41.56  
##  Median :4745   Median :-93.11   Median :41.94  
##  Mean   :4625   Mean   :-93.09   Mean   :42.00  
##  3rd Qu.:5312   3rd Qu.:-91.67   3rd Qu.:42.50  
##  Max.   :9929   Max.   :-90.18   Max.   :45.01
  summary(stores$latitude) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -96.56  -94.03  -93.22  -93.12  -91.67  -90.18     344
#  stores <- unique(stores$store.number)


  sum(is.na(coordinates)) 
## [1] 0
  summary(coordinates)
##   store.number     latitude        longitude    
##  Min.   :2106   Min.   :-96.56   Min.   :40.39  
##  1st Qu.:4132   1st Qu.:-94.02   1st Qu.:41.56  
##  Median :4745   Median :-93.11   Median :41.94  
##  Mean   :4625   Mean   :-93.09   Mean   :42.00  
##  3rd Qu.:5312   3rd Qu.:-91.67   3rd Qu.:42.50  
##  Max.   :9929   Max.   :-90.18   Max.   :45.01
  summary(stores$latitude)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -96.56  -94.03  -93.22  -93.12  -91.67  -90.18     344

Q2.6 - Fix address, city and county names

  stores <- unique(stores)
  
  dim(stores)
## [1] 5192    8

Q2.7 - Remove duplicates

  stores <- merge(x = stores[, !(names(stores) %in% c("latitude", "longitude"))],
                  y = coordinates,
                  by = "store.number",
                  all.x = TRUE)
  
  # Keeping only unique rows in the merged 'stores' data
  stores <- unique(stores)
  
  # Fixing address, city and county names
  
  stores$address <- gsub(pattern     = "[.,]" ,  
                         replacement = "" ,  
                         x           = stores$address
  ) 
  
  stores[, c("address", "city", "county")] <- sapply(X = stores[, c("address", 
                                                                    "city",
                                                                    "county")
  ], 
  FUN = str_to_title 
  )
  
  head(stores[, c("address", "city", "county")])  
##                 address        city     county
## 1          2217 College Cedar Falls Black Hawk
## 2          2217 College Cedar Falls Black Hawk
## 3          2217 College Cedar Falls Black Hawk
## 4 1119 Market St Box 71      Gowrie    Webster
## 5 1119 Market St Box 71      Gowrie    Webster
## 6 1119 Market St Box 71      Gowrie    Webster

Q2.8 - Fill in missing country names

  dim(stores)
## [1] 5192    8

Q2.9 - Remove duplicates

#Some of the stores with the same store.number are recorded twice because one of the
#records is missing county value

  counties <- unique(  (stores[! is.na(stores$county), 
                               c("store.number", "county")
  ]
  )
  ) 
  
  stores <- merge( x       = stores[ , !colnames(stores) %in% c("county")] , 
                   y       = counties , 
                   by      = c("store.number") , 
                   all.x   = TRUE ) 
  
  rm(counties)
  
  stores <- unique(stores)
  
  dim(stores)
## [1] 2792    8
  stores <- unique(stores)

Q2.11

  stores$dup <- as.integer(duplicated(stores$store.number) | duplicated(stores$store.number, fromLast = TRUE))

  total_dup_1 <- sum(stores$dup == 1)
  

  cat("Total rows with dup = 1:", total_dup_1, "\n")
## Total rows with dup = 1: 905

Q2.11 - Check for proper zip/city/county combinations

    geo <- read.csv("C:/Users/91976/Desktop/Software tools/R/R week 2/iowa.geographies.csv"
                    , check.names = FALSE
                    , stringsAsFactors = FALSE
                    , na.strings = "NA"
                    )

    
    geo$match <- 1L
    
    stores <- merge(x = stores,y = geo , by.x = c("zip.code", "city", "county") , by.y = c("zipcode", "city", "county") , 
                    all.x = TRUE, 
                    all.y = FALSE 
    )
    
    sum(is.na(stores$match))
## [1] 382

Q3 cleaning sales data

Q3.1 - Convert sales and prices to proper format

# Remove $ character and convert to numeric for state.bottle.retail
sales$state.bottle.retail <- as.numeric(gsub("[^0-9.]", "", sales$state.bottle.retail))

Q3.2 - Create subcategory variable

    sales$subcategory <- sales$category.name
    
    sales$category.name <- NULL

Q3.3 - Create new category variable

    # Create an empty (NA) variable category
    sales$category <- NA
    
    # Match Tequila
    sales$category[grepl("\\btequila\\b", sales$subcategory, ignore.case = TRUE)] <- "Tequila"
    
    # Match Gin
    sales$category[grepl("\\bgin\\b|\\bgins\\b", sales$subcategory, ignore.case = TRUE)] <- "Gin"
    
    # Match Brandy
    sales$category[grepl("\\bbrandy\\b|\\bbrandies\\b", sales$subcategory, ignore.case = TRUE)] <- "Brandy"
    
    # Check the number of records per category
    table(sales$category)
## 
##  Brandy     Gin Tequila 
##  960725  648208  808976

Q4 - export cleand data

  # Export cleaned data in sales
  write.csv(sales, file = "sales.csv", row.names = FALSE)

  # Export cleaned data in stores
  write.csv(stores, file = "stores.csv", row.names = FALSE)
  
  # Check the size of the resulting files
  file.info("sales.csv")$size
## [1] 245159443
  file.info("stores.csv")$size
## [1] 297573