title: “Assignment 2” author: “Anuranjan Saxena” date: “2023-11-21”
output: html_document: toc: yes toc_float: yes theme: united highlight:
tango
—
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 509376 27.3 1133240 60.6 644240 34.5
## Vcells 901270 6.9 8388608 64.0 1635216 12.5
packages <- c("stringr")
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}## Warning: package 'stringr' was built under R version 4.2.3
rm(packages)
# Set working directory and path to data
# setwd("")
# Load data
sales <- read.csv("C:/Users/91976/Desktop/Software tools/R/R week 2/r.hw02.csv"
, check.names = FALSE
, stringsAsFactors = FALSE
, na.strings = "NA"
)
if (! nrow(sales) == 2417909) {
stop("Wrong number of sales records! Make sure you have downloaded the data file correctly.")
}
str(sales)## 'data.frame': 2417909 obs. of 16 variables:
## $ : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "08/16/2019" "08/19/2019" "08/19/2019" "08/19/2019" ...
## $ Store Number : int 2514 2561 2573 5360 2603 5669 5102 2662 5128 2592 ...
## $ Store Name : chr "Hy-Vee Drugstore / Marion" "Hy-Vee Food Store / Fleur / DSM" "Hy-Vee Food Store / Muscatine" "Super Saver Liquor" ...
## $ Address : chr "2790 7th Avenue" "4605 Fleur Drive" "2400 2nd Ave" "1610 Rockingham Rd" ...
## $ City : chr "Marion" "Des Moines" "Muscatine" "Davenport" ...
## $ Zip Code : chr "52302" "50321" "52761" "52802" ...
## $ Store Location : chr "POINT (-91.582229 42.033186)" "POINT (-93.64494300000001 41.542748)" "POINT (-91.035138 41.451349)" "POINT (-90.599327 41.520231)" ...
## $ County : chr "LINN" "POLK" "MUSCATINE" "SCOTT" ...
## $ Category Name : chr "American Dry Gins" "American Brandies" "Imported Dry Gins" "Tequila, 100% Agave, GinBrandylike" ...
## $ Item Description : chr "Burnetts Gin London Dry" "Paramount Blackberry Brandy" "Bombay Dry Gin" "Jose Cuervo Especial Silver" ...
## $ Bottle Volume (ml) : int 750 750 750 200 1750 375 1000 1750 750 1750 ...
## $ State Bottle Retail : num 8.07 8.22 16.5 4.5 37.49 ...
## $ Bottles Sold : int 12 12 2 48 2 3 48 6 12 6 ...
## $ Sale (Dollars) : chr "$ 96.84" "$ 98.64" "$ 33" "$ 216" ...
## $ Volume Sold (Liters): num 9 9 1.5 9.6 3.5 1.12 48 10.5 9 10.5 ...
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(sales,
type='text',
title='Descriptive Statistics',
summary= TRUE,
digits = 2,
out = 'summary.text'
)##
## Descriptive Statistics
## ======================================================================
## Statistic N Mean St. Dev. Min Max
## ----------------------------------------------------------------------
## 2,417,909 1,208,955.00 697,990.30 1 2,417,909
## Store Number 2,417,909 3,668.79 1,022.44 2,106 9,929
## Bottle Volume (ml) 2,417,909 767.54 420.20 50 3,000
## State Bottle Retail 2,417,909 15.63 14.47 0.00 2,623.68
## Bottles Sold 2,417,909 9.52 20.20 1 3,600
## Volume Sold (Liters) 2,417,909 6.50 17.45 0.05 4,704.00
## ----------------------------------------------------------------------
## [1] "" "Date" "Store Number"
## [4] "Store Name" "Address" "City"
## [7] "Zip Code" "Store Location" "County"
## [10] "Category Name" "Item Description" "Bottle Volume (ml)"
## [13] "State Bottle Retail" "Bottles Sold" "Sale (Dollars)"
## [16] "Volume Sold (Liters)"
## [1] "Date" "Store Number" "Store Name"
## [4] "Address" "City" "Zip Code"
## [7] "Store Location" "County" "Category Name"
## [10] "Item Description" "Bottle Volume (ml)" "State Bottle Retail"
## [13] "Bottles Sold" "Sale (Dollars)" "Volume Sold (Liters)"
#Storing old names just in case they're needed somewhere else
old.names <- colnames(sales)
old.names ## [1] "Date" "Store Number" "Store Name"
## [4] "Address" "City" "Zip Code"
## [7] "Store Location" "County" "Category Name"
## [10] "Item Description" "Bottle Volume (ml)" "State Bottle Retail"
## [13] "Bottles Sold" "Sale (Dollars)" "Volume Sold (Liters)"
## [1] "date" "store number" "store name"
## [4] "address" "city" "zip code"
## [7] "store location" "county" "category name"
## [10] "item description" "bottle volume (ml)" "state bottle retail"
## [13] "bottles sold" "sale (dollars)" "volume sold (liters)"
## [1] "date" "store number" "store name"
## [4] "address" "city" "zip code"
## [7] "store location" "county" "category name"
## [10] "item description" "bottle volume (ml)" "state bottle retail"
## [13] "bottles sold" "sale (dollars)" "volume sold (liters)"
## [1] "date" "store.number" "store.name"
## [4] "address" "city" "zip.code"
## [7] "store.location" "county" "category.name"
## [10] "item.description" "bottle.volume.(ml)" "state.bottle.retail"
## [13] "bottles.sold" "sale.(dollars)" "volume.sold.(liters)"
## date store.number store.name
## 1 08/16/2019 2514 Hy-Vee Drugstore / Marion
## 2 08/19/2019 2561 Hy-Vee Food Store / Fleur / DSM
## 3 08/19/2019 2573 Hy-Vee Food Store / Muscatine
## 4 08/19/2019 5360 Super Saver Liquor
## 5 08/16/2019 2603 Hy-Vee Wine and Spirits / Bettendorf
## 6 08/20/2019 5669 Guddi Mart / Waterloo
## address city zip.code
## 1 2790 7th Avenue Marion 52302
## 2 4605 Fleur Drive Des Moines 50321
## 3 2400 2nd Ave Muscatine 52761
## 4 1610 Rockingham Rd Davenport 52802
## 5 2890 Devils Glen Road Bettendorf 52722
## 6 306 Byron Ave Waterloo 50702
## store.location county
## 1 POINT (-91.582229 42.033186) LINN
## 2 POINT (-93.64494300000001 41.542748) POLK
## 3 POINT (-91.035138 41.451349) MUSCATINE
## 4 POINT (-90.599327 41.520231) SCOTT
## 5 POINT (-90.483701 41.552338) SCOTT
## 6 POINT (-92.353141 42.480288) BLACK HAWK
## category.name item.description
## 1 American Dry Gins Burnetts Gin London Dry
## 2 American Brandies Paramount Blackberry Brandy
## 3 Imported Dry Gins Bombay Dry Gin
## 4 Tequila, 100% Agave, GinBrandylike Jose Cuervo Especial Silver
## 5 Imported Dry Gins Tanqueray Gin
## 6 Tequila, 100% Agave, GinBrandylike Don Julio Blanco
## bottle.volume.(ml) state.bottle.retail bottles.sold sale.(dollars)
## 1 750 8.07 12 $ 96.84
## 2 750 8.22 12 $ 98.64
## 3 750 16.50 2 $ 33
## 4 200 4.50 48 $ 216
## 5 1750 37.49 2 $ 74.98
## 6 375 21.74 3 $ 65.22
## volume.sold.(liters)
## 1 9.00
## 2 9.00
## 3 1.50
## 4 9.60
## 5 3.50
## 6 1.12
# Select the necessary columns
selected_columns <- c("store.number", "store.name", "address", "store.location", "city", "zip.code", "county")
# Create the new dataframe 'stores' with unique combinations
stores <- sales[, selected_columns]
stores <- unique(stores)
n_rows <- nrow(stores)
n_unique_store_numbers <- length(unique(stores$store.number))
# Printing results
print(paste("Number of rows in 'stores':", n_rows))## [1] "Number of rows in 'stores': 5207"
## [1] "Number of unique values in 'store.number': 2278"
sales <- subset(sales, select = -c(store.name, address, store.location, city, zip.code, county))
ncol(sales)## [1] 9
# Use strsplit to extract coordinates
coordinates <- strsplit(stores$store.location, "[(), ]")
# Extracting latitude and longitude
latitude <- as.numeric(sapply(coordinates, function(x) as.numeric(x[3])))
longitude <- as.numeric(sapply(coordinates, function(x) as.numeric(x[4])))
# Add new variables to the 'stores' dataframe
stores$latitude <- latitude
stores$longitude <- longitude
# Check the data types
str(stores)## 'data.frame': 5207 obs. of 9 variables:
## $ store.number : int 2514 2561 2573 5360 2603 5669 5102 2662 5128 2592 ...
## $ store.name : chr "Hy-Vee Drugstore / Marion" "Hy-Vee Food Store / Fleur / DSM" "Hy-Vee Food Store / Muscatine" "Super Saver Liquor" ...
## $ address : chr "2790 7th Avenue" "4605 Fleur Drive" "2400 2nd Ave" "1610 Rockingham Rd" ...
## $ store.location: chr "POINT (-91.582229 42.033186)" "POINT (-93.64494300000001 41.542748)" "POINT (-91.035138 41.451349)" "POINT (-90.599327 41.520231)" ...
## $ city : chr "Marion" "Des Moines" "Muscatine" "Davenport" ...
## $ zip.code : chr "52302" "50321" "52761" "52802" ...
## $ county : chr "LINN" "POLK" "MUSCATINE" "SCOTT" ...
## $ latitude : num -91.6 -93.6 -91 -90.6 -90.5 ...
## $ longitude : num 42 41.5 41.5 41.5 41.6 ...
coordinates <- aggregate(cbind(latitude, longitude) ~ store.number, data = stores, FUN = mean)
stores <- stores[, !(names(stores) %in% c("latitude", "longitude"))]
stores <- merge(stores, coordinates, by = "store.number", all.x = TRUE)
sum(is.na(coordinates)) # No NA## [1] 0
## store.number latitude longitude
## Min. :2106 Min. :-96.56 Min. :40.39
## 1st Qu.:4132 1st Qu.:-94.02 1st Qu.:41.56
## Median :4745 Median :-93.11 Median :41.94
## Mean :4625 Mean :-93.09 Mean :42.00
## 3rd Qu.:5312 3rd Qu.:-91.67 3rd Qu.:42.50
## Max. :9929 Max. :-90.18 Max. :45.01
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -96.56 -94.03 -93.22 -93.12 -91.67 -90.18 344
## [1] 0
## store.number latitude longitude
## Min. :2106 Min. :-96.56 Min. :40.39
## 1st Qu.:4132 1st Qu.:-94.02 1st Qu.:41.56
## Median :4745 Median :-93.11 Median :41.94
## Mean :4625 Mean :-93.09 Mean :42.00
## 3rd Qu.:5312 3rd Qu.:-91.67 3rd Qu.:42.50
## Max. :9929 Max. :-90.18 Max. :45.01
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -96.56 -94.03 -93.22 -93.12 -91.67 -90.18 344
stores <- merge(x = stores[, !(names(stores) %in% c("latitude", "longitude"))],
y = coordinates,
by = "store.number",
all.x = TRUE)
# Keeping only unique rows in the merged 'stores' data
stores <- unique(stores)
# Fixing address, city and county names
stores$address <- gsub(pattern = "[.,]" ,
replacement = "" ,
x = stores$address
)
stores[, c("address", "city", "county")] <- sapply(X = stores[, c("address",
"city",
"county")
],
FUN = str_to_title
)
head(stores[, c("address", "city", "county")]) ## address city county
## 1 2217 College Cedar Falls Black Hawk
## 2 2217 College Cedar Falls Black Hawk
## 3 2217 College Cedar Falls Black Hawk
## 4 1119 Market St Box 71 Gowrie Webster
## 5 1119 Market St Box 71 Gowrie Webster
## 6 1119 Market St Box 71 Gowrie Webster
#Some of the stores with the same store.number are recorded twice because one of the
#records is missing county value
counties <- unique( (stores[! is.na(stores$county),
c("store.number", "county")
]
)
)
stores <- merge( x = stores[ , !colnames(stores) %in% c("county")] ,
y = counties ,
by = c("store.number") ,
all.x = TRUE )
rm(counties)
stores <- unique(stores)
dim(stores)## [1] 2792 8
stores$dup <- as.integer(duplicated(stores$store.number) | duplicated(stores$store.number, fromLast = TRUE))
total_dup_1 <- sum(stores$dup == 1)
cat("Total rows with dup = 1:", total_dup_1, "\n")## Total rows with dup = 1: 905
geo <- read.csv("C:/Users/91976/Desktop/Software tools/R/R week 2/iowa.geographies.csv"
, check.names = FALSE
, stringsAsFactors = FALSE
, na.strings = "NA"
)
geo$match <- 1L
stores <- merge(x = stores,y = geo , by.x = c("zip.code", "city", "county") , by.y = c("zipcode", "city", "county") ,
all.x = TRUE,
all.y = FALSE
)
sum(is.na(stores$match))## [1] 382
# Create an empty (NA) variable category
sales$category <- NA
# Match Tequila
sales$category[grepl("\\btequila\\b", sales$subcategory, ignore.case = TRUE)] <- "Tequila"
# Match Gin
sales$category[grepl("\\bgin\\b|\\bgins\\b", sales$subcategory, ignore.case = TRUE)] <- "Gin"
# Match Brandy
sales$category[grepl("\\bbrandy\\b|\\bbrandies\\b", sales$subcategory, ignore.case = TRUE)] <- "Brandy"
# Check the number of records per category
table(sales$category)##
## Brandy Gin Tequila
## 960725 648208 808976