knitr::opts_chunk$set(echo = TRUE)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1 Original Data Loading

df_brand_tags <-
  read.csv("/Users/apple/Quantitative\ Marketing\ Research/EDA\ I/EDA\ I\ Data/df_brand_tags.csv")

df_projects <- 
  read.csv("/Users/apple/Quantitative\ Marketing\ Research/EDA\ I/EDA\ I\ Data/df_projects.csv")

df_users_thru_jan23 <-
  read.csv("/Users/apple/Quantitative\ Marketing\ Research/EDA\ I/EDA\ I\ Data/df_users_thru_jan23.csv")

2 Covariate Data Processing in R

common_columns <- intersect(names(df_brand_tags), names(df_projects))

JA_Cov <- full_join(df_brand_tags, df_projects, by = common_columns) |> 
          select(-c(project_id.1, name))

colnames(JA_Cov)[c(1:16)] <- 
  c("proj_id", "proj_loc_id", "proj_loc", "tag", "tag_category", "category_desc",
    "is_featured", "address", "longitude", "latitude", "city", "state", 
    "proj_created", "proj", "zip", "timezone")

JA_Cov <- JA_Cov[, c("proj_loc", "proj_loc_id", "proj", "proj_id", "proj_created",
                     "is_featured", "tag", "tag_category", "category_desc", 
                     "address", "city", "state", "zip", 
                     "longitude", "latitude", "timezone")]

JA_Cov$proj_created <- as.Date(JA_Cov$proj_created, format = "%Y-%m-%d")

JA_Cov$proj <- as.character(JA_Cov$proj)

JA_Cov$proj_id <- as.character(JA_Cov$proj_id)

JA_Cov$proj_loc_id <- as.character(JA_Cov$proj_loc_id)

JA_Cov$proj_loc <- as.character(JA_Cov$proj_loc)

JA_Cov$is_featured <- as.integer(JA_Cov$is_featured)

JA_Cov$tag <- as.character(JA_Cov$tag)

JA_Cov$tag_category <- as.factor(JA_Cov$tag_category)

JA_Cov$category_desc <- as.factor(JA_Cov$category_desc)

JA_Cov$timezone <- as.factor(JA_Cov$timezone)

JA_Cov <- filter(JA_Cov, is.na(longitude & latitude) == FALSE)

JA_Cov$full_address <- 
  paste(JA_Cov$address, JA_Cov$city, JA_Cov$state, JA_Cov$zip, sep=", ")

JA_Cov$is_featured[is.na(JA_Cov$is_featured)] <- 0

JA_Cov$is_featured <- as.factor(JA_Cov$is_featured)

summary(JA_Cov)
##    proj_loc         proj_loc_id            proj             proj_id         
##  Length:2179        Length:2179        Length:2179        Length:2179       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   proj_created        is_featured     tag            tag_category
##  Min.   :2017-05-17   0: 358      Length:2179        0   :   8   
##  1st Qu.:2021-05-12   1:1821      Class :character   1   :  17   
##  Median :2022-04-21               Mode  :character   2   :1313   
##  Mean   :2022-01-16                                  3   : 470   
##  3rd Qu.:2022-09-14                                  4   :  13   
##  Max.   :2023-02-15                                  NA's: 358   
##                                                                  
##        category_desc    address              city              state          
##  awards       :  17   Length:2179        Length:2179        Length:2179       
##  cuisine_type :1313   Class :character   Class :character   Class :character  
##  mood         : 470   Mode  :character   Mode  :character   Mode  :character  
##  ownership    :  13                                                           
##  uncategorized:   8                                                           
##  NA's         : 358                                                           
##                                                                               
##      zip              longitude          latitude    
##  Length:2179        Min.   :-157.83   Min.   :21.28  
##  Class :character   1st Qu.:-118.25   1st Qu.:34.07  
##  Mode  :character   Median : -84.34   Median :38.91  
##                     Mean   : -92.58   Mean   :37.60  
##                     3rd Qu.: -74.03   3rd Qu.:40.75  
##                     Max.   : -71.04   Max.   :61.19  
##                                                      
##                 timezone    full_address      
##  America/New_York   :1146   Length:2179       
##  America/Los_Angeles: 645   Class :character  
##  America/Chicago    : 320   Mode  :character  
##  America/Phoenix    :  14                     
##  America/Denver     :  10                     
##  (Other)            :  21                     
##  NA's               :  23
# JA_Cov <- st_as_sf(JA_Cov, coords = c("longitude", "latitude"), crs = 4326)

The following analyses are done in Python. Please refer to Statistical Modeling VII b - Data Augmentation of Project Locations with Yelp API in Python.