knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_brand_tags <-
read.csv("/Users/apple/Quantitative\ Marketing\ Research/EDA\ I/EDA\ I\ Data/df_brand_tags.csv")
df_projects <-
read.csv("/Users/apple/Quantitative\ Marketing\ Research/EDA\ I/EDA\ I\ Data/df_projects.csv")
df_users_thru_jan23 <-
read.csv("/Users/apple/Quantitative\ Marketing\ Research/EDA\ I/EDA\ I\ Data/df_users_thru_jan23.csv")
common_columns <- intersect(names(df_brand_tags), names(df_projects))
JA_Cov <- full_join(df_brand_tags, df_projects, by = common_columns) |>
select(-c(project_id.1, name))
colnames(JA_Cov)[c(1:16)] <-
c("proj_id", "proj_loc_id", "proj_loc", "tag", "tag_category", "category_desc",
"is_featured", "address", "longitude", "latitude", "city", "state",
"proj_created", "proj", "zip", "timezone")
JA_Cov <- JA_Cov[, c("proj_loc", "proj_loc_id", "proj", "proj_id", "proj_created",
"is_featured", "tag", "tag_category", "category_desc",
"address", "city", "state", "zip",
"longitude", "latitude", "timezone")]
JA_Cov$proj_created <- as.Date(JA_Cov$proj_created, format = "%Y-%m-%d")
JA_Cov$proj <- as.character(JA_Cov$proj)
JA_Cov$proj_id <- as.character(JA_Cov$proj_id)
JA_Cov$proj_loc_id <- as.character(JA_Cov$proj_loc_id)
JA_Cov$proj_loc <- as.character(JA_Cov$proj_loc)
JA_Cov$is_featured <- as.integer(JA_Cov$is_featured)
JA_Cov$tag <- as.character(JA_Cov$tag)
JA_Cov$tag_category <- as.factor(JA_Cov$tag_category)
JA_Cov$category_desc <- as.factor(JA_Cov$category_desc)
JA_Cov$timezone <- as.factor(JA_Cov$timezone)
JA_Cov <- filter(JA_Cov, is.na(longitude & latitude) == FALSE)
JA_Cov$full_address <-
paste(JA_Cov$address, JA_Cov$city, JA_Cov$state, JA_Cov$zip, sep=", ")
JA_Cov$is_featured[is.na(JA_Cov$is_featured)] <- 0
JA_Cov$is_featured <- as.factor(JA_Cov$is_featured)
summary(JA_Cov)
## proj_loc proj_loc_id proj proj_id
## Length:2179 Length:2179 Length:2179 Length:2179
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## proj_created is_featured tag tag_category
## Min. :2017-05-17 0: 358 Length:2179 0 : 8
## 1st Qu.:2021-05-12 1:1821 Class :character 1 : 17
## Median :2022-04-21 Mode :character 2 :1313
## Mean :2022-01-16 3 : 470
## 3rd Qu.:2022-09-14 4 : 13
## Max. :2023-02-15 NA's: 358
##
## category_desc address city state
## awards : 17 Length:2179 Length:2179 Length:2179
## cuisine_type :1313 Class :character Class :character Class :character
## mood : 470 Mode :character Mode :character Mode :character
## ownership : 13
## uncategorized: 8
## NA's : 358
##
## zip longitude latitude
## Length:2179 Min. :-157.83 Min. :21.28
## Class :character 1st Qu.:-118.25 1st Qu.:34.07
## Mode :character Median : -84.34 Median :38.91
## Mean : -92.58 Mean :37.60
## 3rd Qu.: -74.03 3rd Qu.:40.75
## Max. : -71.04 Max. :61.19
##
## timezone full_address
## America/New_York :1146 Length:2179
## America/Los_Angeles: 645 Class :character
## America/Chicago : 320 Mode :character
## America/Phoenix : 14
## America/Denver : 10
## (Other) : 21
## NA's : 23
# JA_Cov <- st_as_sf(JA_Cov, coords = c("longitude", "latitude"), crs = 4326)
The following analyses are done in Python. Please refer to Statistical Modeling VII b - Data Augmentation of Project Locations with Yelp API in Python.