knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(bpCausal)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(rlang)
##
## Attaching package: 'rlang'
## The following object is masked from 'package:data.table':
##
## :=
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
hypoY_tf <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Statistical\ Modeling\ XI/Statistical\ Modeling\ XI\ Essential\ Data/hypoY_tf.csv")
Y_tf <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Statistical\ Modeling\ XI/Statistical\ Modeling\ XI\ Essential\ Data/Y_tf.csv")
X_U <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Statistical\ Modeling\ XI/Statistical\ Modeling\ XI\ Essential\ Data/X_U.csv")
X_P <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Statistical\ Modeling\ XI/Statistical\ Modeling\ XI\ Essential\ Data/X_P.csv")
X_UP2 <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Statistical\ Modeling\ XI/Statistical\ Modeling\ XI\ Essential\ Data/X_UP2.csv")
df_revenue_view <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Reconciliation\ Analysis\ I/Converted\ Raw\ Data/df_revenue_view.csv")
onehot_checkWe define a function called onehot_check. It takes a
data frame df and a column name prefix as
inputs to check whether a baseline column has been left out in the
one-hot encoding process for columns that start with the given prefix.
It returns a string message indicating whether a baseline column has
been left out or not, based on the presence or absence of all-zero or
all-one rows for the one-hot encoded columns.
onehot_check <- function(df, prefix) {
one_hot_cols <- grep(paste0("^", prefix), names(df), value = TRUE)
all_zero_rows <- rowSums(df[, one_hot_cols]) == 0
num_all_zero_rows <- sum(all_zero_rows)
all_one_rows <- rowSums(df[, one_hot_cols]) == length(one_hot_cols)
num_all_one_rows <- sum(all_one_rows)
if (num_all_zero_rows > 0 && num_all_one_rows == 0) {
return("A baseline column has been left out.")
}
else {
return("No baseline column has been left out.")
}
}
onehot_check(X_U, "user_app_")
## [1] "No baseline column has been left out."
We will drop user_app_other in this case since it is
less frequent and abnormal.
X_U <- X_U[, !(names(X_U) %in% c("user_app_other"))]
onehot_check(X_U, "user_app_")
## [1] "A baseline column has been left out."
onehot_check(X_P, "yelp_tag_")
## [1] "No baseline column has been left out."
names(sort(colSums(X_P[, grep("^yelp_tag_", names(X_P), value = TRUE)])))[1]
## [1] "yelp_tag_Airport.Shuttles"
We will drop yelp_tag_Airport.Shuttles in this case
since it is one of the least frequent tags.
X_P <- X_P[, !(names(X_P) %in% c("yelp_tag_Airport.Shuttles"))]
X_UP2onehot_check(X_UP2, "Venue.Type...Detail_")
## [1] "No baseline column has been left out."
We will drop Venue.Type...Detail_Unknown in this
case.
X_UP2 <- X_UP2[, !(names(X_UP2) %in% c("Venue.Type...Detail_Unknown"))]
onehot_check(X_UP2, "Check.Average_")
## [1] "No baseline column has been left out."
names(sort(colSums(X_UP2[, grep("^Check.Average_", names(X_UP2), value = TRUE)])))
## [1] "Check.Average_Low" "Check.Average_Very.High"
## [3] "Check.Average_Very.Low" "Check.Average_High"
## [5] "Check.Average_Mid"
We will drop Check.Average_Mid in this case since it is
the most frequent column.
X_UP2 <- X_UP2[, !(names(X_UP2) %in% c("Check.Average_Mid"))]
onehot_check(X_UP2, "Service.Type_")
## [1] "No baseline column has been left out."
We will drop Service.Type_Unknown in this case.
X_UP2 <- X_UP2[, !(names(X_UP2) %in% c("Service.Type_Unknown"))]
onehot_check(X_UP2, "Venue.Type...Detail_")
## [1] "A baseline column has been left out."
onehot_check(X_UP2, "Check.Average_")
## [1] "A baseline column has been left out."
onehot_check(X_UP2, "Service.Type_")
## [1] "A baseline column has been left out."
X_UP2$user_id <- as.integer(X_UP2$user_id)
colnames(X_UP2)[2] <- c("proj_id")
X_UP1X_up1 <- df_revenue_view[, c(3, 13, 8:12, 17:18)]
colnames(X_up1)[1] <- c("proj_id")
X_up1$stripe_brand <- as.factor(X_up1$stripe_brand)
X_up1$utm_campaign <- as.factor(X_up1$utm_campaign)
X_up1$utm_medium <- as.factor(X_up1$utm_medium)
X_up1$utm_content <- as.factor(X_up1$utm_content)
X_up1$utm_source <- as.factor(X_up1$utm_source)
summary(X_up1)
## proj_id user_id utm_campaign utm_medium
## Min. :-999.0 Min. : 21 :22056 :40029
## 1st Qu.:-999.0 1st Qu.: 65751 amex-special :18412 email :22390
## Median : 692.0 Median :131784 welcome : 7139 sms : 4497
## Mean : 218.1 Mean :129149 black-friday : 4952 website: 3368
## 3rd Qu.: 712.0 3rd Qu.:188627 2022-holiday-gifting: 3321 flow : 913
## Max. : 813.0 Max. :233738 (Other) :17059 (Other): 1742
## NA's :70867 NA's :70867
## utm_content utm_source is_app_purchase
## :58121 :41098 Min. :0.00000
## em2 : 3006 modal : 9471 1st Qu.:0.00000
## em1 : 2994 merch-outreach : 4081 Median :0.00000
## em3 : 1343 houseaccountholders: 3175 Mean :0.08369
## cta-button: 612 combo : 1928 3rd Qu.:0.00000
## (Other) : 5354 (Other) :13170 Max. :1.00000
## NA's :72376 NA's :70883
## is_excess stripe_brand
## Min. :1 American Express:54128
## 1st Qu.:1 Discover : 2818
## Median :1 MasterCard :20103
## Mean :1 Visa :60804
## 3rd Qu.:1 NA's : 5953
## Max. :1
## NA's :104108
X_up1$stripe_brand <- as.character(X_up1$stripe_brand)
X_up1$is_excess[is.na(X_up1$is_excess)] <- 0
X_up1$stripe_brand[is.na(X_up1$stripe_brand)] <- "other"
X_up1$stripe_brand <- as.factor(X_up1$stripe_brand)
colnames(X_up1)[9] <- "stripe_brand_"
X_up1_onehot <- as.data.frame(model.matrix(~ stripe_brand_ - 1, data = X_up1))
X_UP1 <- cbind(X_up1[, 1:8], X_up1_onehot)
onehot_check(X_UP1, "stripe_brand_")
## [1] "No baseline column has been left out."
We will drop stripe_brand_other in this case.
X_UP1 <- X_UP1[, !(names(X_UP1) %in% c("stripe_brand_other"))]
onehot_check(X_UP1, "stripe_brand_")
## [1] "A baseline column has been left out."
utm_We will first identify columns starting with utm_ and
check each row to see if all these 4 columns are either missing or
non-missing. Then we will count the number of rows that do NOT meet this
condition.
utm_cols <- grep("^utm_", names(X_UP1), value = TRUE)
check_rows <- rowSums(is.na(X_UP1[, utm_cols])) %in% c(0, length(utm_cols))
sum(!check_rows)
## [1] 1509
The check reveals that there are 1,509 rows in X_UP1
where the 4 utm_ prefixed columns are not all missing or
all non-missing. This means that these rows have a mix of missing and
non-missing values among these 4 columns.
utm_ Columns by is_promoWe will first create a new column is_promo in
X_UP1. We will set is_promo to 1
if ANY SINGLE ONE of the utm_ columns have non-missing
entries, and we will set is_promo to 0
otherwise.
X_UP1$is_promo <- 0
X_UP1$is_promo[rowSums(!is.na(X_UP1[, utm_cols])) > 0] <- 1
rownames(X_UP1) <- NULL
X_UP1 <- X_UP1[, c(1:2, 13, 7:12, 3:6)]
X_UP1$user_id <- as.integer(X_UP1$user_id)
summary(X_UP1)
## proj_id user_id is_promo is_app_purchase
## Min. :-999.0 Min. : 21 Min. :0.0000 Min. :0.00000
## 1st Qu.:-999.0 1st Qu.: 65751 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 692.0 Median :131784 Median :1.0000 Median :0.00000
## Mean : 218.1 Mean :129149 Mean :0.5072 Mean :0.08369
## 3rd Qu.: 712.0 3rd Qu.:188627 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. : 813.0 Max. :233738 Max. :1.0000 Max. :1.00000
##
## is_excess stripe_brand_American Express stripe_brand_Discover
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2761 Mean :0.3764 Mean :0.0196
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## stripe_brand_MasterCard stripe_brand_Visa utm_campaign
## Min. :0.0000 Min. :0.0000 :22056
## 1st Qu.:0.0000 1st Qu.:0.0000 amex-special :18412
## Median :0.0000 Median :0.0000 welcome : 7139
## Mean :0.1398 Mean :0.4228 black-friday : 4952
## 3rd Qu.:0.0000 3rd Qu.:1.0000 2022-holiday-gifting: 3321
## Max. :1.0000 Max. :1.0000 (Other) :17059
## NA's :70867
## utm_medium utm_content utm_source
## :40029 :58121 :41098
## email :22390 em2 : 3006 modal : 9471
## sms : 4497 em1 : 2994 merch-outreach : 4081
## website: 3368 em3 : 1343 houseaccountholders: 3175
## flow : 913 cta-button: 612 combo : 1928
## (Other): 1742 (Other) : 5354 (Other) :13170
## NA's :70867 NA's :72376 NA's :70883
Feature Engineering for proj_loc_id: we will
count the unique number of proj_loc_id for
each proj_id.
For is_featured: we will take the
maximum value for each proj_id.
For longitude and latitude: we will
compute the mean to find the central location point for
each proj_id.
For rating: we will take the mean
value for each proj_id.
For review_num: we will take the
sum for each proj_id.
For price_level: we will take the
mean for each proj_id.
For all yelp_tag_ columns: we will take the
maximum for each proj_id.
yelp_cols <- grep("^yelp_tag_", names(X_P), value = TRUE)
X_P <- group_by(X_P, proj_id) |>
summarise(unique_proj_loc_P = n_distinct(proj_loc_id),
max_is_featured = max(is_featured, na.rm = TRUE),
mean_longitude = mean(longitude, na.rm = TRUE),
mean_latitude = mean(latitude, na.rm = TRUE),
mean_rating = mean(rating, na.rm = TRUE),
sum_review_num = sum(review_num, na.rm = TRUE),
mean_price_level = mean(price_level, na.rm = TRUE),
across(starts_with("yelp_tag_"), \(x) max(x, na.rm = TRUE)))
X_P <- as.data.frame(X_P)
For is_promo: we will take the
maximum value for each group to indicate if a promotion
was involved for any transaction.
For is_app_purchase: we will also take the
maximum value to signify if any purchase was made via
the app.
For is_excess: we will again take the
maximum value to indicate if any transaction had excess
value.
For stripe_brand_American Express,
stripe_brand_Discover,
stripe_brand_MasterCard, and
stripe_brand_Visa: we will take the
maximum for each to indicate if the brand was used in
any of the transactions.
For utm_campaign, utm_medium,
utm_content, and utm_source: since these are
all categorical, we will concatenate the unique values into a
string for each group.
X_UP1 <- filter(X_UP1, proj_id != -999) |>
group_by(user_id, proj_id) |>
summarise(is_promo = max(is_promo, na.rm = TRUE),
max_is_app_purchase = max(is_app_purchase, na.rm = TRUE),
max_is_excess = max(is_excess, na.rm = TRUE),
`max_stripe_brand_American Express` = max(`stripe_brand_American Express`, na.rm = TRUE),
max_stripe_brand_Discover = max(stripe_brand_Discover, na.rm = TRUE),
max_stripe_brand_MasterCard = max(stripe_brand_MasterCard, na.rm = TRUE),
max_stripe_brand_Visa = max(stripe_brand_Visa, na.rm = TRUE),
concat_utm_campaign = paste(unique(na.omit(utm_campaign)), collapse = ","),
concat_utm_medium = paste(unique(na.omit(utm_medium)), collapse = ","),
concat_utm_content = paste(unique(na.omit(utm_content)), collapse = ","),
concat_utm_source = paste(unique(na.omit(utm_source)), collapse = ","))
## `summarise()` has grouped output by 'user_id'. You can override using the
## `.groups` argument.
X_UP1 <- X_UP1 |> ungroup() |> as.data.frame()
For project_location_id: we will count the
unique number of project_location_id for each
group.
For Venue.Type...Detail_ columns: we will take the
maximum value for each to indicate if the venue type
was involved in any of the transactions.
For Check.Average_ columns: we will also take the
maximum value for each to indicate if a particular
check average type was involved.
For Service.Type_ columns: we will again take the
maximum value for each to indicate if a particular
service type was involved.
X_UP2 <- X_UP2 |>
filter(proj_id != -999) |>
group_by(user_id, proj_id) |>
summarise(unique_proj_loc_UP = n_distinct(project_location_id),
max_Venue.Type...Detail_Bar = max(Venue.Type...Detail_Bar, na.rm = TRUE),
max_Venue.Type...Detail_Cafe = max(Venue.Type...Detail_Cafe, na.rm = TRUE),
max_Venue.Type...Detail_Casual.Dining = max(Venue.Type...Detail_Casual.Dining, na.rm = TRUE),
max_Venue.Type...Detail_Casual.Fine.Dining = max(Venue.Type...Detail_Casual.Fine.Dining, na.rm = TRUE),
max_Venue.Type...Detail_Coffee.Shop = max(Venue.Type...Detail_Coffee.Shop, na.rm = TRUE),
max_Venue.Type...Detail_Fast.Casual = max(Venue.Type...Detail_Fast.Casual, na.rm = TRUE),
max_Venue.Type...Detail_Fast.Food = max(Venue.Type...Detail_Fast.Food, na.rm = TRUE),
max_Venue.Type...Detail_Fine.Dining = max(Venue.Type...Detail_Fine.Dining, na.rm = TRUE),
max_Venue.Type...Detail_Market = max(Venue.Type...Detail_Market, na.rm = TRUE),
max_Venue.Type...Detail_Nightclub = max(Venue.Type...Detail_Nightclub, na.rm = TRUE),
max_Check.Average_High = max(Check.Average_High, na.rm = TRUE),
max_Check.Average_Low = max(Check.Average_Low, na.rm = TRUE),
max_Check.Average_Very.High = max(Check.Average_Very.High, na.rm = TRUE),
max_Check.Average_Very.Low = max(Check.Average_Very.Low, na.rm = TRUE),
max_Service.Type_Fast.Casual = max(Service.Type_Fast.Casual, na.rm = TRUE),
max_Service.Type_Full.Service = max(Service.Type_Full.Service, na.rm = TRUE),
max_Service.Type_Ghost.Kitchen = max(Service.Type_Ghost.Kitchen, na.rm = TRUE),
max_Service.Type_QSR = max(Service.Type_QSR, na.rm = TRUE))
## `summarise()` has grouped output by 'user_id'. You can override using the
## `.groups` argument.
X_UP2 <- X_UP2 |> ungroup() |> as.data.frame()
X_U (\(196,119 \times
4\))dim(X_U)
## [1] 196119 4
head(X_U)
## user_id T_acct user_app_android user_app_ios
## 1 1 -168 0 1
## 2 21 -162 0 1
## 3 22 -162 0 1
## 4 23 -162 0 1
## 5 24 -162 0 0
## 6 25 -162 0 0
X_P (\(250 \times
179\))dim(X_P)
## [1] 250 179
head(X_P)
## proj_id unique_proj_loc_P max_is_featured mean_longitude mean_latitude
## 1 11 2 1 -77.02214 38.90474
## 2 20 1 0 -77.02490 38.89700
## 3 39 1 0 -77.01902 38.91207
## 4 42 2 0 -77.03404 38.89938
## 5 45 1 0 -77.09386 38.88451
## 6 64 1 0 -77.00014 38.90034
## mean_rating sum_review_num mean_price_level yelp_tag_Acne.Treatment
## 1 4.0 570 2.00000 0
## 2 3.5 1630 2.00000 0
## 3 4.0 82 3.00000 0
## 4 3.5 1567 2.01265 0
## 5 2.5 212 1.00000 0
## 6 4.5 50 3.00000 0
## yelp_tag_Afghan yelp_tag_Airport.Terminals yelp_tag_American..New.
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_American..Traditional. yelp_tag_Arabic yelp_tag_Argentine
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 0
## 5 1 0 0
## 6 0 0 0
## yelp_tag_Art.Museums yelp_tag_Asian.Fusion yelp_tag_Australian
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Bagels yelp_tag_Bakeries yelp_tag_Bangladeshi yelp_tag_Barbeque
## 1 0 1 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Bars yelp_tag_Basque yelp_tag_Beer yelp_tag_Beer.Bar
## 1 0 0 0 0
## 2 1 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Beer.Gardens yelp_tag_Belgian yelp_tag_Bikes yelp_tag_Brasseries
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Breakfast...Brunch yelp_tag_Breweries yelp_tag_Brewpubs
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_British yelp_tag_Bubble.Tea yelp_tag_Burgers yelp_tag_Butcher
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 1 0
## 6 0 0 0 0
## yelp_tag_CSA yelp_tag_Cafes yelp_tag_Cajun.Creole yelp_tag_Cambodian
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Cannabis.Clinics yelp_tag_Cannabis.Dispensaries yelp_tag_Cantonese
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Caribbean yelp_tag_Caterers yelp_tag_Champagne.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Cheesesteaks yelp_tag_Chicken.Shop yelp_tag_Chicken.Wings
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Chinese yelp_tag_Chocolatiers...Shops yelp_tag_Cideries
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Cocktail.Bars yelp_tag_Coffee...Tea yelp_tag_Coffee.Roasteries
## 1 1 0 0
## 2 0 0 0
## 3 0 0 0
## 4 1 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Colleges...Universities yelp_tag_Colombian yelp_tag_Comfort.Food
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Comic.Books yelp_tag_Cooking.Schools yelp_tag_Cosmetic.Dentists
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Cuban yelp_tag_Dance.Clubs yelp_tag_Delis yelp_tag_Desserts
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Dim.Sum yelp_tag_Diners yelp_tag_Donuts yelp_tag_Empanadas
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Ethiopian yelp_tag_Falafel yelp_tag_Fast.Food
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Floral.Designers yelp_tag_Florists yelp_tag_Food.Court
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Food.Delivery.Services yelp_tag_Food.Stands yelp_tag_Food.Trucks
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_French yelp_tag_Furniture.Stores yelp_tag_Gastropubs
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_General.Dentistry yelp_tag_German yelp_tag_Gift.Shops
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Gluten.Free yelp_tag_Greek yelp_tag_Grocery yelp_tag_Halal
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Health.Markets yelp_tag_Home.Decor yelp_tag_Hookah.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Hospitals yelp_tag_Hot.Dogs yelp_tag_Hotels
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 1 0
## 6 0 0 0
## yelp_tag_Ice.Cream...Frozen.Yogurt yelp_tag_Indian yelp_tag_Italian
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 1
## yelp_tag_Izakaya yelp_tag_Japanese yelp_tag_Jewelry
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Juice.Bars...Smoothies yelp_tag_Knife.Sharpening yelp_tag_Korean
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Laser.Hair.Removal yelp_tag_Latin.American yelp_tag_Lebanese
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Limos yelp_tag_Live.Raw.Food yelp_tag_Local.Flavor yelp_tag_Lounges
## 1 0 0 0 1
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Massage yelp_tag_Massage.Therapy yelp_tag_Meat.Shops
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Mediterranean yelp_tag_Mexican yelp_tag_Middle.Eastern
## 1 0 0 0
## 2 0 1 0
## 3 0 0 0
## 4 0 1 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Middle.Schools...High.Schools yelp_tag_Modern.European
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## yelp_tag_Music.Venues yelp_tag_New.Mexican.Cuisine yelp_tag_Noodles
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Nutritionists yelp_tag_Organic.Stores yelp_tag_Pan.Asian
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Party...Event.Planning yelp_tag_Pasta.Shops
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## yelp_tag_Patisserie.Cake.Shop yelp_tag_Personal.Chefs yelp_tag_Peruvian
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Pet.Boarding yelp_tag_Pet.Groomers yelp_tag_Pet.Training
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Pizza yelp_tag_Poke yelp_tag_Pop.Up.Restaurants
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Professional.Sports.Teams yelp_tag_Pubs yelp_tag_Ramen
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Restaurants yelp_tag_Salad yelp_tag_Sandwiches yelp_tag_Seafood
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 1
## yelp_tag_Skin.Care yelp_tag_Smokehouse yelp_tag_Soul.Food yelp_tag_Soup
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Southern yelp_tag_Spanish yelp_tag_Speakeasies
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Specialty.Food yelp_tag_Sports.Bars yelp_tag_Stadiums...Arenas
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Steakhouses yelp_tag_Street.Vendors yelp_tag_Sushi.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 1 0 0
## yelp_tag_Tacos yelp_tag_Taiwanese yelp_tag_Tapas.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Tapas.Small.Plates yelp_tag_Tex.Mex yelp_tag_Thai yelp_tag_Tiki.Bars
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Train.Stations yelp_tag_Turkish yelp_tag_Used yelp_tag_Vegan
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Vegetarian yelp_tag_Venues...Event.Spaces yelp_tag_Vietnamese
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Vintage...Consignment yelp_tag_Waffles yelp_tag_Whiskey.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Wholesalers yelp_tag_Wine...Spirits yelp_tag_Wine.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Wine.Tours yelp_tag_Wineries yelp_tag_Wraps
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
X_UPX_UP1
(\(77,286 \times 13\))dim(X_UP1)
## [1] 77286 13
head(X_UP1)
## user_id proj_id is_promo max_is_app_purchase max_is_excess
## 1 21 11 0 1 0
## 2 21 693 0 0 1
## 3 21 699 0 0 0
## 4 22 255 0 1 1
## 5 22 277 1 1 0
## 6 22 317 0 1 0
## max_stripe_brand_American Express max_stripe_brand_Discover
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 1 0
## max_stripe_brand_MasterCard max_stripe_brand_Visa concat_utm_campaign
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## concat_utm_medium concat_utm_content concat_utm_source
## 1
## 2
## 3
## 4
## 5
## 6
X_UP2
(\(78,484 \times 21\))dim(X_UP2)
## [1] 78484 21
head(X_UP2)
## user_id proj_id unique_proj_loc_UP max_Venue.Type...Detail_Bar
## 1 21 693 1 0
## 2 21 699 1 0
## 3 22 11 1 0
## 4 22 224 1 0
## 5 22 249 1 0
## 6 22 255 1 0
## max_Venue.Type...Detail_Cafe max_Venue.Type...Detail_Casual.Dining
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## max_Venue.Type...Detail_Casual.Fine.Dining
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## max_Venue.Type...Detail_Coffee.Shop max_Venue.Type...Detail_Fast.Casual
## 1 0 0
## 2 1 0
## 3 1 0
## 4 0 0
## 5 0 1
## 6 0 0
## max_Venue.Type...Detail_Fast.Food max_Venue.Type...Detail_Fine.Dining
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## max_Venue.Type...Detail_Market max_Venue.Type...Detail_Nightclub
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## max_Check.Average_High max_Check.Average_Low max_Check.Average_Very.High
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 1 0
## 6 0 0 0
## max_Check.Average_Very.Low max_Service.Type_Fast.Casual
## 1 0 0
## 2 1 0
## 3 1 0
## 4 0 0
## 5 0 1
## 6 0 0
## max_Service.Type_Full.Service max_Service.Type_Ghost.Kitchen
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 1 0
## max_Service.Type_QSR
## 1 0
## 2 1
## 3 1
## 4 0
## 5 0
## 6 0
X_UP <- full_join(X_UP1, X_UP2, by = c("user_id", "proj_id"))
X_UP_U <- left_join(X_UP, X_U, by = "user_id")
X_UP_U_P <- left_join(X_UP_U, X_P, by = "proj_id")
X (\(97,484 \times 213\))X <- X_UP_U_P
dim(X)
## [1] 97484 213
head(X)
## user_id proj_id is_promo max_is_app_purchase max_is_excess
## 1 21 11 0 1 0
## 2 21 693 0 0 1
## 3 21 699 0 0 0
## 4 22 255 0 1 1
## 5 22 277 1 1 0
## 6 22 317 0 1 0
## max_stripe_brand_American Express max_stripe_brand_Discover
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 1 0
## max_stripe_brand_MasterCard max_stripe_brand_Visa concat_utm_campaign
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
## concat_utm_medium concat_utm_content concat_utm_source unique_proj_loc_UP
## 1 NA
## 2 1
## 3 1
## 4 1
## 5 4
## 6 1
## max_Venue.Type...Detail_Bar max_Venue.Type...Detail_Cafe
## 1 NA NA
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## max_Venue.Type...Detail_Casual.Dining
## 1 NA
## 2 1
## 3 0
## 4 1
## 5 0
## 6 0
## max_Venue.Type...Detail_Casual.Fine.Dining
## 1 NA
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
## max_Venue.Type...Detail_Coffee.Shop max_Venue.Type...Detail_Fast.Casual
## 1 NA NA
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## max_Venue.Type...Detail_Fast.Food max_Venue.Type...Detail_Fine.Dining
## 1 NA NA
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## max_Venue.Type...Detail_Market max_Venue.Type...Detail_Nightclub
## 1 NA NA
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 0
## max_Check.Average_High max_Check.Average_Low max_Check.Average_Very.High
## 1 NA NA NA
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## max_Check.Average_Very.Low max_Service.Type_Fast.Casual
## 1 NA NA
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## max_Service.Type_Full.Service max_Service.Type_Ghost.Kitchen
## 1 NA NA
## 2 0 1
## 3 0 0
## 4 1 0
## 5 1 0
## 6 0 0
## max_Service.Type_QSR T_acct user_app_android user_app_ios unique_proj_loc_P
## 1 NA -162 0 1 2
## 2 0 -162 0 1 3
## 3 1 -162 0 1 64
## 4 0 -162 0 1 1
## 5 0 -162 0 1 4
## 6 0 -162 0 1 1
## max_is_featured mean_longitude mean_latitude mean_rating sum_review_num
## 1 1 -77.02214 38.90474 4.000000 570
## 2 1 -77.00792 38.90823 4.166667 2265
## 3 1 -88.28376 38.49904 3.875000 12040
## 4 1 -76.99125 38.88233 4.000000 47
## 5 1 -97.74778 30.26841 4.000000 2610
## 6 0 -97.75171 30.27497 4.500000 283
## mean_price_level yelp_tag_Acne.Treatment yelp_tag_Afghan
## 1 2.000000 0 0
## 2 2.061707 0 0
## 3 2.041150 0 0
## 4 2.027138 0 0
## 5 2.068440 0 0
## 6 2.000000 0 0
## yelp_tag_Airport.Terminals yelp_tag_American..New.
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 0
## yelp_tag_American..Traditional. yelp_tag_Arabic yelp_tag_Argentine
## 1 0 0 0
## 2 1 0 0
## 3 1 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Art.Museums yelp_tag_Asian.Fusion yelp_tag_Australian
## 1 0 0 0
## 2 0 1 0
## 3 0 0 1
## 4 0 0 0
## 5 0 1 0
## 6 0 0 0
## yelp_tag_Bagels yelp_tag_Bakeries yelp_tag_Bangladeshi yelp_tag_Barbeque
## 1 0 1 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Bars yelp_tag_Basque yelp_tag_Beer yelp_tag_Beer.Bar
## 1 0 0 0 0
## 2 0 0 0 0
## 3 1 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Beer.Gardens yelp_tag_Belgian yelp_tag_Bikes yelp_tag_Brasseries
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Breakfast...Brunch yelp_tag_Breweries yelp_tag_Brewpubs
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_British yelp_tag_Bubble.Tea yelp_tag_Burgers yelp_tag_Butcher
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 1 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_CSA yelp_tag_Cafes yelp_tag_Cajun.Creole yelp_tag_Cambodian
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 1 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Cannabis.Clinics yelp_tag_Cannabis.Dispensaries yelp_tag_Cantonese
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Caribbean yelp_tag_Caterers yelp_tag_Champagne.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Cheesesteaks yelp_tag_Chicken.Shop yelp_tag_Chicken.Wings
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Chinese yelp_tag_Chocolatiers...Shops yelp_tag_Cideries
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 1 0 0
## 6 0 0 0
## yelp_tag_Cocktail.Bars yelp_tag_Coffee...Tea yelp_tag_Coffee.Roasteries
## 1 1 0 0
## 2 0 0 0
## 3 1 1 0
## 4 0 0 0
## 5 1 0 0
## 6 0 0 0
## yelp_tag_Colleges...Universities yelp_tag_Colombian yelp_tag_Comfort.Food
## 1 0 0 0
## 2 0 0 1
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Comic.Books yelp_tag_Cooking.Schools yelp_tag_Cosmetic.Dentists
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Cuban yelp_tag_Dance.Clubs yelp_tag_Delis yelp_tag_Desserts
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Dim.Sum yelp_tag_Diners yelp_tag_Donuts yelp_tag_Empanadas
## 1 0 0 0 0
## 2 1 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 1 0 0 0
## 6 0 0 0 0
## yelp_tag_Ethiopian yelp_tag_Falafel yelp_tag_Fast.Food
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Floral.Designers yelp_tag_Florists yelp_tag_Food.Court
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Food.Delivery.Services yelp_tag_Food.Stands yelp_tag_Food.Trucks
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_French yelp_tag_Furniture.Stores yelp_tag_Gastropubs
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_General.Dentistry yelp_tag_German yelp_tag_Gift.Shops
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Gluten.Free yelp_tag_Greek yelp_tag_Grocery yelp_tag_Halal
## 1 0 0 0 0
## 2 0 0 0 0
## 3 1 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Health.Markets yelp_tag_Home.Decor yelp_tag_Hookah.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Hospitals yelp_tag_Hot.Dogs yelp_tag_Hotels
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Ice.Cream...Frozen.Yogurt yelp_tag_Indian yelp_tag_Italian
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Izakaya yelp_tag_Japanese yelp_tag_Jewelry
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Juice.Bars...Smoothies yelp_tag_Knife.Sharpening yelp_tag_Korean
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Laser.Hair.Removal yelp_tag_Latin.American yelp_tag_Lebanese
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Limos yelp_tag_Live.Raw.Food yelp_tag_Local.Flavor yelp_tag_Lounges
## 1 0 0 0 1
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Massage yelp_tag_Massage.Therapy yelp_tag_Meat.Shops
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Mediterranean yelp_tag_Mexican yelp_tag_Middle.Eastern
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 0
## 5 1 0 0
## 6 0 0 0
## yelp_tag_Middle.Schools...High.Schools yelp_tag_Modern.European
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## yelp_tag_Music.Venues yelp_tag_New.Mexican.Cuisine yelp_tag_Noodles
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Nutritionists yelp_tag_Organic.Stores yelp_tag_Pan.Asian
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Party...Event.Planning yelp_tag_Pasta.Shops
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## yelp_tag_Patisserie.Cake.Shop yelp_tag_Personal.Chefs yelp_tag_Peruvian
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Pet.Boarding yelp_tag_Pet.Groomers yelp_tag_Pet.Training
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Pizza yelp_tag_Poke yelp_tag_Pop.Up.Restaurants
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 1 0 0
## 5 0 0 0
## 6 1 0 0
## yelp_tag_Professional.Sports.Teams yelp_tag_Pubs yelp_tag_Ramen
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Restaurants yelp_tag_Salad yelp_tag_Sandwiches yelp_tag_Seafood
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 1 1
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Skin.Care yelp_tag_Smokehouse yelp_tag_Soul.Food yelp_tag_Soup
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Southern yelp_tag_Spanish yelp_tag_Speakeasies
## 1 0 0 0
## 2 1 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Specialty.Food yelp_tag_Sports.Bars yelp_tag_Stadiums...Arenas
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Steakhouses yelp_tag_Street.Vendors yelp_tag_Sushi.Bars
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Tacos yelp_tag_Taiwanese yelp_tag_Tapas.Bars
## 1 0 0 0
## 2 0 0 0
## 3 1 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Tapas.Small.Plates yelp_tag_Tex.Mex yelp_tag_Thai yelp_tag_Tiki.Bars
## 1 0 0 0 0
## 2 0 0 0 0
## 3 1 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## yelp_tag_Train.Stations yelp_tag_Turkish yelp_tag_Used yelp_tag_Vegan
## 1 0 0 0 0
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 0 0
## 5 0 0 0 1
## 6 0 0 0 0
## yelp_tag_Vegetarian yelp_tag_Venues...Event.Spaces yelp_tag_Vietnamese
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 0
## 5 1 0 0
## 6 0 0 0
## yelp_tag_Vintage...Consignment yelp_tag_Waffles yelp_tag_Whiskey.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Wholesalers yelp_tag_Wine...Spirits yelp_tag_Wine.Bars
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## yelp_tag_Wine.Tours yelp_tag_Wineries yelp_tag_Wraps
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
X <- X_UP_U_P
miss_prop <- function(df) {
total_rows <- nrow(df)
missing_props <- numeric(ncol(df))
for (i in seq_along(df)) {
missing_count <- sum(is.na(df[[i]]))
missing_props[i] <- missing_count / total_rows
}
named_missing_props <- setNames(missing_props, names(df))
return(named_missing_props)
}
miss_prop(X)
## user_id
## 0.0000000000
## proj_id
## 0.0000000000
## is_promo
## 0.2071929753
## max_is_app_purchase
## 0.2071929753
## max_is_excess
## 0.2071929753
## max_stripe_brand_American Express
## 0.2071929753
## max_stripe_brand_Discover
## 0.2071929753
## max_stripe_brand_MasterCard
## 0.2071929753
## max_stripe_brand_Visa
## 0.2071929753
## concat_utm_campaign
## 0.2071929753
## concat_utm_medium
## 0.2071929753
## concat_utm_content
## 0.2071929753
## concat_utm_source
## 0.2071929753
## unique_proj_loc_UP
## 0.1949037791
## max_Venue.Type...Detail_Bar
## 0.1949037791
## max_Venue.Type...Detail_Cafe
## 0.1949037791
## max_Venue.Type...Detail_Casual.Dining
## 0.1949037791
## max_Venue.Type...Detail_Casual.Fine.Dining
## 0.1949037791
## max_Venue.Type...Detail_Coffee.Shop
## 0.1949037791
## max_Venue.Type...Detail_Fast.Casual
## 0.1949037791
## max_Venue.Type...Detail_Fast.Food
## 0.1949037791
## max_Venue.Type...Detail_Fine.Dining
## 0.1949037791
## max_Venue.Type...Detail_Market
## 0.1949037791
## max_Venue.Type...Detail_Nightclub
## 0.1949037791
## max_Check.Average_High
## 0.1949037791
## max_Check.Average_Low
## 0.1949037791
## max_Check.Average_Very.High
## 0.1949037791
## max_Check.Average_Very.Low
## 0.1949037791
## max_Service.Type_Fast.Casual
## 0.1949037791
## max_Service.Type_Full.Service
## 0.1949037791
## max_Service.Type_Ghost.Kitchen
## 0.1949037791
## max_Service.Type_QSR
## 0.1949037791
## T_acct
## 0.0001128390
## user_app_android
## 0.0001128390
## user_app_ios
## 0.0001128390
## unique_proj_loc_P
## 0.0004718723
## max_is_featured
## 0.0004718723
## mean_longitude
## 0.0004718723
## mean_latitude
## 0.0004718723
## mean_rating
## 0.0004718723
## sum_review_num
## 0.0004718723
## mean_price_level
## 0.0004718723
## yelp_tag_Acne.Treatment
## 0.0004718723
## yelp_tag_Afghan
## 0.0004718723
## yelp_tag_Airport.Terminals
## 0.0004718723
## yelp_tag_American..New.
## 0.0004718723
## yelp_tag_American..Traditional.
## 0.0004718723
## yelp_tag_Arabic
## 0.0004718723
## yelp_tag_Argentine
## 0.0004718723
## yelp_tag_Art.Museums
## 0.0004718723
## yelp_tag_Asian.Fusion
## 0.0004718723
## yelp_tag_Australian
## 0.0004718723
## yelp_tag_Bagels
## 0.0004718723
## yelp_tag_Bakeries
## 0.0004718723
## yelp_tag_Bangladeshi
## 0.0004718723
## yelp_tag_Barbeque
## 0.0004718723
## yelp_tag_Bars
## 0.0004718723
## yelp_tag_Basque
## 0.0004718723
## yelp_tag_Beer
## 0.0004718723
## yelp_tag_Beer.Bar
## 0.0004718723
## yelp_tag_Beer.Gardens
## 0.0004718723
## yelp_tag_Belgian
## 0.0004718723
## yelp_tag_Bikes
## 0.0004718723
## yelp_tag_Brasseries
## 0.0004718723
## yelp_tag_Breakfast...Brunch
## 0.0004718723
## yelp_tag_Breweries
## 0.0004718723
## yelp_tag_Brewpubs
## 0.0004718723
## yelp_tag_British
## 0.0004718723
## yelp_tag_Bubble.Tea
## 0.0004718723
## yelp_tag_Burgers
## 0.0004718723
## yelp_tag_Butcher
## 0.0004718723
## yelp_tag_CSA
## 0.0004718723
## yelp_tag_Cafes
## 0.0004718723
## yelp_tag_Cajun.Creole
## 0.0004718723
## yelp_tag_Cambodian
## 0.0004718723
## yelp_tag_Cannabis.Clinics
## 0.0004718723
## yelp_tag_Cannabis.Dispensaries
## 0.0004718723
## yelp_tag_Cantonese
## 0.0004718723
## yelp_tag_Caribbean
## 0.0004718723
## yelp_tag_Caterers
## 0.0004718723
## yelp_tag_Champagne.Bars
## 0.0004718723
## yelp_tag_Cheesesteaks
## 0.0004718723
## yelp_tag_Chicken.Shop
## 0.0004718723
## yelp_tag_Chicken.Wings
## 0.0004718723
## yelp_tag_Chinese
## 0.0004718723
## yelp_tag_Chocolatiers...Shops
## 0.0004718723
## yelp_tag_Cideries
## 0.0004718723
## yelp_tag_Cocktail.Bars
## 0.0004718723
## yelp_tag_Coffee...Tea
## 0.0004718723
## yelp_tag_Coffee.Roasteries
## 0.0004718723
## yelp_tag_Colleges...Universities
## 0.0004718723
## yelp_tag_Colombian
## 0.0004718723
## yelp_tag_Comfort.Food
## 0.0004718723
## yelp_tag_Comic.Books
## 0.0004718723
## yelp_tag_Cooking.Schools
## 0.0004718723
## yelp_tag_Cosmetic.Dentists
## 0.0004718723
## yelp_tag_Cuban
## 0.0004718723
## yelp_tag_Dance.Clubs
## 0.0004718723
## yelp_tag_Delis
## 0.0004718723
## yelp_tag_Desserts
## 0.0004718723
## yelp_tag_Dim.Sum
## 0.0004718723
## yelp_tag_Diners
## 0.0004718723
## yelp_tag_Donuts
## 0.0004718723
## yelp_tag_Empanadas
## 0.0004718723
## yelp_tag_Ethiopian
## 0.0004718723
## yelp_tag_Falafel
## 0.0004718723
## yelp_tag_Fast.Food
## 0.0004718723
## yelp_tag_Floral.Designers
## 0.0004718723
## yelp_tag_Florists
## 0.0004718723
## yelp_tag_Food.Court
## 0.0004718723
## yelp_tag_Food.Delivery.Services
## 0.0004718723
## yelp_tag_Food.Stands
## 0.0004718723
## yelp_tag_Food.Trucks
## 0.0004718723
## yelp_tag_French
## 0.0004718723
## yelp_tag_Furniture.Stores
## 0.0004718723
## yelp_tag_Gastropubs
## 0.0004718723
## yelp_tag_General.Dentistry
## 0.0004718723
## yelp_tag_German
## 0.0004718723
## yelp_tag_Gift.Shops
## 0.0004718723
## yelp_tag_Gluten.Free
## 0.0004718723
## yelp_tag_Greek
## 0.0004718723
## yelp_tag_Grocery
## 0.0004718723
## yelp_tag_Halal
## 0.0004718723
## yelp_tag_Health.Markets
## 0.0004718723
## yelp_tag_Home.Decor
## 0.0004718723
## yelp_tag_Hookah.Bars
## 0.0004718723
## yelp_tag_Hospitals
## 0.0004718723
## yelp_tag_Hot.Dogs
## 0.0004718723
## yelp_tag_Hotels
## 0.0004718723
## yelp_tag_Ice.Cream...Frozen.Yogurt
## 0.0004718723
## yelp_tag_Indian
## 0.0004718723
## yelp_tag_Italian
## 0.0004718723
## yelp_tag_Izakaya
## 0.0004718723
## yelp_tag_Japanese
## 0.0004718723
## yelp_tag_Jewelry
## 0.0004718723
## yelp_tag_Juice.Bars...Smoothies
## 0.0004718723
## yelp_tag_Knife.Sharpening
## 0.0004718723
## yelp_tag_Korean
## 0.0004718723
## yelp_tag_Laser.Hair.Removal
## 0.0004718723
## yelp_tag_Latin.American
## 0.0004718723
## yelp_tag_Lebanese
## 0.0004718723
## yelp_tag_Limos
## 0.0004718723
## yelp_tag_Live.Raw.Food
## 0.0004718723
## yelp_tag_Local.Flavor
## 0.0004718723
## yelp_tag_Lounges
## 0.0004718723
## yelp_tag_Massage
## 0.0004718723
## yelp_tag_Massage.Therapy
## 0.0004718723
## yelp_tag_Meat.Shops
## 0.0004718723
## yelp_tag_Mediterranean
## 0.0004718723
## yelp_tag_Mexican
## 0.0004718723
## yelp_tag_Middle.Eastern
## 0.0004718723
## yelp_tag_Middle.Schools...High.Schools
## 0.0004718723
## yelp_tag_Modern.European
## 0.0004718723
## yelp_tag_Music.Venues
## 0.0004718723
## yelp_tag_New.Mexican.Cuisine
## 0.0004718723
## yelp_tag_Noodles
## 0.0004718723
## yelp_tag_Nutritionists
## 0.0004718723
## yelp_tag_Organic.Stores
## 0.0004718723
## yelp_tag_Pan.Asian
## 0.0004718723
## yelp_tag_Party...Event.Planning
## 0.0004718723
## yelp_tag_Pasta.Shops
## 0.0004718723
## yelp_tag_Patisserie.Cake.Shop
## 0.0004718723
## yelp_tag_Personal.Chefs
## 0.0004718723
## yelp_tag_Peruvian
## 0.0004718723
## yelp_tag_Pet.Boarding
## 0.0004718723
## yelp_tag_Pet.Groomers
## 0.0004718723
## yelp_tag_Pet.Training
## 0.0004718723
## yelp_tag_Pizza
## 0.0004718723
## yelp_tag_Poke
## 0.0004718723
## yelp_tag_Pop.Up.Restaurants
## 0.0004718723
## yelp_tag_Professional.Sports.Teams
## 0.0004718723
## yelp_tag_Pubs
## 0.0004718723
## yelp_tag_Ramen
## 0.0004718723
## yelp_tag_Restaurants
## 0.0004718723
## yelp_tag_Salad
## 0.0004718723
## yelp_tag_Sandwiches
## 0.0004718723
## yelp_tag_Seafood
## 0.0004718723
## yelp_tag_Skin.Care
## 0.0004718723
## yelp_tag_Smokehouse
## 0.0004718723
## yelp_tag_Soul.Food
## 0.0004718723
## yelp_tag_Soup
## 0.0004718723
## yelp_tag_Southern
## 0.0004718723
## yelp_tag_Spanish
## 0.0004718723
## yelp_tag_Speakeasies
## 0.0004718723
## yelp_tag_Specialty.Food
## 0.0004718723
## yelp_tag_Sports.Bars
## 0.0004718723
## yelp_tag_Stadiums...Arenas
## 0.0004718723
## yelp_tag_Steakhouses
## 0.0004718723
## yelp_tag_Street.Vendors
## 0.0004718723
## yelp_tag_Sushi.Bars
## 0.0004718723
## yelp_tag_Tacos
## 0.0004718723
## yelp_tag_Taiwanese
## 0.0004718723
## yelp_tag_Tapas.Bars
## 0.0004718723
## yelp_tag_Tapas.Small.Plates
## 0.0004718723
## yelp_tag_Tex.Mex
## 0.0004718723
## yelp_tag_Thai
## 0.0004718723
## yelp_tag_Tiki.Bars
## 0.0004718723
## yelp_tag_Train.Stations
## 0.0004718723
## yelp_tag_Turkish
## 0.0004718723
## yelp_tag_Used
## 0.0004718723
## yelp_tag_Vegan
## 0.0004718723
## yelp_tag_Vegetarian
## 0.0004718723
## yelp_tag_Venues...Event.Spaces
## 0.0004718723
## yelp_tag_Vietnamese
## 0.0004718723
## yelp_tag_Vintage...Consignment
## 0.0004718723
## yelp_tag_Waffles
## 0.0004718723
## yelp_tag_Whiskey.Bars
## 0.0004718723
## yelp_tag_Wholesalers
## 0.0004718723
## yelp_tag_Wine...Spirits
## 0.0004718723
## yelp_tag_Wine.Bars
## 0.0004718723
## yelp_tag_Wine.Tours
## 0.0004718723
## yelp_tag_Wineries
## 0.0004718723
## yelp_tag_Wraps
## 0.0004718723