Libraries and global parameters:

library(dplyr)
library(ggplot2)
library(lubridate)
library(devtools)
library(lares)      # install_github("laresbernardo/lares")
library(ggbiplot)   # install_github("vqv/ggbiplot")

project <- "BL Jampp Application"

CODING

Using Python:

import random
def weighted_random(values, weights, size):
    #Sum all weights to normalize
    total_weight  =  sum(weights)
    weights = [w / total_weight for w in weights]
    # Create the weights distribution
    # i.e if [0.2, 0.2, 0.2, 0.2, 0.2] then [0.2, 0.4, 0.6, 0.8, 1.0]
    for i in range(len(weights)-1):
       weights[i+1] += weights[i]
    # Instead of duplicating each value, we equalize values
    # Create an empty list
    res = []
    # Run size times
    for j in range(size):
    # Get random number each time
        rand = random.random()
        # Check weights list to detect the range
        for i in range(len(weights)):
           if rand < weights[i]:
              res.append(values[i])
              break
    return(res)

# Much easier would have been:
import numpy as np
np.random.choice(values, size, weights)

Using R:

# Create weighted_random values: straigh forward using base R
d <- sample(c(1, 2, 3), 
            size = 1000, replace = TRUE,
            prob = c(0.5, 0.3, 0.2))
# As a function with values and probabilities inputs
weighted_random <- function(values, probs, n = 1000, seed = 5952) {
  set.seed(seed) # For reproducibility (same results)
  sample(values, # Values to sample
         size = n, # How many times should be sampled?
         replace = TRUE, # Sampling with replacement
         prob = probs) # Probability weights
}
numbers <- weighted_random(c(1, 2, 3), c(0.5, 0.3, 0.2))

# Results verification
table(numbers)/10

## numbers
##    1    2    3 
## 50.6 30.1 19.3

ANALYTICS

PART 1:

Using SQL:

WITH top5 AS
(SELECT DEVICE_ID,
  sum(EVENT_VALUE) AS TOTAL
  FROM table
  ORDER BY TOTAL DESC
  LIMIT (
    SELECT ROUND(COUNT(DISTINCT(DEVICE_ID)) * 0.05)
    FROM table WHERE EVENT_ID = 2))
SELECT DEVICE_ID,
EVENT_DATE - LAG(EVENT_DATE, 1) OVER (ORDER BY EVENT_DATE DESC) AS time_delta
FROM table
WHERE DEVICE_ID IN (SELECT DEVICE_ID FROM top5)
AND EVENT_ID = 2
GROUP BY DEVICE_ID

Using R + dplyr:

# Hypothetical log events dataframe
table <- data.frame() 

# Get top 5% top performance DEVICE_ID
top_perf <- table %>%
  filter(EVENT_ID == 2) %>%
  group_by(DEVICE_ID) %>%
  summarise(TOTAL = sum(EVENT_VALUE, na.rm = TRUE)) %>%
  arrange(desc(TOTAL)) %>%
  slice(1:round(nrow(.)*0.05))

# Calculate time_delta for every purchase
result <- table %>%
  filter(DEVICE_ID %in% top_perf$DEVICE_ID) %>%
  arrange(desc(EVENT_DATE)) %>%
  mutate(EVENT_DATE = as.numeric(EVENT_DATE)) %>%
  group_by(DEVICE_ID) %>%
  mutate(time_delta = (EVENT_DATE - lag(EVENT_DATE))/3600) %>%
  filter(!is.na(time_delta))

# Visualizations:

# Density plot to check time distribution
ggplot(result, aes(x = time_delta)) + geom_density() +
  labs(title = "Timings between Purchases for Top 5% Performing Users",
       x = "Time delta [h]", y = "Density", subtitle = project)

# We could also check a boxplot to interpret quartiles and outliers
ggplot(result, aes(x = "All clients", y = time_delta)) + geom_boxplot() +
  labs(title = "Timings between Purchases for Top 5% Performing Users",
       x = "", y = "Time delta [h]", subtitle = project)

# To check each of these users (if there are few)
ggplot(result, aes(x = DEVICE_ID, y = time_delta)) + 
  geom_boxplot() + coord_flip() +
  labs(title = "Timings between Purchases for Top 5% Performing Users",
       x = "DEVICE_ID", y = "Time delta [h]", subtitle = project)

PART 2:

imp <- data.frame(
  ID = as.character(c(1:5)),
  VAR = c("a","a","b","b","c"),
  BID = c(0.6, 0.2, 0.3, 0.03, 0.3),
  IMP = c(1,1,1,0,1),
  CLICK = c(1,0,1,0,0),
  PRED = c(0.03, 0.001, 0.028, 0.002, 0.02))
costs <- data.frame(
  ID = as.character(c(1:5)), 
  COST = c(0.5, 0.5, 0.1, 0.2, 0.2))

imp %>% left_join(costs, "ID") %>%
  filter(VAR != "c") %>%
  group_by(VAR) %>%
  summarise(IMP = sum(IMP), 
            CLICK = sum(CLICK),
            CTR = CLICK/IMP,
            PREDICTED_CTR = mean(PRED),
            COST = sum(COST))

##   IMP CLICK       CTR PREDICTED_CTR COST
## 1   3     2 0.6666667       0.01525  1.3

CLUSTERING

# File: https://www.indec.gob.ar/ftp/cuadros/menusuperior/enfr/ENFR2013_baseusuario.rar
# Dictionary: https://www.indec.gob.ar/ftp/cuadros/menusuperior/enfr/doc_base_usuario_enfr2013.pdf
base <- read.table("ENFR2013_baseusuario.txt", header = TRUE, sep = "|")

df <- base %>% 
  select(starts_with("BIAF"), "NIVEL_ACTIVIDAD_FISICA") %>%
  replace(., is.na(.), 0) %>%
  mutate(BIAF01 = as.numeric(ifelse(BIAF01 == 8, 0, BIAF01)),
         BIAF03 = as.numeric(ifelse(BIAF03 == 8, 0, BIAF03)),
         BIAF05 = as.numeric(ifelse(BIAF05 == 8, 0, BIAF05))) %>%
  mutate(BIAF08 = as.character(BIAF08),
         BIAF09 = as.character(BIAF09)) %>%
  ohse(.) %>%
  replaceall(9999, NA) %>%
  mutate_each(funs(as.numeric))

## One Hot Encoding applied to 2 variables: 'BIAF08', 'BIAF09'

# 1933 total NAs; 5.7% rows with missing values
missingness(df)

##   variable missing missingness
## 1   BIAF07    1643        5.08
## 2   BIAF06     224        0.69
## 3   BIAF04      45        0.14
## 4   BIAF02      21        0.06

round(100*nrow(df[rowSums(is.na(df)) > 0,])/nrow(df),2)

## [1] 5.72

# We could get rid of NAs (6% of data aprox) or impute these values
imp <- impute(df)

## 
##  iter imp variable
##   1   1  BIAF02  BIAF04  BIAF06  BIAF07
##   1   2  BIAF02  BIAF04  BIAF06  BIAF07
##   1   3  BIAF02  BIAF04  BIAF06  BIAF07
##   1   4  BIAF02  BIAF04  BIAF06  BIAF07
##   1   5  BIAF02  BIAF04  BIAF06  BIAF07
##   2   1  BIAF02  BIAF04  BIAF06  BIAF07
##   2   2  BIAF02  BIAF04  BIAF06  BIAF07
##   2   3  BIAF02  BIAF04  BIAF06  BIAF07
##   2   4  BIAF02  BIAF04  BIAF06  BIAF07
##   2   5  BIAF02  BIAF04  BIAF06  BIAF07
##   3   1  BIAF02  BIAF04  BIAF06  BIAF07
##   3   2  BIAF02  BIAF04  BIAF06  BIAF07
##   3   3  BIAF02  BIAF04  BIAF06  BIAF07
##   3   4  BIAF02  BIAF04  BIAF06  BIAF07
##   3   5  BIAF02  BIAF04  BIAF06  BIAF07
##   4   1  BIAF02  BIAF04  BIAF06  BIAF07
##   4   2  BIAF02  BIAF04  BIAF06  BIAF07
##   4   3  BIAF02  BIAF04  BIAF06  BIAF07
##   4   4  BIAF02  BIAF04  BIAF06  BIAF07
##   4   5  BIAF02  BIAF04  BIAF06  BIAF07
##   5   1  BIAF02  BIAF04  BIAF06  BIAF07
##   5   2  BIAF02  BIAF04  BIAF06  BIAF07
##   5   3  BIAF02  BIAF04  BIAF06  BIAF07
##   5   4  BIAF02  BIAF04  BIAF06  BIAF07
##   5   5  BIAF02  BIAF04  BIAF06  BIAF07

dff <- imp

# Let's cluster!
aux <- clusterKmeans(dff)
aux$nclusters_plot

# Optimal number of clusters: 8
clusters <- clusterKmeans(dff, 8)

## 
## Attaching package: 'plotly'

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

clusters$nclusters_plot

freqs(clusters$df, cluster, plot = TRUE)

# Let's check each cluster's particular characteristics
corr_cross(clusters$df, contains = "cluster", top = 30)

# Cluster 2: didn't answer question BIAF10 + don't walk 10 minutes + 
# no time + low or NO activity at all
distr(clusters$df, BIAF10_02, cluster)

clusters$df %>% filter(cluster == "2") %>% freqs(BIAF05)

## # A tibble: 1 x 4
##   BIAF05     n     p  pcum
##    <dbl> <int> <dbl> <dbl>
## 1      0  8259   100   100

distr(clusters$df, BIAF08_1, cluster)

clusters$df %>% filter(cluster == "2") %>% freqs(NIVEL_ACTIVIDAD_FISICA)

## # A tibble: 1 x 4
##   NIVEL_ACTIVIDAD_FISICA     n     p  pcum
##                    <dbl> <int> <dbl> <dbl>
## 1                      1  8259   100   100

# Clusters 3,5: Enough exercise as needed
distr(clusters$df, BIAF09_1, cluster)

# Clusters 4,7: Not enough time for more exercise
distr(clusters$df, BIAF09_2, cluster)

# Cluster 7: Lots of exercise, 5-7 days a week!
distr(filter(clusters$df, cluster == "7"), BIAF05)

# Cluster 8,1: Healthy issues and/or unwillingness
distr(clusters$df, BIAF09_6, cluster)

distr(clusters$df, BIAF09_10, cluster)

# Cluster 6: Other people, lots of variance; furthest cluster from 2 and 7
corr_var(clusters$df, cluster_6, top = 10)

# Reduce number of variables with PCA
df.pca <- prcomp(dff, center = TRUE, scale = TRUE)
# PC1 explains 22% and PC2 explains 7%
summary(df.pca)

## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.6785 1.45408 1.22176 1.16275 1.15713 1.07205
## Proportion of Variance 0.2242 0.06607 0.04665 0.04225 0.04184 0.03592
## Cumulative Proportion  0.2242 0.29027 0.33692 0.37917 0.42101 0.45692
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     1.04123 1.03286 1.02951 1.01945 1.01603 1.00795
## Proportion of Variance 0.03388 0.03334 0.03312 0.03248 0.03226 0.03175
## Cumulative Proportion  0.49080 0.52414 0.55726 0.58974 0.62200 0.65375
##                           PC13    PC14    PC15    PC16    PC17   PC18
## Standard deviation     1.00602 1.00375 1.00285 1.00223 1.00110 1.0008
## Proportion of Variance 0.03163 0.03149 0.03143 0.03139 0.03132 0.0313
## Cumulative Proportion  0.68538 0.71686 0.74829 0.77968 0.81100 0.8423
##                           PC19    PC20    PC21    PC22    PC23    PC24
## Standard deviation     1.00054 0.99633 0.93618 0.70157 0.63464 0.56628
## Proportion of Variance 0.03128 0.03102 0.02739 0.01538 0.01259 0.01002
## Cumulative Proportion  0.87358 0.90460 0.93199 0.94737 0.95996 0.96998
##                           PC25    PC26    PC27    PC28    PC29    PC30
## Standard deviation     0.52401 0.45569 0.44462 0.42638 0.30328 0.07433
## Proportion of Variance 0.00858 0.00649 0.00618 0.00568 0.00287 0.00017
## Cumulative Proportion  0.97856 0.98505 0.99123 0.99691 0.99978 0.99995
##                           PC31                 PC32
## Standard deviation     0.03813 0.000000000000002543
## Proportion of Variance 0.00005 0.000000000000000000
## Cumulative Proportion  1.00000 1.000000000000000000

plot(df.pca)

ggbiplot(df.pca, alpha = 0.3, groups = clusters$df$cluster)

# NIVEL_ACTIVIDAD_FISICA and BIAF07 do NOT contribute with PC1
# BIAF01,2,3,4 contribute with PC2

STREAMING & APIs

library(rtweet)
values <- c("python", "data science")
p <- get_tweets(values)
dim(p) # Right now: 11369 tweets, 90 columns
colnames(p)

# This script could be run by a cron job every day, every minute (* * * * *)
# Rscript /dir/mytwitterscript.R

# These could be saved and read with:
# saveRDS(p, paste0("twitterfeed_", Sys.time(), ".Rds))
# readRDS("nameofthefile.Rds")
# nrow(p) # Number of tweets

# For multiple files you could:
files <- paste0("file_",0:59,".Rds") # 60 files
for (i in 1:60) {
  if (i == 1) out <- c()
  readRDS(files)
  out <- distinct(rbind(out, p), text, .keep_all = TRUE)
}

# Plot results of tweets per hour of given terms:
out %>%
  mutate(minute = minute(created_at)) %>%
  ggplot(aes(x = minute)) + 
  geom_density(fill = "purple", alpha = "0.9") +
  labs(title = "Tweets per minute density", subtitle = project)
  
# A REST API service can be created easily with plumber package:

# plumber.R

#* Return tweets
#* @param values The values for twitter filter
#* @get /get_tweets
function(values="python"){
  library(rtweet)
  values <- c("python", "data science")
  p <- get_tweets(values)
  return(p)
}

Jampp Application

Bernardo Lares

July 3rd, 2019

Libraries and global parameters:

CODING

Using Python:

Using R:

ANALYTICS

PART 1:

Using SQL:

Using R + dplyr:

PART 2:

CLUSTERING

STREAMING & APIs