library(dplyr)
library(ggplot2)
library(lubridate)
library(devtools)
library(lares) # install_github("laresbernardo/lares")
library(ggbiplot) # install_github("vqv/ggbiplot")
project <- "BL Jampp Application"
import random
def weighted_random(values, weights, size):
#Sum all weights to normalize
total_weight = sum(weights)
weights = [w / total_weight for w in weights]
# Create the weights distribution
# i.e if [0.2, 0.2, 0.2, 0.2, 0.2] then [0.2, 0.4, 0.6, 0.8, 1.0]
for i in range(len(weights)-1):
weights[i+1] += weights[i]
# Instead of duplicating each value, we equalize values
# Create an empty list
res = []
# Run size times
for j in range(size):
# Get random number each time
rand = random.random()
# Check weights list to detect the range
for i in range(len(weights)):
if rand < weights[i]:
res.append(values[i])
break
return(res)
# Much easier would have been:
import numpy as np
np.random.choice(values, size, weights)
# Create weighted_random values: straigh forward using base R
d <- sample(c(1, 2, 3),
size = 1000, replace = TRUE,
prob = c(0.5, 0.3, 0.2))
# As a function with values and probabilities inputs
weighted_random <- function(values, probs, n = 1000, seed = 5952) {
set.seed(seed) # For reproducibility (same results)
sample(values, # Values to sample
size = n, # How many times should be sampled?
replace = TRUE, # Sampling with replacement
prob = probs) # Probability weights
}
numbers <- weighted_random(c(1, 2, 3), c(0.5, 0.3, 0.2))
# Results verification
table(numbers)/10
## numbers
## 1 2 3
## 50.6 30.1 19.3
WITH top5 AS
(SELECT DEVICE_ID,
sum(EVENT_VALUE) AS TOTAL
FROM table
ORDER BY TOTAL DESC
LIMIT (
SELECT ROUND(COUNT(DISTINCT(DEVICE_ID)) * 0.05)
FROM table WHERE EVENT_ID = 2))
SELECT DEVICE_ID,
EVENT_DATE - LAG(EVENT_DATE, 1) OVER (ORDER BY EVENT_DATE DESC) AS time_delta
FROM table
WHERE DEVICE_ID IN (SELECT DEVICE_ID FROM top5)
AND EVENT_ID = 2
GROUP BY DEVICE_ID
# Hypothetical log events dataframe
table <- data.frame()
# Get top 5% top performance DEVICE_ID
top_perf <- table %>%
filter(EVENT_ID == 2) %>%
group_by(DEVICE_ID) %>%
summarise(TOTAL = sum(EVENT_VALUE, na.rm = TRUE)) %>%
arrange(desc(TOTAL)) %>%
slice(1:round(nrow(.)*0.05))
# Calculate time_delta for every purchase
result <- table %>%
filter(DEVICE_ID %in% top_perf$DEVICE_ID) %>%
arrange(desc(EVENT_DATE)) %>%
mutate(EVENT_DATE = as.numeric(EVENT_DATE)) %>%
group_by(DEVICE_ID) %>%
mutate(time_delta = (EVENT_DATE - lag(EVENT_DATE))/3600) %>%
filter(!is.na(time_delta))
# Visualizations:
# Density plot to check time distribution
ggplot(result, aes(x = time_delta)) + geom_density() +
labs(title = "Timings between Purchases for Top 5% Performing Users",
x = "Time delta [h]", y = "Density", subtitle = project)
# We could also check a boxplot to interpret quartiles and outliers
ggplot(result, aes(x = "All clients", y = time_delta)) + geom_boxplot() +
labs(title = "Timings between Purchases for Top 5% Performing Users",
x = "", y = "Time delta [h]", subtitle = project)
# To check each of these users (if there are few)
ggplot(result, aes(x = DEVICE_ID, y = time_delta)) +
geom_boxplot() + coord_flip() +
labs(title = "Timings between Purchases for Top 5% Performing Users",
x = "DEVICE_ID", y = "Time delta [h]", subtitle = project)
imp <- data.frame(
ID = as.character(c(1:5)),
VAR = c("a","a","b","b","c"),
BID = c(0.6, 0.2, 0.3, 0.03, 0.3),
IMP = c(1,1,1,0,1),
CLICK = c(1,0,1,0,0),
PRED = c(0.03, 0.001, 0.028, 0.002, 0.02))
costs <- data.frame(
ID = as.character(c(1:5)),
COST = c(0.5, 0.5, 0.1, 0.2, 0.2))
imp %>% left_join(costs, "ID") %>%
filter(VAR != "c") %>%
group_by(VAR) %>%
summarise(IMP = sum(IMP),
CLICK = sum(CLICK),
CTR = CLICK/IMP,
PREDICTED_CTR = mean(PRED),
COST = sum(COST))
## IMP CLICK CTR PREDICTED_CTR COST
## 1 3 2 0.6666667 0.01525 1.3
# File: https://www.indec.gob.ar/ftp/cuadros/menusuperior/enfr/ENFR2013_baseusuario.rar
# Dictionary: https://www.indec.gob.ar/ftp/cuadros/menusuperior/enfr/doc_base_usuario_enfr2013.pdf
base <- read.table("ENFR2013_baseusuario.txt", header = TRUE, sep = "|")
df <- base %>%
select(starts_with("BIAF"), "NIVEL_ACTIVIDAD_FISICA") %>%
replace(., is.na(.), 0) %>%
mutate(BIAF01 = as.numeric(ifelse(BIAF01 == 8, 0, BIAF01)),
BIAF03 = as.numeric(ifelse(BIAF03 == 8, 0, BIAF03)),
BIAF05 = as.numeric(ifelse(BIAF05 == 8, 0, BIAF05))) %>%
mutate(BIAF08 = as.character(BIAF08),
BIAF09 = as.character(BIAF09)) %>%
ohse(.) %>%
replaceall(9999, NA) %>%
mutate_each(funs(as.numeric))
## One Hot Encoding applied to 2 variables: 'BIAF08', 'BIAF09'
# 1933 total NAs; 5.7% rows with missing values
missingness(df)
## variable missing missingness
## 1 BIAF07 1643 5.08
## 2 BIAF06 224 0.69
## 3 BIAF04 45 0.14
## 4 BIAF02 21 0.06
round(100*nrow(df[rowSums(is.na(df)) > 0,])/nrow(df),2)
## [1] 5.72
# We could get rid of NAs (6% of data aprox) or impute these values
imp <- impute(df)
##
## iter imp variable
## 1 1 BIAF02 BIAF04 BIAF06 BIAF07
## 1 2 BIAF02 BIAF04 BIAF06 BIAF07
## 1 3 BIAF02 BIAF04 BIAF06 BIAF07
## 1 4 BIAF02 BIAF04 BIAF06 BIAF07
## 1 5 BIAF02 BIAF04 BIAF06 BIAF07
## 2 1 BIAF02 BIAF04 BIAF06 BIAF07
## 2 2 BIAF02 BIAF04 BIAF06 BIAF07
## 2 3 BIAF02 BIAF04 BIAF06 BIAF07
## 2 4 BIAF02 BIAF04 BIAF06 BIAF07
## 2 5 BIAF02 BIAF04 BIAF06 BIAF07
## 3 1 BIAF02 BIAF04 BIAF06 BIAF07
## 3 2 BIAF02 BIAF04 BIAF06 BIAF07
## 3 3 BIAF02 BIAF04 BIAF06 BIAF07
## 3 4 BIAF02 BIAF04 BIAF06 BIAF07
## 3 5 BIAF02 BIAF04 BIAF06 BIAF07
## 4 1 BIAF02 BIAF04 BIAF06 BIAF07
## 4 2 BIAF02 BIAF04 BIAF06 BIAF07
## 4 3 BIAF02 BIAF04 BIAF06 BIAF07
## 4 4 BIAF02 BIAF04 BIAF06 BIAF07
## 4 5 BIAF02 BIAF04 BIAF06 BIAF07
## 5 1 BIAF02 BIAF04 BIAF06 BIAF07
## 5 2 BIAF02 BIAF04 BIAF06 BIAF07
## 5 3 BIAF02 BIAF04 BIAF06 BIAF07
## 5 4 BIAF02 BIAF04 BIAF06 BIAF07
## 5 5 BIAF02 BIAF04 BIAF06 BIAF07
dff <- imp
# Let's cluster!
aux <- clusterKmeans(dff)
aux$nclusters_plot
# Optimal number of clusters: 8
clusters <- clusterKmeans(dff, 8)
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
clusters$nclusters_plot
freqs(clusters$df, cluster, plot = TRUE)
# Let's check each cluster's particular characteristics
corr_cross(clusters$df, contains = "cluster", top = 30)
# Cluster 2: didn't answer question BIAF10 + don't walk 10 minutes +
# no time + low or NO activity at all
distr(clusters$df, BIAF10_02, cluster)
clusters$df %>% filter(cluster == "2") %>% freqs(BIAF05)
## # A tibble: 1 x 4
## BIAF05 n p pcum
## <dbl> <int> <dbl> <dbl>
## 1 0 8259 100 100
distr(clusters$df, BIAF08_1, cluster)
clusters$df %>% filter(cluster == "2") %>% freqs(NIVEL_ACTIVIDAD_FISICA)
## # A tibble: 1 x 4
## NIVEL_ACTIVIDAD_FISICA n p pcum
## <dbl> <int> <dbl> <dbl>
## 1 1 8259 100 100
# Clusters 3,5: Enough exercise as needed
distr(clusters$df, BIAF09_1, cluster)
# Clusters 4,7: Not enough time for more exercise
distr(clusters$df, BIAF09_2, cluster)
# Cluster 7: Lots of exercise, 5-7 days a week!
distr(filter(clusters$df, cluster == "7"), BIAF05)
# Cluster 8,1: Healthy issues and/or unwillingness
distr(clusters$df, BIAF09_6, cluster)
distr(clusters$df, BIAF09_10, cluster)
# Cluster 6: Other people, lots of variance; furthest cluster from 2 and 7
corr_var(clusters$df, cluster_6, top = 10)
# Reduce number of variables with PCA
df.pca <- prcomp(dff, center = TRUE, scale = TRUE)
# PC1 explains 22% and PC2 explains 7%
summary(df.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.6785 1.45408 1.22176 1.16275 1.15713 1.07205
## Proportion of Variance 0.2242 0.06607 0.04665 0.04225 0.04184 0.03592
## Cumulative Proportion 0.2242 0.29027 0.33692 0.37917 0.42101 0.45692
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 1.04123 1.03286 1.02951 1.01945 1.01603 1.00795
## Proportion of Variance 0.03388 0.03334 0.03312 0.03248 0.03226 0.03175
## Cumulative Proportion 0.49080 0.52414 0.55726 0.58974 0.62200 0.65375
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 1.00602 1.00375 1.00285 1.00223 1.00110 1.0008
## Proportion of Variance 0.03163 0.03149 0.03143 0.03139 0.03132 0.0313
## Cumulative Proportion 0.68538 0.71686 0.74829 0.77968 0.81100 0.8423
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 1.00054 0.99633 0.93618 0.70157 0.63464 0.56628
## Proportion of Variance 0.03128 0.03102 0.02739 0.01538 0.01259 0.01002
## Cumulative Proportion 0.87358 0.90460 0.93199 0.94737 0.95996 0.96998
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.52401 0.45569 0.44462 0.42638 0.30328 0.07433
## Proportion of Variance 0.00858 0.00649 0.00618 0.00568 0.00287 0.00017
## Cumulative Proportion 0.97856 0.98505 0.99123 0.99691 0.99978 0.99995
## PC31 PC32
## Standard deviation 0.03813 0.000000000000002543
## Proportion of Variance 0.00005 0.000000000000000000
## Cumulative Proportion 1.00000 1.000000000000000000
plot(df.pca)
ggbiplot(df.pca, alpha = 0.3, groups = clusters$df$cluster)
# NIVEL_ACTIVIDAD_FISICA and BIAF07 do NOT contribute with PC1
# BIAF01,2,3,4 contribute with PC2
library(rtweet)
values <- c("python", "data science")
p <- get_tweets(values)
dim(p) # Right now: 11369 tweets, 90 columns
colnames(p)
# This script could be run by a cron job every day, every minute (* * * * *)
# Rscript /dir/mytwitterscript.R
# These could be saved and read with:
# saveRDS(p, paste0("twitterfeed_", Sys.time(), ".Rds))
# readRDS("nameofthefile.Rds")
# nrow(p) # Number of tweets
# For multiple files you could:
files <- paste0("file_",0:59,".Rds") # 60 files
for (i in 1:60) {
if (i == 1) out <- c()
readRDS(files)
out <- distinct(rbind(out, p), text, .keep_all = TRUE)
}
# Plot results of tweets per hour of given terms:
out %>%
mutate(minute = minute(created_at)) %>%
ggplot(aes(x = minute)) +
geom_density(fill = "purple", alpha = "0.9") +
labs(title = "Tweets per minute density", subtitle = project)
# A REST API service can be created easily with plumber package:
# plumber.R
#* Return tweets
#* @param values The values for twitter filter
#* @get /get_tweets
function(values="python"){
library(rtweet)
values <- c("python", "data science")
p <- get_tweets(values)
return(p)
}