The dataset provided has 4 fields relating to several kinds of products in diverse categories. The task at hand is to realise the top trending targets/products for each category.

#Loading necessary libraries
library(ggplot2)
library(dplyr)
library(lubridate)
setwd("C:/Users/ankit/Desktop/Pyprojects/Trends-Assignment.tar")
trend <- read.csv("trendz.csv", na.strings = c(' ', '', 'NA', 'NaN'), stringsAsFactors = F)

Since we are dealing with just 9 days, I have decided to refer to them as Days (1 to 9) for the purpose of simplicity.

trend$day <- ymd_hms(trend$day)
trend$day <- day(trend$day) - 15
head(trend)

We need to also remove the rows that have missing target name since they do not add much value to our analysis

ind <- which(is.na(trend$target))
trend <- trend[-ind,]
summary(trend)
   activity             target            day            city               count        
 Length:963548      Min.   :     0   Min.   :1.000   Length:963548      Min.   :   1.00  
 Class :character   1st Qu.:134485   1st Qu.:3.000   Class :character   1st Qu.:   1.00  
 Mode  :character   Median :232545   Median :5.000   Mode  :character   Median :   1.00  
                    Mean   :210099   Mean   :5.001                      Mean   :   3.14  
                    3rd Qu.:314840   3rd Qu.:7.000                      3rd Qu.:   2.00  
                    Max.   :715573   Max.   :9.000                      Max.   :1137.00  

Next, we need to seperate out the entities that can be added to cart and others. The categories corresponding to ‘viewed sku’, ‘viewed drug’ and ‘viewed OTC’ have targets common with ‘added to cart’ category. Hence, these are the 3 categories for which we can calculate conversion rates and define a metric that compares trend using data.

For the other three 2 categories - ‘viewed article’ & ‘viewed OTC categories’ the outcome that can be used to determine trend is the no of views itself.

First, we will work out a function that determines the top trends for the last 2 categories

#Funtion for determining the top trending entities within 'viewed article' and 'viewed OTC categories'
top_trend_finder_view_cor <- function(act, df, name_city){
  ifelse(name_city != 'All',
    z <- df %>% filter(activity == as.name(act), city == as.name(name_city)) %>%
      group_by(target, day) %>% 
      summarise(Total_Count = sum(count)) %>% 
      arrange(target,day),
  z <- df %>% filter(activity == as.name(act)) %>%
      group_by(target, day) %>% 
      summarise(Total_Count = sum(count)) %>% 
      arrange(target,day)
  )
  d <- ungroup(z) %>% group_by(target) %>%
    summarise(series_size = n(), sum.target = sum(Total_Count)) %>%
    filter(series_size > 2, sum.target > 30)
  
  z <- subset(z, target %in% unique(d$target))
  a <- unique(z[['target']])
  cor_estimate = c()
  b <- z %>% filter(target == a[1])
  for (i in 1:length(a)){
    b <- z %>% filter(target == a[i])
    model <- cor.test(~Total_Count + day,data=b,method="kendall")
    cor_estimate <- append(cor_estimate,model$estimate)
    }
  cor_rank <- data.frame(target = a, score = cor_estimate)
  
  e <- (head(arrange(cor_rank,desc(score)), n = 5))
  f <- subset(z, target %in% e$target)
  num_plot <- ggplot(f, aes(day,Total_Count, colour=as.factor(target))) + 
    #geom_line(size = 2) + 
    scale_x_continuous(breaks = c(1:9)) +
    ggtitle(cat(as.name(act), "for", as.name(name_city))) + xlab('Day') + theme(
    plot.title = element_text(color="red", size=14, face="bold.italic", hjust = 0.5),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold")) +
    geom_smooth(se = F, size = 2)
  return(num_plot)
}
# This function plots it for all cities, the name of city could be mentioned if required
# 'viewed article category'
top_trend_finder_view_cor('viewed article', trend, 'All')
viewed article for All

# 'viewed otc categories' activity
top_trend_finder_view_cor('viewed otc categories', trend, 'All')
viewed otc categories for All

Turns out that only around 8 targets have added to cart value associated with it in ‘viewed drug’ and hence the more reliable metric for observation of trend is the no of views and hence we will use the view function for ‘viewed drug’ category as well.

# 'viewed frug' activity
top_trend_finder_view_cor('viewed drug', trend, 'All')
viewed drug for All

The next function is exclusively for the 2 categories (‘viewed sku’ & ‘viewed OTC’). These 2 categories have large number of targets that feature well in the added to target category as well. Hence, It would be interesting to determine the conversion rate for realising the trends for these 2 categories.

library(gridExtra)
top_trend_finder_cor <- function(act, df, name_city){
  if(name_city != 'All'){
    z <- df %>% filter(activity == as.name(act), city == name_city) %>%
    group_by(target, day) %>% 
    summarise(Total_Count = sum(count))
  
    y <- z %>% group_by(target) %>% 
      summarise(n = n(), sum = sum(Total_Count)) %>%
      filter(n > 1)
    
    z <- ungroup(z) %>% filter(target %in% unique(y$target))
    
    c <- df %>% filter(activity == 'added to cart', city == name_city) %>%
      group_by(target) %>%
      summarise(Total_Added_To_Cart = sum(count)) %>%
      filter(Total_Added_To_Cart > 10)
    
    z <- z %>% filter(target %in% unique(c$target))
    a <- df %>% filter(activity == 'added to cart', city == name_city) %>%
      group_by(target, day) %>%
      summarise(Total_Added_To_Cart = sum(count))
  
  } else {
    z <- df %>% filter(activity == as.name(act)) %>%
      group_by(target, day) %>% 
      summarise(Total_Count = sum(count))
    
    y <- z %>% group_by(target) %>% 
      summarise(n = n(), sum = sum(Total_Count)) %>%
      filter(n > 1)
    
    z <- ungroup(z) %>% filter(target %in% unique(y$target))
    
    c <- df %>% filter(activity == 'added to cart') %>%
      group_by(target) %>%
      summarise(Total_Added_To_Cart = sum(count)) %>%
      filter(Total_Added_To_Cart > 15)
    
    z <- z %>% filter(target %in% unique(c$target))
    a <- df %>% filter(activity == 'added to cart') %>%
      group_by(target, day) %>%
      summarise(Total_Added_To_Cart = sum(count))
  }
  b <- left_join(z,a,by = c('target', 'day'))
  b[which(is.na(b$Total_Added_To_Cart)),'Total_Added_To_Cart'] <- 0
  b[which(b$Total_Added_To_Cart > b$Total_Count),'Total_Added_To_Cart'] <- 0
  b <- b %>% mutate(conversion = (Total_Added_To_Cart/Total_Count)*100) %>%
     arrange(target,day)
  e <- unique(b$target)
  cor_estimate = c()
  for (i in 1:length(e)){
    d <- b %>% filter(target == e[i])
    model <- cor.test(~conversion + day,data=d,method="kendall")
    cor_estimate <- append(cor_estimate,model$estimate)
  }
  cor_rank <- data.frame(target = e, score = cor_estimate)
  
  g <- (head(arrange(cor_rank,desc(score)), n = 5))
  f <- subset(b, target %in% g$target)
  
  num_plot <- ggplot(f, aes(day,Total_Count, colour=as.factor(target))) + 
    #geom_line(size = 2) + 
    scale_x_continuous(breaks = c(1:9)) +
    ggtitle(as.name(act)) + theme(
    plot.title = element_text(color="red", size=14, face="bold.italic", hjust = 0.5),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold")) +
    geom_smooth(se = F, size = 1.5)
  num_plot_2 <- ggplot(f, aes(day,Total_Added_To_Cart, colour=as.factor(target))) + 
    #geom_line(size = 2) + 
    scale_x_continuous(breaks = c(1:9)) +
    ggtitle(as.name(act)) + theme(
    plot.title = element_text(color="red", size=14, face="bold.italic", hjust = 0.5),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold")) +
    geom_smooth(se = F, size = 1.5)
  finplot <- grid.arrange(num_plot, num_plot_2, nrow=2)
  suppressWarnings(return(finplot))
}
top_trend_finder_cor('viewed OTC',trend, 'New Delhi')
TableGrob (2 x 1) "arrange": 2 grobs
  z     cells    name           grob
1 1 (1-1,1-1) arrange gtable[layout]
2 2 (2-2,1-1) arrange gtable[layout]

top_trend_finder_cor('viewed sku',trend, 'New Delhi')
TableGrob (2 x 1) "arrange": 2 grobs
  z     cells    name           grob
1 1 (1-1,1-1) arrange gtable[layout]
2 2 (2-2,1-1) arrange gtable[layout]

The dataset could be really interesting with a longer time series and trends observed would be more authentic. The correlation function would still provide a good estimate for the trend strength comparison.

Also more historical day could provide a metric like historical mean of count that can be used for weeding out unpopular products/items from the analysis.

---
title: "1MG Trends Assignment"
output: html_notebook
---

The dataset provided has 4 fields relating to several kinds of products in diverse categories. The task at hand is to realise the top trending targets/products for each category. 

```{r}
#Loading necessary libraries
library(ggplot2)
library(dplyr)
library(lubridate)
setwd("C:/Users/ankit/Desktop/Pyprojects/Trends-Assignment.tar")
trend <- read.csv("trendz.csv", na.strings = c(' ', '', 'NA', 'NaN'), stringsAsFactors = F)
```
Since we are dealing with just 9 days, I have decided to refer to them as Days (1 to 9) for the purpose of simplicity.

```{r}
#Converting day into Posixst format
trend$day <- ymd_hms(trend$day)
trend$day <- day(trend$day) - 15
head(trend)
```

We need to also remove the rows that have missing target name since they do not add much value to our analysis

```{r}
ind <- which(is.na(trend$target))
trend <- trend[-ind,]
summary(trend)
```

Next, we need to seperate out the entities that can be added to cart and others. The categories corresponding to 'viewed sku', 'viewed drug' and 'viewed OTC' have targets common with 'added to cart' category. Hence, these are the 3 categories for which we can calculate conversion rates and define a metric that compares trend using data. 

For the other three 2 categories - 'viewed article' & 'viewed OTC categories' the outcome that can be used to determine trend is the no of views itself.

First, we will work out a function that determines the top trends for the last 2 categories


```{r}
#Funtion for determining the top trending entities within 'viewed article' and 'viewed OTC categories'
top_trend_finder_view_cor <- function(act, df, name_city){
  # filtering dataframe on the basis of city input & type of activity
  ifelse(name_city != 'All',
    z <- df %>% filter(activity == as.name(act), city == as.name(name_city)) %>%
      group_by(target, day) %>% 
      summarise(Total_Count = sum(count)) %>% 
      arrange(target,day),
    z <- df %>% filter(activity == as.name(act)) %>%
      group_by(target, day) %>% 
      summarise(Total_Count = sum(count)) %>% 
      arrange(target,day)
  )
  
  # filtering out unpopular targets that might creep in to the top categories without having a
  # relevent count, also filtering out time series with less than 3 entries/observations
  d <- ungroup(z) %>% group_by(target) %>%
    summarise(series_size = n(), sum.target = sum(Total_Count)) %>%
    filter(series_size > 2, sum.target > 30)
  
  z <- subset(z, target %in% unique(d$target))
  a <- unique(z[['target']])
  
  #calculate correlation with time for each target
  cor_estimate = c()
  b <- z %>% filter(target == a[1])
  for (i in 1:length(a)){
    b <- z %>% filter(target == a[i])
    model <- cor.test(~Total_Count + day,data=b,method="kendall")
    cor_estimate <- append(cor_estimate,model$estimate)
    }
  cor_rank <- data.frame(target = a, score = cor_estimate)
  
  #Extracting top 5 trends
  e <- (head(arrange(cor_rank,desc(score)), n = 5))
  f <- subset(z, target %in% e$target)
  
  #Plot the top 5 trending targets for the requested activity
  num_plot <- ggplot(f, aes(day,Total_Count, colour=as.factor(target))) + 
    #geom_line(size = 2) + 
    scale_x_continuous(breaks = c(1:9)) +
    ggtitle(cat(as.name(act), "for", as.name(name_city))) + xlab('Day') + theme(
    plot.title = element_text(color="red", size=14, face="bold.italic", hjust = 0.5),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold")) +
    geom_smooth(se = F, size = 2)

  return(num_plot)
}
```

```{r warning=FALSE,message=FALSE}
# This function plots it for all cities, the name of city could be mentioned if required
# 'viewed article category'
top_trend_finder_view_cor('viewed article', trend, 'All')

```

```{r warning=FALSE,message=FALSE}
# 'viewed otc categories' activity
top_trend_finder_view_cor('viewed otc categories', trend, 'All')
```
Turns out that only around 8 targets have added to cart value associated with it in 'viewed drug' and hence the more reliable metric for observation of trend is the no of views and hence we will use the view function for 'viewed drug' category as well.

```{r warning=FALSE,message=FALSE}
# 'viewed frug' activity
top_trend_finder_view_cor('viewed drug', trend, 'All')
```

The next function is exclusively for the 2 categories ('viewed sku' & 'viewed OTC'). These 2 categories have large number of targets that feature well in the added to target category as well. Hence, It would be interesting to determine the conversion rate for realising the trends for these 2 categories.

```{r}
library(gridExtra)
top_trend_finder_cor <- function(act, df, name_city){
  
  # filtering dataframe on the basis of city input & type of activity
  # & implementing required conditions as was done in the previous function
  if(name_city != 'All'){
    z <- df %>% filter(activity == as.name(act), city == name_city) %>%
      group_by(target, day) %>% 
      summarise(Total_Count = sum(count))
    y <- z %>% group_by(target) %>% 
      summarise(n = n(), sum = sum(Total_Count)) %>%
      filter(n > 1)
    z <- ungroup(z) %>% filter(target %in% unique(y$target))
    c <- df %>% filter(activity == 'added to cart', city == name_city) %>%
      group_by(target) %>%
      summarise(Total_Added_To_Cart = sum(count)) %>%
      filter(Total_Added_To_Cart > 10)
    z <- z %>% filter(target %in% unique(c$target))
    a <- df %>% filter(activity == 'added to cart', city == name_city) %>%
      group_by(target, day) %>%
      summarise(Total_Added_To_Cart = sum(count))
  } else {
    z <- df %>% filter(activity == as.name(act)) %>%
      group_by(target, day) %>% 
      summarise(Total_Count = sum(count))
    y <- z %>% group_by(target) %>% 
      summarise(n = n(), sum = sum(Total_Count)) %>%
      filter(n > 1)
    z <- ungroup(z) %>% filter(target %in% unique(y$target))
    c <- df %>% filter(activity == 'added to cart') %>%
      group_by(target) %>%
      summarise(Total_Added_To_Cart = sum(count)) %>%
      filter(Total_Added_To_Cart > 15)
    z <- z %>% filter(target %in% unique(c$target))
    a <- df %>% filter(activity == 'added to cart') %>%
      group_by(target, day) %>%
      summarise(Total_Added_To_Cart = sum(count))
  }
  
  #Joining the 'Added to Cart' numbers with the main view count table
  b <- left_join(z,a,by = c('target', 'day'))
  b[which(is.na(b$Total_Added_To_Cart)),'Total_Added_To_Cart'] <- 0
  
  #Removing anamolies where added to cart is greater than the view count which is impractical
  b[which(b$Total_Added_To_Cart > b$Total_Count),'Total_Added_To_Cart'] <- 0
  b <- b %>% mutate(conversion = (Total_Added_To_Cart/Total_Count)*100) %>%
     arrange(target,day)

  e <- unique(b$target)
  
  #Estimating correlation of the conversion rate for each target with time
  cor_estimate = c()
  for (i in 1:length(e)){
    d <- b %>% filter(target == e[i])
    model <- cor.test(~conversion + day,data=d,method="kendall")
    cor_estimate <- append(cor_estimate,model$estimate)
  }
  cor_rank <- data.frame(target = e, score = cor_estimate)
  g <- (head(arrange(cor_rank,desc(score)), n = 5))
  f <- subset(b, target %in% g$target)
  
  #Plot the viewed and added to cart activity in seperate plots 
  num_plot <- ggplot(f, aes(day,Total_Count, colour=as.factor(target))) + 
    scale_x_continuous(breaks = c(1:9)) +
    ggtitle(as.name(act)) + theme(
    plot.title = element_text(color="red", size=14, face="bold.italic", hjust = 0.5),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold")) +
    geom_smooth(se = F, size = 1.5)
  num_plot_2 <- ggplot(f, aes(day,Total_Added_To_Cart, colour=as.factor(target))) + 
    scale_x_continuous(breaks = c(1:9)) +
    ggtitle(as.name(act)) + theme(
    plot.title = element_text(color="red", size=14, face="bold.italic", hjust = 0.5),
    axis.title.x = element_text(size=14, face="bold"),
    axis.title.y = element_text(size=14, face="bold")) +
    geom_smooth(se = F, size = 1.5)
  finplot <- grid.arrange(num_plot, num_plot_2, nrow=2)
  suppressWarnings(return(finplot))
}
```


```{r warning=FALSE,message=FALSE}
top_trend_finder_cor('viewed OTC',trend, 'New Delhi')
```

```{r warning=FALSE,message=FALSE}
top_trend_finder_cor('viewed sku',trend, 'New Delhi')
```

The dataset could be really interesting with a longer time series and trends observed would be more authentic. The correlation function would still provide a good estimate for the trend strength comparison.

Also more historical day could provide a metric like historical mean of count that can be used for weeding out unpopular products/items from the analysis.

