Problem Summary

My organization wants to know which companies are similar to each other to help in identifying potential customers of a SAAS software solution (e.g. Salesforce CRM) in various segments of the market. The Sales Department is very interested in this analysis, which will help them more easily penetrate various market segments.

Solution Summary

The Analytics Department developed two unsupervised algorithm to classify companies based on how their stocks trade using their daily stock returns (percentage movement from one day to the next). This analysis will deliver value to the stakeholders to determine which companies are related to each other (competitors and have similar attributes).

\[ return_{daily} = \frac{price_{i}-price_{i-1}}{price_{i-1}} \]

# install.packages("plotly")

library(tidyverse)
library(tidyquant)
library(broom)
library(umap)
library(plotly)

Stock prices Analysis

We have stock prices for every stock in the SP 500 Index, which is the daily stock prices for over 500 stocks. The data set is over 1.2M observations.

# STOCK PRICES
sp_500_prices_tbl <- read_rds("week_6_data/sp_500_prices_tbl.rds")
sp_500_prices_tbl
# Apply your data transformation skills!
sp_500_sy018_group <- sp_500_prices_tbl %>% select("symbol", "date", "adjusted") %>%
    mutate(month = month(date, label = TRUE, abbr = TRUE), 
           day = day(date), 
           year = year(date)) %>%
    filter(year >= 2018) %>% 
    group_by(symbol)

sp_500_daily_returns_tbl <- sp_500_sy018_group %>% 
    
    # create a lag 1 day between stock prize value and remove the NA
    mutate(adjusted_lag_1 = lag(adjusted, n=1)) %>%
    ungroup() %>%
    mutate(adjusted_lag_1 = case_when(
        is.na(adjusted_lag_1) ~ adjusted, 
        TRUE ~ adjusted_lag_1)
    ) %>%
    # Compute the diff between the adjusted and the adjusted_lag
    mutate(diff_adj_1 = adjusted - adjusted_lag_1) %>%
    mutate(pct_return = diff_adj_1/adjusted_lag_1) %>%
    select(symbol, date, pct_return) 
# Output: sp_500_daily_returns_tbl

Daily returns

sp_500_daily_returns_tbl <- read_rds("week_6_data/sp_500_daily_returns_tbl.rds")
sp_500_daily_returns_tbl
kmeans_mapper <- function(center = 3) {
    stock_date_matrix_tbl %>%
        select(-symbol) %>%
        kmeans(centers = center, nstart = 20)
}
# Use purrr to map
 k_means_mapped_tbl <- tibble(centers = 1:30) %>%
    # Add a column name kmeans output of the kmeans_mapper
    mutate(k_means = map(centers, kmeans_mapper)) %>%
    # map de glance function to kmeans to expose 
    mutate(glance = map(k_means, glance))

# Output: k_means_mapped_tbl 

Skree plot : Optimum number of segments

# Visualize Scree Plot
 k_means_mapped_tbl %>% 
        unnest(glance) %>%
        select(centers, tot.withinss) %>%
        
        # Visualize
        ggplot(aes(centers, tot.withinss)) + 
        geom_point(color = "#2c3e50", size = 3) +
        geom_line(color = "#2c3e50", size = 1) + 
        ggrepel::geom_label_repel(aes(label = centers), color = "#2c3e50") + 
        
        # Formatting 
        theme_tq() + 
        labs(
            title = "Skree Plot of the SP500 company subgroups (stock prices based)", 
            subtitle = "Author : Ralph D. Tasing", 
            caption = "Conclusion: Based on the skree plot, 10 segments is enough. 
            Behond 10, we don't have too much change") +
        
        theme(legend.position = "none",
              plot.title = element_text(face = "bold", size = 14), 
              plot.caption = element_text(face = "bold.italic", size = 9))

# Convert umap results to tibble with symbols
 umap_results_tbl <- umap_results$layout %>%
        
        # convert it from matrix to tibble 
        as_tibble() %>%
        set_names(c("V1", "V2")) %>% 
        bind_cols(stock_date_matrix_tbl %>% select(symbol))
# Output: umap_results_tbl

SP500 companies Segmentation : 2D Projection

plot_cluster(k = 10)