Load all libraries

library(RCurl)
library(XML)
library(stringr)
library(dplyr)
library(tidyr)
library(pracma)
library(ggplot2)

Section 1: Extract all stock symbols of NASDAQ and NYSE

Set up the base link to the websites containing the data of stock symbols.
Stocks in the website is grouped by the initial of the companies. For example, Apple is in the ‘A’ group and the complete URL to the ‘A’ group is “https://www.advfn.com/nasdaq/nasdaq.asp?companies=A

NASDAQ_list_url <- 'https://www.advfn.com/nasdaq/nasdaq.asp?companies='
NYSE_list_url <- 'https://www.advfn.com/nyse/newyorkstockexchange.asp?companies='

Set up an element function to help extracting the URLs in a table

hrefFun <- function(x){
  xpathSApply(x,'./a',xmlAttrs)  
}

Set up a function to extract the company name, symbol, and the base URL to the detailed information (such as historical price) of every stock within the same group/page. The base URL is linked to information hosted by the same website.

get_stock_list <- function(base_link, base_group) {
  htmlLink <- getURL(str_c(base_link, base_group))
  htmlFile <- htmlParse(htmlLink)
  temp <- readHTMLTable(htmlFile, stringsAsFactors = FALSE, which = 5)
  colnames(temp) <- c("Company", "Symbol","Link")
  temp <- temp[-1,]
  temp <- temp[str_length(temp$Company) > 0,]
  
  temp2 <- readHTMLTable(htmlFile, elFun = hrefFun, stringsAsFactors = FALSE, which = 5)
  temp2 <- temp2$V1
  temp2 <- temp2[temp2 != "list()"]
  temp2 <- str_remove(temp2, "stock-price")
  temp$Link <- temp2
  
  return(temp)
}

Loop through all groups/pages and extract a complete list of all stocks in NASDAQ and NYSE

stock_list <- data.frame()
for (group_index in c(LETTERS, "0")) {
  stock_list <- rbind(stock_list, get_stock_list(NASDAQ_list_url,group_index))
}
number_of_NASDAQ <- length(stock_list$Symbol)
for (group_index in c(LETTERS, "0")) {
  stock_list <- rbind(stock_list, get_stock_list(NYSE_list_url,group_index))
}
total_of_stocks <- length(stock_list$Symbol)

Add a new variable indicating which exchange(NASDAQ or NYSE) each stock belongs to.
Finally, store the completed list in a csv file.

stock_list$exchange <- NA
stock_list$exchange[1:number_of_NASDAQ] <- "NASDAQ"
stock_list$exchange[(number_of_NASDAQ + 1):total_of_stocks] <- "NYSE"
write.csv(stock_list, "stock_list.csv", row.names = FALSE)

The finished complete list of stocks is uploaded to gitHub for easier access.

stock_list <- read.csv("https://raw.githubusercontent.com/ezaccountz/DATA_607_Final_Project/master/stock_list.csv", stringsAsFactors = FALSE)
print(str_c("The total number of companies is ", length(stock_list$Symbol)))
## [1] "The total number of companies is 4999"
head(stock_list)
##                  Company Symbol
## 1            A. Schulman   SHLM
## 2 A.c. Moore Arts Crafts   ACMR
## 3          A.d.a.m. Inc.   ADAM
## 4              Aaipharma   AAII
## 5                   Aaon   AAON
## 6                 Abaxis   ABAX
##                                                                     Link
## 1 https://ih.advfn.com/stock-market/NASDAQ/a-schulman-inc-delisted-SHLM/
## 2            https://ih.advfn.com/stock-market/NASDAQ/acm-research-ACMR/
## 3                 https://ih.advfn.com/stock-market/NASDAQ/ny-adam-ADAM/
## 4               https://ih.advfn.com/stock-market/NASDAQ/aaipharma-AAII/
## 5                    https://ih.advfn.com/stock-market/NASDAQ/aaon-AAON/
## 6     https://ih.advfn.com/stock-market/NASDAQ/abaxis-inc-delisted-ABAX/
##   exchange
## 1   NASDAQ
## 2   NASDAQ
## 3   NASDAQ
## 4   NASDAQ
## 5   NASDAQ
## 6   NASDAQ

In the same website that the stock symbols are extracted, historical stock prices are also available.
For example, the stock prices for ‘Amazon’ are listed in “https://ih.advfn.com/stock-market/NASDAQ/amazon-com-AMZN/historical/more-historical-data?current=0&Date1=01/01/18&Date2=10/31/19”.
By using the base URLs recorded in the stock list, we can constract the complete URL for each stock and extract the historical prices for all stock (if available).
The following code is used to create a blank stock price table and then fill in the daily stock price from 01/01/2018 to 10/31/2019 by looping through the information pages of every individual stocks.
The finished data is stored in a csv file.

stock_date <- seq.Date(as.Date("2018-1-1"), as.Date("2019-10-31"), "day")
stock_price <- data.frame(matrix(ncol = length(stock_list$Symbol), nrow = length(stock_date)))
colnames(stock_price) <- stock_list$Symbol
rownames(stock_price) <- stock_date


get_stock_price <- function(base_link, start_date, end_date) {
  htmlLink <- getURL(str_c(base_link,"historical/more-historical-data?current=",0,"&Date1=",start_date,"&Date2=",end_date))
  if (str_length(htmlLink) == 0) {
    return(NA)
  }
  if (str_detect(htmlLink, "Page Not Found")) {
    return(NA)
  }
  
  htmlFile <- htmlParse(htmlLink)
  
  temp <- readHTMLTable(htmlFile, stringsAsFactors = FALSE, which = 2)
  if (length(temp) == 1) return(NA)
 
  num_of_pages <- getHTMLLinks(htmlFile, xpQuery = "//a[@class='date-control date-control-right']/span[@id='last']/parent::*/@href")
  
  if (length(num_of_pages) > 0) {
    num_of_pages <- str_extract(num_of_pages[1], "current=\\d+")
    num_of_pages <- as.integer(str_remove(num_of_pages, "current=")) 
    for(i in c(1:num_of_pages)) {
      htmlLink <- getURL(str_c(base_link,"historical/more-historical-data?current=",i,"&Date1=",start_date,"&Date2=",end_date))
      htmlFile <- htmlParse(htmlLink)
      temp2 <- readHTMLTable(htmlFile, stringsAsFactors = FALSE, which = 2)
      if (length(temp2) > 1) {
        temp <- rbind(temp, temp2)
      }
    }
  }
  
  temp$Date <- as.Date(temp$Date, "%b%d%Y")
  temp <- temp[temp$Volume > 0,]
  
  if (length(temp$Date) == 0) return(NA)
  
  return(temp)
}


for (i in c(1:length(stock_list$Symbol))) {
  start_date <- "1/1/18"
  end_date <- "10/31/19"
  temp <- get_stock_price(stock_list$Link[i], start_date, end_date)
  if (!is.na(temp)) {
    for(j in c(1:length(temp$Date))) {
      stock_price[as.character(temp$Date[j]),stock_list$Symbol[i]] <- temp$Close[j]
    }
  } 
}

write.csv(stock_price, "stock_price.csv", row.names = TRUE)

The overall run time for the data collecting process is over 10 hours and the quality of the data is not acceptable. Please try not to run the code yourself. The result is uploaded to gitHub for you to review if you are interested.

stock_price <- read.csv("https://raw.githubusercontent.com/ezaccountz/DATA_607_Final_Project/master/stock_price.csv", stringsAsFactors = FALSE, row.names=1)
stock_price[1:10,1:5]
##             SHLM  ACMR ADAM AAII  AAON
## 2018-01-01    NA    NA   NA   NA    NA
## 2018-01-02 37.35 5.620   NA   NA 36.95
## 2018-01-03 38.10 5.300   NA   NA 37.60
## 2018-01-04 38.75 5.315   NA   NA 37.45
## 2018-01-05 39.10 5.510   NA   NA 38.05
## 2018-01-06    NA    NA   NA   NA    NA
## 2018-01-07    NA    NA   NA   NA    NA
## 2018-01-08 38.90 5.240   NA   NA 37.45
## 2018-01-09 37.90 5.240   NA   NA 37.10
## 2018-01-10 38.90 5.190   NA   NA 36.10

The data has the following problems:
1) The base link to the stock information is no longer valid.
2) Some data values are missing.
3) Some data values are duplicated.
4) A company may not be active for the whole period from 01/01/2018 to 10/31/2019.Some of them have junk values added.

A more reliable data source is Yahoo Finance. The ‘BatchGetSymbols’ package is a tool ready in hand to extract all information we need and you can also define a threshold to filter out all stocks with inadequate data.

library('BatchGetSymbols')

Gather all stock prices for stocks with no missing value (thresh.bad.data = 1).
Extract the daily close values and transform the data from long to wide.
Double check and remove any stock with missing value.
The run time is around 1 hour. The result is stored in a csv file and uploaded to gitHub for easier access.

stock_date <- seq.Date(as.Date("2018-1-1"), as.Date("2019-10-31"), "day")
stock_price <- data.frame(matrix(ncol = length(stock_list$Symbol), nrow = length(stock_date)))
colnames(stock_price) <- stock_list$Symbol
rownames(stock_price) <- stock_date


first_date <- as.Date("2018-1-1")
last_date <- as.Date("2019-11-1")
freq_data <- 'daily'

stock_price2 <- BatchGetSymbols(tickers = stock_list$Symbol, 
                        first.date = first_date,
                        last.date = last_date, 
                        freq.data = freq_data,
                        thresh.bad.data = 1,
                        cache.folder = file.path(tempdir(), 
                                                 'BGS_Cache') )

stock_price3 <- data.frame(Date = stock_price2$df.tickers$ref.date, 
                   Symbol = stock_price2$df.tickers$ticker,
                   Close = stock_price2$df.tickers$price.close)


stock_price3 <- spread(temp, "Symbol", "Close")
stock_price3 <- stock_price3[,apply(stock_price3, 2, function(x) !any(is.na(x)))]

write.csv(stock_price3, "stock_price_wide.csv", row.names = FALSE)
stock_price3 <- read.csv("https://raw.githubusercontent.com/ezaccountz/DATA_607_Final_Project/master/stock_price_wide.csv", stringsAsFactors = FALSE)
stock_price3$Date <- as.Date(stock_price3$Date)
print(str_c("The total number of companies is ", length(colnames(stock_price3))-1))
## [1] "The total number of companies is 2426"
stock_price3[1:10,1:5]
##          Date     A    AA AAME   AAN
## 1  2018-01-02 67.60 55.17 3.25 39.47
## 2  2018-01-03 69.32 54.50 3.35 39.67
## 3  2018-01-04 68.80 54.70 3.25 39.83
## 4  2018-01-05 69.90 54.09 3.65 39.94
## 5  2018-01-08 70.05 55.00 3.35 40.72
## 6  2018-01-09 71.77 54.20 3.40 39.80
## 7  2018-01-10 70.79 56.17 3.35 39.15
## 8  2018-01-11 70.80 56.91 3.35 41.12
## 9  2018-01-12 71.73 56.76 3.35 41.13
## 10 2018-01-16 71.23 56.24 3.30 41.40

Section 2: Calculate the similarity between every two stocks.

The similarity of two stocks are calculated by the Correlation Coefficient using the stock price from 01/01/2018 to 12/31/2018 as inputs.

similarity_table <- data.frame(matrix(ncol = length(colnames(stock_price3)) - 1, nrow = length(colnames(stock_price3)) - 1))
colnames(similarity_table) <- colnames(stock_price3)[-1]
rownames(similarity_table) <- colnames(stock_price3)[-1]

for(i in c(1:(length(similarity_table$A)-1))) {
  for (j in c(1:i)) {
    similarity_table[i+1,j] <- cor(stock_price3[1:251,i + 2],stock_price3[1:251,j + 1], method = "pearson")
    similarity_table[j,i+1] <- similarity_table[i+1,j]
  }
}
similarity_table[1:5,1:5]
##               A         AA        AAME         AAN        AAON
## A            NA  0.1152811  0.43985067  0.08619936  0.26496687
## AA   0.11528114         NA  0.50199188 -0.34331229 -0.19841405
## AAME 0.43985067  0.5019919          NA -0.17573586  0.09045917
## AAN  0.08619936 -0.3433123 -0.17573586          NA  0.58895335
## AAON 0.26496687 -0.1984140  0.09045917  0.58895335          NA

Section 3: k-means clustering.
The algorithm is based on the discription from our text book “Data Science for Business” Chapter 6, page 170-174.

First, create a new list for all available stocks. Also, calculate the annual return for later processing.

stock_list2 <- data.frame(Symbol = colnames(similarity_table), Center = NA, Annual_Return = NA, stringsAsFactors=FALSE)
temp <- data.frame(Symbol = colnames(stock_price3)[-1], start_price = t(stock_price3[1,-1]), stringsAsFactors=FALSE)
colnames(temp)[2] <- "start_price"
temp2 <- data.frame(Symbol = colnames(stock_price3)[-1], end_price = t(stock_price3[1 + 250,-1]), stringsAsFactors=FALSE)
colnames(temp2)[2] <- "end_price"

stock_list2 <- stock_list2 %>% 
  left_join(temp, by = "Symbol") %>% 
  left_join(temp2, by = "Symbol") %>% 
  mutate(Annual_Return = (end_price - start_price)/start_price) %>% 
  select(1:3)

head(stock_list2)
##   Symbol Center Annual_Return
## 1      A     NA  -0.002070991
## 2     AA     NA  -0.518216405
## 3   AAME     NA  -0.258461538
## 4    AAN     NA   0.065366049
## 5   AAON     NA  -0.051150202
## 6    AAP     NA   0.484211640

Create a function to select the initial centers for clustering. The inputs are the similarity table, the number of centers and a selected stock as a starting point.
The function sorts all stocks based their distance/similarity to the selected starting stock. The function then pick up N-1 stocks from the sorted list sequentially to be the remaining initial centers

find_initial_centers <- function (sim_table, num_center, start_symbol) {
  temp <- data.frame(symbol = colnames(sim_table), similarity = sim_table[, start_symbol], stringsAsFactors = FALSE)
  temp <- arrange(temp,desc(similarity))
  interval <- as.integer(length(temp$symbol) / num_center)
  centers <- start_symbol
  for(i in c(1:(num_center-1))) {
    centers <- c(centers, temp$symbol[i * interval])
  }
  return(data.frame(centers = centers, stringsAsFactors = FALSE))
}

First, select 50 centers starting from the stock of ‘Amazon’

start_centers <- find_initial_centers(similarity_table, 50, "AMZN")
unlist(start_centers)
##  centers1  centers2  centers3  centers4  centers5  centers6  centers7 
##    "AMZN"    "TRNS"     "UIS"    "THRM"     "CMG"    "HSKA"    "BPOP" 
##  centers8  centers9 centers10 centers11 centers12 centers13 centers14 
##    "WAYN"     "HNI"     "GBX"    "GIII"    "RCII"    "FIZZ"     "AFL" 
## centers15 centers16 centers17 centers18 centers19 centers20 centers21 
##    "AUDC"    "KTCC"    "GPRO"     "TDW"     "AIV"    "RECN"     "WEC" 
## centers22 centers23 centers24 centers25 centers26 centers27 centers28 
##    "IMMR"     "MDU"    "CZWI"    "GPOR"     "NOK"    "WSFS"    "GYRO" 
## centers29 centers30 centers31 centers32 centers33 centers34 centers35 
##     "BXG"     "CNQ"    "PBHC"     "BBW"     "HHS"     "BCO"     "WMT" 
## centers36 centers37 centers38 centers39 centers40 centers41 centers42 
##    "SSFN"    "GIGM"     "PLL"   "WACLY"     "AMX"   "RYAAY"    "ELSE" 
## centers43 centers44 centers45 centers46 centers47 centers48 centers49 
##    "EBSB"     "DLX"       "F"     "OMC"     "IAE"     "SQM"     "LFC" 
## centers50 
##    "QUIK"

Step 1: for each of the stocks, calculate the distance/similarity from the stock to each of the centers. Assign a center to the stock where the distance/similarity is the smallest

temp <- stock_list2 %>% 
  left_join(select(cbind(Symbol = rownames(similarity_table), similarity_table, stringsAsFactors=FALSE), c("Symbol", start_centers$centers)), by = "Symbol")
for (i in c(1: length(stock_list2$Symbol))) {
 if(any(is.na(temp[i,-c(1:3)]))) {
    stock_list2$Center[i] <- stock_list2$Symbol[i]
  } else {
  stock_list2$Center[i] <- colnames(temp)[which(temp[i,] == max(temp[i,-c(1:3)], na.rm = TRUE))]
  }
}
head(stock_list2)
##   Symbol Center Annual_Return
## 1      A    WMT  -0.002070991
## 2     AA    DLX  -0.518216405
## 3   AAME   QUIK  -0.258461538
## 4    AAN    UIS   0.065366049
## 5   AAON    HNI  -0.051150202
## 6    AAP   AUDC   0.484211640

Step 2: for each group of stocks (stocks with the same center), find the adjusted center within the group.
A center should be close to all members of the same group.
That is, when calculating the maximum distance from a selected stock to all other members within a group, the result for the center should be the smallest.

temp2 <- unique(stock_list2$Center)
for (i in c(1:length(temp2))) {
  temp <- stock_list2[stock_list2$Center == temp2[i],]
  temp <- similarity_table[temp$Symbol,temp$Symbol]
  
  if (length(temp) > 1) {
    temp$min <- apply(temp, 1, min, na.rm = TRUE)
    stock_list2[stock_list2$Center == temp2[i],]$Center <- rownames(temp)[which(temp$min == max(temp$min))][1]
  } 
}
adjusted_centers <- data.frame(centers = unique(stock_list2$Center), stringsAsFactors = FALSE) 
head(stock_list2)
##   Symbol Center Annual_Return
## 1      A  CMCSA  -0.002070991
## 2     AA    ARD  -0.518216405
## 3   AAME   TREE  -0.258461538
## 4    AAN  DISCA   0.065366049
## 5   AAON    SHW  -0.051150202
## 6    AAP   HURN   0.484211640

Repeat step 1 and step 2 until there is no change in the centers

while (!identical(arrange(start_centers, centers), arrange(adjusted_centers, centers))) {
  start_centers <- adjusted_centers
  
  
  temp <- stock_list2 %>% 
    left_join(select(cbind(Symbol = rownames(similarity_table), similarity_table, stringsAsFactors=FALSE), c("Symbol", start_centers$centers)), by = "Symbol")
  for (i in c(1: length(stock_list2$Symbol))) {
    if(any(is.na(temp[i,-c(1:3)]))) {
      stock_list2$Center[i] <- stock_list2$Symbol[i]
    } else {
      stock_list2$Center[i] <- colnames(temp)[which(temp[i,] == max(temp[i,-c(1:3)], na.rm = TRUE))]
    }
  }
  
  temp2 <- unique(stock_list2$Center)
  for (i in c(1:length(temp2))) {
    temp <- stock_list2[stock_list2$Center == temp2[i],]
    temp <- similarity_table[temp$Symbol,temp$Symbol]

    if (length(temp) > 1) {
      temp$min <- apply(temp, 1, min, na.rm = TRUE)
      stock_list2[stock_list2$Center == temp2[i],]$Center <- rownames(temp)[which(temp$min == max(temp$min))][1]
    }
  }
  
  adjusted_centers <- data.frame(centers = unique(stock_list2$Center), stringsAsFactors = FALSE)
}

The final centers are

unlist(adjusted_centers)
##  centers1  centers2  centers3  centers4  centers5  centers6  centers7 
##   "CMCSA"     "ARD"    "ACAD"    "DSWL"     "AVX"    "HURN"     "GBX" 
##  centers8  centers9 centers10 centers11 centers12 centers13 centers14 
##     "NBL"    "AMKR"     "BDX"    "CEVA"     "ROG"     "GFF"     "SHO" 
## centers15 centers16 centers17 centers18 centers19 centers20 centers21 
##     "RWT"    "BPOP"     "NOA"     "VTR"    "IDSA"    "PRGX"     "SIG" 
## centers22 centers23 centers24 centers25 centers26 centers27 centers28 
##     "BCO"     "UMC"     "SAP"      "AM"     "EMR"    "CZWI"    "MOFG" 
## centers29 centers30 centers31 centers32 centers33 centers34 centers35 
##     "VMC"    "SAFT"    "SHEN"    "KTCC"     "APD"     "GTS"    "NAII" 
## centers36 centers37 centers38 centers39 centers40 centers41 centers42 
##    "TGEN"    "GIGM"     "CIB"    "PRSC"     "WAT"     "WRI"     "TDW" 
## centers43 centers44 centers45 centers46 centers47 centers48 centers49 
##    "UFCS"    "CHMP"    "LAKE"     "HMY"    "GRVY"    "CREE"     "IPG" 
## centers50 
##    "RECN"

Section 3: portfolio evaluation

Select from each group, the stock with the highest annual return

stock_result <- stock_list2 %>% 
  group_by(Center) %>% 
  arrange(desc(Annual_Return)) %>% 
  filter(row_number()==1)
head(stock_result)
## # A tibble: 6 x 3
## # Groups:   Center [6]
##   Symbol Center Annual_Return
##   <chr>  <chr>          <dbl>
## 1 TNDM   HURN           13.9 
## 2 NIHD   BPOP            9.76
## 3 ARWR   SAFT            2.34
## 4 AMRN   NOA             2.21
## 5 AMSC   SHEN            1.93
## 6 BXC    PRGX            1.53

I expect that my investment to not lose value. Therefore, stocks with negative annual return are filtered out.
Extract the daiy stock price from 01/01/2018 to 12/31/2018 for each selected stock.
Create a protfolio by assigning $1,000 equally to each of the selected stocks based on their values on 01/01/2018.
Caculate the value of the profolio from 01/01/2018 to 12/31/2018

stock_portfolio_symbol <- filter(stock_result, Annual_Return >= 0)$Symbol 
stock_portfolio_price <- stock_price3[,c("Date",stock_portfolio_symbol)]
stock_portfolio_price <- stock_portfolio_price[stock_portfolio_price$Date >= as.Date("2018-1-1") & stock_portfolio_price$Date < as.Date("2019-1-1"),]
stock_portfolio_share <- data.frame(Symbol = stock_portfolio_symbol, share = NA, stringsAsFactors = FALSE)
stock_portfolio_share$share <- (1000/length(stock_portfolio_share$Symbol))/t(stock_portfolio_price[1,-1])
stock_portfolio_price$portfolio <- dot(as.matrix(t(stock_portfolio_price[,-1])), stock_portfolio_share$share)

The daily value of our protfolio from 01/01/2018 to 12/31/2018 (excluding the dates when the markets are closed)

stock_portfolio_price$portfolio
##   [1] 1000.000 1018.218 1021.548 1029.769 1061.712 1061.150 1099.154
##   [8] 1111.641 1107.487 1105.438 1112.714 1118.357 1169.038 1168.136
##  [15] 1186.338 1196.041 1183.802 1185.607 1187.363 1179.202 1170.224
##  [22] 1182.610 1157.234 1117.408 1133.795 1138.203 1109.444 1109.260
##  [29] 1121.277 1122.176 1142.513 1146.638 1153.406 1159.475 1160.029
##  [36] 1166.045 1197.945 1220.300 1210.770 1205.076 1192.828 1213.254
##  [43] 1239.188 1241.983 1251.152 1256.328 1287.145 1331.869 1334.679
##  [50] 1328.470 1334.732 1358.408 1339.217 1348.174 1363.241 1344.254
##  [57] 1340.365 1369.116 1349.338 1332.441 1363.206 1341.229 1354.111
##  [64] 1383.941 1390.566 1403.598 1402.393 1414.604 1418.884 1425.737
##  [71] 1422.499 1430.170 1458.423 1477.241 1463.412 1464.322 1446.056
##  [78] 1432.926 1434.134 1447.850 1443.358 1430.256 1440.178 1442.851
##  [85] 1438.212 1452.516 1458.285 1497.012 1516.314 1531.947 1532.898
##  [92] 1535.557 1548.777 1537.606 1534.055 1546.898 1557.595 1547.025
##  [99] 1570.221 1576.787 1563.862 1565.002 1606.637 1596.148 1611.355
## [106] 1615.425 1666.437 1678.444 1665.213 1673.623 1706.426 1760.703
## [113] 1777.190 1822.409 1817.113 1793.116 1794.371 1844.329 1817.867
## [120] 1862.597 1844.125 1851.717 1830.907 1847.301 1846.288 1845.803
## [127] 1866.517 1901.922 1916.151 1914.061 1929.526 1925.555 1955.830
## [134] 1974.520 1960.867 1975.839 1955.800 1960.070 1947.260 1997.568
## [141] 2030.626 2027.323 2056.510 1999.255 1963.172 2049.201 2101.157
## [148] 2100.208 2102.925 2096.614 2048.981 1996.749 2035.617 2045.039
## [155] 2022.931 2037.313 2002.632 2007.672 2029.192 2045.835 2083.616
## [162] 2105.517 2116.508 2115.851 2134.787 2141.604 2162.256 2178.307
## [169] 2199.178 2204.826 2143.814 2168.507 2175.155 2183.101 2183.572
## [176] 2214.407 2252.870 2198.821 2082.013 2114.846 2054.421 2154.402
## [183] 2147.419 2228.154 2297.971 2287.745 2291.020 2297.781 2221.902
## [190] 2178.565 2208.745 2160.878 2169.825 2129.112 2116.111 2016.253
## [197] 2006.619 2040.077 2045.922 2133.594 2147.275 2088.148 2099.545
## [204] 2099.932 2099.979 2002.869 2039.588 2059.468 2016.051 2044.114
## [211] 2101.451 2208.619 2269.584 2267.126 2244.901 2321.045 2298.105
## [218] 2283.990 2191.234 2060.353 1982.943 2050.143 2045.375 1963.866
## [225] 1916.787 2007.218 1989.837 2026.479 1994.322 2073.898 2109.058
## [232] 2111.647 2130.059 2013.586 2038.972 1969.763 1990.327 2005.358
## [239] 2055.335 2019.834 2005.312 1962.642 1945.710 1933.180 1872.809
## [246] 1808.532 1790.016 1915.649 1910.626 1960.567 2010.335

Since at the point of creating the profolio, the stocks are known to be the best-performing one, our portfolio should be also performing well

stock_plot <- ggplot(stock_portfolio_price, aes(x=stock_portfolio_price$Date, y=stock_portfolio_price$portfolio)) + 
  geom_line(color = "black") +
  labs(title="Portfolio value from 01/01/2018 to 12/31/2018",
       x="Date",
       y="Portfolio value")
plot(stock_plot)

Looking at the daily percent change of the stocks and the portfolio,
the standard deviation of the portfolio is indeed noticeably smaller than most of the stocks.

temp <- (stock_portfolio_price[,-1] - apply(stock_portfolio_price[,-1], 2, lag))/apply(stock_portfolio_price[,-1], 2, lag)
apply(temp, 2, sd, na.rm = TRUE)
##       TNDM       NIHD       ARWR       AMRN       AMSC        BXC 
## 0.05689750 0.08192225 0.05651674 0.20353809 0.04838987 0.06393355 
##       SSTI       SMTX        EGY       MLAB       SIGA       TZOO 
## 0.04168241 0.03125154 0.05436820 0.02258700 0.03284977 0.04835725 
##        GOL       NVAX       RELL       VRTB       ISRG       IMMR 
## 0.04602950 0.05923009 0.01907300 0.23318716 0.02099944 0.03805117 
##        CPL        NOK       NWPX       PODD       APEI        ANF 
## 0.01866348 0.01819659 0.02347094 0.02334311 0.03357011 0.03664777 
##       BDCO         BA       NATI        ABG       WILC       FORD 
## 0.10961869 0.01985344 0.02252684 0.01900089 0.01880939 0.10248450 
##        AFL         EC       PBHC       WIRE        TFX       HARL 
## 0.01275387 0.02668673 0.01299297 0.01929331 0.01742575 0.01512471 
##        HDB        LOW       AMTY       ORBT  portfolio 
## 0.01494738 0.01827962 0.08893489 0.01781543 0.01883155

Now we evaluate the performance of the portfolio, assuming that we invested $1,000 on 01/01/2019

stock_portfolio_price <- stock_price3[,c("Date",stock_portfolio_symbol)]
stock_portfolio_price <- stock_portfolio_price[stock_portfolio_price$Date >= as.Date("2019-1-1") & stock_portfolio_price$Date < as.Date("2019-11-1"),]
stock_portfolio_share <- data.frame(Symbol = stock_portfolio_symbol, share = NA, stringsAsFactors = FALSE)
stock_portfolio_share$share <- (1000/length(stock_portfolio_share$Symbol))/t(stock_portfolio_price[1,-1])
stock_portfolio_price$portfolio <- dot(as.matrix(t(stock_portfolio_price[,-1])), stock_portfolio_share$share)

The daily value of our protfolio from 01/01/2019 to 10/31/2019 (excluding the dates when the markets are closed)

stock_portfolio_price$portfolio
##   [1] 1000.000  983.964 1016.269 1026.650 1041.273 1053.520 1056.366
##   [8] 1061.279 1053.743 1069.234 1061.151 1066.734 1077.204 1059.707
##  [15] 1057.416 1065.112 1084.563 1075.387 1061.367 1075.317 1091.293
##  [22] 1089.582 1094.414 1094.986 1101.863 1090.007 1104.057 1112.152
##  [29] 1135.824 1127.646 1134.238 1141.323 1142.515 1143.892 1127.407
##  [36] 1146.657 1141.308 1141.661 1161.821 1135.450 1143.585 1126.274
##  [43] 1133.004 1115.037 1110.083 1109.124 1126.183 1127.701 1125.058
##  [50] 1118.987 1117.020 1115.477 1117.273 1112.267 1116.434 1077.641
##  [57] 1078.656 1081.647 1071.293 1077.258 1087.113 1082.096 1093.284
##  [64] 1100.075 1101.657 1104.357 1101.334 1091.655 1101.848 1095.313
##  [71] 1095.601 1100.227 1105.649 1090.024 1090.220 1091.364 1100.521
##  [78] 1092.335 1100.238 1105.921 1104.580 1096.533 1091.911 1093.818
##  [85] 1120.780 1125.470 1111.134 1114.087 1116.584 1104.708 1074.416
##  [92] 1072.180 1089.806 1096.437 1083.767 1068.729 1085.286 1072.021
##  [99] 1071.567 1086.185 1071.145 1064.060 1063.417 1043.283 1036.894
## [106] 1059.635 1056.473 1046.359 1061.315 1056.144 1050.370 1050.899
## [113] 1059.490 1045.861 1057.878 1072.477 1076.578 1094.587 1088.587
## [120] 1084.737 1076.584 1062.394 1076.660 1080.322 1084.505 1083.373
## [127] 1088.145 1100.849 1105.719 1121.865 1129.868 1129.484 1129.378
## [134] 1125.677 1121.386 1122.376 1109.239 1098.748 1097.984 1112.646
## [141] 1118.605 1092.515 1114.448 1114.179 1129.183 1113.797 1109.493
## [148] 1091.973 1064.809 1087.200 1084.705 1104.233 1097.426 1082.929
## [155] 1089.772 1067.592 1038.756 1066.922 1064.473 1061.908 1074.992
## [162] 1067.092 1046.846 1060.299 1052.737 1058.400 1071.524 1072.104
## [169] 1062.064 1067.454 1076.042 1074.519 1066.123 1076.516 1093.708
## [176] 1097.755 1088.152 1085.954 1091.987 1087.064 1087.447 1087.935
## [183] 1088.846 1077.079 1078.888 1063.937 1054.710 1064.584 1048.097
## [190] 1035.667 1069.717 1065.941 1064.858 1051.101 1056.677 1048.685
## [197] 1063.182 1061.644 1070.807 1080.215 1086.053 1093.987 1086.512
## [204] 1093.970 1094.071 1087.691 1082.762 1088.966 1091.481 1090.240
## [211] 1088.108

The return for 10 months is 8.8%.

stock_plot <- ggplot(stock_portfolio_price, aes(x=stock_portfolio_price$Date, y=stock_portfolio_price$portfolio)) + 
  geom_line(color = "black") +
  labs(title="Portfolio value from 01/01/2019 to 10/31/2019",
       x="Date",
       y="Portfolio value")
plot(stock_plot)

From the graph, we see that the portfolio is doing well at the begining but then the value is going up and down. The volatility of the stock_portfolio is still considerable.

Now let’s look at the standard deviation of the daily percent change

temp <- (stock_portfolio_price[,-1] - apply(stock_portfolio_price[,-1], 2, lag))/apply(stock_portfolio_price[,-1], 2, lag)
apply(temp, 2, sd, na.rm = TRUE)
##        TNDM        NIHD        ARWR        AMRN        AMSC         BXC 
## 0.039055544 0.036158432 0.035656576 0.039870961 0.037535471 0.033530935 
##        SSTI        SMTX         EGY        MLAB        SIGA        TZOO 
## 0.035018065 0.039803008 0.035374481 0.027911195 0.023529846 0.042821132 
##         GOL        NVAX        RELL        VRTB        ISRG        IMMR 
## 0.036007209 0.074193059 0.018988820 0.073570269 0.018682825 0.022181683 
##         CPL         NOK        NWPX        PODD        APEI         ANF 
## 0.018300024 0.025021383 0.020950182 0.026597692 0.025065997 0.037672616 
##        BDCO          BA        NATI         ABG        WILC        FORD 
## 0.055147662 0.018679273 0.015825715 0.017755731 0.021051516 0.033410598 
##         AFL          EC        PBHC        WIRE         TFX        HARL 
## 0.009549102 0.019857401 0.019423588 0.016665734 0.015800406 0.015232224 
##         HDB         LOW        AMTY        ORBT   portfolio 
## 0.015301152 0.017508008 0.156543499 0.016364838 0.010996339

The standard deviation for the portfolio is lower in the second year. However, the performance (return) is also lower.

Conclusion: stocks that are performing well in the first year may not be performing well in the second year. The model needs to be improved to select more reliable stocks. One option is to extend the input period to a few more years. Another option is to perform deeper analysis when selecting a stock(s) from each group. This will require additional complicated work, so for now it will not be included in my project.

Section 4: Create final portfolio

At the end, let’s see what the model gives up by using the daily price from 01/01/2018 to 10/31/2019 as input.
Calculate the similiarity table.

similarity_table2 <- data.frame(matrix(ncol = length(colnames(stock_price3)) - 1, nrow = length(colnames(stock_price3)) - 1))
colnames(similarity_table2) <- colnames(stock_price3)[-1]
rownames(similarity_table2) <- colnames(stock_price3)[-1]

for(i in c(1:(length(similarity_table2$A)-1))) {
  for (j in c(1:i)) {
    similarity_table2[i+1,j] <- cor(stock_price3[,i + 2],stock_price3[,j + 1], method = "pearson")
    similarity_table2[j,i+1] <- similarity_table2[i+1,j]
  }
}

similarity_table2[1:5,1:5]
##               A         AA       AAME        AAN       AAON
## A            NA -0.4864174 -0.1423822  0.4824920  0.5652028
## AA   -0.4864174         NA  0.6702864 -0.7855711 -0.7583300
## AAME -0.1423822  0.6702864         NA -0.5299671 -0.4701893
## AAN   0.4824920 -0.7855711 -0.5299671         NA  0.8526159
## AAON  0.5652028 -0.7583300 -0.4701893  0.8526159         NA

Create a new table for the clustering calculation

stock_list3 <- data.frame(Symbol = colnames(similarity_table2), Center = NA, Total_Return = NA, stringsAsFactors=FALSE)
temp <- data.frame(Symbol = colnames(stock_price3)[-1], start_price = t(stock_price3[,-1]), stringsAsFactors=FALSE)
colnames(temp)[2] <- "start_price"
temp2 <- data.frame(Symbol = colnames(stock_price3)[-1], end_price = t(stock_price3[length(stock_price3$Date),-1]), stringsAsFactors=FALSE)
colnames(temp2)[2] <- "end_price"

stock_list3 <- stock_list3 %>% 
  left_join(temp, by = "Symbol") %>% 
  left_join(temp2, by = "Symbol") %>% 
  mutate(Total_Return = (end_price - start_price)/start_price) %>% 
  select(1:3)

k-means clustering

start_centers <- find_initial_centers(similarity_table2, 50, "AMZN")

temp <- stock_list3 %>% 
  left_join(select(cbind(Symbol = rownames(similarity_table2), similarity_table2, stringsAsFactors=FALSE), c("Symbol", start_centers$centers)), by = "Symbol")
for (i in c(1: length(stock_list3$Symbol))) {
 if(any(is.na(temp[i,-c(1:3)]))) {
    stock_list3$Center[i] <- stock_list3$Symbol[i]
  } else {
  stock_list3$Center[i] <- colnames(temp)[which(temp[i,] == max(temp[i,-c(1:3)], na.rm = TRUE))]
  }
}

temp2 <- unique(stock_list3$Center)
for (i in c(1:length(temp2))) {
  temp <- stock_list3[stock_list3$Center == temp2[i],]
  temp <- similarity_table2[temp$Symbol,temp$Symbol]
  
  if (length(temp) > 1) {
    temp$min <- apply(temp, 1, min, na.rm = TRUE)
    stock_list3[stock_list3$Center == temp2[i],]$Center <- rownames(temp)[which(temp$min == max(temp$min))][1]
  } 
}
adjusted_centers <- data.frame(centers = unique(stock_list3$Center), stringsAsFactors = FALSE) 

while (!identical(arrange(start_centers, centers), arrange(adjusted_centers, centers))) {
  start_centers <- adjusted_centers
  
  
  temp <- stock_list3 %>% 
    left_join(select(cbind(Symbol = rownames(similarity_table2), similarity_table2, stringsAsFactors=FALSE), c("Symbol", start_centers$centers)), by = "Symbol")
  for (i in c(1: length(stock_list3$Symbol))) {
    if(any(is.na(temp[i,-c(1:3)]))) {
      stock_list3$Center[i] <- stock_list3$Symbol[i]
    } else {
      stock_list3$Center[i] <- colnames(temp)[which(temp[i,] == max(temp[i,-c(1:3)], na.rm = TRUE))]
    }
  }
  
  temp2 <- unique(stock_list3$Center)
  for (i in c(1:length(temp2))) {
    temp <- stock_list3[stock_list3$Center == temp2[i],]
    temp <- similarity_table2[temp$Symbol,temp$Symbol]

    if (length(temp) > 1) {
      temp$min <- apply(temp, 1, min, na.rm = TRUE)
      stock_list3[stock_list3$Center == temp2[i],]$Center <- rownames(temp)[which(temp$min == max(temp$min))][1]
    }
  }
  
  adjusted_centers <- data.frame(centers = unique(stock_list3$Center), stringsAsFactors = FALSE)
}

Generate the protfolio

stock_result <- stock_list3 %>% 
  group_by(Center) %>% 
  arrange(desc(Total_Return)) %>% 
  filter(row_number()==1)

stock_portfolio_symbol <- filter(stock_result, Total_Return >= 0)$Symbol 
stock_portfolio_price <- stock_price3[,c("Date",stock_portfolio_symbol)]
stock_portfolio_price <- stock_portfolio_price[stock_portfolio_price$Date >= as.Date("2018-1-1") & stock_portfolio_price$Date < as.Date("2019-11-1"),]
stock_portfolio_share <- data.frame(Symbol = stock_portfolio_symbol, share = NA, stringsAsFactors = FALSE)
stock_portfolio_share$share <- (1000/length(stock_portfolio_share$Symbol))/t(stock_portfolio_price[1,-1])
stock_portfolio_price$portfolio <- dot(as.matrix(t(stock_portfolio_price[,-1])), stock_portfolio_share$share)
stock_plot <- ggplot(stock_portfolio_price, aes(x=stock_portfolio_price$Date, y=stock_portfolio_price$portfolio)) + 
  geom_line(color = "black") +
  labs(title="Portfolio value from 01/01/2018 to 10/31/2019",
       x="Date",
       y="Portfolio value")
plot(stock_plot)

standard deviation of daily percent change

temp <- (stock_portfolio_price[,-1] - apply(stock_portfolio_price[,-1], 2, lag))/apply(stock_portfolio_price[,-1], 2, lag)
apply(temp, 2, sd, na.rm = TRUE)
##       TNDM       ARWR       NIHD       ATEA        BXC       PNRG 
## 0.04992622 0.04808492 0.06552064 0.06739530 0.05223830 0.03312517 
##       MERR        EGY        SKY       ACMR         AU       STAA 
## 0.48394137 0.04673050 0.04132960 0.04656917 0.02601952 0.03663223 
##       DRRX        AVP       SPEC       GLUU        MED       BEBE 
## 0.05287614 0.03445594 0.17996309 0.03671713 0.03469619 0.03863174 
##       CRNT        EEI        TAL       LQDT        ARL        SSI 
## 0.03585686 0.03160234 0.03026246 0.03234323 0.03731502 0.04458648 
##        DKS       JOBS        TSM       FOSL       MNOV       PLXS 
## 0.02479547 0.03013347 0.01667517 0.05862203 0.04332470 0.01775949 
##       INTC       AMKR        RDN       SMIT        FIX       TROW 
## 0.02000278 0.03195234 0.01922472 0.03453190 0.02010523 0.01576708 
##        AXE        LEG        BVN       FNLC       PZZA       WGNR 
## 0.02459463 0.01703910 0.02042631 0.01380234 0.02539003 0.14325674 
##  portfolio 
## 0.01917637

The result protfolio seems to be performing extremely well. However, this is merely because we picked the best ones from the history. The future performance is to be reviewed.