Load all libraries
library(RCurl)
library(XML)
library(stringr)
library(dplyr)
library(tidyr)
library(pracma)
library(ggplot2)
Section 1: Extract all stock symbols of NASDAQ and NYSE
Set up the base link to the websites containing the data of stock symbols.
Stocks in the website is grouped by the initial of the companies. For example, Apple is in the ‘A’ group and the complete URL to the ‘A’ group is “https://www.advfn.com/nasdaq/nasdaq.asp?companies=A”
NASDAQ_list_url <- 'https://www.advfn.com/nasdaq/nasdaq.asp?companies='
NYSE_list_url <- 'https://www.advfn.com/nyse/newyorkstockexchange.asp?companies='
Set up an element function to help extracting the URLs in a table
hrefFun <- function(x){
xpathSApply(x,'./a',xmlAttrs)
}
Set up a function to extract the company name, symbol, and the base URL to the detailed information (such as historical price) of every stock within the same group/page. The base URL is linked to information hosted by the same website.
get_stock_list <- function(base_link, base_group) {
htmlLink <- getURL(str_c(base_link, base_group))
htmlFile <- htmlParse(htmlLink)
temp <- readHTMLTable(htmlFile, stringsAsFactors = FALSE, which = 5)
colnames(temp) <- c("Company", "Symbol","Link")
temp <- temp[-1,]
temp <- temp[str_length(temp$Company) > 0,]
temp2 <- readHTMLTable(htmlFile, elFun = hrefFun, stringsAsFactors = FALSE, which = 5)
temp2 <- temp2$V1
temp2 <- temp2[temp2 != "list()"]
temp2 <- str_remove(temp2, "stock-price")
temp$Link <- temp2
return(temp)
}
Loop through all groups/pages and extract a complete list of all stocks in NASDAQ and NYSE
stock_list <- data.frame()
for (group_index in c(LETTERS, "0")) {
stock_list <- rbind(stock_list, get_stock_list(NASDAQ_list_url,group_index))
}
number_of_NASDAQ <- length(stock_list$Symbol)
for (group_index in c(LETTERS, "0")) {
stock_list <- rbind(stock_list, get_stock_list(NYSE_list_url,group_index))
}
total_of_stocks <- length(stock_list$Symbol)
Add a new variable indicating which exchange(NASDAQ or NYSE) each stock belongs to.
Finally, store the completed list in a csv file.
stock_list$exchange <- NA
stock_list$exchange[1:number_of_NASDAQ] <- "NASDAQ"
stock_list$exchange[(number_of_NASDAQ + 1):total_of_stocks] <- "NYSE"
write.csv(stock_list, "stock_list.csv", row.names = FALSE)
The finished complete list of stocks is uploaded to gitHub for easier access.
stock_list <- read.csv("https://raw.githubusercontent.com/ezaccountz/DATA_607_Final_Project/master/stock_list.csv", stringsAsFactors = FALSE)
print(str_c("The total number of companies is ", length(stock_list$Symbol)))
## [1] "The total number of companies is 4999"
head(stock_list)
## Company Symbol
## 1 A. Schulman SHLM
## 2 A.c. Moore Arts Crafts ACMR
## 3 A.d.a.m. Inc. ADAM
## 4 Aaipharma AAII
## 5 Aaon AAON
## 6 Abaxis ABAX
## Link
## 1 https://ih.advfn.com/stock-market/NASDAQ/a-schulman-inc-delisted-SHLM/
## 2 https://ih.advfn.com/stock-market/NASDAQ/acm-research-ACMR/
## 3 https://ih.advfn.com/stock-market/NASDAQ/ny-adam-ADAM/
## 4 https://ih.advfn.com/stock-market/NASDAQ/aaipharma-AAII/
## 5 https://ih.advfn.com/stock-market/NASDAQ/aaon-AAON/
## 6 https://ih.advfn.com/stock-market/NASDAQ/abaxis-inc-delisted-ABAX/
## exchange
## 1 NASDAQ
## 2 NASDAQ
## 3 NASDAQ
## 4 NASDAQ
## 5 NASDAQ
## 6 NASDAQ
In the same website that the stock symbols are extracted, historical stock prices are also available.
For example, the stock prices for ‘Amazon’ are listed in “https://ih.advfn.com/stock-market/NASDAQ/amazon-com-AMZN/historical/more-historical-data?current=0&Date1=01/01/18&Date2=10/31/19”.
By using the base URLs recorded in the stock list, we can constract the complete URL for each stock and extract the historical prices for all stock (if available).
The following code is used to create a blank stock price table and then fill in the daily stock price from 01/01/2018 to 10/31/2019 by looping through the information pages of every individual stocks.
The finished data is stored in a csv file.
stock_date <- seq.Date(as.Date("2018-1-1"), as.Date("2019-10-31"), "day")
stock_price <- data.frame(matrix(ncol = length(stock_list$Symbol), nrow = length(stock_date)))
colnames(stock_price) <- stock_list$Symbol
rownames(stock_price) <- stock_date
get_stock_price <- function(base_link, start_date, end_date) {
htmlLink <- getURL(str_c(base_link,"historical/more-historical-data?current=",0,"&Date1=",start_date,"&Date2=",end_date))
if (str_length(htmlLink) == 0) {
return(NA)
}
if (str_detect(htmlLink, "Page Not Found")) {
return(NA)
}
htmlFile <- htmlParse(htmlLink)
temp <- readHTMLTable(htmlFile, stringsAsFactors = FALSE, which = 2)
if (length(temp) == 1) return(NA)
num_of_pages <- getHTMLLinks(htmlFile, xpQuery = "//a[@class='date-control date-control-right']/span[@id='last']/parent::*/@href")
if (length(num_of_pages) > 0) {
num_of_pages <- str_extract(num_of_pages[1], "current=\\d+")
num_of_pages <- as.integer(str_remove(num_of_pages, "current="))
for(i in c(1:num_of_pages)) {
htmlLink <- getURL(str_c(base_link,"historical/more-historical-data?current=",i,"&Date1=",start_date,"&Date2=",end_date))
htmlFile <- htmlParse(htmlLink)
temp2 <- readHTMLTable(htmlFile, stringsAsFactors = FALSE, which = 2)
if (length(temp2) > 1) {
temp <- rbind(temp, temp2)
}
}
}
temp$Date <- as.Date(temp$Date, "%b%d%Y")
temp <- temp[temp$Volume > 0,]
if (length(temp$Date) == 0) return(NA)
return(temp)
}
for (i in c(1:length(stock_list$Symbol))) {
start_date <- "1/1/18"
end_date <- "10/31/19"
temp <- get_stock_price(stock_list$Link[i], start_date, end_date)
if (!is.na(temp)) {
for(j in c(1:length(temp$Date))) {
stock_price[as.character(temp$Date[j]),stock_list$Symbol[i]] <- temp$Close[j]
}
}
}
write.csv(stock_price, "stock_price.csv", row.names = TRUE)
The overall run time for the data collecting process is over 10 hours and the quality of the data is not acceptable. Please try not to run the code yourself. The result is uploaded to gitHub for you to review if you are interested.
stock_price <- read.csv("https://raw.githubusercontent.com/ezaccountz/DATA_607_Final_Project/master/stock_price.csv", stringsAsFactors = FALSE, row.names=1)
stock_price[1:10,1:5]
## SHLM ACMR ADAM AAII AAON
## 2018-01-01 NA NA NA NA NA
## 2018-01-02 37.35 5.620 NA NA 36.95
## 2018-01-03 38.10 5.300 NA NA 37.60
## 2018-01-04 38.75 5.315 NA NA 37.45
## 2018-01-05 39.10 5.510 NA NA 38.05
## 2018-01-06 NA NA NA NA NA
## 2018-01-07 NA NA NA NA NA
## 2018-01-08 38.90 5.240 NA NA 37.45
## 2018-01-09 37.90 5.240 NA NA 37.10
## 2018-01-10 38.90 5.190 NA NA 36.10
The data has the following problems:
1) The base link to the stock information is no longer valid.
2) Some data values are missing.
3) Some data values are duplicated.
4) A company may not be active for the whole period from 01/01/2018 to 10/31/2019.Some of them have junk values added.
A more reliable data source is Yahoo Finance. The ‘BatchGetSymbols’ package is a tool ready in hand to extract all information we need and you can also define a threshold to filter out all stocks with inadequate data.
library('BatchGetSymbols')
Gather all stock prices for stocks with no missing value (thresh.bad.data = 1).
Extract the daily close values and transform the data from long to wide.
Double check and remove any stock with missing value.
The run time is around 1 hour. The result is stored in a csv file and uploaded to gitHub for easier access.
stock_date <- seq.Date(as.Date("2018-1-1"), as.Date("2019-10-31"), "day")
stock_price <- data.frame(matrix(ncol = length(stock_list$Symbol), nrow = length(stock_date)))
colnames(stock_price) <- stock_list$Symbol
rownames(stock_price) <- stock_date
first_date <- as.Date("2018-1-1")
last_date <- as.Date("2019-11-1")
freq_data <- 'daily'
stock_price2 <- BatchGetSymbols(tickers = stock_list$Symbol,
first.date = first_date,
last.date = last_date,
freq.data = freq_data,
thresh.bad.data = 1,
cache.folder = file.path(tempdir(),
'BGS_Cache') )
stock_price3 <- data.frame(Date = stock_price2$df.tickers$ref.date,
Symbol = stock_price2$df.tickers$ticker,
Close = stock_price2$df.tickers$price.close)
stock_price3 <- spread(temp, "Symbol", "Close")
stock_price3 <- stock_price3[,apply(stock_price3, 2, function(x) !any(is.na(x)))]
write.csv(stock_price3, "stock_price_wide.csv", row.names = FALSE)
stock_price3 <- read.csv("https://raw.githubusercontent.com/ezaccountz/DATA_607_Final_Project/master/stock_price_wide.csv", stringsAsFactors = FALSE)
stock_price3$Date <- as.Date(stock_price3$Date)
print(str_c("The total number of companies is ", length(colnames(stock_price3))-1))
## [1] "The total number of companies is 2426"
stock_price3[1:10,1:5]
## Date A AA AAME AAN
## 1 2018-01-02 67.60 55.17 3.25 39.47
## 2 2018-01-03 69.32 54.50 3.35 39.67
## 3 2018-01-04 68.80 54.70 3.25 39.83
## 4 2018-01-05 69.90 54.09 3.65 39.94
## 5 2018-01-08 70.05 55.00 3.35 40.72
## 6 2018-01-09 71.77 54.20 3.40 39.80
## 7 2018-01-10 70.79 56.17 3.35 39.15
## 8 2018-01-11 70.80 56.91 3.35 41.12
## 9 2018-01-12 71.73 56.76 3.35 41.13
## 10 2018-01-16 71.23 56.24 3.30 41.40
Section 2: Calculate the similarity between every two stocks.
The similarity of two stocks are calculated by the Correlation Coefficient using the stock price from 01/01/2018 to 12/31/2018 as inputs.
similarity_table <- data.frame(matrix(ncol = length(colnames(stock_price3)) - 1, nrow = length(colnames(stock_price3)) - 1))
colnames(similarity_table) <- colnames(stock_price3)[-1]
rownames(similarity_table) <- colnames(stock_price3)[-1]
for(i in c(1:(length(similarity_table$A)-1))) {
for (j in c(1:i)) {
similarity_table[i+1,j] <- cor(stock_price3[1:251,i + 2],stock_price3[1:251,j + 1], method = "pearson")
similarity_table[j,i+1] <- similarity_table[i+1,j]
}
}
similarity_table[1:5,1:5]
## A AA AAME AAN AAON
## A NA 0.1152811 0.43985067 0.08619936 0.26496687
## AA 0.11528114 NA 0.50199188 -0.34331229 -0.19841405
## AAME 0.43985067 0.5019919 NA -0.17573586 0.09045917
## AAN 0.08619936 -0.3433123 -0.17573586 NA 0.58895335
## AAON 0.26496687 -0.1984140 0.09045917 0.58895335 NA
Section 3: k-means clustering.
The algorithm is based on the discription from our text book “Data Science for Business” Chapter 6, page 170-174.
First, create a new list for all available stocks. Also, calculate the annual return for later processing.
stock_list2 <- data.frame(Symbol = colnames(similarity_table), Center = NA, Annual_Return = NA, stringsAsFactors=FALSE)
temp <- data.frame(Symbol = colnames(stock_price3)[-1], start_price = t(stock_price3[1,-1]), stringsAsFactors=FALSE)
colnames(temp)[2] <- "start_price"
temp2 <- data.frame(Symbol = colnames(stock_price3)[-1], end_price = t(stock_price3[1 + 250,-1]), stringsAsFactors=FALSE)
colnames(temp2)[2] <- "end_price"
stock_list2 <- stock_list2 %>%
left_join(temp, by = "Symbol") %>%
left_join(temp2, by = "Symbol") %>%
mutate(Annual_Return = (end_price - start_price)/start_price) %>%
select(1:3)
head(stock_list2)
## Symbol Center Annual_Return
## 1 A NA -0.002070991
## 2 AA NA -0.518216405
## 3 AAME NA -0.258461538
## 4 AAN NA 0.065366049
## 5 AAON NA -0.051150202
## 6 AAP NA 0.484211640
Create a function to select the initial centers for clustering. The inputs are the similarity table, the number of centers and a selected stock as a starting point.
The function sorts all stocks based their distance/similarity to the selected starting stock. The function then pick up N-1 stocks from the sorted list sequentially to be the remaining initial centers
find_initial_centers <- function (sim_table, num_center, start_symbol) {
temp <- data.frame(symbol = colnames(sim_table), similarity = sim_table[, start_symbol], stringsAsFactors = FALSE)
temp <- arrange(temp,desc(similarity))
interval <- as.integer(length(temp$symbol) / num_center)
centers <- start_symbol
for(i in c(1:(num_center-1))) {
centers <- c(centers, temp$symbol[i * interval])
}
return(data.frame(centers = centers, stringsAsFactors = FALSE))
}
First, select 50 centers starting from the stock of ‘Amazon’
start_centers <- find_initial_centers(similarity_table, 50, "AMZN")
unlist(start_centers)
## centers1 centers2 centers3 centers4 centers5 centers6 centers7
## "AMZN" "TRNS" "UIS" "THRM" "CMG" "HSKA" "BPOP"
## centers8 centers9 centers10 centers11 centers12 centers13 centers14
## "WAYN" "HNI" "GBX" "GIII" "RCII" "FIZZ" "AFL"
## centers15 centers16 centers17 centers18 centers19 centers20 centers21
## "AUDC" "KTCC" "GPRO" "TDW" "AIV" "RECN" "WEC"
## centers22 centers23 centers24 centers25 centers26 centers27 centers28
## "IMMR" "MDU" "CZWI" "GPOR" "NOK" "WSFS" "GYRO"
## centers29 centers30 centers31 centers32 centers33 centers34 centers35
## "BXG" "CNQ" "PBHC" "BBW" "HHS" "BCO" "WMT"
## centers36 centers37 centers38 centers39 centers40 centers41 centers42
## "SSFN" "GIGM" "PLL" "WACLY" "AMX" "RYAAY" "ELSE"
## centers43 centers44 centers45 centers46 centers47 centers48 centers49
## "EBSB" "DLX" "F" "OMC" "IAE" "SQM" "LFC"
## centers50
## "QUIK"
Step 1: for each of the stocks, calculate the distance/similarity from the stock to each of the centers. Assign a center to the stock where the distance/similarity is the smallest
temp <- stock_list2 %>%
left_join(select(cbind(Symbol = rownames(similarity_table), similarity_table, stringsAsFactors=FALSE), c("Symbol", start_centers$centers)), by = "Symbol")
for (i in c(1: length(stock_list2$Symbol))) {
if(any(is.na(temp[i,-c(1:3)]))) {
stock_list2$Center[i] <- stock_list2$Symbol[i]
} else {
stock_list2$Center[i] <- colnames(temp)[which(temp[i,] == max(temp[i,-c(1:3)], na.rm = TRUE))]
}
}
head(stock_list2)
## Symbol Center Annual_Return
## 1 A WMT -0.002070991
## 2 AA DLX -0.518216405
## 3 AAME QUIK -0.258461538
## 4 AAN UIS 0.065366049
## 5 AAON HNI -0.051150202
## 6 AAP AUDC 0.484211640
Step 2: for each group of stocks (stocks with the same center), find the adjusted center within the group.
A center should be close to all members of the same group.
That is, when calculating the maximum distance from a selected stock to all other members within a group, the result for the center should be the smallest.
temp2 <- unique(stock_list2$Center)
for (i in c(1:length(temp2))) {
temp <- stock_list2[stock_list2$Center == temp2[i],]
temp <- similarity_table[temp$Symbol,temp$Symbol]
if (length(temp) > 1) {
temp$min <- apply(temp, 1, min, na.rm = TRUE)
stock_list2[stock_list2$Center == temp2[i],]$Center <- rownames(temp)[which(temp$min == max(temp$min))][1]
}
}
adjusted_centers <- data.frame(centers = unique(stock_list2$Center), stringsAsFactors = FALSE)
head(stock_list2)
## Symbol Center Annual_Return
## 1 A CMCSA -0.002070991
## 2 AA ARD -0.518216405
## 3 AAME TREE -0.258461538
## 4 AAN DISCA 0.065366049
## 5 AAON SHW -0.051150202
## 6 AAP HURN 0.484211640
Repeat step 1 and step 2 until there is no change in the centers
while (!identical(arrange(start_centers, centers), arrange(adjusted_centers, centers))) {
start_centers <- adjusted_centers
temp <- stock_list2 %>%
left_join(select(cbind(Symbol = rownames(similarity_table), similarity_table, stringsAsFactors=FALSE), c("Symbol", start_centers$centers)), by = "Symbol")
for (i in c(1: length(stock_list2$Symbol))) {
if(any(is.na(temp[i,-c(1:3)]))) {
stock_list2$Center[i] <- stock_list2$Symbol[i]
} else {
stock_list2$Center[i] <- colnames(temp)[which(temp[i,] == max(temp[i,-c(1:3)], na.rm = TRUE))]
}
}
temp2 <- unique(stock_list2$Center)
for (i in c(1:length(temp2))) {
temp <- stock_list2[stock_list2$Center == temp2[i],]
temp <- similarity_table[temp$Symbol,temp$Symbol]
if (length(temp) > 1) {
temp$min <- apply(temp, 1, min, na.rm = TRUE)
stock_list2[stock_list2$Center == temp2[i],]$Center <- rownames(temp)[which(temp$min == max(temp$min))][1]
}
}
adjusted_centers <- data.frame(centers = unique(stock_list2$Center), stringsAsFactors = FALSE)
}
The final centers are
unlist(adjusted_centers)
## centers1 centers2 centers3 centers4 centers5 centers6 centers7
## "CMCSA" "ARD" "ACAD" "DSWL" "AVX" "HURN" "GBX"
## centers8 centers9 centers10 centers11 centers12 centers13 centers14
## "NBL" "AMKR" "BDX" "CEVA" "ROG" "GFF" "SHO"
## centers15 centers16 centers17 centers18 centers19 centers20 centers21
## "RWT" "BPOP" "NOA" "VTR" "IDSA" "PRGX" "SIG"
## centers22 centers23 centers24 centers25 centers26 centers27 centers28
## "BCO" "UMC" "SAP" "AM" "EMR" "CZWI" "MOFG"
## centers29 centers30 centers31 centers32 centers33 centers34 centers35
## "VMC" "SAFT" "SHEN" "KTCC" "APD" "GTS" "NAII"
## centers36 centers37 centers38 centers39 centers40 centers41 centers42
## "TGEN" "GIGM" "CIB" "PRSC" "WAT" "WRI" "TDW"
## centers43 centers44 centers45 centers46 centers47 centers48 centers49
## "UFCS" "CHMP" "LAKE" "HMY" "GRVY" "CREE" "IPG"
## centers50
## "RECN"
Section 3: portfolio evaluation
Select from each group, the stock with the highest annual return
stock_result <- stock_list2 %>%
group_by(Center) %>%
arrange(desc(Annual_Return)) %>%
filter(row_number()==1)
head(stock_result)
## # A tibble: 6 x 3
## # Groups: Center [6]
## Symbol Center Annual_Return
## <chr> <chr> <dbl>
## 1 TNDM HURN 13.9
## 2 NIHD BPOP 9.76
## 3 ARWR SAFT 2.34
## 4 AMRN NOA 2.21
## 5 AMSC SHEN 1.93
## 6 BXC PRGX 1.53
I expect that my investment to not lose value. Therefore, stocks with negative annual return are filtered out.
Extract the daiy stock price from 01/01/2018 to 12/31/2018 for each selected stock.
Create a protfolio by assigning $1,000 equally to each of the selected stocks based on their values on 01/01/2018.
Caculate the value of the profolio from 01/01/2018 to 12/31/2018
stock_portfolio_symbol <- filter(stock_result, Annual_Return >= 0)$Symbol
stock_portfolio_price <- stock_price3[,c("Date",stock_portfolio_symbol)]
stock_portfolio_price <- stock_portfolio_price[stock_portfolio_price$Date >= as.Date("2018-1-1") & stock_portfolio_price$Date < as.Date("2019-1-1"),]
stock_portfolio_share <- data.frame(Symbol = stock_portfolio_symbol, share = NA, stringsAsFactors = FALSE)
stock_portfolio_share$share <- (1000/length(stock_portfolio_share$Symbol))/t(stock_portfolio_price[1,-1])
stock_portfolio_price$portfolio <- dot(as.matrix(t(stock_portfolio_price[,-1])), stock_portfolio_share$share)
The daily value of our protfolio from 01/01/2018 to 12/31/2018 (excluding the dates when the markets are closed)
stock_portfolio_price$portfolio
## [1] 1000.000 1018.218 1021.548 1029.769 1061.712 1061.150 1099.154
## [8] 1111.641 1107.487 1105.438 1112.714 1118.357 1169.038 1168.136
## [15] 1186.338 1196.041 1183.802 1185.607 1187.363 1179.202 1170.224
## [22] 1182.610 1157.234 1117.408 1133.795 1138.203 1109.444 1109.260
## [29] 1121.277 1122.176 1142.513 1146.638 1153.406 1159.475 1160.029
## [36] 1166.045 1197.945 1220.300 1210.770 1205.076 1192.828 1213.254
## [43] 1239.188 1241.983 1251.152 1256.328 1287.145 1331.869 1334.679
## [50] 1328.470 1334.732 1358.408 1339.217 1348.174 1363.241 1344.254
## [57] 1340.365 1369.116 1349.338 1332.441 1363.206 1341.229 1354.111
## [64] 1383.941 1390.566 1403.598 1402.393 1414.604 1418.884 1425.737
## [71] 1422.499 1430.170 1458.423 1477.241 1463.412 1464.322 1446.056
## [78] 1432.926 1434.134 1447.850 1443.358 1430.256 1440.178 1442.851
## [85] 1438.212 1452.516 1458.285 1497.012 1516.314 1531.947 1532.898
## [92] 1535.557 1548.777 1537.606 1534.055 1546.898 1557.595 1547.025
## [99] 1570.221 1576.787 1563.862 1565.002 1606.637 1596.148 1611.355
## [106] 1615.425 1666.437 1678.444 1665.213 1673.623 1706.426 1760.703
## [113] 1777.190 1822.409 1817.113 1793.116 1794.371 1844.329 1817.867
## [120] 1862.597 1844.125 1851.717 1830.907 1847.301 1846.288 1845.803
## [127] 1866.517 1901.922 1916.151 1914.061 1929.526 1925.555 1955.830
## [134] 1974.520 1960.867 1975.839 1955.800 1960.070 1947.260 1997.568
## [141] 2030.626 2027.323 2056.510 1999.255 1963.172 2049.201 2101.157
## [148] 2100.208 2102.925 2096.614 2048.981 1996.749 2035.617 2045.039
## [155] 2022.931 2037.313 2002.632 2007.672 2029.192 2045.835 2083.616
## [162] 2105.517 2116.508 2115.851 2134.787 2141.604 2162.256 2178.307
## [169] 2199.178 2204.826 2143.814 2168.507 2175.155 2183.101 2183.572
## [176] 2214.407 2252.870 2198.821 2082.013 2114.846 2054.421 2154.402
## [183] 2147.419 2228.154 2297.971 2287.745 2291.020 2297.781 2221.902
## [190] 2178.565 2208.745 2160.878 2169.825 2129.112 2116.111 2016.253
## [197] 2006.619 2040.077 2045.922 2133.594 2147.275 2088.148 2099.545
## [204] 2099.932 2099.979 2002.869 2039.588 2059.468 2016.051 2044.114
## [211] 2101.451 2208.619 2269.584 2267.126 2244.901 2321.045 2298.105
## [218] 2283.990 2191.234 2060.353 1982.943 2050.143 2045.375 1963.866
## [225] 1916.787 2007.218 1989.837 2026.479 1994.322 2073.898 2109.058
## [232] 2111.647 2130.059 2013.586 2038.972 1969.763 1990.327 2005.358
## [239] 2055.335 2019.834 2005.312 1962.642 1945.710 1933.180 1872.809
## [246] 1808.532 1790.016 1915.649 1910.626 1960.567 2010.335
Since at the point of creating the profolio, the stocks are known to be the best-performing one, our portfolio should be also performing well
stock_plot <- ggplot(stock_portfolio_price, aes(x=stock_portfolio_price$Date, y=stock_portfolio_price$portfolio)) +
geom_line(color = "black") +
labs(title="Portfolio value from 01/01/2018 to 12/31/2018",
x="Date",
y="Portfolio value")
plot(stock_plot)
Looking at the daily percent change of the stocks and the portfolio,
the standard deviation of the portfolio is indeed noticeably smaller than most of the stocks.
temp <- (stock_portfolio_price[,-1] - apply(stock_portfolio_price[,-1], 2, lag))/apply(stock_portfolio_price[,-1], 2, lag)
apply(temp, 2, sd, na.rm = TRUE)
## TNDM NIHD ARWR AMRN AMSC BXC
## 0.05689750 0.08192225 0.05651674 0.20353809 0.04838987 0.06393355
## SSTI SMTX EGY MLAB SIGA TZOO
## 0.04168241 0.03125154 0.05436820 0.02258700 0.03284977 0.04835725
## GOL NVAX RELL VRTB ISRG IMMR
## 0.04602950 0.05923009 0.01907300 0.23318716 0.02099944 0.03805117
## CPL NOK NWPX PODD APEI ANF
## 0.01866348 0.01819659 0.02347094 0.02334311 0.03357011 0.03664777
## BDCO BA NATI ABG WILC FORD
## 0.10961869 0.01985344 0.02252684 0.01900089 0.01880939 0.10248450
## AFL EC PBHC WIRE TFX HARL
## 0.01275387 0.02668673 0.01299297 0.01929331 0.01742575 0.01512471
## HDB LOW AMTY ORBT portfolio
## 0.01494738 0.01827962 0.08893489 0.01781543 0.01883155
Now we evaluate the performance of the portfolio, assuming that we invested $1,000 on 01/01/2019
stock_portfolio_price <- stock_price3[,c("Date",stock_portfolio_symbol)]
stock_portfolio_price <- stock_portfolio_price[stock_portfolio_price$Date >= as.Date("2019-1-1") & stock_portfolio_price$Date < as.Date("2019-11-1"),]
stock_portfolio_share <- data.frame(Symbol = stock_portfolio_symbol, share = NA, stringsAsFactors = FALSE)
stock_portfolio_share$share <- (1000/length(stock_portfolio_share$Symbol))/t(stock_portfolio_price[1,-1])
stock_portfolio_price$portfolio <- dot(as.matrix(t(stock_portfolio_price[,-1])), stock_portfolio_share$share)
The daily value of our protfolio from 01/01/2019 to 10/31/2019 (excluding the dates when the markets are closed)
stock_portfolio_price$portfolio
## [1] 1000.000 983.964 1016.269 1026.650 1041.273 1053.520 1056.366
## [8] 1061.279 1053.743 1069.234 1061.151 1066.734 1077.204 1059.707
## [15] 1057.416 1065.112 1084.563 1075.387 1061.367 1075.317 1091.293
## [22] 1089.582 1094.414 1094.986 1101.863 1090.007 1104.057 1112.152
## [29] 1135.824 1127.646 1134.238 1141.323 1142.515 1143.892 1127.407
## [36] 1146.657 1141.308 1141.661 1161.821 1135.450 1143.585 1126.274
## [43] 1133.004 1115.037 1110.083 1109.124 1126.183 1127.701 1125.058
## [50] 1118.987 1117.020 1115.477 1117.273 1112.267 1116.434 1077.641
## [57] 1078.656 1081.647 1071.293 1077.258 1087.113 1082.096 1093.284
## [64] 1100.075 1101.657 1104.357 1101.334 1091.655 1101.848 1095.313
## [71] 1095.601 1100.227 1105.649 1090.024 1090.220 1091.364 1100.521
## [78] 1092.335 1100.238 1105.921 1104.580 1096.533 1091.911 1093.818
## [85] 1120.780 1125.470 1111.134 1114.087 1116.584 1104.708 1074.416
## [92] 1072.180 1089.806 1096.437 1083.767 1068.729 1085.286 1072.021
## [99] 1071.567 1086.185 1071.145 1064.060 1063.417 1043.283 1036.894
## [106] 1059.635 1056.473 1046.359 1061.315 1056.144 1050.370 1050.899
## [113] 1059.490 1045.861 1057.878 1072.477 1076.578 1094.587 1088.587
## [120] 1084.737 1076.584 1062.394 1076.660 1080.322 1084.505 1083.373
## [127] 1088.145 1100.849 1105.719 1121.865 1129.868 1129.484 1129.378
## [134] 1125.677 1121.386 1122.376 1109.239 1098.748 1097.984 1112.646
## [141] 1118.605 1092.515 1114.448 1114.179 1129.183 1113.797 1109.493
## [148] 1091.973 1064.809 1087.200 1084.705 1104.233 1097.426 1082.929
## [155] 1089.772 1067.592 1038.756 1066.922 1064.473 1061.908 1074.992
## [162] 1067.092 1046.846 1060.299 1052.737 1058.400 1071.524 1072.104
## [169] 1062.064 1067.454 1076.042 1074.519 1066.123 1076.516 1093.708
## [176] 1097.755 1088.152 1085.954 1091.987 1087.064 1087.447 1087.935
## [183] 1088.846 1077.079 1078.888 1063.937 1054.710 1064.584 1048.097
## [190] 1035.667 1069.717 1065.941 1064.858 1051.101 1056.677 1048.685
## [197] 1063.182 1061.644 1070.807 1080.215 1086.053 1093.987 1086.512
## [204] 1093.970 1094.071 1087.691 1082.762 1088.966 1091.481 1090.240
## [211] 1088.108
The return for 10 months is 8.8%.
stock_plot <- ggplot(stock_portfolio_price, aes(x=stock_portfolio_price$Date, y=stock_portfolio_price$portfolio)) +
geom_line(color = "black") +
labs(title="Portfolio value from 01/01/2019 to 10/31/2019",
x="Date",
y="Portfolio value")
plot(stock_plot)
From the graph, we see that the portfolio is doing well at the begining but then the value is going up and down. The volatility of the stock_portfolio is still considerable.
Now let’s look at the standard deviation of the daily percent change
temp <- (stock_portfolio_price[,-1] - apply(stock_portfolio_price[,-1], 2, lag))/apply(stock_portfolio_price[,-1], 2, lag)
apply(temp, 2, sd, na.rm = TRUE)
## TNDM NIHD ARWR AMRN AMSC BXC
## 0.039055544 0.036158432 0.035656576 0.039870961 0.037535471 0.033530935
## SSTI SMTX EGY MLAB SIGA TZOO
## 0.035018065 0.039803008 0.035374481 0.027911195 0.023529846 0.042821132
## GOL NVAX RELL VRTB ISRG IMMR
## 0.036007209 0.074193059 0.018988820 0.073570269 0.018682825 0.022181683
## CPL NOK NWPX PODD APEI ANF
## 0.018300024 0.025021383 0.020950182 0.026597692 0.025065997 0.037672616
## BDCO BA NATI ABG WILC FORD
## 0.055147662 0.018679273 0.015825715 0.017755731 0.021051516 0.033410598
## AFL EC PBHC WIRE TFX HARL
## 0.009549102 0.019857401 0.019423588 0.016665734 0.015800406 0.015232224
## HDB LOW AMTY ORBT portfolio
## 0.015301152 0.017508008 0.156543499 0.016364838 0.010996339
The standard deviation for the portfolio is lower in the second year. However, the performance (return) is also lower.
Conclusion: stocks that are performing well in the first year may not be performing well in the second year. The model needs to be improved to select more reliable stocks. One option is to extend the input period to a few more years. Another option is to perform deeper analysis when selecting a stock(s) from each group. This will require additional complicated work, so for now it will not be included in my project.
Section 4: Create final portfolio
At the end, let’s see what the model gives up by using the daily price from 01/01/2018 to 10/31/2019 as input.
Calculate the similiarity table.
similarity_table2 <- data.frame(matrix(ncol = length(colnames(stock_price3)) - 1, nrow = length(colnames(stock_price3)) - 1))
colnames(similarity_table2) <- colnames(stock_price3)[-1]
rownames(similarity_table2) <- colnames(stock_price3)[-1]
for(i in c(1:(length(similarity_table2$A)-1))) {
for (j in c(1:i)) {
similarity_table2[i+1,j] <- cor(stock_price3[,i + 2],stock_price3[,j + 1], method = "pearson")
similarity_table2[j,i+1] <- similarity_table2[i+1,j]
}
}
similarity_table2[1:5,1:5]
## A AA AAME AAN AAON
## A NA -0.4864174 -0.1423822 0.4824920 0.5652028
## AA -0.4864174 NA 0.6702864 -0.7855711 -0.7583300
## AAME -0.1423822 0.6702864 NA -0.5299671 -0.4701893
## AAN 0.4824920 -0.7855711 -0.5299671 NA 0.8526159
## AAON 0.5652028 -0.7583300 -0.4701893 0.8526159 NA
Create a new table for the clustering calculation
stock_list3 <- data.frame(Symbol = colnames(similarity_table2), Center = NA, Total_Return = NA, stringsAsFactors=FALSE)
temp <- data.frame(Symbol = colnames(stock_price3)[-1], start_price = t(stock_price3[,-1]), stringsAsFactors=FALSE)
colnames(temp)[2] <- "start_price"
temp2 <- data.frame(Symbol = colnames(stock_price3)[-1], end_price = t(stock_price3[length(stock_price3$Date),-1]), stringsAsFactors=FALSE)
colnames(temp2)[2] <- "end_price"
stock_list3 <- stock_list3 %>%
left_join(temp, by = "Symbol") %>%
left_join(temp2, by = "Symbol") %>%
mutate(Total_Return = (end_price - start_price)/start_price) %>%
select(1:3)
k-means clustering
start_centers <- find_initial_centers(similarity_table2, 50, "AMZN")
temp <- stock_list3 %>%
left_join(select(cbind(Symbol = rownames(similarity_table2), similarity_table2, stringsAsFactors=FALSE), c("Symbol", start_centers$centers)), by = "Symbol")
for (i in c(1: length(stock_list3$Symbol))) {
if(any(is.na(temp[i,-c(1:3)]))) {
stock_list3$Center[i] <- stock_list3$Symbol[i]
} else {
stock_list3$Center[i] <- colnames(temp)[which(temp[i,] == max(temp[i,-c(1:3)], na.rm = TRUE))]
}
}
temp2 <- unique(stock_list3$Center)
for (i in c(1:length(temp2))) {
temp <- stock_list3[stock_list3$Center == temp2[i],]
temp <- similarity_table2[temp$Symbol,temp$Symbol]
if (length(temp) > 1) {
temp$min <- apply(temp, 1, min, na.rm = TRUE)
stock_list3[stock_list3$Center == temp2[i],]$Center <- rownames(temp)[which(temp$min == max(temp$min))][1]
}
}
adjusted_centers <- data.frame(centers = unique(stock_list3$Center), stringsAsFactors = FALSE)
while (!identical(arrange(start_centers, centers), arrange(adjusted_centers, centers))) {
start_centers <- adjusted_centers
temp <- stock_list3 %>%
left_join(select(cbind(Symbol = rownames(similarity_table2), similarity_table2, stringsAsFactors=FALSE), c("Symbol", start_centers$centers)), by = "Symbol")
for (i in c(1: length(stock_list3$Symbol))) {
if(any(is.na(temp[i,-c(1:3)]))) {
stock_list3$Center[i] <- stock_list3$Symbol[i]
} else {
stock_list3$Center[i] <- colnames(temp)[which(temp[i,] == max(temp[i,-c(1:3)], na.rm = TRUE))]
}
}
temp2 <- unique(stock_list3$Center)
for (i in c(1:length(temp2))) {
temp <- stock_list3[stock_list3$Center == temp2[i],]
temp <- similarity_table2[temp$Symbol,temp$Symbol]
if (length(temp) > 1) {
temp$min <- apply(temp, 1, min, na.rm = TRUE)
stock_list3[stock_list3$Center == temp2[i],]$Center <- rownames(temp)[which(temp$min == max(temp$min))][1]
}
}
adjusted_centers <- data.frame(centers = unique(stock_list3$Center), stringsAsFactors = FALSE)
}
Generate the protfolio
stock_result <- stock_list3 %>%
group_by(Center) %>%
arrange(desc(Total_Return)) %>%
filter(row_number()==1)
stock_portfolio_symbol <- filter(stock_result, Total_Return >= 0)$Symbol
stock_portfolio_price <- stock_price3[,c("Date",stock_portfolio_symbol)]
stock_portfolio_price <- stock_portfolio_price[stock_portfolio_price$Date >= as.Date("2018-1-1") & stock_portfolio_price$Date < as.Date("2019-11-1"),]
stock_portfolio_share <- data.frame(Symbol = stock_portfolio_symbol, share = NA, stringsAsFactors = FALSE)
stock_portfolio_share$share <- (1000/length(stock_portfolio_share$Symbol))/t(stock_portfolio_price[1,-1])
stock_portfolio_price$portfolio <- dot(as.matrix(t(stock_portfolio_price[,-1])), stock_portfolio_share$share)
stock_plot <- ggplot(stock_portfolio_price, aes(x=stock_portfolio_price$Date, y=stock_portfolio_price$portfolio)) +
geom_line(color = "black") +
labs(title="Portfolio value from 01/01/2018 to 10/31/2019",
x="Date",
y="Portfolio value")
plot(stock_plot)
standard deviation of daily percent change
temp <- (stock_portfolio_price[,-1] - apply(stock_portfolio_price[,-1], 2, lag))/apply(stock_portfolio_price[,-1], 2, lag)
apply(temp, 2, sd, na.rm = TRUE)
## TNDM ARWR NIHD ATEA BXC PNRG
## 0.04992622 0.04808492 0.06552064 0.06739530 0.05223830 0.03312517
## MERR EGY SKY ACMR AU STAA
## 0.48394137 0.04673050 0.04132960 0.04656917 0.02601952 0.03663223
## DRRX AVP SPEC GLUU MED BEBE
## 0.05287614 0.03445594 0.17996309 0.03671713 0.03469619 0.03863174
## CRNT EEI TAL LQDT ARL SSI
## 0.03585686 0.03160234 0.03026246 0.03234323 0.03731502 0.04458648
## DKS JOBS TSM FOSL MNOV PLXS
## 0.02479547 0.03013347 0.01667517 0.05862203 0.04332470 0.01775949
## INTC AMKR RDN SMIT FIX TROW
## 0.02000278 0.03195234 0.01922472 0.03453190 0.02010523 0.01576708
## AXE LEG BVN FNLC PZZA WGNR
## 0.02459463 0.01703910 0.02042631 0.01380234 0.02539003 0.14325674
## portfolio
## 0.01917637
The result protfolio seems to be performing extremely well. However, this is merely because we picked the best ones from the history. The future performance is to be reviewed.