download MonthlyCloses.csv.This file contains 240 rows or two decades worth of MonthlyClosing prices for some top performing stocks in the Indian stock markets taken from CMIE (Center for Monitoring Indian Economy).When reading in data, it is best to consider converting it into CSV file(s) as it is one of the most flexible data formats, which can be read by most statistical platforms.
demo(graphics)in RStudio gives us a glimpse into the wide variety of plots that R can create.
# Date variables can be hard, so we convert them
# into something more useful
# Install a package called lubridate via the
# Packages tab - should take very little time
library(lubridate)
# Next, import the data from a file.
# If you encounter an error message,
# then you must set the Working Directory.
# You could use the Files tab to navigate
# to the file MonthlyCloses.csv
# and select More > Set working directory
# Read the closing prices into a variable
MonthlyCloses <- read.csv("MonthlyCloses.csv",
header=TRUE)
str(MonthlyCloses)
## 'data.frame': 240 obs. of 14 variables:
## $ Month : Factor w/ 240 levels "Apr-00","Apr-01",..: 96 76 156 16 176 136 116 36 236 216 ...
## $ Pharma1 : num 17.6 16 16.7 13.7 13.1 ...
## $ Pharma2 : num 91.2 87.5 85 78.8 76.2 ...
## $ Aluminum : num 58.1 54.4 50.7 44.4 47.8 ...
## $ FMCG : num 60.5 57.5 56 55.5 58.4 58.6 59.8 59 63 63.6 ...
## $ Housing : num 22.8 21 20 20 21.6 ...
## $ Software1 : num 3.36 3.52 3.71 3.52 3.91 3.7 3.48 3.73 3.62 3.48 ...
## $ Infrastructure: num 76.2 76.7 86.7 83.3 89.3 ...
## $ Automobile : num 52.5 48.8 42.8 45 48 ...
## $ Refineries : num 68.8 66.9 65.3 62.2 67.9 ...
## $ Bank : num 17.3 16.6 16.6 16 17.9 ...
## $ Power : num 13.6 12.4 11.9 11 12.1 ...
## $ Steel : num 138 133 126 117 133 ...
## $ Software2 : num 2.58 2.5 2.49 2.47 2.7 2.67 3.12 2.9 3.07 2.88 ...
head(MonthlyCloses)
## Month Pharma1 Pharma2 Aluminum FMCG Housing Software1 Infrastructure
## 1 Jan-95 17.65 91.25 58.08 60.5 22.75 3.36 76.25
## 2 Feb-95 16.00 87.50 54.38 57.5 21.00 3.52 76.67
## 3 Mar-95 16.67 85.00 50.67 56.0 20.00 3.71 86.67
## 4 Apr-95 13.73 78.75 44.41 55.5 20.00 3.52 83.33
## 5 May-95 13.07 76.25 47.83 58.4 21.60 3.91 89.33
## 6 Jun-95 11.07 65.00 43.84 58.6 23.85 3.70 87.50
## Automobile Refineries Bank Power Steel Software2
## 1 52.50 68.75 17.34 13.65 138.46 2.58
## 2 48.75 66.88 16.63 12.45 133.31 2.50
## 3 42.75 65.31 16.61 11.92 125.94 2.49
## 4 45.00 62.19 16.04 11.00 117.10 2.47
## 5 48.00 67.88 17.88 12.06 132.87 2.70
## 6 48.75 66.00 17.74 11.39 132.28 2.67
# Dimensions of the dataset
numDays <- nrow(MonthlyCloses)
numStocks <- ncol(MonthlyCloses) - 1
# Because the data is in mixed format - the
# first column is a Date and the rest are
# Prices, we make sure that R knows about
# their intended formats
# Explicitly declare the formats of the columns
# Month field is alpha
MonthlyCloses[ , 1] <- as.character(MonthlyCloses[ , 1])
# Prices are numeric
for(i in 2:numStocks + 1) {
MonthlyCloses[ , i] <- as.numeric(MonthlyCloses[ , i])
}
# Prepare the Month field, so it can be converted
# to a Date field. You don't have to understand
# everything here - all that the command is
# achieving is that the Date column is set to
# a format recognisable by R
MonthlyCloses$Month <- sapply(MonthlyCloses$Month,
function(month)
paste("01-",month, sep=""))
MonthlyCloses$Month <- as.Date(MonthlyCloses$Month,
"%d-%b-%y")
head(MonthlyCloses)
## Month Pharma1 Pharma2 Aluminum FMCG Housing Software1
## 1 1995-01-01 17.65 91.25 58.08 60.5 22.75 3.36
## 2 1995-02-01 16.00 87.50 54.38 57.5 21.00 3.52
## 3 1995-03-01 16.67 85.00 50.67 56.0 20.00 3.71
## 4 1995-04-01 13.73 78.75 44.41 55.5 20.00 3.52
## 5 1995-05-01 13.07 76.25 47.83 58.4 21.60 3.91
## 6 1995-06-01 11.07 65.00 43.84 58.6 23.85 3.70
## Infrastructure Automobile Refineries Bank Power Steel Software2
## 1 76.25 52.50 68.75 17.34 13.65 138.46 2.58
## 2 76.67 48.75 66.88 16.63 12.45 133.31 2.50
## 3 86.67 42.75 65.31 16.61 11.92 125.94 2.49
## 4 83.33 45.00 62.19 16.04 11.00 117.10 2.47
## 5 89.33 48.00 67.88 17.88 12.06 132.87 2.70
## 6 87.50 48.75 66.00 17.74 11.39 132.28 2.67
# Let's attach a TradingMonth column to our dataset
MonthlyCloses$TradingMonth <- months(MonthlyCloses$Month)
# We're ready to explore some plotting routines
with(MonthlyCloses,
plot(Software1, Software2,
main="Monthly Closes for two stocks in the Software Sector",
xlab="Software1", ylab="Software2",
pch=20, cex=0.7, col="darkgreen")
)
# Obtain month-wise averages by grouping closed
# across the years
monthlyAverage <- aggregate(FMCG ~ TradingMonth,
MonthlyCloses,
mean)
# Place two plots side by side
par(mfrow=c(1,2))
# This gives us a raw barplot
barplot(monthlyAverage$FMCG,
names.arg=monthlyAverage$TradingMonth,
main="Month-wise Average for FMCG",
col=colors()[30:41])
# Here's a pie chart for the same
pie(monthlyAverage$FMCG,
labels=monthlyAverage$TradingMonth,
main="Month-wise Average for FMCG",
col=colors()[30:41])
# A task you could undertake is to sort these
# columns by the Month name
#lets check the class of monthlyAverage$TradingMonth
class(monthlyAverage$TradingMonth)
## [1] "character"
#for sorting we need to convert them to factor so that we get ordered factor levels
monthlyAverage$TradingMonth<-as.factor(monthlyAverage$TradingMonth)
#check again the class of monthlyAverage$TradingMonth
class(monthlyAverage$TradingMonth)
## [1] "factor"
levels(monthlyAverage$TradingMonth)
## [1] "April" "August" "December" "February" "January"
## [6] "July" "June" "March" "May" "November"
## [11] "October" "September"
monthlyAverage$TradingMonth<-relevel(monthlyAverage$TradingMonth,"January")
#This makes "January" as the first level
#Instead of doing one month each time, we can reorder at the time of creating the factor
monthlyAverage$TradingMonth<-factor(monthlyAverage$TradingMonth,
levels=c("January","February","March","April",
"May","June","July","August","September",
"October","November","December"),ordered=TRUE)
#check again the class of monthlyAverage$TradingMonth
levels(monthlyAverage$TradingMonth)
## [1] "January" "February" "March" "April" "May"
## [6] "June" "July" "August" "September" "October"
## [11] "November" "December"
#now lets replot with correctly ordered months
# Place two plots side by side
par(mfrow=c(1,2))
# This gives us a raw barplot
barplot(monthlyAverage$FMCG,
names.arg=sort(monthlyAverage$TradingMonth),
main="Month-wise Average for FMCG",
col=colors()[30:41])
# Here's a pie chart for the same
pie(monthlyAverage$FMCG,
labels=sort(monthlyAverage$TradingMonth,decreasing = TRUE),
main="Month-wise Average for FMCG",
col=colors()[30:41])
# # Plot the time series for each stock
for(symbol in 2:numStocks + 1) {
# The ts function of R helps us to
# construct a time series
plot(ts(MonthlyCloses[ , symbol],
start=c(1995, 1), end=c(2014, 12),
frequency=12),
main=paste("Monthly Movements for:",
colnames(MonthlyCloses)[symbol]),
xlab="Year", ylab="Stock Price in INR",
col="navy")
}
Lets visualise two related numerical variables(softeare 1 & software 2) using scatterplots and line charts placed side by side:
par(mfrow=c(1,2))
with(MonthlyCloses,
plot(Software1, Software2,
main="Monthly Closes for two stocks \n in the Software Sector",
xlab="Software1", ylab="Software2",
pch=20, cex=0.7, col="darkgreen")
)
#Now get aline chart for both
#Determine the range for Y axis
maxVal<-max(max(MonthlyCloses$Software1),max(MonthlyCloses$Software2))
minVal<-min(min(MonthlyCloses$Software1),min(MonthlyCloses$Software2))
plot(MonthlyCloses$Software1~MonthlyCloses$Month,
type="l",
lwd=2,
col="navy",
xlab="Time", ylab="Monthly Closing price",
main="Two software stocks",
ylim=c(0,maxVal))
par(new=TRUE)
plot(MonthlyCloses$Software2~MonthlyCloses$Month,
type="l",
lwd=2,
ann=FALSE,
col="brown",
ylim=c(0,maxVal))
Stocks<-read.csv("FourStocks.csv",header = TRUE)
#View(Stocks) # to see the dataset
str(Stocks)
## 'data.frame': 4945 obs. of 5 variables:
## $ Date : Factor w/ 4945 levels "1-Apr-02","1-Apr-03",..: 1847 3605 4021 4186 4353 4844 217 382 547 714 ...
## $ Pharma: num 98.8 101.2 101.2 98.8 95 ...
## $ FMCG : num 59 58.5 58 57 56.5 57 55.5 56 56.5 57 ...
## $ Power : num 13.6 13.3 13.3 13 12.8 ...
## $ Steel : num 151 150 149 145 141 ...
head(Stocks)
## Date Pharma FMCG Power Steel
## 1 2-Jan-95 98.75 59.0 13.61 150.98
## 2 3-Jan-95 101.25 58.5 13.27 150.25
## 3 4-Jan-95 101.25 58.0 13.27 148.77
## 4 5-Jan-95 98.75 57.0 13.03 145.09
## 5 6-Jan-95 95.00 56.5 12.79 141.41
## 6 9-Jan-95 97.50 57.0 12.55 138.46
colMeans(Stocks[ , 2:5]) #getting mean column wise
## Pharma FMCG Power Steel
## 774.45014 248.38014 51.05487 287.19741
#or
sapply(Stocks[ , 2:5],mean)
## Pharma FMCG Power Steel
## 774.45014 248.38014 51.05487 287.19741
#Q:Which stock has the lowest mean value across the two decades?
which.min(colMeans(Stocks[ , 2:5]))
## Power
## 3
#Ans=power
#We can calculate the standard deviations of the 4 columns representing the stocks using the apply family functions
apply(Stocks[ , 2:5], MARGIN=2, FUN=sd)
## Pharma FMCG Power Steel
## 749.60569 150.69119 43.64287 212.03147
#or
sapply(Stocks[ , 2:5],sd)
## Pharma FMCG Power Steel
## 749.60569 150.69119 43.64287 212.03147
#Q:Which stock has the highest volatility, as measured by the standard deviation?
which.max(sapply(Stocks[ , 2:5],sd))
## Pharma
## 1
###################################
SKEWNESS
library(moments)
skewness(Stocks[ , 2:5]) #skewness of all 4 stocks
## Pharma FMCG Power Steel
## 1.4052686 1.5324310 0.5690035 0.6753831
#Q:The prices of which stock have the maximum skew?
which.max(skewness(Stocks[ , 2:5]))
## FMCG
## 2
This question tests your visual interpretation of skews.
Q:Assume that the histograms correspond to Steel and FMCG. What is the diagram on the left? Ans:Histogram of Steel
Manipulating Dates in R using lubridate package
# Install the lubridate package from the
# Packages pane if you have not already done so
library("lubridate")
# Format the Date field based on what is
# contained in the Date column
Stocks$Date <- as.Date(Stocks$Date, "%d-%b-%y")
# Capture the month and year data in different columns.
# The "accessor" functions such as 'month()' & 'year()'are courtesy the lubridate library!
Stocks$Month <- month(Stocks$Date)
Stocks$Year <- year(Stocks$Date)
# View the newly formatted data
#View(Stocks)
How does this change things? Source your script and have a look at the partial screen captures below:
Before Formatting
After Formatting
write.csv(Stocks, "NewStocks.csv")
cat("Created NewStocks.csv in",
getwd(), "\n",
"You may open it as a spreadsheet!\n")
## Created NewStocks.csv in J:/Elearning/courseera videos/edX_Statistics for Business - II_IIMBx QM101.2x/rstudio files
## You may open it as a spreadsheet!
Here’s how we define our random variable. The sample space is the set of 4,945 observations in the dataset, which stand for the prices of stock over two decade. Our RV maps each outcome (stock price) in the sample space to the average stock price for the corresponding month. Therefore our RV takes only 240 values for the 20 years under observation, rather than the potentially 4,945 individual values.
In the new xlsx spreadsheet that you just created, build a Pivot table to calculate the average Pharma stock prices by month and year. Your table should look like this - in truncated form.
The same table, albeit in a different form, can be constructed using R. Here’s code that you will add to your script.
# Here's an easy way to construct a table
# of average values by month
monthYearAvePharma <- aggregate(Pharma ~ Month + Year,
data=Stocks,
FUN=mean)
#The advantage with R is that the FUN argument can be whatever you want it to be.
head(monthYearAvePharma)
## Month Year Pharma
## 1 1 1995 95.35714
## 2 2 1995 87.89474
## 3 3 1995 86.64526
## 4 4 1995 84.02375
## 5 5 1995 76.99429
## 6 6 1995 72.42045
APPROXIMATION
In our next step, we construct the average tables by year and month for the remaining stocks, namely FMCG, Power and Steel. This can be done via an easy cut-and-paste in your script - make sure to change the variable names accordingly!
Using the View command, you can examine the contents of these aggregate variables. Now proceed to answer these questions.
monthYearAveFMCG <- aggregate(FMCG ~ Month + Year,
data=Stocks,
FUN=mean)
monthYearAvePower <- aggregate(Power ~ Month + Year,
data=Stocks,
FUN=mean)
monthYearAveSteel <- aggregate(Steel ~ Month + Year,
data=Stocks,
FUN=mean)
head(monthYearAveFMCG)
## Month Year FMCG
## 1 1 1995 58.21429
## 2 2 1995 58.02632
## 3 3 1995 56.94737
## 4 4 1995 56.68750
## 5 5 1995 57.15714
## 6 6 1995 59.05455
head(monthYearAvePower)
## Month Year Power
## 1 1 1995 13.05524
## 2 2 1995 12.81895
## 3 3 1995 12.37947
## 4 4 1995 11.81000
## 5 5 1995 11.13048
## 6 6 1995 12.44409
head(monthYearAveSteel)
## Month Year Steel
## 1 1 1995 139.4443
## 2 2 1995 131.9126
## 3 3 1995 134.0053
## 4 4 1995 131.3275
## 5 5 1995 120.0924
## 6 6 1995 136.4273
#monthYearAve<-aggregate(Stocks[,2:5]~ Month + Year, data=Stocks,FUN=mean)
Q:What was the average price of Steel stock for December 1995?
monthYearAveSteel$Steel[monthYearAveSteel$Month==12 & monthYearAveSteel$Year==1995]
## [1] 118.1025
Ans:118.1025
Q:What was the minimum average price of Steel stock across the months?
min(monthYearAveSteel$Steel)
## [1] 44.24286
Ans:44.24286
Q:What was the actual average price of Steel stock over the 4,945 trading days?
mean(Stocks$Steel)
## [1] 287.1974
Ans:287.1974
Q:Assume that there are equal number of trading days in a month. What is the expectation of the random variable that maps a day’s closing price of Steel stock to the average price for the month?
mean(monthYearAveSteel$Steel)
## [1] 285.7848
Ans:285.7848
One important way to regard data in financial markets is as time series. R has tremendous support for time series data. Here is code to declare the actual vs. aggregate time series in your R script and plot it.
# Let's plot and compare the actual vs aggregated prices
# the Pharma stock
par(mfrow=c(1,2))
ts.Pharma <- ts(Stocks$Pharma)
plot(ts.Pharma,
main="Actual Prices of Pharma Stock",
col="navy",
ylab="Pharma (actual)")
ts.PharmaAvg <- ts(monthYearAvePharma$Pharma,
start=c(1995, 1), end=c(2014, 12),
frequency=12)
plot(ts.PharmaAvg,
main="Aggregate Prices of Pharma Stock",
col="darkred",
ylab="Pharma (aggregate)")
The averaging by the random variable has had a “smoothing” effect on the plot.
create time series for the remaining stocks:
# Let's plot and compare the actual vs aggregated prices
par(mfrow=c(1,2))
ts.FMCG <- ts(Stocks$FMCG)
plot(ts.FMCG,
main="Actual Prices of FMCG Stock",
col="navy",
ylab="FMCG (actual)")
ts.FMCGAvg <- ts(monthYearAveFMCG$FMCG,
start=c(1995, 1), end=c(2014, 12),
frequency=12)
plot(ts.FMCGAvg,
main="Aggregate Prices of FMCG Stock",
col="darkred",
ylab="FMCG (aggregate)")
# Let's plot and compare the actual vs aggregated prices
par(mfrow=c(1,2))
ts.Power<- ts(Stocks$Power)
plot(ts.Power,
main="Actual Prices of Power Stock",
col="navy",
ylab="Power(actual)")
ts.PowerAvg <- ts(monthYearAvePower$Power,
start=c(1995, 1), end=c(2014, 12),
frequency=12)
plot(ts.PowerAvg,
main="Aggregate Prices of Power Stock",
col="darkred",
ylab="Power (aggregate)")
# Let's plot and compare the actual vs aggregated prices
par(mfrow=c(1,2))
ts.Steel<- ts(Stocks$Steel)
plot(ts.Steel,
main="Actual Prices of Steel Stock",
col="navy",
ylab="Steel(actual)")
ts.SteelAvg <- ts(monthYearAveSteel$Steel,
start=c(1995, 1), end=c(2014, 12),
frequency=12)
plot(ts.SteelAvg,
main="Aggregate Prices of Steel Stock",
col="darkred",
ylab="Steel(aggregate)")
create a script based on all of our earlier instructions and save it as FourStocks.R
# This is a library that comes shipped with RStudio,
# and facilitates interactive plotting
#
# NOTE: Please don't attempt to install this!
library(manipulate)
# Source what you have created earlier
# Make sure all of your variables are declared
source("FourStocks.R")
## 'data.frame': 4945 obs. of 5 variables:
## $ Date : Factor w/ 4945 levels "1-Apr-02","1-Apr-03",..: 1847 3605 4021 4186 4353 4844 217 382 547 714 ...
## $ Pharma: num 98.8 101.2 101.2 98.8 95 ...
## $ FMCG : num 59 58.5 58 57 56.5 57 55.5 56 56.5 57 ...
## $ Power : num 13.6 13.3 13.3 13 12.8 ...
## $ Steel : num 151 150 149 145 141 ...
# If you encounter any errors, this is due to
# some of the steps not being executed correctly!
# Reset the plotting arrangement
par(mfrow=c(1,1))
# Function to plot the time series of all sectors
# from the indicated start year to the end year
windowAll <- function(startYear, endYear) {
ts.plot(window(ts.PharmaAvg,
start=c(startYear, 1),
end=c(endYear, 12)),
window(ts.FMCGAvg,
start=c(startYear, 1),
end=c(endYear, 12)),
window(ts.PowerAvg,
start=c(startYear, 1),
end=c(endYear, 12)),
window(ts.SteelAvg,
start=c(startYear, 1),
end=c(endYear, 12)),
main=paste("Stocks from", startYear,
"through", endYear),
gpars=list(xlab="Year",
ylab="Stock Price",
col=c("darkred", "navy",
"darkgreen", "darkorange"),
lwd=2))
legend("topleft",
bty="n",
c("Pharma", "FMCG", "Power", "Steel"),
lty=c(1, 1, 1, 1),
lwd=c(2, 2, 2, 2),
col=c("darkred", "navy",
"darkgreen", "darkorange"))
}
windowAll(2010,2014) #if we want for a period between 2010 to 2014
windowAll(1998,2010)
# Plot a time series of adjusted close prices
# between lowYr and upYr. Interact with it.
# What do you notice?
#Uncomment the below commands to run the plots interactively using manipulate
# lowYr <- 1995; upYr <- 2014
# manipulate(
# windowAll(lowYr, upYr),
# lowYr = slider(1995, 2014,
# step=1,
# initial=1995),
# upYr = slider(lowYr, 2014,
# step=1,
# initial=2014))
devtools::session_info()
## Session info --------------------------------------------------------------
## setting value
## version R version 3.2.2 (2015-08-14)
## system x86_64, mingw32
## ui RTerm
## language (EN)
## collate English_India.1252
## tz Asia/Calcutta
## date 2015-10-30
## Packages ------------------------------------------------------------------
## package * version date source
## devtools 1.9.1 2015-09-11 CRAN (R 3.2.2)
## digest 0.6.8 2014-12-31 CRAN (R 3.2.0)
## evaluate 0.8 2015-09-18 CRAN (R 3.2.2)
## formatR 1.2.1 2015-09-18 CRAN (R 3.2.2)
## htmltools 0.2.6 2014-09-08 CRAN (R 3.2.1)
## knitr 1.11 2015-08-14 CRAN (R 3.2.2)
## lubridate * 1.3.3 2013-12-31 CRAN (R 3.2.1)
## magrittr 1.5 2014-11-22 CRAN (R 3.2.1)
## manipulate * 1.0.1 2014-12-24 CRAN (R 3.2.0)
## memoise 0.2.1 2014-04-22 CRAN (R 3.2.0)
## moments * 0.14 2015-01-05 CRAN (R 3.2.2)
## plyr 1.8.3 2015-06-12 CRAN (R 3.2.1)
## Rcpp 0.12.1 2015-09-10 CRAN (R 3.2.2)
## rmarkdown 0.8.1 2015-10-10 CRAN (R 3.2.2)
## stringi 1.0-1 2015-10-22 CRAN (R 3.2.2)
## stringr 1.0.0 2015-04-30 CRAN (R 3.2.1)
## yaml 2.1.13 2014-06-12 CRAN (R 3.2.1)