# Set your working directory
### REPLACE WITH YOUR FILE PATH
setwd('/home/nicole/Documents/Teaching/Class_Documents/Lecture2')
# WINDOWS: setwd("c:/Documents/my/working/directory")
# MAC: setwd("/Users/yourname/working/directory")
First, make sure you have installed and loaded the required libraries
library("readxl")
library("data.table")
library("tidyverse")
library("zoo")
library("reshape2")
# recall the install function: example -
# install.packages("readxl")
We want the file “excel_CAC40.xlsx”, and the sheet “Webstat_Export_20200124”. We know there is a header in the first 4 lines, so we skip these lines on the import.
df <- read_excel("excel_CAC40.xlsx", sheet = "Webstat_Export_20200124", skip = 4)
## New names:
## * `` -> ...1
## * `` -> ...6
# df <- read_excel("excel_CAC40.xlsx", sheet = 1, skip = 4) # we can also import by sheet order
# read in as a tibble object
df <- as.data.frame(df) # convert to data frame object
dim(df) # get dimensions of data
## [1] 96 16
head(df) # view first 5 rows of data
We see that “…1” and “…6” are the same item, so we remove one. We don’t like some these variable names, so we change them using better naming conventions.
df <- df[,!(names(df) %in% "...6")]
head(df)
names(df) # view names of columns
## [1] "...1" "flow"
## [3] "ocv_norev" "stock"
## [5] "ocv_rev" "g.r. CAC 40"
## [7] "g.r. CAC 40 (macrotrends)" "CAC 40"
## [9] "CAC 40 2014 =100" "CAC40 2015 =100 (macrotrends)"
## [11] "verif 1" "verif 2"
## [13] "check" "inflation rate"
## [15] "fin profitability (real)"
# we don't like some these variable names, so we change them using better naming conventions
setnames(df,old=c("...1",
"g.r. CAC 40",
"g.r. CAC 40 (macrotrends)",
"CAC 40",
"CAC 40 2014 =100",
"CAC40 2015 =100 (macrotrends)",
"verif 1",
"verif 2",
"inflation rate",
"fin profitability (real)"),
new=c("qtr",
"gr.CAC40",
"gr.CAC40.mt",
"CAC40",
"CAC40.2014.100",
"CAC40.2015.100.mt",
"verif.1",
"verif.2",
"inflation.rate",
"fin.profitability.real"))
names(df) # much better!
## [1] "qtr" "flow"
## [3] "ocv_norev" "stock"
## [5] "ocv_rev" "gr.CAC40"
## [7] "gr.CAC40.mt" "CAC40"
## [9] "CAC40.2014.100" "CAC40.2015.100.mt"
## [11] "verif.1" "verif.2"
## [13] "check" "inflation.rate"
## [15] "fin.profitability.real"
The variable qtr is coded as a character. Let’s convert it to a year-quarter data-time type.
str(df) # view the data types
## 'data.frame': 96 obs. of 15 variables:
## $ qtr : chr "1995Q4" "1996Q1" "1996Q2" "1996Q3" ...
## $ flow : num 0 1523 -2581 2136 -1740 ...
## $ ocv_norev : num 0 -1414 2183 -383 -2399 ...
## $ stock : num 225961 263342 269122 262088 278622 ...
## $ ocv_rev : num NA 37272 6178 -8787 20673 ...
## $ gr.CAC40 : num NA 16.49 2.35 -3.27 7.89 ...
## $ gr.CAC40.mt : num NA 9.231 3.859 0.428 8.576 ...
## $ CAC40 : num 1 1.16 1.19 1.15 1.24 ...
## $ CAC40.2014.100 : num 0.327 0.381 0.39 0.378 0.407 ...
## $ CAC40.2015.100.mt : num 0.432 0.472 0.49 0.492 0.534 ...
## $ verif.1 : num NA 15.27 2.32 -3.32 7.59 ...
## $ verif.2 : num NA 16.49 2.35 -3.27 7.89 ...
## $ check : num NA 0.0 2.8e-14 0.0 0.0 ...
## $ inflation.rate : num 1.216 1.699 1.476 1.218 0.951 ...
## $ fin.profitability.real: num NA 14.8 0.87 -4.48 6.94 ...
df$qtr <- as.Date(as.yearqtr(df$qtr, format = "%YQ%q"))
str(df) # now coded as a date object
## 'data.frame': 96 obs. of 15 variables:
## $ qtr : Date, format: "1995-10-01" "1996-01-01" ...
## $ flow : num 0 1523 -2581 2136 -1740 ...
## $ ocv_norev : num 0 -1414 2183 -383 -2399 ...
## $ stock : num 225961 263342 269122 262088 278622 ...
## $ ocv_rev : num NA 37272 6178 -8787 20673 ...
## $ gr.CAC40 : num NA 16.49 2.35 -3.27 7.89 ...
## $ gr.CAC40.mt : num NA 9.231 3.859 0.428 8.576 ...
## $ CAC40 : num 1 1.16 1.19 1.15 1.24 ...
## $ CAC40.2014.100 : num 0.327 0.381 0.39 0.378 0.407 ...
## $ CAC40.2015.100.mt : num 0.432 0.472 0.49 0.492 0.534 ...
## $ verif.1 : num NA 15.27 2.32 -3.32 7.59 ...
## $ verif.2 : num NA 16.49 2.35 -3.27 7.89 ...
## $ check : num NA 0.0 2.8e-14 0.0 0.0 ...
## $ inflation.rate : num 1.216 1.699 1.476 1.218 0.951 ...
## $ fin.profitability.real: num NA 14.8 0.87 -4.48 6.94 ...
summary(df) # view summary statistics of each variable
## qtr flow ocv_norev
## Min. :1995-10-01 Min. :-15754.0 Min. :-33243.0
## 1st Qu.:2001-09-08 1st Qu.: -3103.2 1st Qu.: -2459.0
## Median :2007-08-16 Median : -260.0 Median : -0.5
## Mean :2007-08-16 Mean : 623.4 Mean : 2358.0
## 3rd Qu.:2013-07-24 3rd Qu.: 4147.2 3rd Qu.: 3086.2
## Max. :2019-07-01 Max. : 23633.0 Max. : 42504.0
##
## stock ocv_rev gr.CAC40 gr.CAC40.mt
## Min. : 225961 Min. :-159357 Min. :-20.668 Min. :-28.747
## 1st Qu.: 507720 1st Qu.: -23170 1st Qu.: -3.268 1st Qu.: -2.763
## Median : 667843 Median : 15589 Median : 2.499 Median : 2.553
## Mean : 677603 Mean : 8025 Mean : 1.941 Mean : 1.734
## 3rd Qu.: 805964 3rd Qu.: 42044 3rd Qu.: 7.557 3rd Qu.: 7.528
## Max. :1274514 Max. : 143049 Max. : 26.070 Max. : 29.771
## NA's :1 NA's :1 NA's :1
## CAC40 CAC40.2014.100 CAC40.2015.100.mt verif.1
## Min. :1.000 Min. :0.3274 Min. :0.4320 Min. :-23.153
## 1st Qu.:1.929 1st Qu.:0.6314 1st Qu.:0.8143 1st Qu.: -3.322
## Median :2.472 Median :0.8093 Median :0.9729 Median : 2.469
## Mean :2.508 Mean :0.8211 Mean :0.9732 Mean : 1.488
## 3rd Qu.:3.068 3rd Qu.:1.0044 3rd Qu.:1.1665 3rd Qu.: 7.285
## Max. :4.111 Max. :1.3458 Max. :1.4875 Max. : 23.166
## NA's :1
## verif.2 check inflation.rate fin.profitability.real
## Min. :-20.668 Min. :0 Min. :-0.3356 Min. :-22.668
## 1st Qu.: -3.268 1st Qu.:0 1st Qu.: 0.7317 1st Qu.: -4.553
## Median : 2.499 Median :0 Median : 1.2169 Median : 1.423
## Mean : 1.941 Mean :0 Mean : 1.2612 Mean : 0.679
## 3rd Qu.: 7.557 3rd Qu.:0 3rd Qu.: 1.9237 3rd Qu.: 5.529
## Max. : 26.070 Max. :0 Max. : 2.7726 Max. : 24.733
## NA's :1 NA's :1 NA's :1
It appears we have some NA’s, which we know are in the first row due to calculatin growth rates. For now we leave it since we will re-calcualte the variables using R here.
# 1. ocv_rev
df$ocv_rev.r <- df$stock - lag(df$stock) - df$ocv_norev - df$flow
summary(df$ocv_rev)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -159357 -23170 15589 8025 42044 143049 1
summary(df$ocv_rev.r) # same values
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -159357 -23170 15589 8025 42044 143049 1
# 2. gr.CAC40
df$gr.CAC40.r <- df$ocv_rev.r*100/lag(df$stock)
# 3. gr.CAC40.mt
df$gr.CAC40.mt.r <- (df$CAC40.2015.100.mt-lag(df$CAC40.2015.100.mt))*100/lag(df$CAC40.2015.100.mt)
# 4. CAC40
df$CAC40.r.h <- (1+df$gr.CAC40.r/100) # first we create a 'helper' column that has the growth rate
# this makes things a bit easier to code
df$CAC40.r.h[is.na(df$CAC40.r.h)] <-1 # we set the 1st row growth rate = 1
df$CAC40.r <- 1 # we create a holder column initialized to 1
for(i in 2:nrow(df)){
# for each row in the column CAC40.r multiple the previous row by the 'helper' column
# we index each row by i
df[i,]$CAC40.r <- df[i-1,]$CAC40.r*df[i,]$CAC40.r.h
}
# 5. CAC40.2014.100
df$index <- mean(df[year(df$qtr) %in% "2014",]$CAC40.r)
df$CAC40.2014.100.r <- df$CAC40.r / df$index
# 6. fin.profitability.real
df$fin.profitability.real.r <- df$gr.CAC40.r-df$inflation.rate
Why can’t we just use lead() and lag() for CAC40.r? Calculations are done simultaneously, and not progressively. For the CAC40.r, each cell needs to be calculated before it can be used in the next cell calculation, thus we need to do each calculation one at a time.
head(df)
df <- df[-1,] # remove first row
Remember that for ease of use, we melt the data
m_df <- melt(df,id.vars="qtr")
head(m_df)
ggplot(m_df[m_df$variable %in% c('fin.profitability.real.r','gr.CAC40.r'),],
aes(x=qtr,y=value,fill=variable,color=variable))+
geom_line(alpha=.7) +
theme_bw() +
xlab("Quarter") +
ylab("") +
theme(legend.title = element_blank(),
legend.position = "bottom") +
scale_color_manual(values=c("Navy","cadetblue"),
labels=c("Nominal","Real"))
ggplot(m_df[m_df$variable %in% c('gr.CAC40.mt.r','gr.CAC40.r'),],
aes(x=qtr,y=value,fill=variable,color=variable))+
geom_line(alpha=.7) +
theme_bw() +
xlab("Quarter") +
ylab("") +
theme(legend.title = element_blank(),
legend.position = "bottom") +
scale_color_manual(values=c("Navy","cadetblue"),
labels=c("CAC 40 Macro Trends \n Growth Rate",
"CAC 40 Growth Rate"))
ggplot(m_df[m_df$variable %in% c('CAC40.2015.100.mt','CAC40.2014.100.r'),],
aes(x=qtr,y=value,fill=variable,color=variable))+
geom_line(alpha=.7) +
theme_bw() +
xlab("Quarter") +
ylab("") +
theme(legend.title = element_blank(),
legend.position = "bottom") +
scale_color_manual(values=c("Navy","cadetblue"),
labels=c("CAC 40 Macro Trends (2015=1)",
"CAC 40 (2014=1)"))