Importing Libraries

library(ISLR)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
data(package="ISLR")
st_mr = ISLR::Smarket
str(st_mr)
## 'data.frame':    1250 obs. of  9 variables:
##  $ Year     : num  2001 2001 2001 2001 2001 ...
##  $ Lag1     : num  0.381 0.959 1.032 -0.623 0.614 ...
##  $ Lag2     : num  -0.192 0.381 0.959 1.032 -0.623 ...
##  $ Lag3     : num  -2.624 -0.192 0.381 0.959 1.032 ...
##  $ Lag4     : num  -1.055 -2.624 -0.192 0.381 0.959 ...
##  $ Lag5     : num  5.01 -1.055 -2.624 -0.192 0.381 ...
##  $ Volume   : num  1.19 1.3 1.41 1.28 1.21 ...
##  $ Today    : num  0.959 1.032 -0.623 0.614 0.213 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
st_mr$year_C = as.factor(st_mr$Year)

In Lag1, how many elements are positive, negative, or zero?

stock_count = st_mr %>% 
  mutate(category = ifelse(Lag1 > 0, "Positive",
                           ifelse(Lag1 < 0, "Negative", "Zero"))) %>%
  count(category)
stock_count

What is the top 20% of volume traded?

top_20 = quantile(st_mr$Volume, 0.8)
top_20
##      80% 
## 1.704914

How many days the market direction is up or down?

stock_up = st_mr %>% count(Direction)
stock_up

What is the median price of Today?

median_today = median(st_mr$Today)
median_today
## [1] 0.0385

State the number of cases per year

case_per_year = st_mr %>% 
  group_by(Year) %>% 
  summarise(new_case = n())
case_per_year

Create a dataset with only 2001 data

new_stock = st_mr %>% filter(Year == 2001)
new_stock

Distribution of Lag1, Lag3, and Lag5

Lag1

ggplot(st_mr, aes(x = Lag1)) +
  geom_histogram(binwidth = 0.5, fill = "brown", color = "white") +
  labs(title = "Histogram of Lag1", x = "Lag1", y = "Count")

Lag3

ggplot(st_mr, aes(x = Lag3)) +
  geom_histogram(binwidth = 0.5, fill = "skyblue", color = "white") +
  labs(title = "Histogram of Lag3", x = "Lag3", y = "Count")

Lag5

ggplot(st_mr, aes(x = Lag5)) +
  geom_histogram(binwidth = 0.5, fill = "darkgreen", color = "white") +
  labs(title = "Histogram of Lag5", x = "Lag5", y = "Count")

Distribution of percentage return for Today based on Direction of the market

per_dis = st_mr %>% 
  group_by(Direction) %>% 
  summarise(total = sum(Today)) %>% 
  mutate(percent = total * 100 / sum(total))
per_dis
ggplot(per_dis, aes(x = Direction, y = percent, fill = Direction)) + 
  geom_bar(stat = "identity") +
  labs(title = "Percentage Return by Market Direction", x = "Direction", y = "Percentage")

Compute year-wise mean volume

yearly_mean = st_mr %>% 
  filter(Year >= 2001 & Year <= 2005) %>% 
  group_by(Year) %>% 
  summarise(mean_value = mean(Volume, na.rm = TRUE))
yearly_mean

Compute year-wise 5 percentiles (10%, 30%, 50%, 70%, 90%) of Today’s return

yearly_percentile = st_mr %>%
  filter(Year >= 2001 & Year <= 2005) %>%
  group_by(Year) %>%
  summarise(
    Q10 = quantile(Today, probs = 0.1),
    Q30 = quantile(Today, probs = 0.3),
    Q50 = quantile(Today, probs = 0.5),
    Q70 = quantile(Today, probs = 0.7),
    Q90 = quantile(Today, probs = 0.9)
  )

yearly_percentile

Present the results of the previous question in a tabular form

data.frame(yearly_percentile)

Create a new variable for classifying Lag1 as Positive or Zero/Negative

st_mr = st_mr %>% 
  mutate(category = ifelse(Lag1 > 0, "Positive", "Zero/Negative"))
head(st_mr)

Custom Quantiles for Lag1 based on Market Direction

st_mr %>%
  filter(Direction == "Up") %>%
  summarise(Median = median(Lag1))
st_mr %>%
  filter(Direction == "Up") %>%
  summarise(All_Quantiles = list(quantile(Lag1)))
st_mr %>%
  filter(Direction == "Up") %>%
  summarise(Custom_Quantiles = list(quantile(Lag1, probs = c(0, 0.25, 0.5, 0.75, 1))))