Import necessary library
library(ISLR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Load the data and convert the necessary variables into factor
data(package="ISLR")
stock=ISLR::Smarket
str(stock)
## 'data.frame': 1250 obs. of 9 variables:
## $ Year : num 2001 2001 2001 2001 2001 ...
## $ Lag1 : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ Lag2 : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ Lag3 : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ Lag4 : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ Lag5 : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Volume : num 1.19 1.3 1.41 1.28 1.21 ...
## $ Today : num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
stock$year_C=as.factor(stock$Year)
1.No of elements that are positive or negative or zero values in
lag1
stock_count=stock %>% mutate(category=ifelse(Lag1>0,"Positive",
ifelse(Lag1<0,"Negative","Zero")))
stock_count %>% count(category)
2.the top20% of volume traded
top_20=quantile(stock$Volume,0.8)
top_20
## 80%
## 1.704914
3.No of days the market direction is up / down
stock_down=stock %>% group_by(Direction)
count(stock_down)
5.No of cases per year
ncase=stock %>% group_by(Year)
count(ncase)
6.Creating new data set which has only 2001 data
df_stock= stock %>% filter(Year==2001)
df_stock
7.The distributions of lag1, lag3, lag5
7(a).Distribution of Lag1
ggplot(stock, aes(x=Lag1)) +
geom_histogram(fill="pink", color="white") +
labs(title="Distribution of Lag1", x="Lag1", y="Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

7(b).Distribution of Lag2
ggplot(stock, aes(x=Lag3)) +
geom_histogram(fill="lightgreen", color="white") +
labs(title="Distribution of Lag3", x="Lag3", y="Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

7(c).Distribution of Lag5
ggplot(stock, aes(x=Lag5)) +
geom_histogram(fill="violet", color="white") +
labs(title="Distribution of Lag5", x="Lag5", y="Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

8.The distribution of percentage return for today based on Direction
of the market
percentage=stock %>% group_by(Direction) %>% summarise(total=sum(Today)) %>% mutate(percentage=total*100/sum(total))
percentage
9.Year wise mean volume
ggplot(percentage,aes(x=percentage,fill = Direction))+geom_bar(size=3)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

mean <- stock %>% filter(Year >= 2001 & Year <= 2005) %>% group_by(Year) %>% summarise(mean_value = mean(Volume, na.rm = TRUE))
mean
10.Year wise 5 percentiles (10%, 30%, 50%, 70%, 90%) of percentage
return of today
percentile<-stock %>% filter(Year>=2001 & Year<=2005) %>% group_by(Year) %>% summarise(quantile(Today,to_find=c(0.1,0.3,0.5,0.7,0.9)))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
12.Creating a new variable in the data set for classifying Lag1
positive and zero or negative
count=stock %>% mutate(category=ifelse(Lag1>0,"Positive",
ifelse(Lag1<0,"Negative","Zero")))
13.Only for UP Direction,
13(b).Categorize Lag 1 price based on quartiles
stock %>% filter(Direction=="Up") %>% summarise(quantile(stock$Lag1))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
13(c).Categorize Lag 1 price based on suitable 5 percentiles
stock %>% filter(Direction=="Up") %>% summarise(quantile(Lag1,c(0,0.25,0.5,0.75,1)))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.