Tidying and Exploratory Analysis of Indian stock Data

About The Project

The dataset contains data from a list of Indian stocks in NSE. It includes a collection of well performing stocks. Per Jiho Kim,The dataset contains lots of #NA and #REF observations. Find the trading high price mean and low price mean per sector and the distribution of the mean prices. Dataset source: link

Read dataset from Github

stock <- read.csv(file="https://raw.githubusercontent.com/nnaemeka-git/global-datasets/main/stock%20dataset.csv", sep=",")
as_tibble(stock)

## # A tibble: 1,568 x 63
##    Share   Category Sector         RM    Up    Last.Traded.Pri~ Percentage.Chan~
##    <chr>   <chr>    <chr>          <chr> <chr> <chr>            <chr>           
##  1 20MICR~ Exit     CHEMICALS      #N/A  #N/A  43               #N/A            
##  2 3IINFO~ BWLV     IT             #N/A  #N/A  5.35             #N/A            
##  3 3MINDIA Exit     DIVERSIFIED    0     0     12219.65         1.23            
##  4 63MOONS RS       IT             #N/A  #N/A  75.6             0.26            
##  5 8KMILES HWV      IT             0     0     623.9            0.17            
##  6 A2ZINF~ Exit     INFRA-CONSTRU~ 0     0     47.85            2.93            
##  7 Aarti ~ BWLV     PHARMA         0     0     533.65           -0.31           
##  8 AARTII~ Buy      CHEMICALS      CRUD~ Down  865              0.35            
##  9 AARVEE~ Exit     TEXTILES AND ~ #N/A  #N/A  71.05            #N/A            
## 10 ABAN    Exit     ENERGY-OIL & ~ 0     0     219.3            0.55            
## # ... with 1,558 more rows, and 56 more variables: High.Price <chr>,
## #   Low.Price <chr>, X..High.Movt <chr>, X..Low.movt <chr>,
## #   Yearly.Gainner <chr>, X27th.Dec <chr>, X1st.Feb <chr>, X1st.March <chr>,
## #   X1st.April <chr>, TB <chr>, RH <dbl>, Corr <chr>, PeRatio <dbl>,
## #   New.Pe <chr>, W52_High <dbl>, Corre <chr>, latest <int>, Annual_Pat <dbl>,
## #   Cum.PAT.3.Quarter <dbl>, Pat.Jump <dbl>, Annual.Growth <chr>,
## #   Year.End <chr>, Mar_17_ReportedPAT <dbl>, Dec_16_ReportedPAT <dbl>,
## #   Sep_16_ReportedPAT <dbl>, Jun_16_ReportedPAT <dbl>,
## #   Mar_16_ReportedPAT <dbl>, Dec_15_ReportedPAT <dbl>, Y.oY.PAT <chr>,
## #   LQ.0.5.4q.s. <chr>, PAT.Growth <chr>, PeRatio.1 <dbl>, ResultDate <int>,
## #   Mar_17_Inst_no_of_shares <int>, Dec_16_Inst_no_of_shares <int>,
## #   Int..Incr..Decr <int>, Pledge <dbl>, DividendYield <dbl>, DERatio <dbl>,
## #   FB <chr>, Mar_17_Eps_Before <chr>, Dec_16_Eps_Before <chr>,
## #   Sep_16_Eps_Before <chr>, Jun_16_Eps_Before <chr>, Mar_16_Eps_Before <chr>,
## #   Dec_15_Eps_Before <chr>, Annual_Eps <chr>, Volume <chr>, MarketCap <chr>,
## #   Annual_Pat.1 <chr>, EquityCapital <chr>, FaceValue <chr>,
## #   New.Listing <chr>, PE.High.Price.High <lgl>, Future <chr>,
## #   Cap.Category <chr>

Remove #N/A and #REF! from dataset

stock[stock=="#N/A" | stock=="#REF!" ] <- "0"

Convert High and Low stock price columns to numeric values

stock$High.Price <- as.numeric(replace(stock$High.Price,stock$High.Price=="#N/A","0"))
stock$Low.Price <- as.numeric(replace(stock$Low.Price,stock$Low.Price=="#N/A","0"))
stock$Sector <- replace(stock$Sector,stock$Sector%in% c("#N/A","#REF!","0"),"No sector")
tibble(stock)

## # A tibble: 1,568 x 63
##    Share   Category Sector         RM    Up    Last.Traded.Pri~ Percentage.Chan~
##    <chr>   <chr>    <chr>          <chr> <chr> <chr>            <chr>           
##  1 20MICR~ Exit     CHEMICALS      0     0     43               0               
##  2 3IINFO~ BWLV     IT             0     0     5.35             0               
##  3 3MINDIA Exit     DIVERSIFIED    0     0     12219.65         1.23            
##  4 63MOONS RS       IT             0     0     75.6             0.26            
##  5 8KMILES HWV      IT             0     0     623.9            0.17            
##  6 A2ZINF~ Exit     INFRA-CONSTRU~ 0     0     47.85            2.93            
##  7 Aarti ~ BWLV     PHARMA         0     0     533.65           -0.31           
##  8 AARTII~ Buy      CHEMICALS      CRUD~ Down  865              0.35            
##  9 AARVEE~ Exit     TEXTILES AND ~ 0     0     71.05            0               
## 10 ABAN    Exit     ENERGY-OIL & ~ 0     0     219.3            0.55            
## # ... with 1,558 more rows, and 56 more variables: High.Price <dbl>,
## #   Low.Price <dbl>, X..High.Movt <chr>, X..Low.movt <chr>,
## #   Yearly.Gainner <chr>, X27th.Dec <chr>, X1st.Feb <chr>, X1st.March <chr>,
## #   X1st.April <chr>, TB <chr>, RH <dbl>, Corr <chr>, PeRatio <dbl>,
## #   New.Pe <chr>, W52_High <dbl>, Corre <chr>, latest <int>, Annual_Pat <dbl>,
## #   Cum.PAT.3.Quarter <dbl>, Pat.Jump <dbl>, Annual.Growth <chr>,
## #   Year.End <chr>, Mar_17_ReportedPAT <dbl>, Dec_16_ReportedPAT <dbl>,
## #   Sep_16_ReportedPAT <dbl>, Jun_16_ReportedPAT <dbl>,
## #   Mar_16_ReportedPAT <dbl>, Dec_15_ReportedPAT <dbl>, Y.oY.PAT <chr>,
## #   LQ.0.5.4q.s. <chr>, PAT.Growth <chr>, PeRatio.1 <dbl>, ResultDate <int>,
## #   Mar_17_Inst_no_of_shares <int>, Dec_16_Inst_no_of_shares <int>,
## #   Int..Incr..Decr <int>, Pledge <dbl>, DividendYield <dbl>, DERatio <dbl>,
## #   FB <chr>, Mar_17_Eps_Before <chr>, Dec_16_Eps_Before <chr>,
## #   Sep_16_Eps_Before <chr>, Jun_16_Eps_Before <chr>, Mar_16_Eps_Before <chr>,
## #   Dec_15_Eps_Before <chr>, Annual_Eps <chr>, Volume <chr>, MarketCap <chr>,
## #   Annual_Pat.1 <chr>, EquityCapital <chr>, FaceValue <chr>,
## #   New.Listing <chr>, PE.High.Price.High <lgl>, Future <chr>,
## #   Cap.Category <chr>

High Stock Price mean per Sector

stock_avg <- stock %>%
  group_by(Sector)%>%
  summarise(HighPriceAvg=round(mean(High.Price,na.rm=TRUE),2),
            LowPriceAvg=round(mean(Low.Price,na.rm=TRUE),2))%>%arrange(desc(HighPriceAvg))#%>%
  #filter(Sector == "No sector")
stock_avg

## # A tibble: 40 x 3
##    Sector                          HighPriceAvg LowPriceAvg
##    <chr>                                  <dbl>       <dbl>
##  1 AUTO-TYRES AND TUBES                  10729.      10496.
##  2 FMCG                                   1344.       1316.
##  3 CEMENTS                                1171.       1124.
##  4 AUTO-AUTOMOBILES AND AUTO PARTS        1165.       1110.
##  5 DIVERSIFIED                            1107.       1082.
##  6 AUTO-AUTO ANCL AND COMPONENTS          1059.       1033.
##  7 ENERGY-OIL & GAS                        582.        573.
##  8 PHARMA                                  571.        559.
##  9 AIRLINES                                559.        534.
## 10 FINANCIAL SERVICES-NBFC                 536.        521.
## # ... with 30 more rows

Show High stock Price Mean Distribution

stock_avg%>%ggplot(aes(HighPriceAvg))+geom_histogram(binwidth=1500)+
    labs(x="High Price (Rs.)",
       title="The distribution of low stock price of indian stock is skewed to the right,\n and the data distribution contains meaningless Rs.0 low stock price \n and extreme outliers",
       caption="Data source: https://www.kaggle.com/souravroy1/stock-market-data")+
  theme_bw()

#Remove Outlier for High Price Average
high_no_outlier <-stock_avg%>%
  filter(HighPriceAvg != 0 & HighPriceAvg<9000)

ggplot(high_no_outlier, aes(x=HighPriceAvg))+geom_histogram(aes(y=..density..),binwidth=181)+geom_density(col = "tomato")+
  labs(x="High Price (Rs.)",
       title="The distribution of High stock price of indian stock is bimodal and skewed to the \n right, centered at about Rs.268 with most of the data between Rs.100 and\n Rs.250, a range of roughly Rs.1264 and a few outliers are present on the \n higher end.",
       caption="Data source: https://www.kaggle.com/souravroy1/stock-market-data")+
  theme_bw()

#Summary statistics for high price
summarise(high_no_outlier,mean=mean(HighPriceAvg,na.rm=TRUE),
          median=median(HighPriceAvg,na.rm=TRUE),
          sd=sd(HighPriceAvg,na.rm=TRUE),
          min=min(HighPriceAvg),
          max=max(HighPriceAvg)
          )%>%mutate(Range=max-min)

## # A tibble: 1 x 6
##    mean median    sd   min   max Range
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  395.   268.  355.  80.2 1344. 1264.

Show Low stock Price Mean Distribution

stock_avg%>%ggplot(aes(LowPriceAvg))+geom_histogram(binwidth=1500,fill="#DAF7A6")+
   labs(x="Low Price (Rs.)",
       title="The distribution of low stock price of indian stock is skewed to the right,\n and the data distribution contains meaningless Rs.0 low stock price \n and extreme outliers",
       caption="Data source: https://www.kaggle.com/souravroy1/stock-market-data")+
  theme_bw()

#Remove Outlier from Lower Price Average
low_no_outlier <- stock_avg%>%
  filter(LowPriceAvg !=0 & LowPriceAvg<9000)


ggplot(low_no_outlier,aes(LowPriceAvg))+geom_histogram(aes(y=..density..),binwidth=181,fill="#DAF7A6")+geom_density(col = "red")+
 labs(x="Low Price (Rs.)",
       title="The distribution of low stock price of indian stock is bimodal and skewed to the\n right, centered at about Rs.262 with most of the data between Rs.100 and\n Rs.250, a range of roughly Rs.1239 and a few outliers are present on the \n higher end.",
       caption="Data source: https://www.kaggle.com/souravroy1/stock-market-data")+
  theme_bw()

#Summary statistics for Low price Average
summarise(low_no_outlier,mean=mean(LowPriceAvg,na.rm=TRUE),
          median=median(LowPriceAvg,na.rm=TRUE),
          sd=sd(LowPriceAvg,na.rm=TRUE),
          min=min(LowPriceAvg),
          max=max(LowPriceAvg)
          )%>%mutate(Range=max-min)

## # A tibble: 1 x 6
##    mean median    sd   min   max Range
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  384.   262.  345.  77.4 1316. 1239.

Tidying and Exploratory Analysis of Indian stock Data

Nnaemeka Okereafor

10/2/2021

About The Project

High Stock Price mean per Sector

Show High stock Price Mean Distribution

Show Low stock Price Mean Distribution