The dataset contains data from a list of Indian stocks in NSE. It includes a collection of well performing stocks. Per Jiho Kim,The dataset contains lots of #NA and #REF observations. Find the trading high price mean and low price mean per sector and the distribution of the mean prices. Dataset source: link
Read dataset from Github
stock <- read.csv(file="https://raw.githubusercontent.com/nnaemeka-git/global-datasets/main/stock%20dataset.csv", sep=",")
as_tibble(stock)
## # A tibble: 1,568 x 63
## Share Category Sector RM Up Last.Traded.Pri~ Percentage.Chan~
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 20MICR~ Exit CHEMICALS #N/A #N/A 43 #N/A
## 2 3IINFO~ BWLV IT #N/A #N/A 5.35 #N/A
## 3 3MINDIA Exit DIVERSIFIED 0 0 12219.65 1.23
## 4 63MOONS RS IT #N/A #N/A 75.6 0.26
## 5 8KMILES HWV IT 0 0 623.9 0.17
## 6 A2ZINF~ Exit INFRA-CONSTRU~ 0 0 47.85 2.93
## 7 Aarti ~ BWLV PHARMA 0 0 533.65 -0.31
## 8 AARTII~ Buy CHEMICALS CRUD~ Down 865 0.35
## 9 AARVEE~ Exit TEXTILES AND ~ #N/A #N/A 71.05 #N/A
## 10 ABAN Exit ENERGY-OIL & ~ 0 0 219.3 0.55
## # ... with 1,558 more rows, and 56 more variables: High.Price <chr>,
## # Low.Price <chr>, X..High.Movt <chr>, X..Low.movt <chr>,
## # Yearly.Gainner <chr>, X27th.Dec <chr>, X1st.Feb <chr>, X1st.March <chr>,
## # X1st.April <chr>, TB <chr>, RH <dbl>, Corr <chr>, PeRatio <dbl>,
## # New.Pe <chr>, W52_High <dbl>, Corre <chr>, latest <int>, Annual_Pat <dbl>,
## # Cum.PAT.3.Quarter <dbl>, Pat.Jump <dbl>, Annual.Growth <chr>,
## # Year.End <chr>, Mar_17_ReportedPAT <dbl>, Dec_16_ReportedPAT <dbl>,
## # Sep_16_ReportedPAT <dbl>, Jun_16_ReportedPAT <dbl>,
## # Mar_16_ReportedPAT <dbl>, Dec_15_ReportedPAT <dbl>, Y.oY.PAT <chr>,
## # LQ.0.5.4q.s. <chr>, PAT.Growth <chr>, PeRatio.1 <dbl>, ResultDate <int>,
## # Mar_17_Inst_no_of_shares <int>, Dec_16_Inst_no_of_shares <int>,
## # Int..Incr..Decr <int>, Pledge <dbl>, DividendYield <dbl>, DERatio <dbl>,
## # FB <chr>, Mar_17_Eps_Before <chr>, Dec_16_Eps_Before <chr>,
## # Sep_16_Eps_Before <chr>, Jun_16_Eps_Before <chr>, Mar_16_Eps_Before <chr>,
## # Dec_15_Eps_Before <chr>, Annual_Eps <chr>, Volume <chr>, MarketCap <chr>,
## # Annual_Pat.1 <chr>, EquityCapital <chr>, FaceValue <chr>,
## # New.Listing <chr>, PE.High.Price.High <lgl>, Future <chr>,
## # Cap.Category <chr>
Remove #N/A and #REF! from dataset
stock[stock=="#N/A" | stock=="#REF!" ] <- "0"
Convert High and Low stock price columns to numeric values
stock$High.Price <- as.numeric(replace(stock$High.Price,stock$High.Price=="#N/A","0"))
stock$Low.Price <- as.numeric(replace(stock$Low.Price,stock$Low.Price=="#N/A","0"))
stock$Sector <- replace(stock$Sector,stock$Sector%in% c("#N/A","#REF!","0"),"No sector")
tibble(stock)
## # A tibble: 1,568 x 63
## Share Category Sector RM Up Last.Traded.Pri~ Percentage.Chan~
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 20MICR~ Exit CHEMICALS 0 0 43 0
## 2 3IINFO~ BWLV IT 0 0 5.35 0
## 3 3MINDIA Exit DIVERSIFIED 0 0 12219.65 1.23
## 4 63MOONS RS IT 0 0 75.6 0.26
## 5 8KMILES HWV IT 0 0 623.9 0.17
## 6 A2ZINF~ Exit INFRA-CONSTRU~ 0 0 47.85 2.93
## 7 Aarti ~ BWLV PHARMA 0 0 533.65 -0.31
## 8 AARTII~ Buy CHEMICALS CRUD~ Down 865 0.35
## 9 AARVEE~ Exit TEXTILES AND ~ 0 0 71.05 0
## 10 ABAN Exit ENERGY-OIL & ~ 0 0 219.3 0.55
## # ... with 1,558 more rows, and 56 more variables: High.Price <dbl>,
## # Low.Price <dbl>, X..High.Movt <chr>, X..Low.movt <chr>,
## # Yearly.Gainner <chr>, X27th.Dec <chr>, X1st.Feb <chr>, X1st.March <chr>,
## # X1st.April <chr>, TB <chr>, RH <dbl>, Corr <chr>, PeRatio <dbl>,
## # New.Pe <chr>, W52_High <dbl>, Corre <chr>, latest <int>, Annual_Pat <dbl>,
## # Cum.PAT.3.Quarter <dbl>, Pat.Jump <dbl>, Annual.Growth <chr>,
## # Year.End <chr>, Mar_17_ReportedPAT <dbl>, Dec_16_ReportedPAT <dbl>,
## # Sep_16_ReportedPAT <dbl>, Jun_16_ReportedPAT <dbl>,
## # Mar_16_ReportedPAT <dbl>, Dec_15_ReportedPAT <dbl>, Y.oY.PAT <chr>,
## # LQ.0.5.4q.s. <chr>, PAT.Growth <chr>, PeRatio.1 <dbl>, ResultDate <int>,
## # Mar_17_Inst_no_of_shares <int>, Dec_16_Inst_no_of_shares <int>,
## # Int..Incr..Decr <int>, Pledge <dbl>, DividendYield <dbl>, DERatio <dbl>,
## # FB <chr>, Mar_17_Eps_Before <chr>, Dec_16_Eps_Before <chr>,
## # Sep_16_Eps_Before <chr>, Jun_16_Eps_Before <chr>, Mar_16_Eps_Before <chr>,
## # Dec_15_Eps_Before <chr>, Annual_Eps <chr>, Volume <chr>, MarketCap <chr>,
## # Annual_Pat.1 <chr>, EquityCapital <chr>, FaceValue <chr>,
## # New.Listing <chr>, PE.High.Price.High <lgl>, Future <chr>,
## # Cap.Category <chr>
stock_avg <- stock %>%
group_by(Sector)%>%
summarise(HighPriceAvg=round(mean(High.Price,na.rm=TRUE),2),
LowPriceAvg=round(mean(Low.Price,na.rm=TRUE),2))%>%arrange(desc(HighPriceAvg))#%>%
#filter(Sector == "No sector")
stock_avg
## # A tibble: 40 x 3
## Sector HighPriceAvg LowPriceAvg
## <chr> <dbl> <dbl>
## 1 AUTO-TYRES AND TUBES 10729. 10496.
## 2 FMCG 1344. 1316.
## 3 CEMENTS 1171. 1124.
## 4 AUTO-AUTOMOBILES AND AUTO PARTS 1165. 1110.
## 5 DIVERSIFIED 1107. 1082.
## 6 AUTO-AUTO ANCL AND COMPONENTS 1059. 1033.
## 7 ENERGY-OIL & GAS 582. 573.
## 8 PHARMA 571. 559.
## 9 AIRLINES 559. 534.
## 10 FINANCIAL SERVICES-NBFC 536. 521.
## # ... with 30 more rows
stock_avg%>%ggplot(aes(HighPriceAvg))+geom_histogram(binwidth=1500)+
labs(x="High Price (Rs.)",
title="The distribution of low stock price of indian stock is skewed to the right,\n and the data distribution contains meaningless Rs.0 low stock price \n and extreme outliers",
caption="Data source: https://www.kaggle.com/souravroy1/stock-market-data")+
theme_bw()
#Remove Outlier for High Price Average
high_no_outlier <-stock_avg%>%
filter(HighPriceAvg != 0 & HighPriceAvg<9000)
ggplot(high_no_outlier, aes(x=HighPriceAvg))+geom_histogram(aes(y=..density..),binwidth=181)+geom_density(col = "tomato")+
labs(x="High Price (Rs.)",
title="The distribution of High stock price of indian stock is bimodal and skewed to the \n right, centered at about Rs.268 with most of the data between Rs.100 and\n Rs.250, a range of roughly Rs.1264 and a few outliers are present on the \n higher end.",
caption="Data source: https://www.kaggle.com/souravroy1/stock-market-data")+
theme_bw()
#Summary statistics for high price
summarise(high_no_outlier,mean=mean(HighPriceAvg,na.rm=TRUE),
median=median(HighPriceAvg,na.rm=TRUE),
sd=sd(HighPriceAvg,na.rm=TRUE),
min=min(HighPriceAvg),
max=max(HighPriceAvg)
)%>%mutate(Range=max-min)
## # A tibble: 1 x 6
## mean median sd min max Range
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 395. 268. 355. 80.2 1344. 1264.
stock_avg%>%ggplot(aes(LowPriceAvg))+geom_histogram(binwidth=1500,fill="#DAF7A6")+
labs(x="Low Price (Rs.)",
title="The distribution of low stock price of indian stock is skewed to the right,\n and the data distribution contains meaningless Rs.0 low stock price \n and extreme outliers",
caption="Data source: https://www.kaggle.com/souravroy1/stock-market-data")+
theme_bw()
#Remove Outlier from Lower Price Average
low_no_outlier <- stock_avg%>%
filter(LowPriceAvg !=0 & LowPriceAvg<9000)
ggplot(low_no_outlier,aes(LowPriceAvg))+geom_histogram(aes(y=..density..),binwidth=181,fill="#DAF7A6")+geom_density(col = "red")+
labs(x="Low Price (Rs.)",
title="The distribution of low stock price of indian stock is bimodal and skewed to the\n right, centered at about Rs.262 with most of the data between Rs.100 and\n Rs.250, a range of roughly Rs.1239 and a few outliers are present on the \n higher end.",
caption="Data source: https://www.kaggle.com/souravroy1/stock-market-data")+
theme_bw()
#Summary statistics for Low price Average
summarise(low_no_outlier,mean=mean(LowPriceAvg,na.rm=TRUE),
median=median(LowPriceAvg,na.rm=TRUE),
sd=sd(LowPriceAvg,na.rm=TRUE),
min=min(LowPriceAvg),
max=max(LowPriceAvg)
)%>%mutate(Range=max-min)
## # A tibble: 1 x 6
## mean median sd min max Range
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 384. 262. 345. 77.4 1316. 1239.