---
title: "EDA for CWC(Agency) Dataset"
output:
flexdashboard::flex_dashboard:
orientation: rows
vertical_layout: scroll
theme: flatly
social: menu
source_code: embed
color: blue
---
```{r setup, include=FALSE}
library(flexdashboard)
library(flexdashboard)
library(MASS)
library(lattice)
library(dplyr)
library(ggplot2)
library(shiny)
library(DT)
```
# Dataset Description
## View of dataset
```{r}
daf <- read.csv("/cloud/project/Daily_data_of_reservoir_level_of_Central_Water_Commission_(CWC)_Agency_during_June_2024.csv")
datatable(daf, extensions = "Buttons",options=list(dom='Bfrtip',Buttons=c('copy','csv','print','pdf')))
```
# About
## Structure of the dataset
### Structure of the dataset
```{r}
str(daf)
```
## Missing values and outliers detection
### Missing values and outliers detection
```{r}
#Identifying missing values in the dataset
missing_value <- sapply(Insurance, function(x) sum(is.na(x)))
missing_value
#Impute missing values with the mean for numeric columns
Insurance <- Insurance %>%
mutate(across(where(is.numeric), ~ifelse(is.na(.), mean(., na.rm = TRUE), .)))
#Function to detect outliers (assuming you have a custom function or using a standard approach)
detect_outliers <- function(data, column_name) {
# Calculate z-scores
z_scores <- abs((data[[column_name]] - mean(data[[column_name]], na.rm = TRUE)) / sd(data[[column_name]], na.rm = TRUE))
# Identify outliers (those with z-scores greater than a threshold, e.g., 3)
outliers <- data[z_scores > 3, column_name]
return(outliers)
}
#Detect outliers in 'Holders' and 'Claims' columns
outliers_holders <- detect_outliers(Insurance, "Holders")
outliers_claims <- detect_outliers(Insurance, "Claims")
#Filter out the outliers from the Insurance dataset
Insurance_Clean <- Insurance %>%
filter(!(Holders %in% outliers_holders)) %>%
filter(!(Claims %in% outliers_claims))
```
# Summary
## summary
### summary
```{r}
summary(daf)
```
# Piechart
```{r}
total_Long =sum(daf$Long, na.rm = TRUE)
total_fullreservoirlevel=sum(daf$Full_reservoir_level, na.rm = TRUE)
total_Lat=sum(daf$Lat, na.rm = TRUE)
total_Level = sum(daf$Live_capacity_FRL, na.rm = TRUE)
summary_data =data.frame(
Category = c("Total Longtitude", "Total full revservoirlevel", "Total Latitude", "Total Level"),
Value = c(total_Long, total_fullreservoirlevel, total_Lat, total_Level)
)
ggplot(summary_data, aes(x = "", y = Value, fill = Category)) +
geom_bar(width = 1, stat = "identity") +
coord_polar(theta = "y") +
theme_void() +
geom_text(aes(label = paste0(round(Value / sum(Value) * 100, 1), "%")), position = position_stack(vjust = 0.5)) +
labs(title = "Distribution of Attributes")
```
# Uni-variate Analysis {#bivariate-analysis}
## Histogram for single variable
### Histogram for Longitude
```{r}
ggplot(daf, aes(x = Long)) +
geom_histogram(fill = "lightblue", color = "black") +
geom_vline(aes(xintercept = mean(Long)), color = "red", lwd = 1) +
labs(title = "Histogram of Longitude", x = "Longitude")
```
### Histogram for Latitude
```{r}
ggplot(daf, aes(x = Lat)) +
geom_histogram(fill = "violet", color = "black") +
geom_vline(aes(xintercept = mean(Lat)), color = "red", lwd = 1) +
labs(title = "Histogram of Latidude", x = "Latidude")
```
### Histogram for population density
```{r}
ggplot(daf, aes(x = Level)) +
geom_histogram(bins = 100, fill = 'blue', color = 'black') +
geom_vline(aes(xintercept = mean(Level)), color = 'red', lwd = 1) +
labs(title = "Histogram of Level", x = "Popden", y = "Frequency")
```
### Histogram for Long
```{r}
hist(daf$Long ,breaks=500,xlim=c(0,1000),main="distriubution of Logtitude",col=c('blue','orange'))
```
### Histogram for Lat
```{r}
hist(daf$Lat ,breaks=500,xlim=c(0,1000),main="distriubution of Latitude",col=c('pink','yellow'))
```
### Histogram for Level
```{r}
hist(daf$Level ,breaks=500,xlim=c(0,1000),main="distriubution of Level",col=c('violet','gray'))
```
# Bivariate Analysis
## Box plot
### Box Plot for Long
```{r}
boxplot(daf$Long)
```
### Box Plot for Full_reservoir_level
```{r}
boxplot(daf$Full_reservoir_level)
```
### Box Plot for Lat
```{r}
boxplot(daf$Lat)
```
### Box Plot for Live_capacity_FRL
```{r}
boxplot(daf$Live_capacity_FRL)
```
### Box Plot for Year
```{r}
boxplot(daf$Year)
```
### Box Plot for Month
```{r}
boxplot(daf$Month)
```
# Multi-variate Analysis
## Scatter plot
### Scater plot Longtidude vs Latitude
```{r}
ggplot(daf, aes(x = Long, y = Lat)) +
geom_point() +
labs(title = "Scatter plot of long vs. lat")
```
### Scater plot for Level vs Latidude
```{r}
ggplot(daf, aes(x = Level, y = Lat)) +
geom_point() +
labs(title = "Scatter plot of level vs. lat")
```
### Scater plot for Longtidude vs Storage
```{r}
ggplot(daf, aes(x = Long, y = Storage)) +
geom_point() +
labs(title = "Scatter plot of long vs. storage")
```
### Scater plot for Level vs Storage
```{r}
ggplot(daf, aes(x = Level, y =Storage)) +
geom_point() +
labs(title = "Scatter plot of level vs. storage")
```
# insights and inference
### insights and inference in pdf
```{r}
# This code generates the HTML to display the PDF
pdf_path ="https://github.com/mvjayaruthra14/INSIGHTS_AND_INFERENCES_OF_EDA/blob/main/file_show.pdf"
htmltools::tags$iframe(src = pdf_path, width = "100%", height = "600px")
```