---
title: "EDA for Air_index dataset"
output:
flexdashboard::flex_dashboard:
orientation: rows
vertical_layout: scroll
theme: yeti
social: menu
source_code: embed
---
```{r setup, include=FALSE}
library(flexdashboard)
library(ggplot2)
library(dplyr)
library(tidyr)
```
## Dataset Description {.tabset}
```{r}
# Read the dataset
df <- read.csv("air_index.csv")
```
### Head
```{r}
head(df)
```
### Structure
```{r}
str(df)
```
### Summary
```{r}
summary(df)
```
### Null Values
```{r}
# Check for missing values
colSums(is.na(df))
```
```{r}
# Filling missing values in 'pollutant_min', 'pollutant_max', and 'pollutant_avg' with their column means
df <- df %>%
mutate(
pollutant_min = ifelse(is.na(pollutant_min), mean(pollutant_min, na.rm = TRUE), pollutant_min),
pollutant_max = ifelse(is.na(pollutant_max), mean(pollutant_max, na.rm = TRUE), pollutant_max),
pollutant_avg = ifelse(is.na(pollutant_avg), mean(pollutant_avg, na.rm = TRUE), pollutant_avg)
)
```
```{r}
colSums(is.na(df))
```
## Univariate analysis {.tabset}
### Distribution of Pollutant_min levels
```{r}
ggplot(df, aes(x = pollutant_min)) +
geom_histogram(fill = "skyblue", color = "black", bins = 30) +
labs(title = "Distribution of Pollutant Min Levels", x = "Pollutant Min", y = "Count")
```
### Distribution of Pollutant_max levels
```{r}
ggplot(df, aes(x = pollutant_max)) +
geom_histogram(fill = "lightgreen", color = "black", bins = 30) +
labs(title = "Distribution of Pollutant Max Levels", x = "Pollutant Max", y = "Count")
```
### Distribution of Pollutant_avg levels
```{r}
ggplot(df, aes(x = pollutant_avg)) +
geom_histogram(fill = "lightcoral", color = "black", bins = 30) +
labs(title = "Distribution of Pollutant Avg Levels", x = "Pollutant Avg", y = "Count")
```
### Plot the average pollutant levels by state
```{r}
state_pollution_summary <- df %>%
group_by(state) %>%
summarise(avg_pollutant_level = mean(pollutant_avg, na.rm = TRUE))
# Sort states by the average pollutant level to find the highest
most_polluted_state <- state_pollution_summary %>%
arrange(desc(avg_pollutant_level))
# Plot the average pollutant levels by state
ggplot(state_pollution_summary, aes(x = reorder(state, avg_pollutant_level), y = avg_pollutant_level)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flip coordinates for better readability
labs(title = "Average Pollutant Levels by State",
x = "State",
y = "Average Pollutant Level") +
theme_minimal()
```
### Plot the average pollutant level by pollutant_id
```{r}
ggplot(df, aes(x = pollutant_id, y = pollutant_avg)) +
geom_bar(stat = "summary", fun = "mean", fill = "lightcoral") +
labs(title = "Average Pollutant Levels by Pollutant ID",
x = "Pollutant ID",
y = "Average Pollutant Level") +
theme_minimal()
```
## Bivariate analysis {.tabset}
### Scatter plot of pollutant_avg vs pollutant_ma
```{r}
# Scatter plot of pollutant_avg vs pollutant_max
ggplot(df, aes(x = pollutant_max, y = pollutant_avg)) +
geom_point(color = "blue", alpha = 0.6) +
labs(title = "Scatter Plot of Pollutant Max vs Pollutant Avg", x = "Pollutant Max", y = "Pollutant Avg")
```
### Box plot of pollutant_avg by pollutant_id
```{r}
# Bivariate Plot: Box plot of pollutant_avg by pollutant_id
ggplot(df, aes(x = pollutant_id, y = pollutant_avg, fill = pollutant_id)) +
geom_boxplot() +
labs(title = "Box Plot of Pollutant Avg by Pollutant Type", x = "Pollutant Type", y = "Pollutant Avg") +
theme(legend.position = "none") # Removes the legend for simplicity
```
### Box plot of pollutant_avg by state
```{r}
# Bivariate Plot: Box plot of pollutant_avg by state
ggplot(df, aes(x = state, y = pollutant_avg, fill = state)) +
geom_boxplot() +
labs(title = "Box Plot of Pollutant Avg by state", x = "state", y = "Pollutant Avg") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none") # Rotate city labels
```
### pollutant average levels by city and pollutant ID
```{r}
# Create a bar plot to show pollutant average levels by city and pollutant ID
tamilnadu_data <- df %>%
filter(state == "TamilNadu")
ggplot(tamilnadu_data, aes(x = city, y = pollutant_avg, fill = pollutant_id)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Pollutant Average Levels by City and Pollutant Type",
x = "City",
y = "Pollutant Average Level",
fill = "Pollutant ID") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_brewer(palette = "Set1") # Optional: for a more visually appealing color palette
```
## Multivarient
### heatmap
```{r}
# Create a heatmap using ggplot
tamilnadu_data <- df %>%
filter(state == "TamilNadu")
ggplot(tamilnadu_data, aes(x = city, y = pollutant_id, fill = pollutant_avg)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Avg Pollutant") +
labs(title = "Heatmap of Pollutant Averages by City in Tamin Nsadu and Pollutant Type",
x = "City", y = "Pollutant Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
install.packages('rmarkdown')
```