library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(viridis)
## Loading required package: viridisLite
library(ggrepel)
## Loading required package: ggplot2
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
##
## guess_encoding
library(ggthemes)
library(ggplot2)
The historical Kerala rainfall data is read, and the focus is narrowed down to the period from 2000 to 2017.
#import the data
df_rain <- read.csv("Kerala-Rainfall-Historical.csv")
#Filter the data
df_filtered <- df_rain %>%
filter(YEAR >= 2000 & YEAR <= 2017)
#View the new data frame to verify the filter
head(df_filtered)
## SUBDIVISION YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT
## 1 Kerala 2000 11.7 57.8 21.5 96.3 124.5 633.8 343.2 566.5 195.8 214.2
## 2 Kerala 2001 16.5 28.3 7.0 238.0 238.6 715.3 598.5 361.3 216.8 319.6
## 3 Kerala 2002 4.7 8.7 35.7 117.3 330.8 503.1 318.7 438.2 99.0 511.7
## 4 Kerala 2003 0.7 50.9 82.1 134.4 91.0 566.7 532.0 350.3 93.6 407.0
## 5 Kerala 2004 2.4 8.1 37.9 113.2 610.9 673.4 385.4 417.9 192.8 320.6
## 6 Kerala 2005 19.8 7.0 25.3 205.9 134.8 619.2 832.7 291.0 414.7 240.1
## NOV DEC ANNUAL JF MAM JJAS OND
## 1 78.1 69.1 2412.6 69.5 242.3 1739.4 361.5
## 2 181.0 10.1 2931.1 44.7 483.7 1892.0 510.7
## 3 137.5 2.1 2507.4 13.3 483.7 1359.0 651.3
## 4 76.4 9.7 2394.9 51.6 307.5 1542.6 493.1
## 5 120.7 2.7 2886.1 10.5 762.0 1669.5 444.0
## 6 184.3 56.4 3031.1 26.8 366.0 2157.6 480.7
df_filtered
## SUBDIVISION YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT
## 1 Kerala 2000 11.7 57.8 21.5 96.3 124.5 633.8 343.2 566.5 195.8 214.2
## 2 Kerala 2001 16.5 28.3 7.0 238.0 238.6 715.3 598.5 361.3 216.8 319.6
## 3 Kerala 2002 4.7 8.7 35.7 117.3 330.8 503.1 318.7 438.2 99.0 511.7
## 4 Kerala 2003 0.7 50.9 82.1 134.4 91.0 566.7 532.0 350.3 93.6 407.0
## 5 Kerala 2004 2.4 8.1 37.9 113.2 610.9 673.4 385.4 417.9 192.8 320.6
## 6 Kerala 2005 19.8 7.0 25.3 205.9 134.8 619.2 832.7 291.0 414.7 240.1
## 7 Kerala 2006 8.1 0.5 90.7 65.3 521.2 482.4 804.0 432.6 474.8 376.4
## 8 Kerala 2007 0.5 5.6 7.3 138.5 192.7 705.9 966.3 489.6 526.7 357.2
## 9 Kerala 2008 0.8 30.3 217.2 108.4 81.2 469.9 505.1 349.0 347.0 343.4
## 10 Kerala 2009 3.3 1.5 62.6 69.0 191.6 438.2 924.9 269.3 326.5 205.2
## 11 Kerala 2010 18.6 1.0 31.4 138.9 190.6 667.5 629.0 356.0 275.6 441.4
## 12 Kerala 2011 20.5 45.7 24.1 165.2 124.2 788.5 536.8 492.7 391.2 227.2
## 13 Kerala 2012 7.4 11.0 21.0 171.1 95.3 430.3 362.6 501.6 241.1 187.5
## 14 Kerala 2013 3.9 40.1 49.9 49.3 119.3 1042.7 830.2 369.7 318.6 259.9
## 15 Kerala 2014 4.6 10.3 17.9 95.7 251.0 454.4 677.8 733.9 298.8 355.5
## 16 Kerala 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2 292.9 308.1
## 17 Kerala 2016 3.0 16.4 22.4 33.3 258.4 595.7 441.5 231.0 84.1 105.1
## 18 Kerala 2017 12.7 0.3 87.8 52.8 213.3 579.8 378.5 462.6 435.5 228.0
## NOV DEC ANNUAL JF MAM JJAS OND
## 1 78.1 69.1 2412.6 69.5 242.3 1739.4 361.5
## 2 181.0 10.1 2931.1 44.7 483.7 1892.0 510.7
## 3 137.5 2.1 2507.4 13.3 483.7 1359.0 651.3
## 4 76.4 9.7 2394.9 51.6 307.5 1542.6 493.1
## 5 120.7 2.7 2886.1 10.5 762.0 1669.5 444.0
## 6 184.3 56.4 3031.1 26.8 366.0 2157.6 480.7
## 7 162.8 1.8 3420.6 8.6 677.2 2193.8 541.0
## 8 87.4 11.9 3489.6 6.1 338.4 2688.5 456.5
## 9 55.4 17.0 2524.5 31.1 406.7 1670.9 415.7
## 10 274.4 44.2 2810.6 4.8 323.1 1958.9 523.8
## 11 335.1 46.8 3131.8 19.6 360.9 1928.0 823.3
## 12 169.7 49.5 3035.1 66.2 313.5 2209.1 446.3
## 13 112.9 9.4 2151.1 18.3 287.4 1535.6 309.8
## 14 154.9 17.0 3255.4 43.9 218.5 2561.2 431.8
## 15 99.5 47.2 3046.4 14.9 364.5 2164.8 502.1
## 16 223.6 79.4 2600.6 8.9 465.9 1514.7 611.1
## 17 57.9 22.0 1870.9 19.3 314.2 1352.3 185.0
## 18 152.1 61.4 2664.9 13.0 353.9 1856.5 441.5
# Display a statistical summary of the data
summary(df_filtered)
## SUBDIVISION YEAR JAN FEB
## Length:18 Min. :2000 Min. : 0.500 Min. : 0.30
## Class :character 1st Qu.:2004 1st Qu.: 3.025 1st Qu.: 5.65
## Mode :character Median :2008 Median : 4.650 Median : 9.50
## Mean :2008 Mean : 7.906 Mean :18.29
## 3rd Qu.:2013 3rd Qu.:12.450 3rd Qu.:29.80
## Max. :2017 Max. :20.500 Max. :57.80
## MAR APR MAY JUN
## Min. : 7.00 Min. : 33.30 Min. : 81.2 Min. : 430.3
## 1st Qu.: 21.73 1st Qu.: 75.67 1st Qu.:124.3 1st Qu.: 487.6
## Median : 33.55 Median :115.25 Median :192.2 Median : 587.8
## Mean : 49.55 Mean :122.59 Mean :220.6 Mean : 607.2
## 3rd Qu.: 59.48 3rd Qu.:158.62 3rd Qu.:247.9 3rd Qu.: 671.9
## Max. :217.20 Max. :238.00 Max. :610.9 Max. :1042.7
## JUL AUG SEP OCT
## Min. :318.7 Min. :231.0 Min. : 84.1 Min. :105.1
## 1st Qu.:390.6 1st Qu.:349.3 1st Qu.:201.1 1st Qu.:227.4
## Median :534.4 Median :393.8 Median :295.9 Median :313.9
## Mean :581.8 Mean :409.2 Mean :290.3 Mean :300.4
## 3rd Qu.:772.5 3rd Qu.:482.9 3rd Qu.:380.1 3rd Qu.:356.8
## Max. :966.3 Max. :733.9 Max. :526.7 Max. :511.7
## NOV DEC ANNUAL JF
## Min. : 55.40 Min. : 1.80 Min. :1871 Min. : 4.80
## 1st Qu.: 90.42 1st Qu.: 9.80 1st Qu.:2512 1st Qu.:11.12
## Median :144.80 Median :19.50 Median :2848 Median :18.80
## Mean :147.98 Mean :30.98 Mean :2787 Mean :26.17
## 3rd Qu.:178.18 3rd Qu.:48.92 3rd Qu.:3044 3rd Qu.:40.70
## Max. :335.10 Max. :79.40 Max. :3490 Max. :69.50
## MAM JJAS OND
## Min. :218.5 Min. :1352 Min. :185.0
## 1st Qu.:313.7 1st Qu.:1574 1st Qu.:434.2
## Median :357.4 Median :1874 Median :468.6
## Mean :392.7 Mean :1889 Mean :479.4
## 3rd Qu.:451.1 3rd Qu.:2163 3rd Qu.:520.5
## Max. :762.0 Max. :2688 Max. :823.3
# Visualization: Annual Rainfall Bar Plot
barplot(df_filtered$ANNUAL,
main='Annual Rainfall in Kerala (2000-2017)',
xlab='Year',ylab = 'Rainfall(cm)',xlim = c(0,20),ylim =c(0,3500))
The Delhi pollution data is imported, and a summary is checked, along with a count of missing values.
#import data
Delhi_Pollution <- read.csv("delhi pollution.csv")
# Display the first few rows of the data
head(Delhi_Pollution)
## Date Month Year Holidays_Count Days PM2.5 PM10 NO2 SO2 CO Ozone AQI
## 1 1 1 2021 0 5 408.80 442.42 160.61 12.95 2.77 43.19 462
## 2 2 1 2021 0 6 404.04 561.95 52.85 5.18 2.60 16.43 482
## 3 3 1 2021 1 7 225.07 239.04 170.95 10.93 1.40 44.29 263
## 4 4 1 2021 0 1 89.55 132.08 153.98 10.42 1.01 49.19 207
## 5 5 1 2021 0 2 54.06 55.54 122.66 9.70 0.64 48.88 149
## 6 6 1 2021 0 3 155.59 180.14 142.71 10.29 1.18 44.47 252
# Check the class/data type of each column
cls <- as.data.frame(sapply(Delhi_Pollution,FUN = class))
cls
## sapply(Delhi_Pollution, FUN = class)
## Date integer
## Month integer
## Year integer
## Holidays_Count integer
## Days integer
## PM2.5 numeric
## PM10 numeric
## NO2 numeric
## SO2 numeric
## CO numeric
## Ozone numeric
## AQI integer
# Display a statistical summary of the data
summary(Delhi_Pollution)
## Date Month Year Holidays_Count
## Min. : 1.00 Min. : 1.000 Min. :2021 Min. :0.0000
## 1st Qu.: 8.00 1st Qu.: 4.000 1st Qu.:2022 1st Qu.:0.0000
## Median :16.00 Median : 7.000 Median :2023 Median :0.0000
## Mean :15.73 Mean : 6.523 Mean :2023 Mean :0.1896
## 3rd Qu.:23.00 3rd Qu.:10.000 3rd Qu.:2024 3rd Qu.:0.0000
## Max. :31.00 Max. :12.000 Max. :2024 Max. :1.0000
## Days PM2.5 PM10 NO2
## Min. :1.000 Min. : 0.05 Min. : 9.69 Min. : 2.16
## 1st Qu.:2.000 1st Qu.: 41.28 1st Qu.: 115.11 1st Qu.: 17.28
## Median :4.000 Median : 72.06 Median : 199.80 Median : 30.49
## Mean :4.001 Mean : 90.77 Mean : 218.22 Mean : 37.18
## 3rd Qu.:6.000 3rd Qu.: 118.50 3rd Qu.: 297.75 3rd Qu.: 45.01
## Max. :7.000 Max. :1000.00 Max. :1000.00 Max. :433.98
## SO2 CO Ozone AQI
## Min. : 1.21 Min. :0.270 Min. : 2.70 Min. : 19.0
## 1st Qu.: 7.71 1st Qu.:0.610 1st Qu.: 24.10 1st Qu.:108.0
## Median : 15.43 Median :0.850 Median : 32.47 Median :189.0
## Mean : 20.10 Mean :1.026 Mean : 36.34 Mean :202.2
## 3rd Qu.: 26.62 3rd Qu.:1.240 3rd Qu.: 45.73 3rd Qu.:284.0
## Max. :113.40 Max. :4.700 Max. :115.87 Max. :500.0
#Data Check: Missing Values
sum(is.na(Delhi_Pollution))
## [1] 0
#Data Manipulation: Filter Pollution Data for 2023
df_2023 <- Delhi_Pollution[Delhi_Pollution$Year == 2023, ]
#Visualization: PM 2.5 vs PM 10 Box Plot
boxplot(df_2023$PM2.5,df_2023$PM10, main= 'Comparison of PM-2.5 and PM-10 in 2023',names = c("PM 2.5","PM 10"),
xlab="Pollutents", ylab=" Concentration (µg/m³)",xlim=c(0,3),ylim=c(0,550))
The population data is imported from “populaion.csv”, and its structure is inspected.
#import data
populaion <- read.csv("populaion.csv")
View(populaion)
# Display the first few rows of the data
head(populaion)
## Year Population X..Increase.in.Population Population.Density
## 1 1950 357021100 0.00% 108.61
## 2 1951 36,49,22,360 2.21% 111.01
## 3 1952 37,29,97,188 2.21% 113.47
## 4 1953 38,12,27,705 2.21% 115.97
## 5 1954 38,97,31,406 2.23% 118.56
## 6 1955 39,85,77,992 2.27% 121.25
## X..Increase.in.Population.Density Urban.Population
## 1 0.00% Null
## 2 2.21% Null
## 3 2.21% Null
## 4 2.21% Null
## 5 2.23% Null
## 6 2.27% Null
## Urban.Population...of.Total.Population X..Increase.in.Urban.Population
## 1 Null Null
## 2 Null Null
## 3 Null Null
## 4 Null Null
## 5 Null Null
## 6 Null Null
## Rural.Population Rural.Population...of.Total.Population
## 1 Null Null
## 2 Null Null
## 3 Null Null
## 4 Null Null
## 5 Null Null
## 6 Null Null
## X..Increase.in.Rural.Population Life.Expectancy
## 1 Null 35.21
## 2 Null 35.80
## 3 Null 36.39
## 4 Null 36.98
## 5 Null 37.57
## 6 Null 38.16
## X..Increase.in.Life.Expectancy Birth.Rate X..Change.in.Birth.Rate Death.Rate
## 1 0.00% 44.175 0.00% 28.161
## 2 1.68% 43.970 -0.46% 27.584
## 3 1.65% 43.764 -0.47% 27.008
## 4 1.62% 43.558 -0.47% 26.432
## 5 1.60% 43.352 -0.47% 25.856
## 6 1.57% 43.146 -0.48% 25.280
## X..Change.in.Death.Rate Infant.Mortality.Rate
## 1 0.00% 189.629
## 2 -2.05% 186.737
## 3 -2.09% 183.846
## 4 -2.13% 180.954
## 5 -2.18% 178.062
## 6 -2.23% 175.171
## X..Change.in.Infant.Mortality.Rate Fertility.Rate X..Change.in.Fertility.Rate
## 1 0.00% 5.907 0.00%
## 2 -1.53% 5.906 -0.02%
## 3 -1.55% 5.904 -0.03%
## 4 -1.57% 5.903 -0.02%
## 5 -1.60% 5.902 -0.02%
## 6 -1.62% 5.900 -0.03%
## Net.Migration.Rate X..Change.in.Net.Migration.Rate
## 1 -0.043 0.00%
## 2 -0.047 9.30%
## 3 -0.050 6.38%
## 4 -0.054 8.00%
## 5 -0.058 7.41%
## 6 -0.061 5.17%
# Check the class/data type of each column
cls <- as.data.frame(sapply(populaion,FUN = class))
cls
## sapply(populaion, FUN = class)
## Year integer
## Population character
## X..Increase.in.Population character
## Population.Density numeric
## X..Increase.in.Population.Density character
## Urban.Population character
## Urban.Population...of.Total.Population character
## X..Increase.in.Urban.Population character
## Rural.Population character
## Rural.Population...of.Total.Population character
## X..Increase.in.Rural.Population character
## Life.Expectancy numeric
## X..Increase.in.Life.Expectancy character
## Birth.Rate numeric
## X..Change.in.Birth.Rate character
## Death.Rate numeric
## X..Change.in.Death.Rate character
## Infant.Mortality.Rate numeric
## X..Change.in.Infant.Mortality.Rate character
## Fertility.Rate numeric
## X..Change.in.Fertility.Rate character
## Net.Migration.Rate numeric
## X..Change.in.Net.Migration.Rate character
# Display a statistical summary of the data
summary(populaion)
## Year Population X..Increase.in.Population Population.Density
## Min. :1950 Length:73 Length:73 Min. :108.6
## 1st Qu.:1968 Class :character Class :character 1st Qu.:162.3
## Median :1986 Mode :character Mode :character Median :242.7
## Mean :1986 Mean :254.8
## 3rd Qu.:2004 3rd Qu.:345.7
## Max. :2022 Max. :431.1
## X..Increase.in.Population.Density Urban.Population
## Length:73 Length:73
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## Urban.Population...of.Total.Population X..Increase.in.Urban.Population
## Length:73 Length:73
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## Rural.Population Rural.Population...of.Total.Population
## Length:73 Length:73
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## X..Increase.in.Rural.Population Life.Expectancy X..Increase.in.Life.Expectancy
## Length:73 Min. :35.21 Length:73
## Class :character 1st Qu.:46.10 Class :character
## Mode :character Median :55.98 Mode :character
## Mean :54.86
## 3rd Qu.:63.91
## Max. :70.19
## Birth.Rate X..Change.in.Birth.Rate Death.Rate
## Min. :17.16 Length:73 Min. : 7.237
## 1st Qu.:24.75 Class :character 1st Qu.: 8.261
## Median :34.02 Mode :character Median :12.008
## Mean :32.18 Mean :13.869
## 3rd Qu.:39.77 3rd Qu.:18.368
## Max. :44.17 Max. :28.161
## X..Change.in.Death.Rate Infant.Mortality.Rate
## Length:73 Min. : 27.70
## Class :character 1st Qu.: 57.85
## Mode :character Median : 98.21
## Mean :101.89
## 3rd Qu.:145.81
## Max. :189.63
## X..Change.in.Infant.Mortality.Rate Fertility.Rate X..Change.in.Fertility.Rate
## Length:73 Min. :2.159 Length:73
## Class :character 1st Qu.:3.071 Class :character
## Mode :character Median :4.432 Mode :character
## Mean :4.292
## 3rd Qu.:5.723
## Max. :5.907
## Net.Migration.Rate X..Change.in.Net.Migration.Rate
## Min. :-0.44600 Length:73
## 1st Qu.:-0.34300 Class :character
## Median :-0.06800 Mode :character
## Mean :-0.06993
## 3rd Qu.: 0.03900
## Max. : 0.71500
#Filter the data and create a new data frame called 'df_filtered'
df_filtered_pop <- populaion %>%
filter(Year >= 2010 & Year <= 2022)%>%
select(Year, Population)
df_filtered_pop$Population <- as.numeric(df_filtered_pop$Population)
df_filtered_pop
## Year Population
## 1 2010 1240613620
## 2 2011 1257621191
## 3 2012 1274487215
## 4 2013 1291132063
## 5 2014 1307246509
## 6 2015 1322866505
## 7 2016 1338636340
## 8 2017 1354195680
## 9 2018 1369003306
## 10 2019 1383112050
## 11 2020 1396387127
## 12 2021 1407563842
## 13 2022 1417173173
#Visualization: Population Trend
ggplot(data = df_filtered_pop, aes(x = Year, y = Population)) +
geom_line(color = "darkblue", linewidth = 1.2) +
labs(
title = "Population Trend (2000-2022)",
x = "Year",
y = "Total Population")