library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.2.3
library(maps)
## Warning: package 'maps' was built under R version 4.2.3
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(forcats)
crop <- read_csv("punjab_fertilizer.csv")
## Rows: 416 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Year, Province, Division, District
## dbl (14): _id, Usage (in 1000 nutirient tons), Area Sown Total (Wheat), Area...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(crop)
## [1] 416 18
head(crop)
## # A tibble: 6 × 18
## `_id` Year Province Division District Usage (in 1000 nutiri…¹
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 1 2002-03 Punjab Bahawalpur Divn. Bahawalpur 114
## 2 2 2002-03 Punjab Bahawalpur Divn. Bahawalnagar 93
## 3 3 2002-03 Punjab Bahawalpur Divn. Rahim Yar Khan 168
## 4 4 2002-03 Punjab D.G.Khan Divn. D.G. Khan 48
## 5 5 2002-03 Punjab D.G.Khan Divn. Layyah 48
## 6 6 2002-03 Punjab D.G.Khan Divn. Muzaffargarh 87
## # ℹ abbreviated name: ¹`Usage (in 1000 nutirient tons)`
## # ℹ 12 more variables: `Area Sown Total (Wheat)` <dbl>,
## # `Area Sown (Rice)` <dbl>, `Area Sown (Cotton)` <dbl>,
## # `Area Sown (Sugarcane)` <dbl>, `Wheat Production` <dbl>,
## # `Rice Production` <dbl>, `Cotton Production` <dbl>,
## # `Sugarcane Production` <dbl>, `Output/acre wheat` <dbl>,
## # `Output/Acre (Rice)` <dbl>, `Output/Acre (Cotton)` <dbl>, …
colnames(crop)
## [1] "_id" "Year"
## [3] "Province" "Division"
## [5] "District" "Usage (in 1000 nutirient tons)"
## [7] "Area Sown Total (Wheat)" "Area Sown (Rice)"
## [9] "Area Sown (Cotton)" "Area Sown (Sugarcane)"
## [11] "Wheat Production" "Rice Production"
## [13] "Cotton Production" "Sugarcane Production"
## [15] "Output/acre wheat" "Output/Acre (Rice)"
## [17] "Output/Acre (Cotton)" "Output/Acre (Sugarcane)"
crop <- clean_names(crop)
md.pattern(crop, rotate.names = T)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## id year province division district usage_in_1000_nutirient_tons
## 416 1 1 1 1 1 1
## 0 0 0 0 0 0
## area_sown_total_wheat area_sown_rice area_sown_cotton area_sown_sugarcane
## 416 1 1 1 1
## 0 0 0 0
## wheat_production rice_production cotton_production sugarcane_production
## 416 1 1 1 1
## 0 0 0 0
## output_acre_wheat output_acre_rice output_acre_cotton output_acre_sugarcane
## 416 1 1 1 1
## 0 0 0 0
##
## 416 0
## 0
colSums(is.na(crop))
## id year
## 0 0
## province division
## 0 0
## district usage_in_1000_nutirient_tons
## 0 0
## area_sown_total_wheat area_sown_rice
## 0 0
## area_sown_cotton area_sown_sugarcane
## 0 0
## wheat_production rice_production
## 0 0
## cotton_production sugarcane_production
## 0 0
## output_acre_wheat output_acre_rice
## 0 0
## output_acre_cotton output_acre_sugarcane
## 0 0
glimpse(crop)
## Rows: 416
## Columns: 18
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
## $ year <chr> "2002-03", "2002-03", "2002-03", "2002-03…
## $ province <chr> "Punjab", "Punjab", "Punjab", "Punjab", "…
## $ division <chr> "Bahawalpur Divn.", "Bahawalpur Divn.", "…
## $ district <chr> "Bahawalpur", "Bahawalnagar", "Rahim Yar …
## $ usage_in_1000_nutirient_tons <dbl> 114, 93, 168, 48, 48, 87, 54, 91, 99, 49,…
## $ area_sown_total_wheat <dbl> 269, 282, 303, 156, 178, 278, 135, 254, 3…
## $ area_sown_rice <dbl> 5, 41, 13, 27, 1, 20, 5, 22, 79, 22, 231,…
## $ area_sown_cotton <dbl> 268, 178, 307, 93, 29, 189, 148, 32, 45, …
## $ area_sown_sugarcane <dbl> 9, 27, 43, 3, 16, 25, 5, 117, 92, 46, 3, …
## $ wheat_production <dbl> 724, 695, 740, 407, 379, 657, 336, 716, 9…
## $ rice_production <dbl> 7, 65, 19, 53, 1, 32, 7, 33, 118, 43, 436…
## $ cotton_production <dbl> 968, 544, 1076, 411, 63, 599, 651, 67, 11…
## $ sugarcane_production <dbl> 501, 1155, 2620, 142, 674, 1394, 258, 541…
## $ output_acre_wheat <dbl> 2.691450, 2.464539, 2.442244, 2.608974, 2…
## $ output_acre_rice <dbl> 1.400000, 1.585366, 1.461538, 1.962963, 1…
## $ output_acre_cotton <dbl> 3.611940, 3.056180, 3.504886, 4.419355, 2…
## $ output_acre_sugarcane <dbl> 55.66667, 42.77778, 60.93023, 47.33333, 4…
table(crop$province)
##
## Punjab
## 416
colnames(crop)
## [1] "id" "year"
## [3] "province" "division"
## [5] "district" "usage_in_1000_nutirient_tons"
## [7] "area_sown_total_wheat" "area_sown_rice"
## [9] "area_sown_cotton" "area_sown_sugarcane"
## [11] "wheat_production" "rice_production"
## [13] "cotton_production" "sugarcane_production"
## [15] "output_acre_wheat" "output_acre_rice"
## [17] "output_acre_cotton" "output_acre_sugarcane"
# #crop$y <- as.Date(as.yearmon(crop$year))
# crop$t <- as.Date(crop$year, format = "%Y-%d")
# crop$yr <- as.Date(crop$yr, format = "%Y")
# crop <- crop %>%
# mutate(weekend = ifelse(wday(
# crop$t) %in% c(1,7), "weekend", "weekday"))
#
# #crop$c <-as.POSIXct(crop$year, format = "%Y-%m")
# #crop[c("yr", "month")] <- str_split_fixed(crop$year, "-", 2)
#
#
# #to find missin dates
# which(is.na(crop$t))
head(crop)
## # A tibble: 6 × 18
## id year province division district usage_in_1000_nutirie…¹
## <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 1 2002-03 Punjab Bahawalpur Divn. Bahawalpur 114
## 2 2 2002-03 Punjab Bahawalpur Divn. Bahawalnagar 93
## 3 3 2002-03 Punjab Bahawalpur Divn. Rahim Yar Khan 168
## 4 4 2002-03 Punjab D.G.Khan Divn. D.G. Khan 48
## 5 5 2002-03 Punjab D.G.Khan Divn. Layyah 48
## 6 6 2002-03 Punjab D.G.Khan Divn. Muzaffargarh 87
## # ℹ abbreviated name: ¹usage_in_1000_nutirient_tons
## # ℹ 12 more variables: area_sown_total_wheat <dbl>, area_sown_rice <dbl>,
## # area_sown_cotton <dbl>, area_sown_sugarcane <dbl>, wheat_production <dbl>,
## # rice_production <dbl>, cotton_production <dbl>, sugarcane_production <dbl>,
## # output_acre_wheat <dbl>, output_acre_rice <dbl>, output_acre_cotton <dbl>,
## # output_acre_sugarcane <dbl>
# head(crop$t, 416)
crop$year <- as.factor(crop$year)
table(crop$year)
##
## 2002-03 2003-04 2004-05 2005-06 2007-08 2008-09 2009-10 2010-11 2011-12 2012-13
## 33 33 33 33 34 35 35 36 36 36
## 2013-14 2014-15
## 36 36
No missing values found.
#EDA
introduce(crop)
## # A tibble: 1 × 9
## rows columns discrete_columns continuous_columns all_missing_columns
## <int> <int> <int> <int> <int>
## 1 416 18 4 14 0
## # ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
## # total_observations <int>, memory_usage <dbl>
plot_intro(crop)
plot_missing(crop)
head(crop)
## # A tibble: 6 × 18
## id year province division district usage_in_1000_nutirie…¹
## <dbl> <fct> <chr> <chr> <chr> <dbl>
## 1 1 2002-03 Punjab Bahawalpur Divn. Bahawalpur 114
## 2 2 2002-03 Punjab Bahawalpur Divn. Bahawalnagar 93
## 3 3 2002-03 Punjab Bahawalpur Divn. Rahim Yar Khan 168
## 4 4 2002-03 Punjab D.G.Khan Divn. D.G. Khan 48
## 5 5 2002-03 Punjab D.G.Khan Divn. Layyah 48
## 6 6 2002-03 Punjab D.G.Khan Divn. Muzaffargarh 87
## # ℹ abbreviated name: ¹usage_in_1000_nutirient_tons
## # ℹ 12 more variables: area_sown_total_wheat <dbl>, area_sown_rice <dbl>,
## # area_sown_cotton <dbl>, area_sown_sugarcane <dbl>, wheat_production <dbl>,
## # rice_production <dbl>, cotton_production <dbl>, sugarcane_production <dbl>,
## # output_acre_wheat <dbl>, output_acre_rice <dbl>, output_acre_cotton <dbl>,
## # output_acre_sugarcane <dbl>
colnames(crop)
## [1] "id" "year"
## [3] "province" "division"
## [5] "district" "usage_in_1000_nutirient_tons"
## [7] "area_sown_total_wheat" "area_sown_rice"
## [9] "area_sown_cotton" "area_sown_sugarcane"
## [11] "wheat_production" "rice_production"
## [13] "cotton_production" "sugarcane_production"
## [15] "output_acre_wheat" "output_acre_rice"
## [17] "output_acre_cotton" "output_acre_sugarcane"
We have wheat, rice, cotton, sugarcane area sown We have wheat, rice, cotton, sugarcane production We have wheat, rice, cotton, sugarcane output acrea
#Wheat
options(repr.plot.width=5, repr.plot.height=8)
wheat <- crop %>% select(area_sown_total_wheat,wheat_production,division, output_acre_wheat) %>% group_by(division) %>%
summarise(mean_area_sown = mean(area_sown_total_wheat),
mean_production = mean(wheat_production),
mean_output = mean(output_acre_wheat))
wheat
## # A tibble: 9 × 4
## division mean_area_sown mean_production mean_output
## <chr> <dbl> <dbl> <dbl>
## 1 Bahawalpur Divn. 305. 870. 2.85
## 2 D.G.Khan Divn. 204. 525. 2.56
## 3 Faisalabad Divn. 240. 700. 2.94
## 4 Gujranwala Divn. 166. 426. 2.51
## 5 Lahore Divn. 158. 463. 2.90
## 6 Multan Divn. 205. 601. 2.94
## 7 Rawalpindi Divn. 91.3 120. 1.46
## 8 Sahiwal Divn. 168. 565. 3.33
## 9 Sargodha Divn. 154. 353. 2.23
w_sown <- ggplot(wheat, mapping = aes( x = fct_rev(fct_reorder(division, mean_area_sown)), y = mean_area_sown)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average area sown for Wheat in each division",
x = "Divisions",
y="Mean area sown"
)
w_prod <- ggplot(wheat, mapping = aes( x = fct_rev(fct_reorder(division, mean_production)), y = mean_production)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average Wheat production in each division",
x = "Divisions",
y="Mean production"
)
w_output <- ggplot(wheat, mapping = aes( x = fct_rev(fct_reorder(division, mean_output)), y = mean_output)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average Wheat Output in each division",
x = "Divisions",
y="Mean output"
)
grid.arrange(w_sown, w_prod,w_output, ncol = 3)
#Cotton
options(repr.plot.width=5, repr.plot.height=8)
cotton <- crop %>% select(area_sown_cotton,cotton_production,division, output_acre_cotton) %>% group_by(division) %>%
summarise(mean_area_sown = mean(area_sown_cotton),
mean_production = mean(cotton_production),
mean_output = mean(output_acre_cotton))
cotton
## # A tibble: 9 × 4
## division mean_area_sown mean_production mean_output
## <chr> <dbl> <dbl> <dbl>
## 1 Bahawalpur Divn. 255. 1062. 4.17
## 2 D.G.Khan Divn. 111. 422. 3.66
## 3 Faisalabad Divn. 42.5 119. 2.74
## 4 Gujranwala Divn. 0.306 0.444 0.229
## 5 Lahore Divn. 4.12 8.76 1.00
## 6 Multan Divn. 175. 709. 3.87
## 7 Rawalpindi Divn. 0.0708 0.219 0.990
## 8 Sahiwal Divn. 53.4 212. 3.97
## 9 Sargodha Divn. 11.6 31.9 2.46
c_sown <- ggplot(cotton, mapping = aes( x = fct_rev(fct_reorder(division, mean_area_sown)), y = mean_area_sown)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average area sown for Cotton in each division",
x = "Divisions",
y="Mean area sown"
)
c_prod <- ggplot(cotton, mapping = aes( x = fct_rev(fct_reorder(division, mean_production)), y = mean_production)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average Cotton production in each division",
x = "Divisions",
y="Mean production"
)
c_output <- ggplot(cotton, mapping = aes( x = fct_rev(fct_reorder(division, mean_output)), y = mean_output)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average Cotton Output in each division",
x = "Divisions",
y="Mean output"
)
grid.arrange(c_sown, c_prod,c_output, ncol = 3)
#RICE
rice <- crop %>% select(area_sown_rice,rice_production,division, output_acre_rice) %>% group_by(division) %>%
summarise(mean_area_sown = mean(area_sown_rice),
mean_production = mean(rice_production),
mean_output = mean(output_acre_rice))
rice
## # A tibble: 9 × 4
## division mean_area_sown mean_production mean_output
## <chr> <dbl> <dbl> <dbl>
## 1 Bahawalpur Divn. 27.4 50.2 1.71
## 2 D.G.Khan Divn. 16.2 32.5 1.88
## 3 Faisalabad Divn. 44.4 75.5 1.71
## 4 Gujranwala Divn. 124. 232. 1.81
## 5 Lahore Divn. 108. 195. 1.83
## 6 Multan Divn. 20.8 36.5 1.71
## 7 Rawalpindi Divn. 0.312 0.521 0.438
## 8 Sahiwal Divn. 65.8 151. 2.19
## 9 Sargodha Divn. 15.8 26.1 1.68
r_sown <- ggplot(rice, mapping = aes( x = fct_rev(fct_reorder(division, mean_area_sown)), y = mean_area_sown)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average area sown for Rice in each division",
x = "Divisions",
y="Mean area sown"
)
r_prod <- ggplot(rice, mapping = aes( x = fct_rev(fct_reorder(division, mean_production)), y = mean_production)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average Rice production in each division",
x = "Divisions",
y="Mean production"
)
r_output <- ggplot(rice, mapping = aes( x = fct_rev(fct_reorder(division, mean_output)), y = mean_output)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average Rice Output in each division",
x = "Divisions",
y="Mean output"
)
grid.arrange(r_sown, r_prod, r_output, ncol = 3)
#Sugarcane
sugarcane <- crop %>% select(area_sown_sugarcane,sugarcane_production,division, output_acre_sugarcane) %>% group_by(division) %>%
summarise(mean_area_sown = mean(area_sown_sugarcane),
mean_production = mean(sugarcane_production),
mean_output = mean(output_acre_sugarcane))
sugarcane
## # A tibble: 9 × 4
## division mean_area_sown mean_production mean_output
## <chr> <dbl> <dbl> <dbl>
## 1 Bahawalpur Divn. 38.6 2553. 60.0
## 2 D.G.Khan Divn. 16.4 931. 58.5
## 3 Faisalabad Divn. 69.0 3582. 53.3
## 4 Gujranwala Divn. 7.76 340. 40.6
## 5 Lahore Divn. 18.2 862. 52.0
## 6 Multan Divn. 8.98 474. 51.4
## 7 Rawalpindi Divn. 0.0917 3.83 19.2
## 8 Sahiwal Divn. 10.2 477. 48.1
## 9 Sargodha Divn. 23.2 1099. 46.9
s_sown <- ggplot(sugarcane, mapping = aes( x = fct_rev(fct_reorder(division, mean_area_sown)), y = mean_area_sown)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average area sown for Sugarcane in each division",
x = "Divisions",
y="Mean area sown"
)
s_prod <- ggplot(sugarcane, mapping = aes( x = fct_rev(fct_reorder(division, mean_production)), y = mean_production)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average Sugarcane production in each division",
x = "Divisions",
y="Mean production"
)
s_output <- ggplot(sugarcane, mapping = aes( x = fct_rev(fct_reorder(division, mean_output)), y = mean_output)) +
geom_col() +
theme_minimal() +
#coord_flip() +
theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
labs(
title = "Average Sugarcane Output in each division",
x = "Divisions",
y="Mean output"
)
grid.arrange(s_sown, s_prod, s_output, ncol = 3)