library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.2.3
library(maps)
## Warning: package 'maps' was built under R version 4.2.3
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(forcats)

crop <- read_csv("punjab_fertilizer.csv")
## Rows: 416 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Year, Province, Division, District
## dbl (14): _id, Usage (in 1000 nutirient tons), Area Sown Total (Wheat), Area...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(crop)
## [1] 416  18
head(crop)
## # A tibble: 6 × 18
##   `_id` Year    Province Division         District       Usage (in 1000 nutiri…¹
##   <dbl> <chr>   <chr>    <chr>            <chr>                            <dbl>
## 1     1 2002-03 Punjab   Bahawalpur Divn. Bahawalpur                         114
## 2     2 2002-03 Punjab   Bahawalpur Divn. Bahawalnagar                        93
## 3     3 2002-03 Punjab   Bahawalpur Divn. Rahim Yar Khan                     168
## 4     4 2002-03 Punjab   D.G.Khan Divn.   D.G. Khan                           48
## 5     5 2002-03 Punjab   D.G.Khan Divn.   Layyah                              48
## 6     6 2002-03 Punjab   D.G.Khan Divn.   Muzaffargarh                        87
## # ℹ abbreviated name: ¹​`Usage (in 1000 nutirient tons)`
## # ℹ 12 more variables: `Area Sown Total (Wheat)` <dbl>,
## #   `Area Sown (Rice)` <dbl>, `Area Sown (Cotton)` <dbl>,
## #   `Area Sown (Sugarcane)` <dbl>, `Wheat Production` <dbl>,
## #   `Rice Production` <dbl>, `Cotton Production` <dbl>,
## #   `Sugarcane Production` <dbl>, `Output/acre wheat` <dbl>,
## #   `Output/Acre (Rice)` <dbl>, `Output/Acre (Cotton)` <dbl>, …
colnames(crop)
##  [1] "_id"                            "Year"                          
##  [3] "Province"                       "Division"                      
##  [5] "District"                       "Usage (in 1000 nutirient tons)"
##  [7] "Area Sown Total (Wheat)"        "Area Sown (Rice)"              
##  [9] "Area Sown (Cotton)"             "Area Sown (Sugarcane)"         
## [11] "Wheat Production"               "Rice Production"               
## [13] "Cotton Production"              "Sugarcane Production"          
## [15] "Output/acre wheat"              "Output/Acre (Rice)"            
## [17] "Output/Acre (Cotton)"           "Output/Acre (Sugarcane)"
crop <- clean_names(crop)
md.pattern(crop, rotate.names = T)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##     id year province division district usage_in_1000_nutirient_tons
## 416  1    1        1        1        1                            1
##      0    0        0        0        0                            0
##     area_sown_total_wheat area_sown_rice area_sown_cotton area_sown_sugarcane
## 416                     1              1                1                   1
##                         0              0                0                   0
##     wheat_production rice_production cotton_production sugarcane_production
## 416                1               1                 1                    1
##                    0               0                 0                    0
##     output_acre_wheat output_acre_rice output_acre_cotton output_acre_sugarcane
## 416                 1                1                  1                     1
##                     0                0                  0                     0
##      
## 416 0
##     0
colSums(is.na(crop))
##                           id                         year 
##                            0                            0 
##                     province                     division 
##                            0                            0 
##                     district usage_in_1000_nutirient_tons 
##                            0                            0 
##        area_sown_total_wheat               area_sown_rice 
##                            0                            0 
##             area_sown_cotton          area_sown_sugarcane 
##                            0                            0 
##             wheat_production              rice_production 
##                            0                            0 
##            cotton_production         sugarcane_production 
##                            0                            0 
##            output_acre_wheat             output_acre_rice 
##                            0                            0 
##           output_acre_cotton        output_acre_sugarcane 
##                            0                            0
glimpse(crop)
## Rows: 416
## Columns: 18
## $ id                           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
## $ year                         <chr> "2002-03", "2002-03", "2002-03", "2002-03…
## $ province                     <chr> "Punjab", "Punjab", "Punjab", "Punjab", "…
## $ division                     <chr> "Bahawalpur Divn.", "Bahawalpur Divn.", "…
## $ district                     <chr> "Bahawalpur", "Bahawalnagar", "Rahim Yar …
## $ usage_in_1000_nutirient_tons <dbl> 114, 93, 168, 48, 48, 87, 54, 91, 99, 49,…
## $ area_sown_total_wheat        <dbl> 269, 282, 303, 156, 178, 278, 135, 254, 3…
## $ area_sown_rice               <dbl> 5, 41, 13, 27, 1, 20, 5, 22, 79, 22, 231,…
## $ area_sown_cotton             <dbl> 268, 178, 307, 93, 29, 189, 148, 32, 45, …
## $ area_sown_sugarcane          <dbl> 9, 27, 43, 3, 16, 25, 5, 117, 92, 46, 3, …
## $ wheat_production             <dbl> 724, 695, 740, 407, 379, 657, 336, 716, 9…
## $ rice_production              <dbl> 7, 65, 19, 53, 1, 32, 7, 33, 118, 43, 436…
## $ cotton_production            <dbl> 968, 544, 1076, 411, 63, 599, 651, 67, 11…
## $ sugarcane_production         <dbl> 501, 1155, 2620, 142, 674, 1394, 258, 541…
## $ output_acre_wheat            <dbl> 2.691450, 2.464539, 2.442244, 2.608974, 2…
## $ output_acre_rice             <dbl> 1.400000, 1.585366, 1.461538, 1.962963, 1…
## $ output_acre_cotton           <dbl> 3.611940, 3.056180, 3.504886, 4.419355, 2…
## $ output_acre_sugarcane        <dbl> 55.66667, 42.77778, 60.93023, 47.33333, 4…
table(crop$province)
## 
## Punjab 
##    416
colnames(crop)
##  [1] "id"                           "year"                        
##  [3] "province"                     "division"                    
##  [5] "district"                     "usage_in_1000_nutirient_tons"
##  [7] "area_sown_total_wheat"        "area_sown_rice"              
##  [9] "area_sown_cotton"             "area_sown_sugarcane"         
## [11] "wheat_production"             "rice_production"             
## [13] "cotton_production"            "sugarcane_production"        
## [15] "output_acre_wheat"            "output_acre_rice"            
## [17] "output_acre_cotton"           "output_acre_sugarcane"
# #crop$y <- as.Date(as.yearmon(crop$year))
# crop$t <- as.Date(crop$year, format = "%Y-%d")
# crop$yr <- as.Date(crop$yr, format = "%Y")
# crop <- crop %>% 
#   mutate(weekend = ifelse(wday(
# crop$t) %in% c(1,7), "weekend", "weekday"))
# 
# #crop$c <-as.POSIXct(crop$year, format = "%Y-%m")
# #crop[c("yr", "month")] <- str_split_fixed(crop$year, "-", 2)
# 
# 
# #to find missin dates
# which(is.na(crop$t))
head(crop)
## # A tibble: 6 × 18
##      id year    province division         district       usage_in_1000_nutirie…¹
##   <dbl> <chr>   <chr>    <chr>            <chr>                            <dbl>
## 1     1 2002-03 Punjab   Bahawalpur Divn. Bahawalpur                         114
## 2     2 2002-03 Punjab   Bahawalpur Divn. Bahawalnagar                        93
## 3     3 2002-03 Punjab   Bahawalpur Divn. Rahim Yar Khan                     168
## 4     4 2002-03 Punjab   D.G.Khan Divn.   D.G. Khan                           48
## 5     5 2002-03 Punjab   D.G.Khan Divn.   Layyah                              48
## 6     6 2002-03 Punjab   D.G.Khan Divn.   Muzaffargarh                        87
## # ℹ abbreviated name: ¹​usage_in_1000_nutirient_tons
## # ℹ 12 more variables: area_sown_total_wheat <dbl>, area_sown_rice <dbl>,
## #   area_sown_cotton <dbl>, area_sown_sugarcane <dbl>, wheat_production <dbl>,
## #   rice_production <dbl>, cotton_production <dbl>, sugarcane_production <dbl>,
## #   output_acre_wheat <dbl>, output_acre_rice <dbl>, output_acre_cotton <dbl>,
## #   output_acre_sugarcane <dbl>
# head(crop$t, 416)


crop$year <- as.factor(crop$year)
table(crop$year)
## 
## 2002-03 2003-04 2004-05 2005-06 2007-08 2008-09 2009-10 2010-11 2011-12 2012-13 
##      33      33      33      33      34      35      35      36      36      36 
## 2013-14 2014-15 
##      36      36

No missing values found.

#EDA

introduce(crop)
## # A tibble: 1 × 9
##    rows columns discrete_columns continuous_columns all_missing_columns
##   <int>   <int>            <int>              <int>               <int>
## 1   416      18                4                 14                   0
## # ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
## #   total_observations <int>, memory_usage <dbl>
plot_intro(crop)

plot_missing(crop)

head(crop)
## # A tibble: 6 × 18
##      id year    province division         district       usage_in_1000_nutirie…¹
##   <dbl> <fct>   <chr>    <chr>            <chr>                            <dbl>
## 1     1 2002-03 Punjab   Bahawalpur Divn. Bahawalpur                         114
## 2     2 2002-03 Punjab   Bahawalpur Divn. Bahawalnagar                        93
## 3     3 2002-03 Punjab   Bahawalpur Divn. Rahim Yar Khan                     168
## 4     4 2002-03 Punjab   D.G.Khan Divn.   D.G. Khan                           48
## 5     5 2002-03 Punjab   D.G.Khan Divn.   Layyah                              48
## 6     6 2002-03 Punjab   D.G.Khan Divn.   Muzaffargarh                        87
## # ℹ abbreviated name: ¹​usage_in_1000_nutirient_tons
## # ℹ 12 more variables: area_sown_total_wheat <dbl>, area_sown_rice <dbl>,
## #   area_sown_cotton <dbl>, area_sown_sugarcane <dbl>, wheat_production <dbl>,
## #   rice_production <dbl>, cotton_production <dbl>, sugarcane_production <dbl>,
## #   output_acre_wheat <dbl>, output_acre_rice <dbl>, output_acre_cotton <dbl>,
## #   output_acre_sugarcane <dbl>
colnames(crop)
##  [1] "id"                           "year"                        
##  [3] "province"                     "division"                    
##  [5] "district"                     "usage_in_1000_nutirient_tons"
##  [7] "area_sown_total_wheat"        "area_sown_rice"              
##  [9] "area_sown_cotton"             "area_sown_sugarcane"         
## [11] "wheat_production"             "rice_production"             
## [13] "cotton_production"            "sugarcane_production"        
## [15] "output_acre_wheat"            "output_acre_rice"            
## [17] "output_acre_cotton"           "output_acre_sugarcane"

We have wheat, rice, cotton, sugarcane area sown We have wheat, rice, cotton, sugarcane production We have wheat, rice, cotton, sugarcane output acrea

#Wheat

options(repr.plot.width=5, repr.plot.height=8)

wheat <-  crop %>% select(area_sown_total_wheat,wheat_production,division, output_acre_wheat) %>% group_by(division) %>% 
  summarise(mean_area_sown = mean(area_sown_total_wheat),  
            mean_production = mean(wheat_production), 
            mean_output = mean(output_acre_wheat))
wheat
## # A tibble: 9 × 4
##   division         mean_area_sown mean_production mean_output
##   <chr>                     <dbl>           <dbl>       <dbl>
## 1 Bahawalpur Divn.          305.             870.        2.85
## 2 D.G.Khan Divn.            204.             525.        2.56
## 3 Faisalabad Divn.          240.             700.        2.94
## 4 Gujranwala Divn.          166.             426.        2.51
## 5 Lahore Divn.              158.             463.        2.90
## 6 Multan Divn.              205.             601.        2.94
## 7 Rawalpindi Divn.           91.3            120.        1.46
## 8 Sahiwal Divn.             168.             565.        3.33
## 9 Sargodha Divn.            154.             353.        2.23
w_sown <- ggplot(wheat, mapping = aes( x = fct_rev(fct_reorder(division, mean_area_sown)), y = mean_area_sown)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average area sown for Wheat in each division",
    x = "Divisions",
    y="Mean area sown"
  )

w_prod <- ggplot(wheat, mapping = aes( x = fct_rev(fct_reorder(division, mean_production)), y = mean_production)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average Wheat production in each division",
    x = "Divisions",
    y="Mean production"
  )
w_output <- ggplot(wheat, mapping = aes( x = fct_rev(fct_reorder(division, mean_output)), y = mean_output)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average Wheat Output in each division",
    x = "Divisions",
    y="Mean output"
  )

grid.arrange(w_sown, w_prod,w_output, ncol = 3)

#Cotton

options(repr.plot.width=5, repr.plot.height=8)

cotton <- crop %>% select(area_sown_cotton,cotton_production,division, output_acre_cotton) %>% group_by(division) %>% 
  summarise(mean_area_sown = mean(area_sown_cotton),  
            mean_production = mean(cotton_production), 
            mean_output = mean(output_acre_cotton))    
cotton
## # A tibble: 9 × 4
##   division         mean_area_sown mean_production mean_output
##   <chr>                     <dbl>           <dbl>       <dbl>
## 1 Bahawalpur Divn.       255.            1062.          4.17 
## 2 D.G.Khan Divn.         111.             422.          3.66 
## 3 Faisalabad Divn.        42.5            119.          2.74 
## 4 Gujranwala Divn.         0.306            0.444       0.229
## 5 Lahore Divn.             4.12             8.76        1.00 
## 6 Multan Divn.           175.             709.          3.87 
## 7 Rawalpindi Divn.         0.0708           0.219       0.990
## 8 Sahiwal Divn.           53.4            212.          3.97 
## 9 Sargodha Divn.          11.6             31.9         2.46
c_sown <- ggplot(cotton, mapping = aes( x = fct_rev(fct_reorder(division, mean_area_sown)), y = mean_area_sown)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average area sown for Cotton in each division",
    x = "Divisions",
    y="Mean area sown"
  )

c_prod <- ggplot(cotton, mapping = aes( x = fct_rev(fct_reorder(division, mean_production)), y = mean_production)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average Cotton production in each division",
    x = "Divisions",
    y="Mean production"
  )

c_output <- ggplot(cotton, mapping = aes( x = fct_rev(fct_reorder(division, mean_output)), y = mean_output)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average Cotton Output in each division",
    x = "Divisions",
    y="Mean output"
  )

grid.arrange(c_sown, c_prod,c_output, ncol = 3)

#RICE

rice <- crop %>% select(area_sown_rice,rice_production,division, output_acre_rice) %>% group_by(division) %>% 
  summarise(mean_area_sown = mean(area_sown_rice),  
            mean_production = mean(rice_production), 
            mean_output = mean(output_acre_rice))  
rice
## # A tibble: 9 × 4
##   division         mean_area_sown mean_production mean_output
##   <chr>                     <dbl>           <dbl>       <dbl>
## 1 Bahawalpur Divn.         27.4            50.2         1.71 
## 2 D.G.Khan Divn.           16.2            32.5         1.88 
## 3 Faisalabad Divn.         44.4            75.5         1.71 
## 4 Gujranwala Divn.        124.            232.          1.81 
## 5 Lahore Divn.            108.            195.          1.83 
## 6 Multan Divn.             20.8            36.5         1.71 
## 7 Rawalpindi Divn.          0.312           0.521       0.438
## 8 Sahiwal Divn.            65.8           151.          2.19 
## 9 Sargodha Divn.           15.8            26.1         1.68
r_sown <- ggplot(rice, mapping = aes( x = fct_rev(fct_reorder(division, mean_area_sown)), y = mean_area_sown)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average area sown for Rice in each division",
    x = "Divisions",
    y="Mean area sown"
  )

r_prod <- ggplot(rice, mapping = aes( x = fct_rev(fct_reorder(division, mean_production)), y = mean_production)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average Rice production in each division",
    x = "Divisions",
    y="Mean production"
  )
r_output <- ggplot(rice, mapping = aes( x = fct_rev(fct_reorder(division, mean_output)), y = mean_output)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average Rice Output in each division",
    x = "Divisions",
    y="Mean output"
  )

grid.arrange(r_sown, r_prod, r_output, ncol = 3)

#Sugarcane

sugarcane <- crop %>% select(area_sown_sugarcane,sugarcane_production,division, output_acre_sugarcane) %>% group_by(division) %>% 
  summarise(mean_area_sown = mean(area_sown_sugarcane),  
            mean_production = mean(sugarcane_production), 
            mean_output = mean(output_acre_sugarcane)) 

sugarcane
## # A tibble: 9 × 4
##   division         mean_area_sown mean_production mean_output
##   <chr>                     <dbl>           <dbl>       <dbl>
## 1 Bahawalpur Divn.        38.6            2553.          60.0
## 2 D.G.Khan Divn.          16.4             931.          58.5
## 3 Faisalabad Divn.        69.0            3582.          53.3
## 4 Gujranwala Divn.         7.76            340.          40.6
## 5 Lahore Divn.            18.2             862.          52.0
## 6 Multan Divn.             8.98            474.          51.4
## 7 Rawalpindi Divn.         0.0917            3.83        19.2
## 8 Sahiwal Divn.           10.2             477.          48.1
## 9 Sargodha Divn.          23.2            1099.          46.9
s_sown <- ggplot(sugarcane, mapping = aes( x = fct_rev(fct_reorder(division, mean_area_sown)), y = mean_area_sown)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average area sown for Sugarcane in each division",
    x = "Divisions",
    y="Mean area sown"
  )

s_prod <- ggplot(sugarcane, mapping = aes( x = fct_rev(fct_reorder(division, mean_production)), y = mean_production)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average Sugarcane production in each division",
    x = "Divisions",
    y="Mean production"
  )
s_output <- ggplot(sugarcane, mapping = aes( x = fct_rev(fct_reorder(division, mean_output)), y = mean_output)) + 
  geom_col() + 
  theme_minimal() +
  #coord_flip() +
  theme(axis.text.x = element_text(angle = 90), plot.background = element_rect(fill = "#F4F6F7"), legend.position = "bottom") +
  labs(
    title = "Average Sugarcane Output in each division",
    x = "Divisions",
    y="Mean output"
  )

grid.arrange(s_sown, s_prod, s_output, ncol = 3)