Redo_projects_21.04.2024

#use sind data 21.04.2024

library(mice)

## Warning: package 'mice' was built under R version 4.2.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.2.3

## corrplot 0.92 loaded

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'ggplot2' was built under R version 4.2.3

## Warning: package 'tibble' was built under R version 4.2.3

## Warning: package 'purrr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'lubridate' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(Amelia)

## Warning: package 'Amelia' was built under R version 4.2.3

## Loading required package: Rcpp

## Warning: package 'Rcpp' was built under R version 4.2.3

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(psych)

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(sqldf)

## Warning: package 'sqldf' was built under R version 4.2.3

## Loading required package: gsubfn

## Warning: package 'gsubfn' was built under R version 4.2.3

## Loading required package: proto

## Warning: package 'proto' was built under R version 4.2.3

## Loading required package: RSQLite

## Warning: package 'RSQLite' was built under R version 4.2.3

library(reshape2)

## Warning: package 'reshape2' was built under R version 4.2.3

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 4.2.3

## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:psych':
## 
##     describe
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(doBy)

## Warning: package 'doBy' was built under R version 4.2.3

## 
## Attaching package: 'doBy'
## 
## The following object is masked from 'package:dplyr':
## 
##     order_by

library(gmodels)

## Warning: package 'gmodels' was built under R version 4.2.3

library(car)

## Warning: package 'car' was built under R version 4.2.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.2.3

## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(effects)

## Warning: package 'effects' was built under R version 4.2.3

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

library(polycor)

## Warning: package 'polycor' was built under R version 4.2.3

## 
## Attaching package: 'polycor'
## 
## The following object is masked from 'package:psych':
## 
##     polyserial

library(gvlma)
library(boot)

## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following object is masked from 'package:psych':
## 
##     logit

library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(tidyr)
library(dplyr)
library(modelr)

## Warning: package 'modelr' was built under R version 4.2.3

library(gapminder)

## Warning: package 'gapminder' was built under R version 4.2.3

library(tree)

## Warning: package 'tree' was built under R version 4.2.3

library(gbm)

## Warning: package 'gbm' was built under R version 4.2.3

## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.2.3

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.2.3

library(fastDummies)

## Warning: package 'fastDummies' was built under R version 4.2.3

## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.

library(vcd)

## Warning: package 'vcd' was built under R version 4.2.3

## Loading required package: grid

sinde <- read.csv("sindh-school-enrollment-stats.csv")

head(sinde)

##   District Location School.Type     Category Composition Number Population
## 1    Badin    Urban     Primary INSTITUTIONS       TOTAL    168  1,804,516
## 2    Badin    Urban     Primary INSTITUTIONS        MALE     30  1,804,516
## 3    Badin    Urban     Primary INSTITUTIONS      FEMALE     27  1,804,516
## 4    Badin    Urban     Primary INSTITUTIONS       MIXED    111  1,804,516
## 5    Badin    Urban     Primary    ENROLMENT       TOTAL  16446  1,804,516
## 6    Badin    Urban     Primary    ENROLMENT        MALE  10015  1,804,516
##    No.100000
## 1   9.309976
## 2   1.662496
## 3   1.496246
## 4   6.151234
## 5 911.380115
## 6 554.996464

dim(sinde)

## [1] 3480    8

colnames(sinde)

## [1] "District"    "Location"    "School.Type" "Category"    "Composition"
## [6] "Number"      "Population"  "No.100000"

md.pattern(sinde, rotate.names = T)

##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##      District Location School.Type Category Composition Number Population
## 3480        1        1           1        1           1      1          1
##             0        0           0        0           0      0          0
##      No.100000  
## 3480         1 0
##              0 0

table(sinde$Composition)

## 
## FEMALE   MALE  MIXED  TOTAL 
##   1044   1044    348   1044

#check missing values and visualize
options(repr.plot.width = 12, repr.plot.height = 10)
psych::describe(sinde)

##              vars    n    mean       sd median trimmed    mad min       max
## District*       1 3480   15.00     8.37  15.00   15.00  10.38   1     29.00
## Location*       2 3480    2.00     0.82   2.00    2.00   1.48   1      3.00
## School.Type*    3 3480    2.50     1.12   2.50    2.50   1.48   1      4.00
## Category*       4 3480    2.00     0.77   2.00    2.00   1.48   1      3.00
## Composition*    5 3480    2.40     1.20   2.00    2.38   1.48   1      4.00
## Number          6 3480 5083.10 16240.83 197.00 1381.53 289.11   0 212708.00
## Population*     7 3480   14.62     7.96  15.00   14.65  10.38   1     28.00
## No.100000       8 3480  335.92  1059.81  13.06   90.72  19.18   0  10404.94
##                  range  skew kurtosis     se
## District*        28.00  0.00    -1.20   0.14
## Location*         2.00  0.00    -1.50   0.01
## School.Type*      3.00  0.00    -1.36   0.02
## Category*         2.00  0.00    -1.33   0.01
## Composition*      3.00  0.24    -1.49   0.02
## Number       212708.00  6.11    46.48 275.31
## Population*      27.00 -0.04    -1.16   0.14
## No.100000     10404.94  5.50    35.14  17.97

#visualize missing data
missmap(sinde, margins = c(4,2))

#exclude all non numeric variables for corrplots
num.cols <- sapply(sinde, is.numeric)
df <- sinde[, num.cols]
head(df)

##   Number  No.100000
## 1    168   9.309976
## 2     30   1.662496
## 3     27   1.496246
## 4    111   6.151234
## 5  16446 911.380115
## 6  10015 554.996464

#corrplot
corrplot(cor(df), order = "hclust", method = "number")

ggplot(df, aes(No.100000))+geom_histogram(bins = 5)+
  theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
  plot.title    = element_text(hjust = .5,lineheight = .20, size = 15),
  plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
  axis.title.x  = element_text(size = 15),
  axis.title.y  = element_text(size = 15),
  axis.text.x   = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format())

ggplot(df, aes(Number))+geom_histogram()+
  theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
  plot.title    = element_text(hjust = .5,lineheight = .20, size = 15),
  plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
  axis.title.x  = element_text(size = 15),
  axis.title.y  = element_text(size = 15),
  axis.text.x   = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Number of schools distribution") + ylab("School's Number")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mytable1 <- xtabs(~District + Location + School.Type, data = sinde[,!num.cols])
addmargins(mytable1)

## , , School.Type = Higher secondary
## 
##                      Location
## District              Rural Total Urban  Sum
##   Badin                  10    10    10   30
##   Central Karachi        10    10    10   30
##   Dadu                   10    10    10   30
##   East Karachi           10    10    10   30
##   Ghotki                 10    10    10   30
##   Hyderabad              10    10    10   30
##   Jacobad                10    10    10   30
##   Jamshoro               10    10    10   30
##   Kamber Shahdadkot      10    10    10   30
##   Kashmore               10    10    10   30
##   Khairpur Mirs          10    10    10   30
##   Korangi Karachi        10    10    10   30
##   Larkana                10    10    10   30
##   Malir Karachi          10    10    10   30
##   Matira                 10    10    10   30
##   Mirpurkhaas            10    10    10   30
##   Naushero Feroz         10    10    10   30
##   Sanghar                10    10    10   30
##   Shaheed Benazirabad    10    10    10   30
##   Shikarpur              10    10    10   30
##   South Karachi          10    10    10   30
##   Sujawal                10    10    10   30
##   Sukkhur                10    10    10   30
##   Tando Allah yar        10    10    10   30
##   Tando Muhammad Khan    10    10    10   30
##   Tharparkar             10    10    10   30
##   Thatta                 10    10    10   30
##   Umerkot                10    10    10   30
##   West Karchi            10    10    10   30
##   Sum                   290   290   290  870
## 
## , , School.Type = Middle
## 
##                      Location
## District              Rural Total Urban  Sum
##   Badin                  10    10    10   30
##   Central Karachi        10    10    10   30
##   Dadu                   10    10    10   30
##   East Karachi           10    10    10   30
##   Ghotki                 10    10    10   30
##   Hyderabad              10    10    10   30
##   Jacobad                10    10    10   30
##   Jamshoro               10    10    10   30
##   Kamber Shahdadkot      10    10    10   30
##   Kashmore               10    10    10   30
##   Khairpur Mirs          10    10    10   30
##   Korangi Karachi        10    10    10   30
##   Larkana                10    10    10   30
##   Malir Karachi          10    10    10   30
##   Matira                 10    10    10   30
##   Mirpurkhaas            10    10    10   30
##   Naushero Feroz         10    10    10   30
##   Sanghar                10    10    10   30
##   Shaheed Benazirabad    10    10    10   30
##   Shikarpur              10    10    10   30
##   South Karachi          10    10    10   30
##   Sujawal                10    10    10   30
##   Sukkhur                10    10    10   30
##   Tando Allah yar        10    10    10   30
##   Tando Muhammad Khan    10    10    10   30
##   Tharparkar             10    10    10   30
##   Thatta                 10    10    10   30
##   Umerkot                10    10    10   30
##   West Karchi            10    10    10   30
##   Sum                   290   290   290  870
## 
## , , School.Type = Primary
## 
##                      Location
## District              Rural Total Urban  Sum
##   Badin                  10    10    10   30
##   Central Karachi        10    10    10   30
##   Dadu                   10    10    10   30
##   East Karachi           10    10    10   30
##   Ghotki                 10    10    10   30
##   Hyderabad              10    10    10   30
##   Jacobad                10    10    10   30
##   Jamshoro               10    10    10   30
##   Kamber Shahdadkot      10    10    10   30
##   Kashmore               10    10    10   30
##   Khairpur Mirs          10    10    10   30
##   Korangi Karachi        10    10    10   30
##   Larkana                10    10    10   30
##   Malir Karachi          10    10    10   30
##   Matira                 10    10    10   30
##   Mirpurkhaas            10    10    10   30
##   Naushero Feroz         10    10    10   30
##   Sanghar                10    10    10   30
##   Shaheed Benazirabad    10    10    10   30
##   Shikarpur              10    10    10   30
##   South Karachi          10    10    10   30
##   Sujawal                10    10    10   30
##   Sukkhur                10    10    10   30
##   Tando Allah yar        10    10    10   30
##   Tando Muhammad Khan    10    10    10   30
##   Tharparkar             10    10    10   30
##   Thatta                 10    10    10   30
##   Umerkot                10    10    10   30
##   West Karchi            10    10    10   30
##   Sum                   290   290   290  870
## 
## , , School.Type = Secondary
## 
##                      Location
## District              Rural Total Urban  Sum
##   Badin                  10    10    10   30
##   Central Karachi        10    10    10   30
##   Dadu                   10    10    10   30
##   East Karachi           10    10    10   30
##   Ghotki                 10    10    10   30
##   Hyderabad              10    10    10   30
##   Jacobad                10    10    10   30
##   Jamshoro               10    10    10   30
##   Kamber Shahdadkot      10    10    10   30
##   Kashmore               10    10    10   30
##   Khairpur Mirs          10    10    10   30
##   Korangi Karachi        10    10    10   30
##   Larkana                10    10    10   30
##   Malir Karachi          10    10    10   30
##   Matira                 10    10    10   30
##   Mirpurkhaas            10    10    10   30
##   Naushero Feroz         10    10    10   30
##   Sanghar                10    10    10   30
##   Shaheed Benazirabad    10    10    10   30
##   Shikarpur              10    10    10   30
##   South Karachi          10    10    10   30
##   Sujawal                10    10    10   30
##   Sukkhur                10    10    10   30
##   Tando Allah yar        10    10    10   30
##   Tando Muhammad Khan    10    10    10   30
##   Tharparkar             10    10    10   30
##   Thatta                 10    10    10   30
##   Umerkot                10    10    10   30
##   West Karchi            10    10    10   30
##   Sum                   290   290   290  870
## 
## , , School.Type = Sum
## 
##                      Location
## District              Rural Total Urban  Sum
##   Badin                  40    40    40  120
##   Central Karachi        40    40    40  120
##   Dadu                   40    40    40  120
##   East Karachi           40    40    40  120
##   Ghotki                 40    40    40  120
##   Hyderabad              40    40    40  120
##   Jacobad                40    40    40  120
##   Jamshoro               40    40    40  120
##   Kamber Shahdadkot      40    40    40  120
##   Kashmore               40    40    40  120
##   Khairpur Mirs          40    40    40  120
##   Korangi Karachi        40    40    40  120
##   Larkana                40    40    40  120
##   Malir Karachi          40    40    40  120
##   Matira                 40    40    40  120
##   Mirpurkhaas            40    40    40  120
##   Naushero Feroz         40    40    40  120
##   Sanghar                40    40    40  120
##   Shaheed Benazirabad    40    40    40  120
##   Shikarpur              40    40    40  120
##   South Karachi          40    40    40  120
##   Sujawal                40    40    40  120
##   Sukkhur                40    40    40  120
##   Tando Allah yar        40    40    40  120
##   Tando Muhammad Khan    40    40    40  120
##   Tharparkar             40    40    40  120
##   Thatta                 40    40    40  120
##   Umerkot                40    40    40  120
##   West Karchi            40    40    40  120
##   Sum                  1160  1160  1160 3480

mytable2 <- xtabs(~District + Population, data = sinde[,!num.cols])
head(addmargins(mytable2))

##                  Population
## District          1,006,297 1,073,146 1,231,481 1,341,042 1,487,903 1,505,876
##   Badin                   0         0         0         0         0         0
##   Central Karachi         0         0         0         0         0         0
##   Dadu                    0         0         0         0         0         0
##   East Karachi            0         0         0         0         0         0
##   Ghotki                  0         0         0         0         0         0
##   Hyderabad               0         0         0         0         0         0
##                  Population
## District          1,524,391 1,550,266 1,612,373 1,612,847 1,646,318 1,649,661
##   Badin                   0         0         0         0         0         0
##   Central Karachi         0         0         0         0         0         0
##   Dadu                    0       120         0         0         0         0
##   East Karachi            0         0         0         0         0         0
##   Ghotki                  0         0         0         0       120         0
##   Hyderabad               0         0         0         0         0         0
##                  Population
## District          1,791,751 1,804,516 2,008,901 2,057,057 2,199,463 2,404,334
##   Badin                   0       120         0         0         0         0
##   Central Karachi         0         0         0         0         0         0
##   Dadu                    0         0         0         0         0         0
##   East Karachi            0         0         0         0         0         0
##   Ghotki                  0         0         0         0         0         0
##   Hyderabad               0         0         0         0       120         0
##                  Population
## District          2,457,019 2,907,467 2,971,626 3,914,757 677,228 769,349
##   Badin                   0         0         0         0       0       0
##   Central Karachi         0         0       120         0       0       0
##   Dadu                    0         0         0         0       0       0
##   East Karachi            0       120         0         0       0       0
##   Ghotki                  0         0         0         0       0       0
##   Hyderabad               0         0         0         0       0       0
##                  Population
## District          781,967 836,887 979,817 993,142 Sum
##   Badin                 0       0       0       0 120
##   Central Karachi       0       0       0       0 120
##   Dadu                  0       0       0       0 120
##   East Karachi          0       0       0       0 120
##   Ghotki                0       0       0       0 120
##   Hyderabad             0       0       0       0 120

#check relationship between disctrict and population

dp <- xtabs(~District + Population, data = sinde[, !num.cols])
chisq.test(dp)

## Warning in chisq.test(dp): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  dp
## X-squared = 93960, df = 756, p-value < 2.2e-16

dl <- xtabs(~District + Location, data = sinde[, !num.cols])
chisq.test(dl)

## 
##  Pearson's Chi-squared test
## 
## data:  dl
## X-squared = 0, df = 56, p-value = 1

sctg <- xtabs(~School.Type + Category, data = sinde[, !num.cols])
chisq.test(sctg)

## 
##  Pearson's Chi-squared test
## 
## data:  sctg
## X-squared = 0, df = 6, p-value = 1

scmp <- xtabs(~School.Type + Composition, data = sinde[, !num.cols])
chisq.test(scmp)

## 
##  Pearson's Chi-squared test
## 
## data:  scmp
## X-squared = 0, df = 9, p-value = 1

scnum <- xtabs(~School.Type + sinde$Number, data = sinde[, !num.cols])
chisq.test(scnum)

## Warning in chisq.test(scnum): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  scnum
## X-squared = 5630.9, df = 4965, p-value = 7.194e-11

scpop <- xtabs(~School.Type + Population, data = sinde[, !num.cols])
chisq.test(scpop)

## 
##  Pearson's Chi-squared test
## 
## data:  scpop
## X-squared = 0, df = 81, p-value = 1

nopop <- xtabs(~sinde$No.100000 + Population, data = sinde[, !num.cols])
chisq.test(nopop)

## Warning in chisq.test(nopop): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  nopop
## X-squared = 89932, df = 74061, p-value < 2.2e-16

The significant relation found between school type and number, district and population, and popultion with No.100000. No other variables showed relationship.

#sort by number
number_schools <- sinde %>% arrange(desc(Number))

high_number <- number_schools %>% filter(Number > mean(Number)) %>%
  dplyr::select(Number, District, Location, School.Type, Composition)

head(high_number, 10)

##    Number            District Location School.Type Composition
## 1  212708       Khairpur Mirs    Total     Primary       TOTAL
## 2  186622       Khairpur Mirs    Rural     Primary       TOTAL
## 3  173989             Sanghar    Total     Primary       TOTAL
## 4  166351      Naushero Feroz    Total     Primary       TOTAL
## 5  165309 Shaheed Benazirabad    Total     Primary       TOTAL
## 6  159869                Dadu    Total     Primary       TOTAL
## 7  158612             Larkana    Total     Primary       TOTAL
## 8  149514      Naushero Feroz    Rural     Primary       TOTAL
## 9  148694              Ghotki    Total     Primary       TOTAL
## 10 142043             Sanghar    Rural     Primary       TOTAL

low_number <- number_schools %>% filter(Number < mean(Number)) %>%
  dplyr::select(Number, District, Location, School.Type, Composition)

head(low_number, 10)

##    Number        District Location      School.Type Composition
## 1    5081   Malir Karachi    Rural        Secondary       TOTAL
## 2    5078         Umerkot    Total           Middle       TOTAL
## 3    5077   Khairpur Mirs    Rural          Primary       TOTAL
## 4    5068         Jacobad    Rural Higher secondary        MALE
## 5    4999          Ghotki    Total Higher secondary      FEMALE
## 6    4996            Dadu    Total          Primary       TOTAL
## 7    4988        Jamshoro    Rural           Middle       TOTAL
## 8    4973         Sanghar    Rural          Primary       TOTAL
## 9    4972 Tando Allah yar    Total        Secondary      FEMALE
## 10   4939         Sujawal    Total Higher secondary       TOTAL

number_schools %>% group_by(Location) %>%
  summarise(mean = round(mean(Number),0),  sd= round(sd(Number), 0)) %>%
  arrange(desc(mean))

## # A tibble: 3 × 3
##   Location  mean    sd
##   <chr>    <dbl> <dbl>
## 1 Total     7625 21119
## 2 Rural     5209 17266
## 3 Urban     2415  5832

number_schools %>% group_by(Location) %>%
  summarise(mean = round(mean(Number),0),  sd= round(sd(Number), 0)) %>%
  arrange(desc(mean))

## # A tibble: 3 × 3
##   Location  mean    sd
##   <chr>    <dbl> <dbl>
## 1 Total     7625 21119
## 2 Rural     5209 17266
## 3 Urban     2415  5832

number_schools %>% group_by(District, Location) %>%
  summarise(mean = round(mean(Number),0),  sd= round(sd(Number), 0)) %>%
  arrange(desc(mean))

## `summarise()` has grouped output by 'District'. You can override using the
## `.groups` argument.

## # A tibble: 87 × 4
## # Groups:   District [29]
##    District            Location  mean    sd
##    <chr>               <chr>    <dbl> <dbl>
##  1 Khairpur Mirs       Total    17446 41047
##  2 Khairpur Mirs       Rural    14613 35697
##  3 Sanghar             Total    12968 33455
##  4 Naushero Feroz      Total    12882 31589
##  5 Larkana             Total    12706 30601
##  6 Shaheed Benazirabad Total    12515 31629
##  7 Dadu                Total    11598 30428
##  8 Naushero Feroz      Rural    10297 28186
##  9 Badin               Total    10212 27025
## 10 Ghotki              Total    10156 28573
## # ℹ 77 more rows

number_schools %>% group_by(School.Type, Location) %>%
  summarise(mean = round(mean(Number),0),  sd= round(sd(Number), 0)) %>%
  arrange(desc(mean))

## `summarise()` has grouped output by 'School.Type'. You can override using the
## `.groups` argument.

## # A tibble: 12 × 4
## # Groups:   School.Type [4]
##    School.Type      Location  mean    sd
##    <chr>            <chr>    <dbl> <dbl>
##  1 Primary          Total    19754 37606
##  2 Primary          Rural    15358 31639
##  3 Secondary        Total     6163 11129
##  4 Primary          Urban     4396  8173
##  5 Secondary        Urban     3340  7081
##  6 Secondary        Rural     2823  5987
##  7 Higher secondary Total     2471  5032
##  8 Middle           Total     2110  4286
##  9 Middle           Rural     1407  3248
## 10 Higher secondary Rural     1249  2969
## 11 Higher secondary Urban     1222  2583
## 12 Middle           Urban      703  1898

number_schools %>% group_by(Category, Location) %>%
  summarise(mean = round(mean(Number),0),  sd= round(sd(Number), 0)) %>%
  arrange(desc(mean))

## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.

## # A tibble: 9 × 4
## # Groups:   Category [3]
##   Category       Location  mean    sd
##   <chr>          <chr>    <dbl> <dbl>
## 1 ENROLMENT      Total    24305 33004
## 2 ENROLMENT      Rural    16627 28422
## 3 ENROLMENT      Urban     7679  8581
## 4 TEACHING STAFF Total      867  1124
## 5 TEACHING STAFF Rural      526   914
## 6 TEACHING STAFF Urban      341   453
## 7 INSTITUTIONS   Total      183   461
## 8 INSTITUTIONS   Rural      159   436
## 9 INSTITUTIONS   Urban       24    46

number_schools %>% group_by(Composition, Location) %>%
  summarise(mean = round(mean(Number),0),  sd= round(sd(Number), 0)) %>%
  arrange(desc(mean))

## `summarise()` has grouped output by 'Composition'. You can override using the
## `.groups` argument.

## # A tibble: 12 × 4
## # Groups:   Composition [4]
##    Composition Location  mean    sd
##    <chr>       <chr>    <dbl> <dbl>
##  1 TOTAL       Total    12708 30604
##  2 TOTAL       Rural     8682 25105
##  3 MALE        Total     7720 19001
##  4 MALE        Rural     5578 15899
##  5 FEMALE      Total     4899 11854
##  6 TOTAL       Urban     4026  8459
##  7 FEMALE      Rural     3024  9344
##  8 MALE        Urban     2142  4420
##  9 FEMALE      Urban     1874  4199
## 10 MIXED       Total      267   532
## 11 MIXED       Rural      237   506
## 12 MIXED       Urban       29    49

number_schools %>% group_by(Population, Location) %>%
  summarise(mean = round(mean(Number),0),  sd= round(sd(Number), 0)) %>%
  arrange(desc(mean))

## `summarise()` has grouped output by 'Population'. You can override using the
## `.groups` argument.

## # A tibble: 84 × 4
## # Groups:   Population [28]
##    Population Location  mean    sd
##    <chr>      <chr>    <dbl> <dbl>
##  1 2,057,057  Total    12968 33455
##  2 1,612,373  Total    12882 31589
##  3 1,524,391  Total    12706 30601
##  4 1,612,847  Total    12515 31629
##  5 2,404,334  Total    11958 31522
##  6 1,550,266  Total    11598 30428
##  7 1,612,373  Rural    10297 28186
##  8 1,804,516  Total    10212 27025
##  9 1,646,318  Total    10156 28573
## 10 2,404,334  Rural    10070 27550
## # ℹ 74 more rows

ggplot(sinde, aes(Number, reorder(Location, Number))) +
  geom_point() +
  theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
  plot.title    = element_text(hjust = .5,lineheight = .20, size = 15),
  plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
  axis.title.x  = element_text(size = 15),
  axis.title.y  = element_text(size = 15),
  axis.text.x   = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Number of schools distribution",
    subtitle = "School distribution across province") + ylab("Schools Location")

ggplot(sinde, aes(Number, No.100000, color = Composition))+geom_point()+
  geom_smooth(method = lm, se = F, color = "black", size = 0.2, linetype = "dashed") +
geom_hline(yintercept = mean(sinde$Number), linetype = "dashed", size = 1) +
geom_vline(xintercept = mean(sinde$Number), linetype = "dashed", size = 1) +
annotate("text", x = 14900, y = 250, label = "Mean",size = 5) +
annotate("text", x = 38000, y = 131, label = "Mean",size = 5) +
geom_text_repel(data=sinde %>% top_n(10,Number), aes(Number, No.100000, label = Composition),color = "red",
               segment.color = "black") + scale_x_continuous(label = scales::comma_format()) +
geom_text_repel(data=sinde %>% top_n(-3,Number), aes(Number, No.100000, label = Composition),color = "red",
               segment.color = "black")+
  theme(legend.title = element_text(size = 8),
  legend.text = element_text(size = 13),
  axis.title.x = element_text(size = 15),
  axis.title.y = element_text(size = 15),
  axis.text.x = element_text(size = 10),
  axis.text.y = element_text(size = 10),
  plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
  plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12)) +
labs(title = "Number of schools have a linear relationship")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

## Warning: ggrepel: 155 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

 ggplot(sinde, aes(Number, Location, color = Composition))+
   geom_point()+
  facet_wrap(~Composition)+
  theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
  plot.title    = element_text(hjust = .5,lineheight = .20, size = 15),
  plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
  axis.title.x  = element_text(size = 15),
  axis.title.y  = element_text(size = 15),
  axis.text.x   = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Number of schools distribution",
    subtitle = "School distribution across Genders") + ylab("School's Location")

 ggplot(sinde, aes(Number, Location, color = Composition))+
   geom_point()+
   facet_wrap(~ School.Type)+
  theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
  plot.title    = element_text(hjust = .5,lineheight = .20, size = 15),
  plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
  axis.title.x  = element_text(size = 15),
  axis.title.y  = element_text(size = 15),
  axis.text.x   = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Number of schools distribution",
    subtitle = "School distribution across province") + ylab("Schools Location")

Redo_projects_21.04.2024

2024-04-21