#use sind data 21.04.2024
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.2.3
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(sqldf)
## Warning: package 'sqldf' was built under R version 4.2.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.2.3
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.2.3
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.2.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.3
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.3
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:psych':
##
## describe
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(doBy)
## Warning: package 'doBy' was built under R version 4.2.3
##
## Attaching package: 'doBy'
##
## The following object is masked from 'package:dplyr':
##
## order_by
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.2.3
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(effects)
## Warning: package 'effects' was built under R version 4.2.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(polycor)
## Warning: package 'polycor' was built under R version 4.2.3
##
## Attaching package: 'polycor'
##
## The following object is masked from 'package:psych':
##
## polyserial
library(gvlma)
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:car':
##
## logit
##
## The following object is masked from 'package:psych':
##
## logit
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(tidyr)
library(dplyr)
library(modelr)
## Warning: package 'modelr' was built under R version 4.2.3
library(gapminder)
## Warning: package 'gapminder' was built under R version 4.2.3
library(tree)
## Warning: package 'tree' was built under R version 4.2.3
library(gbm)
## Warning: package 'gbm' was built under R version 4.2.3
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:psych':
##
## outlier
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.2.3
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(vcd)
## Warning: package 'vcd' was built under R version 4.2.3
## Loading required package: grid
sinde <- read.csv("sindh-school-enrollment-stats.csv")
head(sinde)
## District Location School.Type Category Composition Number Population
## 1 Badin Urban Primary INSTITUTIONS TOTAL 168 1,804,516
## 2 Badin Urban Primary INSTITUTIONS MALE 30 1,804,516
## 3 Badin Urban Primary INSTITUTIONS FEMALE 27 1,804,516
## 4 Badin Urban Primary INSTITUTIONS MIXED 111 1,804,516
## 5 Badin Urban Primary ENROLMENT TOTAL 16446 1,804,516
## 6 Badin Urban Primary ENROLMENT MALE 10015 1,804,516
## No.100000
## 1 9.309976
## 2 1.662496
## 3 1.496246
## 4 6.151234
## 5 911.380115
## 6 554.996464
dim(sinde)
## [1] 3480 8
colnames(sinde)
## [1] "District" "Location" "School.Type" "Category" "Composition"
## [6] "Number" "Population" "No.100000"
md.pattern(sinde, rotate.names = T)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## District Location School.Type Category Composition Number Population
## 3480 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## No.100000
## 3480 1 0
## 0 0
table(sinde$Composition)
##
## FEMALE MALE MIXED TOTAL
## 1044 1044 348 1044
#check missing values and visualize
options(repr.plot.width = 12, repr.plot.height = 10)
psych::describe(sinde)
## vars n mean sd median trimmed mad min max
## District* 1 3480 15.00 8.37 15.00 15.00 10.38 1 29.00
## Location* 2 3480 2.00 0.82 2.00 2.00 1.48 1 3.00
## School.Type* 3 3480 2.50 1.12 2.50 2.50 1.48 1 4.00
## Category* 4 3480 2.00 0.77 2.00 2.00 1.48 1 3.00
## Composition* 5 3480 2.40 1.20 2.00 2.38 1.48 1 4.00
## Number 6 3480 5083.10 16240.83 197.00 1381.53 289.11 0 212708.00
## Population* 7 3480 14.62 7.96 15.00 14.65 10.38 1 28.00
## No.100000 8 3480 335.92 1059.81 13.06 90.72 19.18 0 10404.94
## range skew kurtosis se
## District* 28.00 0.00 -1.20 0.14
## Location* 2.00 0.00 -1.50 0.01
## School.Type* 3.00 0.00 -1.36 0.02
## Category* 2.00 0.00 -1.33 0.01
## Composition* 3.00 0.24 -1.49 0.02
## Number 212708.00 6.11 46.48 275.31
## Population* 27.00 -0.04 -1.16 0.14
## No.100000 10404.94 5.50 35.14 17.97
#visualize missing data
missmap(sinde, margins = c(4,2))
#exclude all non numeric variables for corrplots
num.cols <- sapply(sinde, is.numeric)
df <- sinde[, num.cols]
head(df)
## Number No.100000
## 1 168 9.309976
## 2 30 1.662496
## 3 27 1.496246
## 4 111 6.151234
## 5 16446 911.380115
## 6 10015 554.996464
#corrplot
corrplot(cor(df), order = "hclust", method = "number")
ggplot(df, aes(No.100000))+geom_histogram(bins = 5)+
theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format())
ggplot(df, aes(Number))+geom_histogram()+
theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Number of schools distribution") + ylab("School's Number")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mytable1 <- xtabs(~District + Location + School.Type, data = sinde[,!num.cols])
addmargins(mytable1)
## , , School.Type = Higher secondary
##
## Location
## District Rural Total Urban Sum
## Badin 10 10 10 30
## Central Karachi 10 10 10 30
## Dadu 10 10 10 30
## East Karachi 10 10 10 30
## Ghotki 10 10 10 30
## Hyderabad 10 10 10 30
## Jacobad 10 10 10 30
## Jamshoro 10 10 10 30
## Kamber Shahdadkot 10 10 10 30
## Kashmore 10 10 10 30
## Khairpur Mirs 10 10 10 30
## Korangi Karachi 10 10 10 30
## Larkana 10 10 10 30
## Malir Karachi 10 10 10 30
## Matira 10 10 10 30
## Mirpurkhaas 10 10 10 30
## Naushero Feroz 10 10 10 30
## Sanghar 10 10 10 30
## Shaheed Benazirabad 10 10 10 30
## Shikarpur 10 10 10 30
## South Karachi 10 10 10 30
## Sujawal 10 10 10 30
## Sukkhur 10 10 10 30
## Tando Allah yar 10 10 10 30
## Tando Muhammad Khan 10 10 10 30
## Tharparkar 10 10 10 30
## Thatta 10 10 10 30
## Umerkot 10 10 10 30
## West Karchi 10 10 10 30
## Sum 290 290 290 870
##
## , , School.Type = Middle
##
## Location
## District Rural Total Urban Sum
## Badin 10 10 10 30
## Central Karachi 10 10 10 30
## Dadu 10 10 10 30
## East Karachi 10 10 10 30
## Ghotki 10 10 10 30
## Hyderabad 10 10 10 30
## Jacobad 10 10 10 30
## Jamshoro 10 10 10 30
## Kamber Shahdadkot 10 10 10 30
## Kashmore 10 10 10 30
## Khairpur Mirs 10 10 10 30
## Korangi Karachi 10 10 10 30
## Larkana 10 10 10 30
## Malir Karachi 10 10 10 30
## Matira 10 10 10 30
## Mirpurkhaas 10 10 10 30
## Naushero Feroz 10 10 10 30
## Sanghar 10 10 10 30
## Shaheed Benazirabad 10 10 10 30
## Shikarpur 10 10 10 30
## South Karachi 10 10 10 30
## Sujawal 10 10 10 30
## Sukkhur 10 10 10 30
## Tando Allah yar 10 10 10 30
## Tando Muhammad Khan 10 10 10 30
## Tharparkar 10 10 10 30
## Thatta 10 10 10 30
## Umerkot 10 10 10 30
## West Karchi 10 10 10 30
## Sum 290 290 290 870
##
## , , School.Type = Primary
##
## Location
## District Rural Total Urban Sum
## Badin 10 10 10 30
## Central Karachi 10 10 10 30
## Dadu 10 10 10 30
## East Karachi 10 10 10 30
## Ghotki 10 10 10 30
## Hyderabad 10 10 10 30
## Jacobad 10 10 10 30
## Jamshoro 10 10 10 30
## Kamber Shahdadkot 10 10 10 30
## Kashmore 10 10 10 30
## Khairpur Mirs 10 10 10 30
## Korangi Karachi 10 10 10 30
## Larkana 10 10 10 30
## Malir Karachi 10 10 10 30
## Matira 10 10 10 30
## Mirpurkhaas 10 10 10 30
## Naushero Feroz 10 10 10 30
## Sanghar 10 10 10 30
## Shaheed Benazirabad 10 10 10 30
## Shikarpur 10 10 10 30
## South Karachi 10 10 10 30
## Sujawal 10 10 10 30
## Sukkhur 10 10 10 30
## Tando Allah yar 10 10 10 30
## Tando Muhammad Khan 10 10 10 30
## Tharparkar 10 10 10 30
## Thatta 10 10 10 30
## Umerkot 10 10 10 30
## West Karchi 10 10 10 30
## Sum 290 290 290 870
##
## , , School.Type = Secondary
##
## Location
## District Rural Total Urban Sum
## Badin 10 10 10 30
## Central Karachi 10 10 10 30
## Dadu 10 10 10 30
## East Karachi 10 10 10 30
## Ghotki 10 10 10 30
## Hyderabad 10 10 10 30
## Jacobad 10 10 10 30
## Jamshoro 10 10 10 30
## Kamber Shahdadkot 10 10 10 30
## Kashmore 10 10 10 30
## Khairpur Mirs 10 10 10 30
## Korangi Karachi 10 10 10 30
## Larkana 10 10 10 30
## Malir Karachi 10 10 10 30
## Matira 10 10 10 30
## Mirpurkhaas 10 10 10 30
## Naushero Feroz 10 10 10 30
## Sanghar 10 10 10 30
## Shaheed Benazirabad 10 10 10 30
## Shikarpur 10 10 10 30
## South Karachi 10 10 10 30
## Sujawal 10 10 10 30
## Sukkhur 10 10 10 30
## Tando Allah yar 10 10 10 30
## Tando Muhammad Khan 10 10 10 30
## Tharparkar 10 10 10 30
## Thatta 10 10 10 30
## Umerkot 10 10 10 30
## West Karchi 10 10 10 30
## Sum 290 290 290 870
##
## , , School.Type = Sum
##
## Location
## District Rural Total Urban Sum
## Badin 40 40 40 120
## Central Karachi 40 40 40 120
## Dadu 40 40 40 120
## East Karachi 40 40 40 120
## Ghotki 40 40 40 120
## Hyderabad 40 40 40 120
## Jacobad 40 40 40 120
## Jamshoro 40 40 40 120
## Kamber Shahdadkot 40 40 40 120
## Kashmore 40 40 40 120
## Khairpur Mirs 40 40 40 120
## Korangi Karachi 40 40 40 120
## Larkana 40 40 40 120
## Malir Karachi 40 40 40 120
## Matira 40 40 40 120
## Mirpurkhaas 40 40 40 120
## Naushero Feroz 40 40 40 120
## Sanghar 40 40 40 120
## Shaheed Benazirabad 40 40 40 120
## Shikarpur 40 40 40 120
## South Karachi 40 40 40 120
## Sujawal 40 40 40 120
## Sukkhur 40 40 40 120
## Tando Allah yar 40 40 40 120
## Tando Muhammad Khan 40 40 40 120
## Tharparkar 40 40 40 120
## Thatta 40 40 40 120
## Umerkot 40 40 40 120
## West Karchi 40 40 40 120
## Sum 1160 1160 1160 3480
mytable2 <- xtabs(~District + Population, data = sinde[,!num.cols])
head(addmargins(mytable2))
## Population
## District 1,006,297 1,073,146 1,231,481 1,341,042 1,487,903 1,505,876
## Badin 0 0 0 0 0 0
## Central Karachi 0 0 0 0 0 0
## Dadu 0 0 0 0 0 0
## East Karachi 0 0 0 0 0 0
## Ghotki 0 0 0 0 0 0
## Hyderabad 0 0 0 0 0 0
## Population
## District 1,524,391 1,550,266 1,612,373 1,612,847 1,646,318 1,649,661
## Badin 0 0 0 0 0 0
## Central Karachi 0 0 0 0 0 0
## Dadu 0 120 0 0 0 0
## East Karachi 0 0 0 0 0 0
## Ghotki 0 0 0 0 120 0
## Hyderabad 0 0 0 0 0 0
## Population
## District 1,791,751 1,804,516 2,008,901 2,057,057 2,199,463 2,404,334
## Badin 0 120 0 0 0 0
## Central Karachi 0 0 0 0 0 0
## Dadu 0 0 0 0 0 0
## East Karachi 0 0 0 0 0 0
## Ghotki 0 0 0 0 0 0
## Hyderabad 0 0 0 0 120 0
## Population
## District 2,457,019 2,907,467 2,971,626 3,914,757 677,228 769,349
## Badin 0 0 0 0 0 0
## Central Karachi 0 0 120 0 0 0
## Dadu 0 0 0 0 0 0
## East Karachi 0 120 0 0 0 0
## Ghotki 0 0 0 0 0 0
## Hyderabad 0 0 0 0 0 0
## Population
## District 781,967 836,887 979,817 993,142 Sum
## Badin 0 0 0 0 120
## Central Karachi 0 0 0 0 120
## Dadu 0 0 0 0 120
## East Karachi 0 0 0 0 120
## Ghotki 0 0 0 0 120
## Hyderabad 0 0 0 0 120
#check relationship between disctrict and population
dp <- xtabs(~District + Population, data = sinde[, !num.cols])
chisq.test(dp)
## Warning in chisq.test(dp): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: dp
## X-squared = 93960, df = 756, p-value < 2.2e-16
dl <- xtabs(~District + Location, data = sinde[, !num.cols])
chisq.test(dl)
##
## Pearson's Chi-squared test
##
## data: dl
## X-squared = 0, df = 56, p-value = 1
sctg <- xtabs(~School.Type + Category, data = sinde[, !num.cols])
chisq.test(sctg)
##
## Pearson's Chi-squared test
##
## data: sctg
## X-squared = 0, df = 6, p-value = 1
scmp <- xtabs(~School.Type + Composition, data = sinde[, !num.cols])
chisq.test(scmp)
##
## Pearson's Chi-squared test
##
## data: scmp
## X-squared = 0, df = 9, p-value = 1
scnum <- xtabs(~School.Type + sinde$Number, data = sinde[, !num.cols])
chisq.test(scnum)
## Warning in chisq.test(scnum): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: scnum
## X-squared = 5630.9, df = 4965, p-value = 7.194e-11
scpop <- xtabs(~School.Type + Population, data = sinde[, !num.cols])
chisq.test(scpop)
##
## Pearson's Chi-squared test
##
## data: scpop
## X-squared = 0, df = 81, p-value = 1
nopop <- xtabs(~sinde$No.100000 + Population, data = sinde[, !num.cols])
chisq.test(nopop)
## Warning in chisq.test(nopop): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: nopop
## X-squared = 89932, df = 74061, p-value < 2.2e-16
The significant relation found between school type and number, district and population, and popultion with No.100000. No other variables showed relationship.
#sort by number
number_schools <- sinde %>% arrange(desc(Number))
high_number <- number_schools %>% filter(Number > mean(Number)) %>%
dplyr::select(Number, District, Location, School.Type, Composition)
head(high_number, 10)
## Number District Location School.Type Composition
## 1 212708 Khairpur Mirs Total Primary TOTAL
## 2 186622 Khairpur Mirs Rural Primary TOTAL
## 3 173989 Sanghar Total Primary TOTAL
## 4 166351 Naushero Feroz Total Primary TOTAL
## 5 165309 Shaheed Benazirabad Total Primary TOTAL
## 6 159869 Dadu Total Primary TOTAL
## 7 158612 Larkana Total Primary TOTAL
## 8 149514 Naushero Feroz Rural Primary TOTAL
## 9 148694 Ghotki Total Primary TOTAL
## 10 142043 Sanghar Rural Primary TOTAL
low_number <- number_schools %>% filter(Number < mean(Number)) %>%
dplyr::select(Number, District, Location, School.Type, Composition)
head(low_number, 10)
## Number District Location School.Type Composition
## 1 5081 Malir Karachi Rural Secondary TOTAL
## 2 5078 Umerkot Total Middle TOTAL
## 3 5077 Khairpur Mirs Rural Primary TOTAL
## 4 5068 Jacobad Rural Higher secondary MALE
## 5 4999 Ghotki Total Higher secondary FEMALE
## 6 4996 Dadu Total Primary TOTAL
## 7 4988 Jamshoro Rural Middle TOTAL
## 8 4973 Sanghar Rural Primary TOTAL
## 9 4972 Tando Allah yar Total Secondary FEMALE
## 10 4939 Sujawal Total Higher secondary TOTAL
number_schools %>% group_by(Location) %>%
summarise(mean = round(mean(Number),0), sd= round(sd(Number), 0)) %>%
arrange(desc(mean))
## # A tibble: 3 × 3
## Location mean sd
## <chr> <dbl> <dbl>
## 1 Total 7625 21119
## 2 Rural 5209 17266
## 3 Urban 2415 5832
number_schools %>% group_by(Location) %>%
summarise(mean = round(mean(Number),0), sd= round(sd(Number), 0)) %>%
arrange(desc(mean))
## # A tibble: 3 × 3
## Location mean sd
## <chr> <dbl> <dbl>
## 1 Total 7625 21119
## 2 Rural 5209 17266
## 3 Urban 2415 5832
number_schools %>% group_by(District, Location) %>%
summarise(mean = round(mean(Number),0), sd= round(sd(Number), 0)) %>%
arrange(desc(mean))
## `summarise()` has grouped output by 'District'. You can override using the
## `.groups` argument.
## # A tibble: 87 × 4
## # Groups: District [29]
## District Location mean sd
## <chr> <chr> <dbl> <dbl>
## 1 Khairpur Mirs Total 17446 41047
## 2 Khairpur Mirs Rural 14613 35697
## 3 Sanghar Total 12968 33455
## 4 Naushero Feroz Total 12882 31589
## 5 Larkana Total 12706 30601
## 6 Shaheed Benazirabad Total 12515 31629
## 7 Dadu Total 11598 30428
## 8 Naushero Feroz Rural 10297 28186
## 9 Badin Total 10212 27025
## 10 Ghotki Total 10156 28573
## # ℹ 77 more rows
number_schools %>% group_by(School.Type, Location) %>%
summarise(mean = round(mean(Number),0), sd= round(sd(Number), 0)) %>%
arrange(desc(mean))
## `summarise()` has grouped output by 'School.Type'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups: School.Type [4]
## School.Type Location mean sd
## <chr> <chr> <dbl> <dbl>
## 1 Primary Total 19754 37606
## 2 Primary Rural 15358 31639
## 3 Secondary Total 6163 11129
## 4 Primary Urban 4396 8173
## 5 Secondary Urban 3340 7081
## 6 Secondary Rural 2823 5987
## 7 Higher secondary Total 2471 5032
## 8 Middle Total 2110 4286
## 9 Middle Rural 1407 3248
## 10 Higher secondary Rural 1249 2969
## 11 Higher secondary Urban 1222 2583
## 12 Middle Urban 703 1898
number_schools %>% group_by(Category, Location) %>%
summarise(mean = round(mean(Number),0), sd= round(sd(Number), 0)) %>%
arrange(desc(mean))
## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.
## # A tibble: 9 × 4
## # Groups: Category [3]
## Category Location mean sd
## <chr> <chr> <dbl> <dbl>
## 1 ENROLMENT Total 24305 33004
## 2 ENROLMENT Rural 16627 28422
## 3 ENROLMENT Urban 7679 8581
## 4 TEACHING STAFF Total 867 1124
## 5 TEACHING STAFF Rural 526 914
## 6 TEACHING STAFF Urban 341 453
## 7 INSTITUTIONS Total 183 461
## 8 INSTITUTIONS Rural 159 436
## 9 INSTITUTIONS Urban 24 46
number_schools %>% group_by(Composition, Location) %>%
summarise(mean = round(mean(Number),0), sd= round(sd(Number), 0)) %>%
arrange(desc(mean))
## `summarise()` has grouped output by 'Composition'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups: Composition [4]
## Composition Location mean sd
## <chr> <chr> <dbl> <dbl>
## 1 TOTAL Total 12708 30604
## 2 TOTAL Rural 8682 25105
## 3 MALE Total 7720 19001
## 4 MALE Rural 5578 15899
## 5 FEMALE Total 4899 11854
## 6 TOTAL Urban 4026 8459
## 7 FEMALE Rural 3024 9344
## 8 MALE Urban 2142 4420
## 9 FEMALE Urban 1874 4199
## 10 MIXED Total 267 532
## 11 MIXED Rural 237 506
## 12 MIXED Urban 29 49
number_schools %>% group_by(Population, Location) %>%
summarise(mean = round(mean(Number),0), sd= round(sd(Number), 0)) %>%
arrange(desc(mean))
## `summarise()` has grouped output by 'Population'. You can override using the
## `.groups` argument.
## # A tibble: 84 × 4
## # Groups: Population [28]
## Population Location mean sd
## <chr> <chr> <dbl> <dbl>
## 1 2,057,057 Total 12968 33455
## 2 1,612,373 Total 12882 31589
## 3 1,524,391 Total 12706 30601
## 4 1,612,847 Total 12515 31629
## 5 2,404,334 Total 11958 31522
## 6 1,550,266 Total 11598 30428
## 7 1,612,373 Rural 10297 28186
## 8 1,804,516 Total 10212 27025
## 9 1,646,318 Total 10156 28573
## 10 2,404,334 Rural 10070 27550
## # ℹ 74 more rows
ggplot(sinde, aes(Number, reorder(Location, Number))) +
geom_point() +
theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Number of schools distribution",
subtitle = "School distribution across province") + ylab("Schools Location")
ggplot(sinde, aes(Number, No.100000, color = Composition))+geom_point()+
geom_smooth(method = lm, se = F, color = "black", size = 0.2, linetype = "dashed") +
geom_hline(yintercept = mean(sinde$Number), linetype = "dashed", size = 1) +
geom_vline(xintercept = mean(sinde$Number), linetype = "dashed", size = 1) +
annotate("text", x = 14900, y = 250, label = "Mean",size = 5) +
annotate("text", x = 38000, y = 131, label = "Mean",size = 5) +
geom_text_repel(data=sinde %>% top_n(10,Number), aes(Number, No.100000, label = Composition),color = "red",
segment.color = "black") + scale_x_continuous(label = scales::comma_format()) +
geom_text_repel(data=sinde %>% top_n(-3,Number), aes(Number, No.100000, label = Composition),color = "red",
segment.color = "black")+
theme(legend.title = element_text(size = 8),
legend.text = element_text(size = 13),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12)) +
labs(title = "Number of schools have a linear relationship")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 155 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggplot(sinde, aes(Number, Location, color = Composition))+
geom_point()+
facet_wrap(~Composition)+
theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Number of schools distribution",
subtitle = "School distribution across Genders") + ylab("School's Location")
ggplot(sinde, aes(Number, Location, color = Composition))+
geom_point()+
facet_wrap(~ School.Type)+
theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title="Number of schools distribution",
subtitle = "School distribution across province") + ylab("Schools Location")