library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# pacman::p_load(janitor, tidyverse, readr)

#-----------Load database
library(readr)
db_1 <- read_csv("../Data/Measles_db_on_30_June_year_2025.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 23394 Columns: 45
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (30): Date fBirth naissance, DateIspecimenCollecteds, DateSpecimenSentto...
## dbl (10): SpecimenSource, MeaslesIgm, FinalClassification, RubellaIgm, AgeIn...
## lgl  (5): PatientsResidence, EpiLink, VirusIsolation, DateSpecimenRecInLab, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(db_1)

# nUmber of rows
number_of_rows <- nrow(db_1)
number_of_rows = nrow(db_1)

# number of columns
my_nb_col  <- ncol(db_1)
my_nb_col = ncol(db_1)

# number of rows and columns
dim(db_1)
## [1] 23394    45
# =========================================================
# Work of 19-02-2026
# =========================================================

names(db_1)
##  [1] "Date fBirth naissance"       "SpecimenSource"             
##  [3] "DateIspecimenCollecteds"     "DateSpecimenSenttolab"      
##  [5] "MeaslesIgm"                  "FinalClassification"        
##  [7] "RubellaIgm"                  "Sex"                        
##  [9] "DistrictOfresidence"         "DateLabSentResulttodistrict"
## [11] "LabId"                       "DateOfLastvaccination"      
## [13] "AgeInYears"                  "AgeInMonths"                
## [15] "Date OfOn   set"             "SpecimenCondition"          
## [17] "DateLabReceivedSpecimen"     "LaboratoryName"             
## [19] "IdNumber"                    "Age s"                      
## [21] "ReportingDistrict"           "Towncity"                   
## [23] "Urbanrural"                  "DateSeenHealthFacility"     
## [25] "NumberOfVaccinedoses"        "DateHealthfacilitynotified" 
## [27] "ReportingHealthfacility"     "ProvinceOfResidence"        
## [29] "DateSentFormtodistrict"      "DateDistrictRecLabResults"  
## [31] "Inoutpatient"                "Outcome"                    
## [33] "DateRecformdistrict"         "DateReceivedNatlevel"       
## [35] "OtherLabResults"             "PatientsResidence"          
## [37] "DataType"                    "Ward"                       
## [39] "EpiLink"                     "VirusIsolation"             
## [41] "DateSpecimenRecInLab"        "WayofLife"                  
## [43] "OtherCountry"                "CountryCode"                
## [45] "DiseaseCondition"
summary(db_1)
##  Date fBirth naissance SpecimenSource  DateIspecimenCollecteds
##  Length:23394          Min.   :1.00    Length:23394           
##  Class :character      1st Qu.:2.00    Class :character       
##  Mode  :character      Median :2.00    Mode  :character       
##                        Mean   :1.96                           
##                        3rd Qu.:2.00                           
##                        Max.   :2.00                           
##                        NA's   :11379                          
##  DateSpecimenSenttolab   MeaslesIgm    FinalClassification   RubellaIgm   
##  Length:23394          Min.   :1.000   Min.   :1.000       Min.   :1.000  
##  Class :character      1st Qu.:2.000   1st Qu.:2.000       1st Qu.:2.000  
##  Mode  :character      Median :2.000   Median :4.000       Median :2.000  
##                        Mean   :2.489   Mean   :3.147       Mean   :2.708  
##                        3rd Qu.:3.000   3rd Qu.:4.000       3rd Qu.:4.000  
##                        Max.   :5.000   Max.   :5.000       Max.   :9.000  
##                                                            NA's   :11087  
##      Sex            DistrictOfresidence DateLabSentResulttodistrict
##  Length:23394       Length:23394        Length:23394               
##  Class :character   Class :character    Class :character           
##  Mode  :character   Mode  :character    Mode  :character           
##                                                                    
##                                                                    
##                                                                    
##                                                                    
##     LabId           DateOfLastvaccination   AgeInYears       AgeInMonths     
##  Length:23394       Length:23394          Min.   :  0.000   Min.   :  0.000  
##  Class :character   Class :character      1st Qu.:  2.000   1st Qu.:  0.000  
##  Mode  :character   Mode  :character      Median :  4.000   Median :  0.000  
##                                           Mean   :  6.833   Mean   :  6.877  
##                                           3rd Qu.:  8.000   3rd Qu.:  7.000  
##                                           Max.   :109.000   Max.   :600.000  
##                                           NA's   :1743      NA's   :13141    
##  Date OfOn   set    SpecimenCondition DateLabReceivedSpecimen
##  Length:23394       Min.   :1.000     Length:23394           
##  Class :character   1st Qu.:1.000     Class :character       
##  Mode  :character   Median :1.000     Mode  :character       
##                     Mean   :1.058                            
##                     3rd Qu.:1.000                            
##                     Max.   :2.000                            
##                     NA's   :10700                            
##  LaboratoryName       IdNumber             Age s        ReportingDistrict 
##  Length:23394       Length:23394       Min.   : 0.000   Length:23394      
##  Class :character   Class :character   1st Qu.: 2.000   Class :character  
##  Mode  :character   Mode  :character   Median : 5.000   Mode  :character  
##                                        Mean   : 8.072                     
##                                        3rd Qu.: 9.000                     
##                                        Max.   :90.000                     
##                                        NA's   :17303                      
##    Towncity          Urbanrural        DateSeenHealthFacility
##  Length:23394       Length:23394       Length:23394          
##  Class :character   Class :character   Class :character      
##  Mode  :character   Mode  :character   Mode  :character      
##                                                              
##                                                              
##                                                              
##                                                              
##  NumberOfVaccinedoses DateHealthfacilitynotified ReportingHealthfacility
##  Min.   : 0.00        Length:23394               Length:23394           
##  1st Qu.: 0.00        Class :character           Class :character       
##  Median : 0.00        Mode  :character           Mode  :character       
##  Mean   :13.71                                                          
##  3rd Qu.: 1.00                                                          
##  Max.   :99.00                                                          
##  NA's   :5805                                                           
##  ProvinceOfResidence DateSentFormtodistrict DateDistrictRecLabResults
##  Length:23394        Length:23394           Length:23394             
##  Class :character    Class :character       Class :character         
##  Mode  :character    Mode  :character       Mode  :character         
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##   Inoutpatient     Outcome          DateRecformdistrict DateReceivedNatlevel
##  Min.   :1.000   Length:23394       Length:23394        Length:23394        
##  1st Qu.:2.000   Class :character   Class :character    Class :character    
##  Median :2.000   Mode  :character   Mode  :character    Mode  :character    
##  Mean   :1.934                                                              
##  3rd Qu.:2.000                                                              
##  Max.   :2.000                                                              
##  NA's   :7992                                                               
##  OtherLabResults    PatientsResidence   DataType             Ward          
##  Length:23394       Mode:logical      Length:23394       Length:23394      
##  Class :character   NA's:23394        Class :character   Class :character  
##  Mode  :character                     Mode  :character   Mode  :character  
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##  EpiLink        VirusIsolation DateSpecimenRecInLab  WayofLife        
##  Mode:logical   Mode:logical   Mode:logical         Length:23394      
##  NA's:23394     NA's:23394     NA's:23394           Class :character  
##                                                     Mode  :character  
##                                                                       
##                                                                       
##                                                                       
##                                                                       
##  OtherCountry   CountryCode        DiseaseCondition  
##  Mode:logical   Length:23394       Length:23394      
##  NA's:23394     Class :character   Class :character  
##                 Mode  :character   Mode  :character  
##                                                      
##                                                      
##                                                      
## 
str(db_1)
## spc_tbl_ [23,394 × 45] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date fBirth naissance      : chr [1:23394] "1/19/1999" "8/15/1994" "12/25/2016" "5/7/2024" ...
##  $ SpecimenSource             : num [1:23394] 1 1 1 1 1 1 1 1 1 1 ...
##  $ DateIspecimenCollecteds    : chr [1:23394] NA "1/23/2025" "1/21/2025" "1/26/2025" ...
##  $ DateSpecimenSenttolab      : chr [1:23394] "1/30/2025" "1/30/2025" "1/30/2025" "2/2/2025" ...
##  $ MeaslesIgm                 : num [1:23394] 2 2 2 1 1 1 1 1 1 1 ...
##  $ FinalClassification        : num [1:23394] 4 4 4 1 1 1 1 1 1 1 ...
##  $ RubellaIgm                 : num [1:23394] 2 2 2 4 4 4 4 4 4 4 ...
##  $ Sex                        : chr [1:23394] "2" "2" "1" "1" ...
##  $ DistrictOfresidence        : chr [1:23394] "Annaba" "El Hadjar" "El Hadjar" "ADRAR" ...
##  $ DateLabSentResulttodistrict: chr [1:23394] "1/30/2025" "1/30/2025" "1/30/2025" "2/2/2025" ...
##  $ LabId                      : chr [1:23394] "041-2025" "042-2025" "043-2025" "044-2025" ...
##  $ DateOfLastvaccination      : chr [1:23394] NA NA "1/1/2023" NA ...
##  $ AgeInYears                 : num [1:23394] 26 30 8 NA 1 NA 1 4 3 4 ...
##  $ AgeInMonths                : num [1:23394] NA NA NA 8 NA 9 NA NA NA NA ...
##  $ Date OfOn   set            : chr [1:23394] "1/12/2025" "1/18/2025" "1/19/2025" "1/16/2025" ...
##  $ SpecimenCondition          : num [1:23394] 1 1 1 1 1 1 1 1 1 1 ...
##  $ DateLabReceivedSpecimen    : chr [1:23394] "1/30/2025" "1/30/2025" "1/30/2025" "2/2/2025" ...
##  $ LaboratoryName             : chr [1:23394] "Laboratoire National de R\xe9f\xe9rence pour la Rougeole" "Laboratoire National de R\xe9f\xe9rence pour la Rougeole" "Laboratoire National de R\xe9f\xe9rence pour la Rougeole" "Laboratoire National de R\xe9f\xe9rence pour la Rougeole" ...
##  $ IdNumber                   : chr [1:23394] "ALG-ANB-ANB-25-039" "ALG-ANB-EHD-25-040" "ALG-ANB-EHD-25-041" "ALG-ADR-ADR-25-042" ...
##  $ Age s                      : num [1:23394] NA NA NA NA NA NA NA NA NA NA ...
##  $ ReportingDistrict          : chr [1:23394] NA NA NA NA ...
##  $ Towncity                   : chr [1:23394] NA NA NA NA ...
##  $ Urbanrural                 : chr [1:23394] NA NA NA NA ...
##  $ DateSeenHealthFacility     : chr [1:23394] NA NA NA NA ...
##  $ NumberOfVaccinedoses       : num [1:23394] NA NA 1 NA NA NA NA NA NA NA ...
##  $ DateHealthfacilitynotified : chr [1:23394] NA NA NA NA ...
##  $ ReportingHealthfacility    : chr [1:23394] NA NA NA NA ...
##  $ ProvinceOfResidence        : chr [1:23394] NA NA NA NA ...
##  $ DateSentFormtodistrict     : chr [1:23394] NA NA NA NA ...
##  $ DateDistrictRecLabResults  : chr [1:23394] NA NA NA NA ...
##  $ Inoutpatient               : num [1:23394] NA NA NA NA NA NA NA NA NA NA ...
##  $ Outcome                    : chr [1:23394] NA NA NA NA ...
##  $ DateRecformdistrict        : chr [1:23394] NA NA NA NA ...
##  $ DateReceivedNatlevel       : chr [1:23394] NA NA NA NA ...
##  $ OtherLabResults            : chr [1:23394] NA NA NA NA ...
##  $ PatientsResidence          : logi [1:23394] NA NA NA NA NA NA ...
##  $ DataType                   : chr [1:23394] NA NA NA NA ...
##  $ Ward                       : chr [1:23394] NA NA NA NA ...
##  $ EpiLink                    : logi [1:23394] NA NA NA NA NA NA ...
##  $ VirusIsolation             : logi [1:23394] NA NA NA NA NA NA ...
##  $ DateSpecimenRecInLab       : logi [1:23394] NA NA NA NA NA NA ...
##  $ WayofLife                  : chr [1:23394] NA NA NA NA ...
##  $ OtherCountry               : logi [1:23394] NA NA NA NA NA NA ...
##  $ CountryCode                : chr [1:23394] "ALG" "ALG" "ALG" "ALG" ...
##  $ DiseaseCondition           : chr [1:23394] "Measles" "Measles" "Measles" "Measles" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Date fBirth naissance` = col_character(),
##   ..   SpecimenSource = col_double(),
##   ..   DateIspecimenCollecteds = col_character(),
##   ..   DateSpecimenSenttolab = col_character(),
##   ..   MeaslesIgm = col_double(),
##   ..   FinalClassification = col_double(),
##   ..   RubellaIgm = col_double(),
##   ..   Sex = col_character(),
##   ..   DistrictOfresidence = col_character(),
##   ..   DateLabSentResulttodistrict = col_character(),
##   ..   LabId = col_character(),
##   ..   DateOfLastvaccination = col_character(),
##   ..   AgeInYears = col_double(),
##   ..   AgeInMonths = col_double(),
##   ..   `Date OfOn   set` = col_character(),
##   ..   SpecimenCondition = col_double(),
##   ..   DateLabReceivedSpecimen = col_character(),
##   ..   LaboratoryName = col_character(),
##   ..   IdNumber = col_character(),
##   ..   `Age s` = col_double(),
##   ..   ReportingDistrict = col_character(),
##   ..   Towncity = col_character(),
##   ..   Urbanrural = col_character(),
##   ..   DateSeenHealthFacility = col_character(),
##   ..   NumberOfVaccinedoses = col_double(),
##   ..   DateHealthfacilitynotified = col_character(),
##   ..   ReportingHealthfacility = col_character(),
##   ..   ProvinceOfResidence = col_character(),
##   ..   DateSentFormtodistrict = col_character(),
##   ..   DateDistrictRecLabResults = col_character(),
##   ..   Inoutpatient = col_double(),
##   ..   Outcome = col_character(),
##   ..   DateRecformdistrict = col_character(),
##   ..   DateReceivedNatlevel = col_character(),
##   ..   OtherLabResults = col_character(),
##   ..   PatientsResidence = col_logical(),
##   ..   DataType = col_character(),
##   ..   Ward = col_character(),
##   ..   EpiLink = col_logical(),
##   ..   VirusIsolation = col_logical(),
##   ..   DateSpecimenRecInLab = col_logical(),
##   ..   WayofLife = col_character(),
##   ..   OtherCountry = col_logical(),
##   ..   CountryCode = col_character(),
##   ..   DiseaseCondition = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(db_1)
## # A tibble: 6 × 45
##   `Date fBirth naissance` SpecimenSource DateIspecimenCollecteds
##   <chr>                            <dbl> <chr>                  
## 1 1/19/1999                            1 <NA>                   
## 2 8/15/1994                            1 1/23/2025              
## 3 12/25/2016                           1 1/21/2025              
## 4 5/7/2024                             1 1/26/2025              
## 5 1/10/2024                            1 1/26/2025              
## 6 <NA>                                 1 1/26/2025              
## # ℹ 42 more variables: DateSpecimenSenttolab <chr>, MeaslesIgm <dbl>,
## #   FinalClassification <dbl>, RubellaIgm <dbl>, Sex <chr>,
## #   DistrictOfresidence <chr>, DateLabSentResulttodistrict <chr>, LabId <chr>,
## #   DateOfLastvaccination <chr>, AgeInYears <dbl>, AgeInMonths <dbl>,
## #   `Date OfOn   set` <chr>, SpecimenCondition <dbl>,
## #   DateLabReceivedSpecimen <chr>, LaboratoryName <chr>, IdNumber <chr>,
## #   `Age s` <dbl>, ReportingDistrict <chr>, Towncity <chr>, Urbanrural <chr>, …
tail(db_1)
## # A tibble: 6 × 45
##   `Date fBirth naissance` SpecimenSource DateIspecimenCollecteds
##   <chr>                            <dbl> <chr>                  
## 1 2/22/2022                           NA 5/23/2025              
## 2 5/17/2017                           NA 1/21/2025              
## 3 10/30/2022                          NA 2/18/2025              
## 4 10/11/2022                          NA 3/26/2025              
## 5 5/13/2024                           NA 5/19/2025              
## 6 7/1/2016                            NA 3/17/2025              
## # ℹ 42 more variables: DateSpecimenSenttolab <chr>, MeaslesIgm <dbl>,
## #   FinalClassification <dbl>, RubellaIgm <dbl>, Sex <chr>,
## #   DistrictOfresidence <chr>, DateLabSentResulttodistrict <chr>, LabId <chr>,
## #   DateOfLastvaccination <chr>, AgeInYears <dbl>, AgeInMonths <dbl>,
## #   `Date OfOn   set` <chr>, SpecimenCondition <dbl>,
## #   DateLabReceivedSpecimen <chr>, LaboratoryName <chr>, IdNumber <chr>,
## #   `Age s` <dbl>, ReportingDistrict <chr>, Towncity <chr>, Urbanrural <chr>, …
# Opérateur %>% |>

milse <- db_1 |>
  get_dupes(IdNumber)

prop.table(table(db_1$Sex))
## 
##            1            2            f            F       Female            m 
## 0.0105608000 0.0098711151 0.0007758955 0.4623475150 0.0002586318 0.0005172637 
##            M         Male 
## 0.5152377258 0.0004310531
db_1$Sex[db_1$Sex %in% c("1", "m", "Male", "M", "Masculin")] <- "Male"

names(db_1)[names(db_1) == "EpiLink"] <- "EpiLink_Milse"

#1) comptage
my_tab <- table(db_1$CountryCode, useNA = "no")

# 2) counstruire le data.frame
df <- data.frame(
  CountryCode = names(my_tab),
  n = as.integer(my_tab), stringsAsFactors = F)

# 3 trier par ordre n (croissant) = arrange(n)

df <- df[order(df$n), ]# , decreasing = T

# 4 équivalent de fct_reorder(CountryCode, n)

df$CountryCode <- factor(x = df$CountryCode, levels = df$CountryCode)



# df <- db_1 %>% 
#   count(CountryCode) %>% 
#   ungroup() %>% 
#   # mutate(prop = n/sum(n), percentage = 100*n/sum(n), label_perc = scales::percent(x = round(prop, digits = 2))) %>% 
#   arrange(n) %>% 
#   mutate(CountryCode = fct_reorder(CountryCode, n))



ggplot2::ggplot(data = df, mapping = aes(y = CountryCode, x = n)) +
  geom_col(fill = "tomato")+ # , show.legend = F
  geom_label(aes(label = n),
             hjust = -.01, 
             size = 4,
             fontface = "bold",
             colour = "white",
             label.size = 0.3, fill = "tomato"
  ) +
  labs(title = "Distribution of suspected cases of Measles in WA countries, 2025",
       subtitle = "These data do not reflect reality, they are for educational purposes only.",
       caption = "Data source: WHO AFRO (IST WA)",
       x = "Cases",
       y = "Countries") +
  # scale_x_continuous(limits = c(0, max(df$n, na.rm = T)) + 2)
  theme_bw() +
  theme(
    title = element_text(size = 14, face = "bold"),
    plot.caption = element_text(size = 12, face = "italic"),
    
    # axis.title = element_text(size = 10),
    axis.title.x = element_text(size = 12),
    # axis.title.y = element_text(size = 10),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    plot.background = element_blank() 
    
  )