Importing Data and Necessary Packages

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.4.1     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(readr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Recoding Variables and Cleaning Data

Renaming the Influenza Transmission Zones (ITZ)

There are 18 Influenza Transmission Zones, as defined by the WHO. You can find a map of the zones and lists of the countries/areas/territories included in each zone here: https://cdn.who.int/media/docs/default-source/influenza/influenza-transmission-zones20180914.pdf

data<- read_csv("flunet_2017_2022.csv")

## Rows: 53650 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): WHOREGION, ITZ, COUNTRY/AREA/TERRITORY, COUNTRY_CODE, ORIGIN_SOURCE
## dbl (18): ISO_YEAR, ISO_WEEK, ISO_SDATE, SPEC_RECEIVED_NB, SPEC_PROCESSED_NB...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data<- data %>%
  mutate (ITZ = recode(ITZ,
                       FLU_NRT_AMR = 'North America',
                       FLU_CNT_AMC = 'Central America and Caribbean',
                       FLU_TEMP_SAMR = 'Temperate South America',
                       FLU_TRP_SAMR = 'Tropical South America',
                       FLU_NTH_EUR = 'Northern Europe',
                       FLU_SW_EUR = 'South West Europe',
                       FLU_EST_EUR = 'Eastern Europe',
                       FLU_NRT_AFR = 'Northern Africa',
                       FLU_EST_AFR = 'Eastern Africa',
                       FLU_MID_AFR = 'Middle Africa',
                       FLU_WST_AFR = 'Western Africa',
                       FLU_STH_AFR = 'Southern Africa',
                       FLU_WST_ASIA = 'Western Asia',
                       FLU_CNT_ASIA = 'Central Asia',
                       FLU_EST_ASIA = 'Eastern Asia',
                       FLU_STH_ASIA = 'Southern Asia',
                       FLU_SE_ASIA = 'South-East Asia',
                       FLU_OCE_MEL_POL = 'Oceania Melanesia and Polynesia'
                       ))

data[is.na(data)] <- 0

Temporal Trends in FluNet Data

by_week_and_region<-summarize(group_by(data,`ITZ`,`ISO_WEEK`,`ISO_YEAR`), SPEC_PROCESSED_WEEKLY=sum(`SPEC_PROCESSED_NB`))

## `summarise()` has grouped output by 'ITZ', 'ISO_WEEK'. You can override using
## the `.groups` argument.

by_date_and_region<-summarize(group_by(data,`ITZ`,`ISO_SDATE`), SPEC_PROCESSED_WEEKLY=sum(`SPEC_PROCESSED_NB`))

## `summarise()` has grouped output by 'ITZ'. You can override using the `.groups`
## argument.

by_week<-summarize(group_by(data,`ISO_SDATE`,`ISO_WEEK`, `ISO_YEAR`),SPEC_PROCESSED=sum(`SPEC_PROCESSED_NB`))

## `summarise()` has grouped output by 'ISO_SDATE', 'ISO_WEEK'. You can override
## using the `.groups` argument.

by_week2<-by_week
by_week2[, 3] <- sapply(by_week2[, 3], as.character)
f1<-ggplot(by_week2, aes(x=`ISO_WEEK`, y=`SPEC_PROCESSED`, group=`ISO_YEAR`, color= `ISO_YEAR`))+geom_line()+
  scale_color_brewer("Year", palette="Dark2")+
  labs(x="Week", y="Specimens Processed", title="Specimens Processed from 2017-2022")+
  theme_classic()
ggplotly(f1)

by_year_and_country<-summarize(group_by(data,`COUNTRY/AREA/TERRITORY`,`ISO_YEAR`,`ITZ`),SPEC_PROCESSED_ANNUAL=sum(`SPEC_PROCESSED_NB`))

## `summarise()` has grouped output by 'COUNTRY/AREA/TERRITORY', 'ISO_YEAR'. You
## can override using the `.groups` argument.

by_year_and_region<-summarize(group_by(data,`ISO_YEAR`,`ITZ`),SPEC_PROCESSED_ANNUAL=sum(`SPEC_PROCESSED_NB`))

## `summarise()` has grouped output by 'ISO_YEAR'. You can override using the
## `.groups` argument.

Species Processed by Each Region

by_year_and_region[is.na(by_year_and_region)] <- 0
by_year_and_region2<- by_year_and_region %>%  filter(!SPEC_PROCESSED_ANNUAL == 6434371)
f2<-ggplot(by_year_and_region2, aes(x=`ISO_YEAR`, y=`SPEC_PROCESSED_ANNUAL`))+
  geom_line()+
  facet_wrap(. ~`ITZ`)+
  labs(x="Year", y="Specimens Processed")+
  theme_bw()
ggplotly(f2)

by_year_and_country[is.na(by_year_and_country)] <- 0
by_year_and_country2<- by_year_and_country %>%  filter(!SPEC_PROCESSED_ANNUAL > 20000)
f2<-ggplot(by_year_and_country2, aes(x=`ISO_YEAR`, y=`SPEC_PROCESSED_ANNUAL`, color=`COUNTRY/AREA/TERRITORY`))+
  geom_line()+
  facet_wrap(. ~`ITZ`)+
  labs(x="Year", y="Specimens Processed")+
  theme_bw()+
  theme(legend.position="none")
ggplotly(f2)

f4<-ggplot(by_year_and_region, aes(x=`ISO_YEAR`, y=`SPEC_PROCESSED_ANNUAL`,fill=`ITZ`))+
  geom_bar(position="stack", stat="identity")+
  labs(y="Specimens Processed", x="Year", fill="Influenza Transmission Zone (ITZ)")+
  theme_bw()
ggplotly(f4)

by_year_and_country2<- by_year_and_country %>%  filter(!SPEC_PROCESSED_ANNUAL > 20000)

f5<-ggplot(by_year_and_country2, aes(x=`ITZ`, y=`SPEC_PROCESSED_ANNUAL`, color=`ITZ`))+
  geom_boxplot()+
  labs(y="Specimens Processed", x="")+
  theme_bw()+
  theme(axis.text.x=element_blank())
ggplotly(f5)

## Warning: The following aesthetics were dropped during statistical transformation:
## y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Preparing for Analysis of Contextual Factors

cf<- read_csv("Country_Indicators_Thesis.csv")

## Rows: 3779 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Series Name, Series Code, Country Name, Country Code, Values
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Fix country names to match FluNet 
# Still need to figure out what to do with the UK
cf<- cf %>%
  mutate (`Country Name` = recode(`Country Name`,
                       'Antigua & Barbuda' = 'Antigua and Barbuda',
                       'Bahamas, The' = 'Bahamas',
                       'Bolivia' = 'Bolivia (Plurinational State of)',
                       'Bosnia and Hercegovina' = 'Bosnia and Herzegovina',
                       'Czech Republic' = 'Czechia',
                       'North Korea' = "Democratic People's Republic of Korea",
                       "Korea, Dem. People's Rep."= "Democratic People's Republic of Korea",
                       "Korea (Democratic People's Rep. of)" = "Democratic People's Republic of Korea",
                       'Congo (Democratic Republic)' = 'Democratic Republic of the Congo',
                       'Congo, Dem. Rep.' = 'Democratic Republic of the Congo',
                       'Egypt, Arab Rep.'= 'Egypt',
                       'Iran' = 'Iran (Islamic Republic of)',
                       'Iran, Islamic Rep.' = 'Iran (Islamic Republic of)',
                       'Kosovo' = 'Kosovo (in accordance with UN Security Council resolution 1244 (1999))',
                       'Kyrgyz Republic' = 'Kyrgyzstan',
                       'Laos' = "Lao People's Democratic Republic",
                       'Lao PDR' = "Lao People's Democratic Republic",
                       'Moldova' = 'Republic of Moldova',
                       "Cote d'Ivoire" = "Côte d'Ivoire",
                       'Netherlands' = 'Netherlands (Kingdom of the)',
                       'South Korea' = 'Republic of Korea',
                       'Korea, Rep.' = 'Republic of Korea',
                       'Korea (Republic of)' = 'Republic of Korea',
                       'Russia' = 'Russian Federation',
                       'St Kitts & Nevis' = 'Saint Kitts and Nevis',
                       'St. Kitts and Nevis' = 'Saint Kitts and Nevis',
                       'St Lucia' = 'Saint Lucia',
                       'St. Lucia' = 'Saint Lucia',
                       'St Vincent & The Grenadines' = 'Saint Vincent and the Grenadines',
                       'St. Vincent and the Grenadines' = 'Saint Vincent and the Grenadines',
                       'Slovak Republic' = 'Slovakia',
                       'Syria' = 'Syrian Arab Republic',
                       'Tanzania' = 'United Republic of Tanzania',
                       'Turkey' = 'Türkiye',
                       'Turkiye' = 'Türkiye',
                       'United Kingdom of Great Britain and Northern Ireland' = 'United Kingdom',
                       'Venezuela' = 'Venezuela (Bolivarian Republic of)',
                       'Venezuela, RB' = 'Venezuela (Bolivarian Republic of)',
                       'Vietnam' = 'Viet Nam'))

# pivot_wider for social factors so each row is country + values for each factor
cf1<- subset(cf, select = -c(`Series Code`, `Country Code`))
cf1<-cf1 %>%
  pivot_wider(names_from=c(`Series Name`), values_from=`Values`)

# gets rid of countries that did not submit to FluNet
cf1<-cf1 %>% drop_na(`Domestic general government health expenditure (% of general government expenditure)`)

# replace all ".." values with NA
# context values can have NA but not 0, specimen values can have 0 but not NA
cf1[cf1==".."]<-NA
cf1[, 2:22] <- sapply(cf1[, 2:22], as.numeric)

# create data set that is each country + ITZ + column for each year's total (pivot_wider)
data2<-summarize(group_by(data,`COUNTRY/AREA/TERRITORY`,`ISO_YEAR`,`ITZ`),SPEC_PROCESSED_ANNUAL=sum(`SPEC_PROCESSED_NB`))

## `summarise()` has grouped output by 'COUNTRY/AREA/TERRITORY', 'ISO_YEAR'. You
## can override using the `.groups` argument.

data2<- data2 %>% 
  pivot_wider(names_from=`ISO_YEAR`, values_from=`SPEC_PROCESSED_ANNUAL`, names_prefix = "Specimens Processed in ")
data2[is.na(data2)] <- 0

# join social factors to table with yearly observations in previous
data3<-merge(x=data2, y=cf1, by.x = "COUNTRY/AREA/TERRITORY", by.y = "Country Name")

Creating a Data Table Based on Rankings of Values

data4<-lapply(data3, rank, ties.method="min")
data4<-as_tibble(data4)

Regression Analysis of Contextual Factors

Domestic general government health expenditure (% of general government expenditure)

data3 %>% 
  lm(`Domestic general government health expenditure (% of general government expenditure)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Domestic general government health expenditure (% of general government expenditure)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8206 -3.4613 -0.3288  2.8199 13.7624 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.045e+01  4.121e-01  25.344  < 2e-16 ***
## `Specimens Processed in 2022` 1.090e-05  3.203e-06   3.404 0.000864 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.691 on 141 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.07595,    Adjusted R-squared:  0.0694 
## F-statistic: 11.59 on 1 and 141 DF,  p-value: 0.0008639

ggplot(data=data3, aes(x=`Domestic general government health expenditure (% of general government expenditure)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 20 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 9 rows containing missing values (`geom_point()`).

Current health expenditure (% of GDP)

data3 %>% 
  lm(`Current health expenditure (% of GDP)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Current health expenditure (% of GDP)` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1457 -1.8637 -0.0041  1.5331  7.0219 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   6.198e+00  1.987e-01  31.188  < 2e-16 ***
## `Specimens Processed in 2022` 6.728e-06  1.544e-06   4.356 2.53e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.262 on 141 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.1186, Adjusted R-squared:  0.1124 
## F-statistic: 18.98 on 1 and 141 DF,  p-value: 2.53e-05

ggplot(data=data3, aes(x=`Current health expenditure (% of GDP)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 20 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 9 rows containing missing values (`geom_point()`).

Births attended by skilled health staff (% of total)

data3 %>% 
  lm(`Births attended by skilled health staff (% of total)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Births attended by skilled health staff (% of total)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.407   1.488   5.231   6.434   6.797 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   9.320e+01  1.714e+00  54.375   <2e-16 ***
## `Specimens Processed in 2022` 1.066e-05  1.051e-05   1.014    0.314    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.34 on 67 degrees of freedom
##   (83 observations deleted due to missingness)
## Multiple R-squared:  0.01511,    Adjusted R-squared:  0.0004106 
## F-statistic: 1.028 on 1 and 67 DF,  p-value: 0.3143

ggplot(data=data3, aes(x=`Births attended by skilled health staff (% of total)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 90 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 83 rows containing missing values (`geom_point()`).

Prevalence of HIV, total (% of population ages 15-49)

data3 %>% 
  lm(`Prevalence of HIV, total (% of population ages 15-49)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Prevalence of HIV, total (% of population ages 15-49)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1410 -1.0386 -0.8360 -0.1414 17.3894 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    1.241e+00  2.645e-01   4.693 7.73e-06 ***
## `Specimens Processed in 2022` -2.908e-06  2.745e-06  -1.060    0.292    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.663 on 111 degrees of freedom
##   (39 observations deleted due to missingness)
## Multiple R-squared:  0.01001,    Adjusted R-squared:  0.001096 
## F-statistic: 1.123 on 1 and 111 DF,  p-value: 0.2916

ggplot(data=data3, aes(x=`Prevalence of HIV, total (% of population ages 15-49)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 43 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 39 rows containing missing values (`geom_point()`).

Antiretroviral therapy coverage (% of people living with HIV)

data3 %>% 
  lm(`Antiretroviral therapy coverage (% of people living with HIV)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Antiretroviral therapy coverage (% of people living with HIV)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.758 -10.658   2.358  12.255  30.237 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   5.757e+01  1.795e+00  32.068  < 2e-16 ***
## `Specimens Processed in 2022` 5.834e-05  1.853e-05   3.148  0.00213 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.79 on 107 degrees of freedom
##   (43 observations deleted due to missingness)
## Multiple R-squared:  0.08478,    Adjusted R-squared:  0.07623 
## F-statistic: 9.912 on 1 and 107 DF,  p-value: 0.002128

ggplot(data=data3, aes(x=`Antiretroviral therapy coverage (% of people living with HIV)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 43 rows containing missing values (`geom_point()`).

Immunization, measles (% of children ages 12-23 months)

data3 %>% 
  lm(`Immunization, measles (% of children ages 12-23 months)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Immunization, measles (% of children ages 12-23 months)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.812  -4.572   5.154   8.764  11.200 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.780e+01  1.172e+00  74.922   <2e-16 ***
## `Specimens Processed in 2022` 1.482e-05  9.200e-06   1.611    0.109    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.48 on 144 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.01771,    Adjusted R-squared:  0.01089 
## F-statistic: 2.596 on 1 and 144 DF,  p-value: 0.1093

ggplot(data=data3, aes(x=`Immunization, measles (% of children ages 12-23 months)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 6 rows containing missing values (`geom_point()`).

Immunization, DPT (% of children ages 12-23 months)

data3 %>% 
  lm(`Immunization, DPT (% of children ages 12-23 months)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Immunization, DPT (% of children ages 12-23 months)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.570  -3.588   3.601   8.390  10.441 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.856e+01  1.038e+00  85.302   <2e-16 ***
## `Specimens Processed in 2022` 1.363e-05  8.151e-06   1.672   0.0967 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.95 on 144 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.01905,    Adjusted R-squared:  0.01224 
## F-statistic: 2.796 on 1 and 144 DF,  p-value: 0.09665

ggplot(data=data3, aes(x=`Immunization, DPT (% of children ages 12-23 months)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 6 rows containing missing values (`geom_point()`).

Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)

data3 %>% 
  lm(`Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.802 -12.318  -6.969   4.497  46.006 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    1.924e+01  1.549e+00  12.421   <2e-16 ***
## `Specimens Processed in 2022` -2.779e-05  1.208e-05  -2.302   0.0228 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.68 on 142 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.03596,    Adjusted R-squared:  0.02917 
## F-statistic: 5.297 on 1 and 142 DF,  p-value: 0.02281

ggplot(data=data3, aes(x=`Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 8 rows containing missing values (`geom_point()`).

Life expectancy at birth, total (years)

data3 %>% 
  lm(`Life expectancy at birth, total (years)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Life expectancy at birth, total (years)` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.014  -3.485   1.185   4.765  11.455 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.290e+01  5.940e-01 122.724  < 2e-16 ***
## `Specimens Processed in 2022` 1.763e-05  4.711e-06   3.743 0.000261 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.912 on 147 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.08699,    Adjusted R-squared:  0.08078 
## F-statistic: 14.01 on 1 and 147 DF,  p-value: 0.0002606

ggplot(data=data3, aes(x=`Life expectancy at birth, total (years)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 16 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 3 rows containing missing values (`geom_point()`).

GDP per capita (current US$)

data3 %>% 
  lm(`GDP per capita (current US$)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `GDP per capita (current US$)` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29505 -13177  -8874   2738 101187 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.497e+04  1.902e+03   7.870 7.74e-13 ***
## `Specimens Processed in 2022` 5.036e-02  1.493e-02   3.373 0.000954 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21880 on 144 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.07323,    Adjusted R-squared:  0.0668 
## F-statistic: 11.38 on 1 and 144 DF,  p-value: 0.0009544

ggplot(data=data3, aes(x=`GDP per capita (current US$)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 21 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 6 rows containing missing values (`geom_point()`).

Access to Electricity

data3 %>% 
  lm(`Access to electricity (% of population)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Access to electricity (% of population)` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.527   2.236  12.018  12.640  12.783 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.722e+01  1.996e+00  43.689   <2e-16 ***
## `Specimens Processed in 2022` 2.830e-05  1.594e-05   1.776   0.0778 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.4 on 149 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.02072,    Adjusted R-squared:  0.01415 
## F-statistic: 3.153 on 1 and 149 DF,  p-value: 0.07784

ggplot(data=data3, aes(x=`Access to electricity (% of population)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

Literacy rate, adult total (% of people ages 15 and above)

data3 %>% 
  lm(`Literacy rate, adult total (% of people ages 15 and above)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Literacy rate, adult total (% of people ages 15 and above)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.843  -2.844   3.509   6.466   9.210 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   93.1982757  2.6191591  35.583   <2e-16 ***
## `Specimens Processed in 2022` -0.0001152  0.0001507  -0.764    0.453    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.66 on 23 degrees of freedom
##   (127 observations deleted due to missingness)
## Multiple R-squared:  0.02475,    Adjusted R-squared:  -0.01765 
## F-statistic: 0.5837 on 1 and 23 DF,  p-value: 0.4526

ggplot(data=data3, aes(x=`Literacy rate, adult total (% of people ages 15 and above)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 127 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 127 rows containing missing values (`geom_point()`).

Primary completion rate, total (% of relevant age group)

data3 %>% 
  lm(`Primary completion rate, total (% of relevant age group)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Primary completion rate, total (% of relevant age group)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.364  -4.757   2.790   7.759  27.252 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   9.297e+01  1.321e+00   70.38   <2e-16 ***
## `Specimens Processed in 2022` 2.860e-05  1.498e-05    1.91   0.0591 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.35 on 98 degrees of freedom
##   (52 observations deleted due to missingness)
## Multiple R-squared:  0.03588,    Adjusted R-squared:  0.02604 
## F-statistic: 3.647 on 1 and 98 DF,  p-value: 0.05909

ggplot(data=data3, aes(x=`Primary completion rate, total (% of relevant age group)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 57 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 52 rows containing missing values (`geom_point()`).

Unemployment, total (% of total labor force) (modeled ILO estimate)

data3 %>% 
  lm(`Unemployment, total (% of total labor force) (modeled ILO estimate)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Unemployment, total (% of total labor force) (modeled ILO estimate)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.472 -3.028 -1.485  1.954 21.896 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    6.577e+00  4.216e-01  15.600   <2e-16 ***
## `Specimens Processed in 2022` -2.151e-07  3.253e-06  -0.066    0.947    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.758 on 139 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  3.147e-05,  Adjusted R-squared:  -0.007163 
## F-statistic: 0.004374 on 1 and 139 DF,  p-value: 0.9474

ggplot(data=data3, aes(x=`Unemployment, total (% of total labor force) (modeled ILO estimate)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 11 rows containing missing values (`geom_point()`).

Urban population (% of total population)

data3 %>% 
  lm(`Urban population (% of total population)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Urban population (% of total population)` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.197 -16.942  -0.957  15.612  40.421 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   5.958e+01  1.828e+00  32.598  < 2e-16 ***
## `Specimens Processed in 2022` 4.314e-05  1.454e-05   2.966  0.00352 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.35 on 148 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.05611,    Adjusted R-squared:  0.04974 
## F-statistic: 8.799 on 1 and 148 DF,  p-value: 0.003516

ggplot(data=data3, aes(x=`Urban population (% of total population)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values (`geom_point()`).

Agricultural land (% of land area)

data3 %>% 
  lm(`Agricultural land (% of land area)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Agricultural land (% of land area)` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.955 -17.137   1.341  14.856  42.398 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.849e+01  1.778e+00  21.645   <2e-16 ***
## `Specimens Processed in 2022` -1.405e-05  1.420e-05  -0.989    0.324    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.84 on 149 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.006525,   Adjusted R-squared:  -0.0001423 
## F-statistic: 0.9787 on 1 and 149 DF,  p-value: 0.3241

ggplot(data=data3, aes(x=`Agricultural land (% of land area)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

Population, total

data3 %>% 
  lm(`Population, total`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Population, total` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -296313231  -34249923  -26839542   -7418448 1334095040 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                   3.450e+07  1.349e+07   2.558   0.0115 * 
## `Specimens Processed in 2022` 3.206e+02  1.081e+02   2.967   0.0035 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 158700000 on 150 degrees of freedom
## Multiple R-squared:  0.05543,    Adjusted R-squared:  0.04913 
## F-statistic: 8.802 on 1 and 150 DF,  p-value: 0.003502

ggplot(data=data3, aes(x=`Population, total`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 16 rows containing non-finite values (`stat_smooth()`).

Mobile cellular subscriptions (per 100 people)

data3 %>% 
  lm(`Mobile cellular subscriptions (per 100 people)`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Mobile cellular subscriptions (per 100 people)` ~ 
##     `Specimens Processed in 2022`, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -95.907 -18.502   3.442  20.405 100.348 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.124e+02  2.852e+00  39.412   <2e-16 ***
## `Specimens Processed in 2022` 5.594e-06  2.270e-05   0.246    0.806    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.31 on 148 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.0004102,  Adjusted R-squared:  -0.006344 
## F-statistic: 0.06074 on 1 and 148 DF,  p-value: 0.8057

ggplot(data=data3, aes(x=`Mobile cellular subscriptions (per 100 people)`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values (`geom_point()`).

Scientific and technical journal articles

data3 %>% 
  lm(`Scientific and technical journal articles`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Scientific and technical journal articles` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -109302   -7259   -6761   -1868  436192 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   6819.9230  3744.5104   1.821   0.0706 .  
## `Specimens Processed in 2022`    0.1739     0.0295   5.897  2.5e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43250 on 145 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1934, Adjusted R-squared:  0.1878 
## F-statistic: 34.77 on 1 and 145 DF,  p-value: 2.501e-08

ggplot(data=data3, aes(x=`Scientific and technical journal articles`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 16 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 5 rows containing missing values (`geom_point()`).

Gender Inequality Index

data3 %>% 
  lm(`Gender Inequality Index`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Gender Inequality Index` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32223 -0.14120  0.00793  0.13994  0.33871 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.419e-01  1.620e-02  21.111  < 2e-16 ***
## `Specimens Processed in 2022` -5.082e-07  1.236e-07  -4.111 6.79e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1807 on 136 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.1105, Adjusted R-squared:  0.104 
## F-statistic:  16.9 on 1 and 136 DF,  p-value: 6.785e-05

ggplot(data=data3, aes(x=`Gender Inequality Index`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 21 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 14 rows containing missing values (`geom_point()`).

Human Development Index

data3 %>% 
  lm(`Human Development Index`~`Specimens Processed in 2022`, data=.) %>% 
  summary()

## 
## Call:
## lm(formula = `Human Development Index` ~ `Specimens Processed in 2022`, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34217 -0.09132  0.01876  0.08463  0.23383 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.270e-01  1.227e-02  59.268  < 2e-16 ***
## `Specimens Processed in 2022` 3.634e-07  9.497e-08   3.827 0.000195 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.139 on 140 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.0947, Adjusted R-squared:  0.08823 
## F-statistic: 14.64 on 1 and 140 DF,  p-value: 0.000195

ggplot(data=data3, aes(x=`Human Development Index`, y=`Specimens Processed in 2022`))+
  geom_point()+
  geom_smooth(method='lm')+
  scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 21 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 10 rows containing missing values (`geom_point()`).

Thesis Data Analysis Documentation

2023-04-02