#The name of the assignment is Data management and Descriptive statistics, name of student and the date when you submit the assignment.

#Loading the required packages
library(readxl)
library(tidyverse)
library(stats)

#Loading the two data sets to be used for analysis
#Set the working directory
setwd("C:/Users/Baha/Downloads")
#Upload the Poverty headcount ratio at $2.15 a day (20 17 PPP) (% of population) data
Poverty <-read_excel("C:/Users/Baha/Downloads/SI.POV.DDAY.xlsx")
#Showing the first entries of the original dataset
head(Poverty)

## # A tibble: 6 × 67
##   `Country Name`  `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
##   <chr>           <chr>          <chr>            <chr>            <lgl>  <lgl> 
## 1 Aruba           ABW            Poverty headcou… SI.POV.DDAY      NA     NA    
## 2 Africa Eastern… AFE            Poverty headcou… SI.POV.DDAY      NA     NA    
## 3 Afghanistan     AFG            Poverty headcou… SI.POV.DDAY      NA     NA    
## 4 Africa Western… AFW            Poverty headcou… SI.POV.DDAY      NA     NA    
## 5 Angola          AGO            Poverty headcou… SI.POV.DDAY      NA     NA    
## 6 Albania         ALB            Poverty headcou… SI.POV.DDAY      NA     NA    
## # ℹ 61 more variables: `1962` <lgl>, `1963` <dbl>, `1964` <dbl>, `1965` <dbl>,
## #   `1966` <dbl>, `1967` <dbl>, `1968` <dbl>, `1969` <dbl>, `1970` <dbl>,
## #   `1971` <dbl>, `1972` <dbl>, `1973` <dbl>, `1974` <dbl>, `1975` <dbl>,
## #   `1976` <dbl>, `1977` <dbl>, `1978` <dbl>, `1979` <dbl>, `1980` <dbl>,
## #   `1981` <dbl>, `1982` <dbl>, `1983` <dbl>, `1984` <dbl>, `1985` <dbl>,
## #   `1986` <dbl>, `1987` <dbl>, `1988` <dbl>, `1989` <dbl>, `1990` <dbl>,
## #   `1991` <dbl>, `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, …

#Upload the Incidence of tuberculosis (per 100,000 people) data
TB <-read_excel("C:/Users/Baha/Downloads/SH.TBS.INCD.xlsx")
head(TB)

## # A tibble: 6 × 67
##   `Country Name`  `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
##   <chr>           <chr>          <chr>            <chr>            <lgl>  <lgl> 
## 1 Aruba           ABW            Incidence of tu… SH.TBS.INCD      NA     NA    
## 2 Africa Eastern… AFE            Incidence of tu… SH.TBS.INCD      NA     NA    
## 3 Afghanistan     AFG            Incidence of tu… SH.TBS.INCD      NA     NA    
## 4 Africa Western… AFW            Incidence of tu… SH.TBS.INCD      NA     NA    
## 5 Angola          AGO            Incidence of tu… SH.TBS.INCD      NA     NA    
## 6 Albania         ALB            Incidence of tu… SH.TBS.INCD      NA     NA    
## # ℹ 61 more variables: `1962` <lgl>, `1963` <lgl>, `1964` <lgl>, `1965` <lgl>,
## #   `1966` <lgl>, `1967` <lgl>, `1968` <lgl>, `1969` <lgl>, `1970` <lgl>,
## #   `1971` <lgl>, `1972` <lgl>, `1973` <lgl>, `1974` <lgl>, `1975` <lgl>,
## #   `1976` <lgl>, `1977` <lgl>, `1978` <lgl>, `1979` <lgl>, `1980` <lgl>,
## #   `1981` <lgl>, `1982` <lgl>, `1983` <lgl>, `1984` <lgl>, `1985` <lgl>,
## #   `1986` <lgl>, `1987` <lgl>, `1988` <lgl>, `1989` <lgl>, `1990` <lgl>,
## #   `1991` <lgl>, `1992` <lgl>, `1993` <lgl>, `1994` <lgl>, `1995` <lgl>, …

dim(TB)

## [1] 266  67

dim(Poverty)

## [1] 266  67

#Provide description of each original data set in terms of the number of observations and variables.

#Incidence of Tuberculosis Data:

The number of observations—directly correlating to the available data for each country—determines: the more countries with accessible data, the higher this count will be. This data set has 266 observations and 67 variables.

Number of variables: There are several variables, including:

Country Name: Name of the country.

Country Code: Code assigned to each country.

Indicator Name: Name of the indicator (presumably related to the incidence of tuberculosis).

Indicator Code: Code assigned to the indicator.

From 1960 to 2022, we have data for each year that indicates either the incidence of tuberculosis or related measures; these serve as our variables.

#Poverty Data set:

The number of observations corresponds to the availability of poverty data for each country: it mirrors—similarly to the previous data set—the number of countries involved. This dataset has 266 observations and 67 variables.

Number of variables: The variables include:

Country Name: Name of the country.

Country Code: Code assigned to each country.

Indicator Name: Name of the indicator (related to poverty).

Indicator Code: Code assigned to the poverty indicator.

From 1960 to 2022, we have data for each year; this data indicates poverty measures or related indicators.

#4. Construct the working data set for the assigned years and indicators by removing unnecessary variables and merging the necessary parts of data.

The working data set will have the variables Country name, Indicator name and the assigned year. For the poverty rates data, the assigned year is 2004 and the incidence of TB data is 2005.

#Working data for the Poverty rates data
library(dplyr)
attach(Poverty)
WD_POV <-Poverty %>% 
  select(`Country Name`, `Indicator Name`, "2004")
detach(Poverty)
tail(WD_POV)

## # A tibble: 6 × 3
##   `Country Name` `Indicator Name`                                         `2004`
##   <chr>          <chr>                                                     <dbl>
## 1 Samoa          Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…   NA  
## 2 Kosovo         Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…   NA  
## 3 Yemen, Rep.    Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…   NA  
## 4 South Africa   Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…   NA  
## 5 Zambia         Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…   61.1
## 6 Zimbabwe       Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…   NA

#Working data for the Incidence of TB data
attach(TB)
WD_TB <-TB %>% 
  select(`Country Name`, `Indicator Name`, "2005")
detach(TB)
head(WD_TB)

## # A tibble: 6 × 3
##   `Country Name`              `Indicator Name`                            `2005`
##   <chr>                       <chr>                                        <dbl>
## 1 Aruba                       Incidence of tuberculosis (per 100,000 peo…     10
## 2 Africa Eastern and Southern Incidence of tuberculosis (per 100,000 peo…     NA
## 3 Afghanistan                 Incidence of tuberculosis (per 100,000 peo…    189
## 4 Africa Western and Central  Incidence of tuberculosis (per 100,000 peo…     NA
## 5 Angola                      Incidence of tuberculosis (per 100,000 peo…    368
## 6 Albania                     Incidence of tuberculosis (per 100,000 peo…     19

Clean the working dataset

#Working data set for the poverty rates
clean_data <-WD_POV %>% 
  na.omit(WD_POV)
dim(clean_data)

## [1] 84  3

head(clean_data)

## # A tibble: 6 × 3
##   `Country Name` `Indicator Name`                                         `2004`
##   <chr>          <chr>                                                     <dbl>
## 1 Argentina      Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…    3.8
## 2 Armenia        Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…    5.3
## 3 Australia      Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…    0.7
## 4 Austria        Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…    0.1
## 5 Azerbaijan     Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…    0  
## 6 Belgium        Poverty headcount ratio at $2.15 a day (2017 PPP) (% of…    0.1

#Working data for the incidence of TB 
TB_clean <-WD_TB %>% 
  na.omit(WD_TB)
dim(TB_clean)

## [1] 225   3

head(TB_clean)

## # A tibble: 6 × 3
##   `Country Name`       `Indicator Name`                               `2005`
##   <chr>                <chr>                                           <dbl>
## 1 Aruba                Incidence of tuberculosis (per 100,000 people)   10  
## 2 Afghanistan          Incidence of tuberculosis (per 100,000 people)  189  
## 3 Angola               Incidence of tuberculosis (per 100,000 people)  368  
## 4 Albania              Incidence of tuberculosis (per 100,000 people)   19  
## 5 Andorra              Incidence of tuberculosis (per 100,000 people)   14  
## 6 United Arab Emirates Incidence of tuberculosis (per 100,000 people)    2.8

#Merge the two datasets

# Perform the merge
TB_POV <- left_join(clean_data, TB_clean, by = "Country Name")
dim(TB_POV)

## [1] 84  5

head(TB_POV)

## # A tibble: 6 × 5
##   `Country Name` `Indicator Name.x`             `2004` `Indicator Name.y` `2005`
##   <chr>          <chr>                           <dbl> <chr>               <dbl>
## 1 Argentina      Poverty headcount ratio at $2…    3.8 Incidence of tube…     31
## 2 Armenia        Poverty headcount ratio at $2…    5.3 Incidence of tube…     92
## 3 Australia      Poverty headcount ratio at $2…    0.7 Incidence of tube…      6
## 4 Austria        Poverty headcount ratio at $2…    0.1 Incidence of tube…     13
## 5 Azerbaijan     Poverty headcount ratio at $2…    0   Incidence of tube…     88
## 6 Belgium        Poverty headcount ratio at $2…    0.1 Incidence of tube…     12

#Construct the indicator variables which identify if an observation is an outl ier (= 1) or not (=O) for each of two variables

# Checking for outliers in the "2004" variable
TB_POV$outlier_2004 <- ifelse(abs(TB_POV$`2004` - mean(TB_POV$`2004`)) > 2 * sd(TB_POV$`2004`), 1, 0)

# Checking for outliers in the "2005" variable
TB_POV$outlier_2005 <- ifelse(abs(TB_POV$`2005` - mean(TB_POV$`2005`)) > 2 * sd(TB_POV$`2005`), 1, 0)

# Display the merged data
head(TB_POV)

## # A tibble: 6 × 7
##   `Country Name` `Indicator Name.x`             `2004` `Indicator Name.y` `2005`
##   <chr>          <chr>                           <dbl> <chr>               <dbl>
## 1 Argentina      Poverty headcount ratio at $2…    3.8 Incidence of tube…     31
## 2 Armenia        Poverty headcount ratio at $2…    5.3 Incidence of tube…     92
## 3 Australia      Poverty headcount ratio at $2…    0.7 Incidence of tube…      6
## 4 Austria        Poverty headcount ratio at $2…    0.1 Incidence of tube…     13
## 5 Azerbaijan     Poverty headcount ratio at $2…    0   Incidence of tube…     88
## 6 Belgium        Poverty headcount ratio at $2…    0.1 Incidence of tube…     12
## # ℹ 2 more variables: outlier_2004 <dbl>, outlier_2005 <lgl>

# Display standard descriptive statistics for each variable with outliers
cat("Descriptive statistics for '2004' with outliers:\n")

## Descriptive statistics for '2004' with outliers:

summary(TB_POV$`2004`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.475   3.200  12.151  16.975  91.500

cat("\n")

cat("Descriptive statistics for '2005' with outliers:\n")

## Descriptive statistics for '2005' with outliers:

summary(TB_POV$`2005`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.99   12.25   40.50   98.43  156.25  602.00       2

cat("\n")

# Identify outliers using box plots
boxplot(TB_POV$`2004`, main = "Boxplot of 2004 with Outliers")

boxplot(TB_POV$`2005`, main = "Boxplot of 2005 with Outliers")

# Display standard descriptive statistics for variable without outliers
cat("Descriptive statistics for '2004' without outliers:\n")

## Descriptive statistics for '2004' without outliers:

summary(TB_POV$`2004`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.475   3.200  12.151  16.975  91.500

cat("\n")

cat("Descriptive statistics for '2005' without outliers:\n")

## Descriptive statistics for '2005' without outliers:

summary(TB_POV$`2005`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.99   12.25   40.50   98.43  156.25  602.00       2

cat("\n")

Run a linear regression of poverty on TB rate and interpret the results.

library(flextable)
library(modelsummary)
linear_model <-lm(TB_POV$`2004`~TB_POV$`2005`)
modelsummary(linear_model)

	(1)
(Intercept)	−0.459
	(1.394)
TB_POV$2005	0.121
	(0.009)
Num.Obs.	82
R2	0.693
R2 Adj.	0.689
AIC	610.2
BIC	617.4
Log.Lik.	−302.081
F	180.197
RMSE	9.63

summary(linear_model)

## 
## Call:
## lm(formula = TB_POV$`2004` ~ TB_POV$`2005`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.383  -3.364  -0.371   1.772  52.482 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.458555   1.393926  -0.329    0.743    
## TB_POV$`2005`  0.120723   0.008993  13.424   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.75 on 80 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.6925, Adjusted R-squared:  0.6887 
## F-statistic: 180.2 on 1 and 80 DF,  p-value: < 2.2e-16

plot(linear_model)

# Create a scatter plot to visualize the relationship between TB and poverty
plot(TB_POV$`2004`, TB_POV$`2005`, 
     xlab = "TB", ylab = "Poverty", main = "Scatter Plot of TB vs. Poverty")

Data Management and Descriptive statistics

2024-03-04

Clean the working dataset

Run a linear regression of poverty on TB rate and interpret the results.