#The name of the assignment is Data management and Descriptive statistics, name of student and the date when you submit the assignment.
#Loading the required packages
library(readxl)
library(tidyverse)
library(stats)
#Loading the two data sets to be used for analysis
#Set the working directory
setwd("C:/Users/Baha/Downloads")
#Upload the Poverty headcount ratio at $2.15 a day (20 17 PPP) (% of population) data
Poverty <-read_excel("C:/Users/Baha/Downloads/SI.POV.DDAY.xlsx")
#Showing the first entries of the original dataset
head(Poverty)
## # A tibble: 6 × 67
## `Country Name` `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
## <chr> <chr> <chr> <chr> <lgl> <lgl>
## 1 Aruba ABW Poverty headcou… SI.POV.DDAY NA NA
## 2 Africa Eastern… AFE Poverty headcou… SI.POV.DDAY NA NA
## 3 Afghanistan AFG Poverty headcou… SI.POV.DDAY NA NA
## 4 Africa Western… AFW Poverty headcou… SI.POV.DDAY NA NA
## 5 Angola AGO Poverty headcou… SI.POV.DDAY NA NA
## 6 Albania ALB Poverty headcou… SI.POV.DDAY NA NA
## # ℹ 61 more variables: `1962` <lgl>, `1963` <dbl>, `1964` <dbl>, `1965` <dbl>,
## # `1966` <dbl>, `1967` <dbl>, `1968` <dbl>, `1969` <dbl>, `1970` <dbl>,
## # `1971` <dbl>, `1972` <dbl>, `1973` <dbl>, `1974` <dbl>, `1975` <dbl>,
## # `1976` <dbl>, `1977` <dbl>, `1978` <dbl>, `1979` <dbl>, `1980` <dbl>,
## # `1981` <dbl>, `1982` <dbl>, `1983` <dbl>, `1984` <dbl>, `1985` <dbl>,
## # `1986` <dbl>, `1987` <dbl>, `1988` <dbl>, `1989` <dbl>, `1990` <dbl>,
## # `1991` <dbl>, `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, …
#Upload the Incidence of tuberculosis (per 100,000 people) data
TB <-read_excel("C:/Users/Baha/Downloads/SH.TBS.INCD.xlsx")
head(TB)
## # A tibble: 6 × 67
## `Country Name` `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
## <chr> <chr> <chr> <chr> <lgl> <lgl>
## 1 Aruba ABW Incidence of tu… SH.TBS.INCD NA NA
## 2 Africa Eastern… AFE Incidence of tu… SH.TBS.INCD NA NA
## 3 Afghanistan AFG Incidence of tu… SH.TBS.INCD NA NA
## 4 Africa Western… AFW Incidence of tu… SH.TBS.INCD NA NA
## 5 Angola AGO Incidence of tu… SH.TBS.INCD NA NA
## 6 Albania ALB Incidence of tu… SH.TBS.INCD NA NA
## # ℹ 61 more variables: `1962` <lgl>, `1963` <lgl>, `1964` <lgl>, `1965` <lgl>,
## # `1966` <lgl>, `1967` <lgl>, `1968` <lgl>, `1969` <lgl>, `1970` <lgl>,
## # `1971` <lgl>, `1972` <lgl>, `1973` <lgl>, `1974` <lgl>, `1975` <lgl>,
## # `1976` <lgl>, `1977` <lgl>, `1978` <lgl>, `1979` <lgl>, `1980` <lgl>,
## # `1981` <lgl>, `1982` <lgl>, `1983` <lgl>, `1984` <lgl>, `1985` <lgl>,
## # `1986` <lgl>, `1987` <lgl>, `1988` <lgl>, `1989` <lgl>, `1990` <lgl>,
## # `1991` <lgl>, `1992` <lgl>, `1993` <lgl>, `1994` <lgl>, `1995` <lgl>, …
dim(TB)
## [1] 266 67
dim(Poverty)
## [1] 266 67
#Provide description of each original data set in terms of the number of observations and variables.
#Incidence of Tuberculosis Data:
The number of observations—directly correlating to the available data for each country—determines: the more countries with accessible data, the higher this count will be. This data set has 266 observations and 67 variables.
Number of variables: There are several variables, including:
Country Name: Name of the country.
Country Code: Code assigned to each country.
Indicator Name: Name of the indicator (presumably related to the incidence of tuberculosis).
Indicator Code: Code assigned to the indicator.
From 1960 to 2022, we have data for each year that indicates either the incidence of tuberculosis or related measures; these serve as our variables.
#Poverty Data set:
The number of observations corresponds to the availability of poverty data for each country: it mirrors—similarly to the previous data set—the number of countries involved. This dataset has 266 observations and 67 variables.
Number of variables: The variables include:
Country Name: Name of the country.
Country Code: Code assigned to each country.
Indicator Name: Name of the indicator (related to poverty).
Indicator Code: Code assigned to the poverty indicator.
From 1960 to 2022, we have data for each year; this data indicates poverty measures or related indicators.
#4. Construct the working data set for the assigned years and indicators by removing unnecessary variables and merging the necessary parts of data.
The working data set will have the variables Country name, Indicator name and the assigned year. For the poverty rates data, the assigned year is 2004 and the incidence of TB data is 2005.
#Working data for the Poverty rates data
library(dplyr)
attach(Poverty)
WD_POV <-Poverty %>%
select(`Country Name`, `Indicator Name`, "2004")
detach(Poverty)
tail(WD_POV)
## # A tibble: 6 × 3
## `Country Name` `Indicator Name` `2004`
## <chr> <chr> <dbl>
## 1 Samoa Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… NA
## 2 Kosovo Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… NA
## 3 Yemen, Rep. Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… NA
## 4 South Africa Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… NA
## 5 Zambia Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… 61.1
## 6 Zimbabwe Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… NA
#Working data for the Incidence of TB data
attach(TB)
WD_TB <-TB %>%
select(`Country Name`, `Indicator Name`, "2005")
detach(TB)
head(WD_TB)
## # A tibble: 6 × 3
## `Country Name` `Indicator Name` `2005`
## <chr> <chr> <dbl>
## 1 Aruba Incidence of tuberculosis (per 100,000 peo… 10
## 2 Africa Eastern and Southern Incidence of tuberculosis (per 100,000 peo… NA
## 3 Afghanistan Incidence of tuberculosis (per 100,000 peo… 189
## 4 Africa Western and Central Incidence of tuberculosis (per 100,000 peo… NA
## 5 Angola Incidence of tuberculosis (per 100,000 peo… 368
## 6 Albania Incidence of tuberculosis (per 100,000 peo… 19
#Working data set for the poverty rates
clean_data <-WD_POV %>%
na.omit(WD_POV)
dim(clean_data)
## [1] 84 3
head(clean_data)
## # A tibble: 6 × 3
## `Country Name` `Indicator Name` `2004`
## <chr> <chr> <dbl>
## 1 Argentina Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… 3.8
## 2 Armenia Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… 5.3
## 3 Australia Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… 0.7
## 4 Austria Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… 0.1
## 5 Azerbaijan Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… 0
## 6 Belgium Poverty headcount ratio at $2.15 a day (2017 PPP) (% of… 0.1
#Working data for the incidence of TB
TB_clean <-WD_TB %>%
na.omit(WD_TB)
dim(TB_clean)
## [1] 225 3
head(TB_clean)
## # A tibble: 6 × 3
## `Country Name` `Indicator Name` `2005`
## <chr> <chr> <dbl>
## 1 Aruba Incidence of tuberculosis (per 100,000 people) 10
## 2 Afghanistan Incidence of tuberculosis (per 100,000 people) 189
## 3 Angola Incidence of tuberculosis (per 100,000 people) 368
## 4 Albania Incidence of tuberculosis (per 100,000 people) 19
## 5 Andorra Incidence of tuberculosis (per 100,000 people) 14
## 6 United Arab Emirates Incidence of tuberculosis (per 100,000 people) 2.8
#Merge the two datasets
# Perform the merge
TB_POV <- left_join(clean_data, TB_clean, by = "Country Name")
dim(TB_POV)
## [1] 84 5
head(TB_POV)
## # A tibble: 6 × 5
## `Country Name` `Indicator Name.x` `2004` `Indicator Name.y` `2005`
## <chr> <chr> <dbl> <chr> <dbl>
## 1 Argentina Poverty headcount ratio at $2… 3.8 Incidence of tube… 31
## 2 Armenia Poverty headcount ratio at $2… 5.3 Incidence of tube… 92
## 3 Australia Poverty headcount ratio at $2… 0.7 Incidence of tube… 6
## 4 Austria Poverty headcount ratio at $2… 0.1 Incidence of tube… 13
## 5 Azerbaijan Poverty headcount ratio at $2… 0 Incidence of tube… 88
## 6 Belgium Poverty headcount ratio at $2… 0.1 Incidence of tube… 12
#Construct the indicator variables which identify if an observation is an outl ier (= 1) or not (=O) for each of two variables
# Checking for outliers in the "2004" variable
TB_POV$outlier_2004 <- ifelse(abs(TB_POV$`2004` - mean(TB_POV$`2004`)) > 2 * sd(TB_POV$`2004`), 1, 0)
# Checking for outliers in the "2005" variable
TB_POV$outlier_2005 <- ifelse(abs(TB_POV$`2005` - mean(TB_POV$`2005`)) > 2 * sd(TB_POV$`2005`), 1, 0)
# Display the merged data
head(TB_POV)
## # A tibble: 6 × 7
## `Country Name` `Indicator Name.x` `2004` `Indicator Name.y` `2005`
## <chr> <chr> <dbl> <chr> <dbl>
## 1 Argentina Poverty headcount ratio at $2… 3.8 Incidence of tube… 31
## 2 Armenia Poverty headcount ratio at $2… 5.3 Incidence of tube… 92
## 3 Australia Poverty headcount ratio at $2… 0.7 Incidence of tube… 6
## 4 Austria Poverty headcount ratio at $2… 0.1 Incidence of tube… 13
## 5 Azerbaijan Poverty headcount ratio at $2… 0 Incidence of tube… 88
## 6 Belgium Poverty headcount ratio at $2… 0.1 Incidence of tube… 12
## # ℹ 2 more variables: outlier_2004 <dbl>, outlier_2005 <lgl>
# Display standard descriptive statistics for each variable with outliers
cat("Descriptive statistics for '2004' with outliers:\n")
## Descriptive statistics for '2004' with outliers:
summary(TB_POV$`2004`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.475 3.200 12.151 16.975 91.500
cat("\n")
cat("Descriptive statistics for '2005' with outliers:\n")
## Descriptive statistics for '2005' with outliers:
summary(TB_POV$`2005`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.99 12.25 40.50 98.43 156.25 602.00 2
cat("\n")
# Identify outliers using box plots
boxplot(TB_POV$`2004`, main = "Boxplot of 2004 with Outliers")
boxplot(TB_POV$`2005`, main = "Boxplot of 2005 with Outliers")
# Display standard descriptive statistics for variable without outliers
cat("Descriptive statistics for '2004' without outliers:\n")
## Descriptive statistics for '2004' without outliers:
summary(TB_POV$`2004`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.475 3.200 12.151 16.975 91.500
cat("\n")
cat("Descriptive statistics for '2005' without outliers:\n")
## Descriptive statistics for '2005' without outliers:
summary(TB_POV$`2005`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.99 12.25 40.50 98.43 156.25 602.00 2
cat("\n")
library(flextable)
library(modelsummary)
linear_model <-lm(TB_POV$`2004`~TB_POV$`2005`)
modelsummary(linear_model)
| Â (1) | |
|---|---|
| (Intercept) | −0.459 |
| (1.394) | |
| TB_POV$2005 | 0.121 |
| (0.009) | |
| Num.Obs. | 82 |
| R2 | 0.693 |
| R2 Adj. | 0.689 |
| AIC | 610.2 |
| BIC | 617.4 |
| Log.Lik. | −302.081 |
| F | 180.197 |
| RMSE | 9.63 |
summary(linear_model)
##
## Call:
## lm(formula = TB_POV$`2004` ~ TB_POV$`2005`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.383 -3.364 -0.371 1.772 52.482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.458555 1.393926 -0.329 0.743
## TB_POV$`2005` 0.120723 0.008993 13.424 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.75 on 80 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.6925, Adjusted R-squared: 0.6887
## F-statistic: 180.2 on 1 and 80 DF, p-value: < 2.2e-16
plot(linear_model)
# Create a scatter plot to visualize the relationship between TB and poverty
plot(TB_POV$`2004`, TB_POV$`2005`,
xlab = "TB", ylab = "Poverty", main = "Scatter Plot of TB vs. Poverty")