Team-32

Author

Data visionaries

Build a parallel coordinates plot to compare health indicators across multiple countries.

Introduction:

Health indicators are important measures that reflect the overall well-being and development of a country. Comparing multiple indicators helps identify patterns and differences between countries. In this report, a parallel coordinates plot is used to visualize and compare health indicators across multiple countries.

Objective:

To analyze and compare key health indicators such as life expectancy, infant mortality, GDP per capital, and health expenditure using R programming.

Step 1: Load required pacakges

library(GGally)
Warning: package 'GGally' was built under R version 4.5.3
Loading required package: ggplot2
Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
Warning: package 'dplyr' was built under R version 4.5.3

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

Step 2: Load dataset

data <- read.csv(file.choose(), stringsAsFactors = FALSE)

head(data)
  InvoiceNo StockCode                         Description Quantity
1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
2    536365     71053                 WHITE METAL LANTERN        6
3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
     InvoiceDate UnitPrice CustomerID        Country
1 12/1/2010 8:26      2.55      17850 United Kingdom
2 12/1/2010 8:26      3.39      17850 United Kingdom
3 12/1/2010 8:26      2.75      17850 United Kingdom
4 12/1/2010 8:26      3.39      17850 United Kingdom
5 12/1/2010 8:26      3.39      17850 United Kingdom
6 12/1/2010 8:26      7.65      17850 United Kingdom
str(data)
'data.frame':   541909 obs. of  8 variables:
 $ InvoiceNo  : chr  "536365" "536365" "536365" "536365" ...
 $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
 $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
 $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
 $ InvoiceDate: chr  "12/1/2010 8:26" "12/1/2010 8:26" "12/1/2010 8:26" "12/1/2010 8:26" ...
 $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
 $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
 $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
colnames(data)
[1] "InvoiceNo"   "StockCode"   "Description" "Quantity"    "InvoiceDate"
[6] "UnitPrice"   "CustomerID"  "Country"    

Step 3: Data Preprocessing

# Clean column names
colnames(data) <- trimws(colnames(data))

# Check structure
str(data)
'data.frame':   541909 obs. of  8 variables:
 $ InvoiceNo  : chr  "536365" "536365" "536365" "536365" ...
 $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
 $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
 $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
 $ InvoiceDate: chr  "12/1/2010 8:26" "12/1/2010 8:26" "12/1/2010 8:26" "12/1/2010 8:26" ...
 $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
 $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
 $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...

Step 4: Create country-level indicators

# Convert all column names to lowercase
colnames(data) <- tolower(trimws(colnames(data)))

# Now use lowercase safely
country_data <- data %>%
  group_by(country) %>%
  summarise(
    total_quantity = sum(quantity, na.rm = TRUE),
    avg_unitprice = mean(unitprice, na.rm = TRUE),
    total_sales = sum(quantity * unitprice, na.rm = TRUE)
  )

head(country_data)
# A tibble: 6 × 4
  country   total_quantity avg_unitprice total_sales
  <chr>              <int>         <dbl>       <dbl>
1 Australia          83653          3.22     137077.
2 Austria             4827          4.24      10154.
3 Bahrain              260          4.56        548.
4 Belgium            23152          3.64      40911.
5 Brazil               356          4.46       1144.
6 Canada              2763          6.03       3666.

Step 5:Parallel Coordinates Plot

library(GGally)
library(ggplot2)

# Use country_data created in Step 4

# Keep only numeric columns + Country for grouping
plot_data <- country_data

# Create plot
ggparcoord(
  data = plot_data,
  columns = 2:ncol(plot_data),   # numeric columns
  groupColumn = 1,               # Country column
  scale = "uniminmax",
  showPoints = TRUE,
  alphaLines = 0.6
) +
  labs(
    title = "Parallel Coordinates Plot of Country-Level Indicators",
    x = "Indicators",
    y = "Scaled Values"
  ) +
  theme_minimal()