Data Exploration

Excercises ~ Week 2

Logo

1 Exercise 1

The following table shows sample information for three students. Each observation represents a single student and includes details such as their unique student ID, name, age, total credits completed, major field of study, and year level.

This dataset demonstrates a mixture of variable types:

Nominal: StudentID, Name, Major
Numeric: Age (continuous), CreditsCompleted (discrete)
Ordinal: YearLevel (Freshman → Senior)

StudentID	Name	Age	CreditsCompleted	Major	YearLevel
S001	Alice	20	45	Data Sains	Sophomore
S002	Budi	21	60	Mathematics	Junior
S003	Citra	19	30	Statistics	Freshman

# 1. Create vectors for each variable
StudentID <- c("S001", "S002", "S003")       # Nominal / ID
Name <- c("Alice", "Budi", "Citra")          # Nominal / Name
Age <- c(20, 21, 19)                         # Numeric / Continuous
CreditsCompleted <- c(45, 60, 30)            # Numeric / Discrete

# Nominal
Major <- c("Data Sains", "Mathematics", "Statistics")  

# Ordinal
YearLevel <- factor(c("Sophomore", "Junior", "Freshman"),
                    levels = c("Freshman","Sophomore","Junior","Senior"),
                    ordered = TRUE)          

# 2. Combine all vectors into a data frame
students <- data.frame(
  StudentID, Name, Age, CreditsCompleted, Major, YearLevel,
  stringsAsFactors = FALSE
)

# 3. Display the data frame
print(students)

##   StudentID  Name Age CreditsCompleted       Major YearLevel
## 1      S001 Alice  20               45  Data Sains Sophomore
## 2      S002  Budi  21               60 Mathematics    Junior
## 3      S003 Citra  19               30  Statistics  Freshman

2 Exercise 2

Identify Data Types: Determine the type of data for each of the following variables:

library(knitr)

# Create a data frame for Data Types
variables_info <- data.frame(
  No = 1:5,
  Variable = c(
    "Number of vehicles passing through the toll road each day",
    "Student height in cm",
    "Employee gender (Male / Female)",
    "Customer satisfaction level: Low, Medium, High",
    "Respondent's favorite color: Red, Blue, Green"
  ),
  DataType = c(
    "Numeric",
    "Numeric",
    "Categorical",
    "Categorical",
    "Categorical"
  ),
  Subtype = c(
    "Discrete",
    "Continuous",
    "Nominal",
    "Ordinal",
    "Nominal"
  ),
  stringsAsFactors = FALSE
)

# Display the data frame as a neat table
kable(variables_info, 
      caption = "Table of Variables and Data Types")

Table of Variables and Data Types
No	Variable	DataType	Subtype
1	Number of vehicles passing through the toll road each day	Numeric	Discrete
2	Student height in cm	Numeric	Continuous
3	Employee gender (Male / Female)	Categorical	Nominal
4	Customer satisfaction level: Low, Medium, High	Categorical	Ordinal
5	Respondent’s favorite color: Red, Blue, Green	Categorical	Nominal

3 Exercise 3

Classify Data Sources: Determine whether the following data comes from internal or external sources, and whether it is structured or unstructured:

# Install DT package if not already installed
# install.packages("DT")
library(DT)

# Create a data frame for data sources 
data_sources <- data.frame(
  No = 1:4,
  DataSource = c(
    "Daily sales transaction data of the company",
    "Weather reports from BMKG",
    "Product reviews on social media",
    "Warehouse inventory reports"
  ),
  Internal_External = c(
    "Internal Source",
    "External Source",
    "External Source",
    "Internal Source"
  ),
  Structured_Unstructured = c(
    "Structured",
    "Structured",
    "Unstructured",
    "Structured"
  ),
  stringsAsFactors = FALSE
)

# Display the data frame as a neat table
datatable(data_sources, 
          caption = "Table of Data Sources",
          rownames = FALSE) # hides the index column

4 Exercise 4

Dataset Structure: Consider the following transaction table:

Date	Qty	Price	Product	CustomerTier
2025-10-01	2	1000	Laptop	High
2025-10-01	5	20	Mouse	Medium
2025-10-02	1	1000	Laptop	Low
2025-10-02	3	30	Keyboard	Medium
2025-10-03	4	50	Mouse	Medium
2025-10-03	2	1000	Laptop	High
2025-10-04	6	25	Keyboard	Low
2025-10-04	1	1000	Laptop	High
2025-10-05	3	40	Mouse	Low
2025-10-05	5	10	Keyboard	Medium

Your Assignment Instructions: Creating a Transactions Table above in R

Create a data frame in R called transactions containing the data above.
Identify which variables are numeric and which are categorical
Calculate total revenue for each transaction by multiplying Qty × Price and add it as a new column Total.
Compute summary statistics:
- Total quantity sold for each product
- Total revenue per product
- Average price per product
Visualize the data:
- Create a barplot showing total quantity sold per product.
- Create a pie chart showing the proportion of total revenue per customer tier.
Optional Challenge:
- Find which date had the highest total revenue.
- Create a stacked bar chart showing quantity sold per product by customer tier.

Hints: Use data.frame(), aggregate(), barplot(), pie(), and basic arithmetic operations in R.

# Load Package
library(kableExtra)
library(DT)
library(htmltools)

# Create Data Frame Transactions

transactions <- data.frame(
  No = 1:10, 
  Date = as.Date(c(
    "2025-10-01", "2025-10-01", "2025-10-02", "2025-10-02",
    "2025-10-03", "2025-10-03", "2025-10-04", "2025-10-04",
    "2025-10-05", "2025-10-05"
  )),
  
  Quantity = c(2, 5, 1, 3, 4, 2, 6, 1, 3, 5),
  
  Price = c(1000, 20, 1000, 30, 50, 1000, 25, 1000, 40, 10),
  
  Product = factor(c(
          "Laptop", "Mouse", "Laptop", "Keyboard", "Mouse",
          "Laptop", "Keyboard", "Laptop", "Mouse", "Keyboard")),
  
  CustomerTier = factor(c(
               "High", "Medium", "Low", "Medium", "Medium",
               "High", "Low", "High", "Low", "Medium"),
        levels = c("Low","Medium","High"), ordered = TRUE),
  stringsAsFactors = FALSE
)


transactions$Total <- transactions$Quantity * transactions$Price

# Identify data types (numeric & categorical)

str(transactions)

## 'data.frame':    10 obs. of  7 variables:
##  $ No          : int  1 2 3 4 5 6 7 8 9 10
##  $ Date        : Date, format: "2025-10-01" "2025-10-01" ...
##  $ Quantity    : num  2 5 1 3 4 2 6 1 3 5
##  $ Price       : num  1000 20 1000 30 50 1000 25 1000 40 10
##  $ Product     : Factor w/ 3 levels "Keyboard","Laptop",..: 2 3 2 1 3 2 1 2 3 1
##  $ CustomerTier: Ord.factor w/ 3 levels "Low"<"Medium"<..: 3 2 1 2 2 3 1 3 1 2
##  $ Total       : num  2000 100 1000 90 200 2000 150 1000 120 50

# Penjelasan: 
# - Numeric: Quantity (discrete), Price (continuous), Total (continuous)
# - Categorical: Date (date), Product (nominal factor), CustomerTier (ordinal factor)


# Display transactions table

datatable(
  transactions,
  caption = tags$caption(
    style = "caption-side:  top;
             text-align: center;
             font-weight: bold;",
            "Table Transactions"),
  options = list(pageLength = 5, autoWidth = TRUE)
)

# Menghitung Summary statistik

# Total quantity sold per product

total_qty <- aggregate(Quantity ~ Product, data = transactions, sum)
total_qty <- total_qty[order(as.numeric(total_qty$Product)), ]  # Reorder berdasarkan levels faktor

datatable(
  total_qty,
  caption = tags$caption(
    style = "caption-side: top;
             text-align: center;
             font-weight: bold;",
            "Total Quantity Sold per Product"
    ),
  options = list(pageLength = 5, autoWidth = TRUE)
)

# Total revenue per product

total_revenue <- aggregate(Total ~ Product, data = transactions, sum)
total_revenue <- total_revenue[order(as.numeric(total_revenue$Product)), ]

datatable(
  total_revenue,
  caption = tags$caption(
    style = "caption-side: top;
             text-align: center;
             font-weight: bold;",
            "Total Revenue"
    ),
  options = list(pageLength = 5, autoWidth = TRUE)
)

# Average price per product

avg_price <- aggregate(Price ~ Product, data = transactions, mean)
avg_price <- avg_price[order(as.numeric(avg_price$Product)), ]
avg_price$Price <- round(avg_price$Price, 2)                   # Round untuk tampilan rapi

datatable(
  avg_price,
  caption = tags$caption(
    style = "caption-side: top;
             text-align: center;
             font-weight: bold;",
            "Average Price"
    ),
  options = list(pageLength = 5, autoWidth = TRUE)
)

# Optional Challange : Find the date with the highest total revenue

revenue_date <- aggregate(Total ~ Date, data = transactions, sum)
highest_date <- revenue_date$Date[which.max(revenue_date$Total)]
highest_revenue <- max(revenue_date$Total)

cat("Highest revenue date:", as.character(highest_date), "(Total: $", max(revenue_date$Total), ")\n")

## Highest revenue date: 2025-10-03 (Total: $ 2200 )

datatable(
  revenue_date,
  caption = tags$caption(
    style = "caption-side: top;
             text-align: center;
             font-weight: bold;",
            "Highest revenue date:"
    ),
  options = list(pageLength = 5, autoWidth = TRUE)
)

# Memperlihatkan / Visualize Data

# Barplot total quantity sold per product

barplot(
  total_qty$Quantity,
  names.arg = levels(total_qty$Product),
  col = c("lightblue", "lightgreen", "lightcoral"),
  main = "Total Quantity Sold per Product",
  xlab = "Product",
  ylab = "Total Quantity"
)

# Pie chart total revenue per customer tier

revenue_tier <- aggregate(Total ~ CustomerTier, data = transactions, sum)
revenue_tier <- revenue_tier[order(as.numeric(revenue_tier$CustomerTier)), ]

# Calculate percentages

total_rev <- sum(revenue_tier$Total)
pct <- round(revenue_tier$Total / total_rev * 100, 1)           # Round to 1 decimal place

# Labels dengan persentase

labels <- paste0(revenue_tier$CustomerTier, " (", pct, "%)")

# Pie Chart

pie(
  revenue_tier$Total,
  labels = labels,
  clockwise = TRUE,                                             # Urutan searah jarum jam untuk standar
  main = "Proportion of Total Revenue per Customer Tier",
  col = c("lightblue", "lightgreen", "lightcoral")
)

# Optional Challenge : Stacked bar chart (quantity per product by tier)

qty_tier <- aggregate(Quantity ~ Product + CustomerTier, data = transactions, FUN = sum)

# Re-factor untuk pastikan urutan

qty_tier$Product <- factor(qty_tier$Product, levels = c(
                                           "Laptop",
                                           "Mouse",
                                           "Keyboard")
                           )
qty_tier$CustomerTier <- factor(qty_tier$CustomerTier, levels = c(
                                           "Low",
                                           "Medium",
                                           "High"),
                                ordered = TRUE
                           )

# Debug qty tier

print(qty_tier)

##    Product CustomerTier Quantity
## 1 Keyboard          Low        6
## 2   Laptop          Low        1
## 3    Mouse          Low        3
## 4 Keyboard       Medium        8
## 5    Mouse       Medium        9
## 6   Laptop         High        5

# Pivot table dengan xtabs (columns: tiers in order Low, Med, High)

qty_table <- xtabs(Quantity ~ Product + CustomerTier, data = qty_tier)

# Verifikasi matriks

print(qty_table)

##           CustomerTier
## Product    Low Medium High
##   Laptop     1      0    5
##   Mouse      3      9    0
##   Keyboard   6      8    0

# Display Stacked barplot  

barplot(
  qty_table,  # Matrix: rows=products (x-axis), columns=tiers (stacks)
  beside = FALSE,  # Stacked: tumpuk tiers dalam 1 bar per product
  names.arg = rownames(qty_table),  # X-labels: "Laptop", "Mouse", "Keyboard"
  col = c("lightcoral", "lightgreen", "lightblue"),  # Stacks: Low (bawah, coral), Medium (green), High (atas, blue)
  border = "black",  # Border antar stack untuk clarity
  main = "Quantity Sold per Product by Customer Tier (Stacked)",
  xlab = "Product",
  ylab = "Total Quantity Sold",
  las = 1,  # Labels horizontal
  ylim = c(0, max(rowSums(qty_table)) + 2),  
  legend.text = colnames(qty_table),  # Legend: Low, Medium, High
  args.legend = list(
    x = "topleft", 
    bty = "n",  # No box
    cex = 0.9,  # Ukuran legend
    title = "Customer Tier",  # Judul legend
    inset = c(0.05,0)
    )
)

5 Exercise 5

Create Your Own Data Frame:

Objective: Create a data frame in R with 30 rows containing a mix of data types: continuous, discrete, nominal, and ordinal.

5.1 Instructions

Open RStudio or the R console.
Create a vector for each column in your data frame:
- Date: 30 dates (can be sequential or random within a month/year)
- Continuous: numeric values that can take decimal values (e.g., height, weight, temperature)
- Discrete: numeric values that can only take whole numbers (e.g., number of items, number of vehicles)
- Nominal: categorical values with no order (e.g., color, gender, city)
- Ordinal: categorical values with a defined order (e.g., Low, Medium, High; Beginner, Intermediate, Expert)
Combine all vectors into a data frame called my_data.
Check your data frame using head() or View() to ensure it has 30 rows and the columns are correct.
Optional tasks:
- Summarize each column using summary()
- Count the frequency of each category for Nominal and Ordinal columns using table()

5.2 Hints

Use seq.Date() or as.Date() to generate the Date column.
Use runif() or rnorm() for continuous numeric data.
Use sample() for discrete, nominal, and ordinal data.
Ensure the ordinal vector is created with factor(..., levels = c("Low","Medium","High"), ordered = TRUE) (or similar).

# Exercise 5: Create Your Own Data Frame

# Load package
library(reactable)

# Date: 30 sequential dates in October 2025
Date <- seq.Date(from = as.Date("2025-10-01"), by = "day", length.out = 30)

# Continuous variable
Temperature <- runif(30, min = 25.0, max = 35.0)

# Discrete variable
Items_Sold <- sample(1:20, size = 30, replace = TRUE)

# Nominal variable 
City <- sample(c("Jakarta", "Bandung", "Surabaya", "Medan", "Cikarang"), size = 30, replace = TRUE)

# Ordinal variable
Satisfaction <- factor(
  sample(c("Low", "Medium", "High"), size = 30, replace = TRUE),
  levels = c("Low", "Medium", "High"),
  ordered = TRUE
)

# Combine all columns into one data frame 

my_data <- data.frame(Date, Temperature, Items_Sold, City, Satisfaction)
my_data$Satisfaction_order <- as.numeric(my_data$Satisfaction)

# Display interactive table using reactable 
reactable( my_data,  
  sortable = FALSE,
  filterable = TRUE,
  searchable = FALSE,
  bordered = TRUE,
  striped = TRUE,
  highlight = TRUE,
  defaultPageSize = 10,
  columns = list(
    Date = colDef(format = colFormat(date = TRUE)),  
    Temperature = colDef(format = colFormat(digits = 2)),
    Satisfaction_order = colDef(format = colFormat(digits = 0)),  
    Satisfaction = colDef(
      style = function(value) {
        color <- switch(as.character(value),
                        "Low" = "red",
                        "Medium" = "orange",
                        "High" = "green")
        list(background = color, color = "black", fontWeight = "bold")
      }
    )
  ),
  defaultColDef = colDef(align = "center")
)

# Optional: Summary

summary(my_data)

##       Date             Temperature      Items_Sold       City          
##  Min.   :2025-10-01   Min.   :25.39   Min.   : 1.0   Length:30         
##  1st Qu.:2025-10-08   1st Qu.:27.53   1st Qu.: 7.0   Class :character  
##  Median :2025-10-15   Median :29.74   Median :12.0   Mode  :character  
##  Mean   :2025-10-15   Mean   :29.96   Mean   :11.2                     
##  3rd Qu.:2025-10-22   3rd Qu.:32.26   3rd Qu.:17.0                     
##  Max.   :2025-10-30   Max.   :34.31   Max.   :19.0                     
##  Satisfaction Satisfaction_order
##  Low   :11    Min.   :1.000     
##  Medium: 9    1st Qu.:1.000     
##  High  :10    Median :2.000     
##               Mean   :1.967     
##               3rd Qu.:3.000     
##               Max.   :3.000

# Frequency tables

cat("Frequency of City (Nominal):\n")

## Frequency of City (Nominal):

print(table(my_data$City))

## 
##  Bandung Cikarang  Jakarta    Medan Surabaya 
##        6        3       11        4        6

cat("\nFrequency of Satisfaction (Ordinal):\n")

## 
## Frequency of Satisfaction (Ordinal):

print(table(my_data$Satisfaction))

## 
##    Low Medium   High 
##     11      9     10