Data Exploration

Excercises ~ Week 2

Logo


1 Exercise 1

The following table shows sample information for three students. Each observation represents a single student and includes details such as their unique student ID, name, age, total credits completed, major field of study, and year level.

This dataset demonstrates a mixture of variable types:

  • Nominal: StudentID, Name, Major
  • Numeric: Age (continuous), CreditsCompleted (discrete)
  • Ordinal: YearLevel (Freshman → Senior)
StudentID Name Age CreditsCompleted Major YearLevel
S001 Alice 20 45 Data Sains Sophomore
S002 Budi 21 60 Mathematics Junior
S003 Citra 19 30 Statistics Freshman
# 1. Create vectors for each variable
StudentID <- c("S001", "S002", "S003")       # Nominal / ID
Name <- c("Alice", "Budi", "Citra")          # Nominal / Name
Age <- c(20, 21, 19)                         # Numeric / Continuous
CreditsCompleted <- c(45, 60, 30)            # Numeric / Discrete

# Nominal
Major <- c("Data Sains", "Mathematics", "Statistics")  

# Ordinal
YearLevel <- factor(c("Sophomore", "Junior", "Freshman"),
                    levels = c("Freshman","Sophomore","Junior","Senior"),
                    ordered = TRUE)          

# 2. Combine all vectors into a data frame
students <- data.frame(
  StudentID, Name, Age, CreditsCompleted, Major, YearLevel,
  stringsAsFactors = FALSE
)

# 3. Display the data frame
print(students)
##   StudentID  Name Age CreditsCompleted       Major YearLevel
## 1      S001 Alice  20               45  Data Sains Sophomore
## 2      S002  Budi  21               60 Mathematics    Junior
## 3      S003 Citra  19               30  Statistics  Freshman

2 Exercise 2

Identify Data Types: Determine the type of data for each of the following variables:

library(knitr)

# Create a data frame for Data Types
variables_info <- data.frame(
  No = 1:5,
  Variable = c(
    "Number of vehicles passing through the toll road each day",
    "Student height in cm",
    "Employee gender (Male / Female)",
    "Customer satisfaction level: Low, Medium, High",
    "Respondent's favorite color: Red, Blue, Green"
  ),
  DataType = c(
    "Numeric",
    "Numeric",
    "Categorical",
    "Categorical",
    "Categorical"
  ),
  Subtype = c(
    "Discrete",
    "Continuous",
    "Nominal",
    "Ordinal",
    "Nominal"
  ),
  stringsAsFactors = FALSE
)

# Display the data frame as a neat table
kable(variables_info, 
      caption = "Table of Variables and Data Types")
Table of Variables and Data Types
No Variable DataType Subtype
1 Number of vehicles passing through the toll road each day Numeric Discrete
2 Student height in cm Numeric Continuous
3 Employee gender (Male / Female) Categorical Nominal
4 Customer satisfaction level: Low, Medium, High Categorical Ordinal
5 Respondent’s favorite color: Red, Blue, Green Categorical Nominal

3 Exercise 3

Classify Data Sources: Determine whether the following data comes from internal or external sources, and whether it is structured or unstructured:

# Install DT package if not already installed
# install.packages("DT")
library(DT)

# Create a data frame for data sources 
data_sources <- data.frame(
  No = 1:4,
  DataSource = c(
    "Daily sales transaction data of the company",
    "Weather reports from BMKG",
    "Product reviews on social media",
    "Warehouse inventory reports"
  ),
  Internal_External = c(
    "Internal Source",
    "External Source",
    "External Source",
    "Internal Source"
  ),
  Structured_Unstructured = c(
    "Structured",
    "Structured",
    "Unstructured",
    "Structured"
  ),
  stringsAsFactors = FALSE
)

# Display the data frame as a neat table
datatable(data_sources, 
          caption = "Table of Data Sources",
          rownames = FALSE) # hides the index column

4 Exercise 4

Dataset Structure: Consider the following transaction table:

Date Qty Price Product CustomerTier
2025-10-01 2 1000 Laptop High
2025-10-01 5 20 Mouse Medium
2025-10-02 1 1000 Laptop Low
2025-10-02 3 30 Keyboard Medium
2025-10-03 4 50 Mouse Medium
2025-10-03 2 1000 Laptop High
2025-10-04 6 25 Keyboard Low
2025-10-04 1 1000 Laptop High
2025-10-05 3 40 Mouse Low
2025-10-05 5 10 Keyboard Medium

Your Assignment Instructions: Creating a Transactions Table above in R

  1. Create a data frame in R called transactions containing the data above.

  2. Identify which variables are numeric and which are categorical

  3. Calculate total revenue for each transaction by multiplying Qty × Price and add it as a new column Total.

  4. Compute summary statistics:

    • Total quantity sold for each product
    • Total revenue per product
    • Average price per product
  5. Visualize the data:

    • Create a barplot showing total quantity sold per product.
    • Create a pie chart showing the proportion of total revenue per customer tier.
  6. Optional Challenge:

    • Find which date had the highest total revenue.
    • Create a stacked bar chart showing quantity sold per product by customer tier.

Hints: Use data.frame(), aggregate(), barplot(), pie(), and basic arithmetic operations in R.

# Load Package
library(kableExtra)
library(DT)
library(htmltools)

# Create Data Frame Transactions

transactions <- data.frame(
  No = 1:10, 
  Date = as.Date(c(
    "2025-10-01", "2025-10-01", "2025-10-02", "2025-10-02",
    "2025-10-03", "2025-10-03", "2025-10-04", "2025-10-04",
    "2025-10-05", "2025-10-05"
  )),
  
  Quantity = c(2, 5, 1, 3, 4, 2, 6, 1, 3, 5),
  
  Price = c(1000, 20, 1000, 30, 50, 1000, 25, 1000, 40, 10),
  
  Product = factor(c(
          "Laptop", "Mouse", "Laptop", "Keyboard", "Mouse",
          "Laptop", "Keyboard", "Laptop", "Mouse", "Keyboard")),
  
  CustomerTier = factor(c(
               "High", "Medium", "Low", "Medium", "Medium",
               "High", "Low", "High", "Low", "Medium"),
        levels = c("Low","Medium","High"), ordered = TRUE),
  stringsAsFactors = FALSE
)


transactions$Total <- transactions$Quantity * transactions$Price

# Identify data types (numeric & categorical)

str(transactions)
## 'data.frame':    10 obs. of  7 variables:
##  $ No          : int  1 2 3 4 5 6 7 8 9 10
##  $ Date        : Date, format: "2025-10-01" "2025-10-01" ...
##  $ Quantity    : num  2 5 1 3 4 2 6 1 3 5
##  $ Price       : num  1000 20 1000 30 50 1000 25 1000 40 10
##  $ Product     : Factor w/ 3 levels "Keyboard","Laptop",..: 2 3 2 1 3 2 1 2 3 1
##  $ CustomerTier: Ord.factor w/ 3 levels "Low"<"Medium"<..: 3 2 1 2 2 3 1 3 1 2
##  $ Total       : num  2000 100 1000 90 200 2000 150 1000 120 50
# Penjelasan: 
# - Numeric: Quantity (discrete), Price (continuous), Total (continuous)
# - Categorical: Date (date), Product (nominal factor), CustomerTier (ordinal factor)


# Display transactions table

datatable(
  transactions,
  caption = tags$caption(
    style = "caption-side:  top;
             text-align: center;
             font-weight: bold;",
            "Table Transactions"),
  options = list(pageLength = 5, autoWidth = TRUE)
)
# Menghitung Summary statistik

# Total quantity sold per product

total_qty <- aggregate(Quantity ~ Product, data = transactions, sum)
total_qty <- total_qty[order(as.numeric(total_qty$Product)), ]  # Reorder berdasarkan levels faktor

datatable(
  total_qty,
  caption = tags$caption(
    style = "caption-side: top;
             text-align: center;
             font-weight: bold;",
            "Total Quantity Sold per Product"
    ),
  options = list(pageLength = 5, autoWidth = TRUE)
)
# Total revenue per product

total_revenue <- aggregate(Total ~ Product, data = transactions, sum)
total_revenue <- total_revenue[order(as.numeric(total_revenue$Product)), ]

datatable(
  total_revenue,
  caption = tags$caption(
    style = "caption-side: top;
             text-align: center;
             font-weight: bold;",
            "Total Revenue"
    ),
  options = list(pageLength = 5, autoWidth = TRUE)
)
# Average price per product

avg_price <- aggregate(Price ~ Product, data = transactions, mean)
avg_price <- avg_price[order(as.numeric(avg_price$Product)), ]
avg_price$Price <- round(avg_price$Price, 2)                   # Round untuk tampilan rapi

datatable(
  avg_price,
  caption = tags$caption(
    style = "caption-side: top;
             text-align: center;
             font-weight: bold;",
            "Average Price"
    ),
  options = list(pageLength = 5, autoWidth = TRUE)
)
# Optional Challange : Find the date with the highest total revenue

revenue_date <- aggregate(Total ~ Date, data = transactions, sum)
highest_date <- revenue_date$Date[which.max(revenue_date$Total)]
highest_revenue <- max(revenue_date$Total)

cat("Highest revenue date:", as.character(highest_date), "(Total: $", max(revenue_date$Total), ")\n")
## Highest revenue date: 2025-10-03 (Total: $ 2200 )
datatable(
  revenue_date,
  caption = tags$caption(
    style = "caption-side: top;
             text-align: center;
             font-weight: bold;",
            "Highest revenue date:"
    ),
  options = list(pageLength = 5, autoWidth = TRUE)
)
# Memperlihatkan / Visualize Data

# Barplot total quantity sold per product

barplot(
  total_qty$Quantity,
  names.arg = levels(total_qty$Product),
  col = c("lightblue", "lightgreen", "lightcoral"),
  main = "Total Quantity Sold per Product",
  xlab = "Product",
  ylab = "Total Quantity"
)

# Pie chart total revenue per customer tier

revenue_tier <- aggregate(Total ~ CustomerTier, data = transactions, sum)
revenue_tier <- revenue_tier[order(as.numeric(revenue_tier$CustomerTier)), ]

# Calculate percentages

total_rev <- sum(revenue_tier$Total)
pct <- round(revenue_tier$Total / total_rev * 100, 1)           # Round to 1 decimal place

# Labels dengan persentase

labels <- paste0(revenue_tier$CustomerTier, " (", pct, "%)")

# Pie Chart

pie(
  revenue_tier$Total,
  labels = labels,
  clockwise = TRUE,                                             # Urutan searah jarum jam untuk standar
  main = "Proportion of Total Revenue per Customer Tier",
  col = c("lightblue", "lightgreen", "lightcoral")
)

# Optional Challenge : Stacked bar chart (quantity per product by tier)

qty_tier <- aggregate(Quantity ~ Product + CustomerTier, data = transactions, FUN = sum)

# Re-factor untuk pastikan urutan

qty_tier$Product <- factor(qty_tier$Product, levels = c(
                                           "Laptop",
                                           "Mouse",
                                           "Keyboard")
                           )
qty_tier$CustomerTier <- factor(qty_tier$CustomerTier, levels = c(
                                           "Low",
                                           "Medium",
                                           "High"),
                                ordered = TRUE
                           )

# Debug qty tier

print(qty_tier)
##    Product CustomerTier Quantity
## 1 Keyboard          Low        6
## 2   Laptop          Low        1
## 3    Mouse          Low        3
## 4 Keyboard       Medium        8
## 5    Mouse       Medium        9
## 6   Laptop         High        5
# Pivot table dengan xtabs (columns: tiers in order Low, Med, High)

qty_table <- xtabs(Quantity ~ Product + CustomerTier, data = qty_tier)

# Verifikasi matriks

print(qty_table)
##           CustomerTier
## Product    Low Medium High
##   Laptop     1      0    5
##   Mouse      3      9    0
##   Keyboard   6      8    0
# Display Stacked barplot  

barplot(
  qty_table,  # Matrix: rows=products (x-axis), columns=tiers (stacks)
  beside = FALSE,  # Stacked: tumpuk tiers dalam 1 bar per product
  names.arg = rownames(qty_table),  # X-labels: "Laptop", "Mouse", "Keyboard"
  col = c("lightcoral", "lightgreen", "lightblue"),  # Stacks: Low (bawah, coral), Medium (green), High (atas, blue)
  border = "black",  # Border antar stack untuk clarity
  main = "Quantity Sold per Product by Customer Tier (Stacked)",
  xlab = "Product",
  ylab = "Total Quantity Sold",
  las = 1,  # Labels horizontal
  ylim = c(0, max(rowSums(qty_table)) + 2),  
  legend.text = colnames(qty_table),  # Legend: Low, Medium, High
  args.legend = list(
    x = "topleft", 
    bty = "n",  # No box
    cex = 0.9,  # Ukuran legend
    title = "Customer Tier",  # Judul legend
    inset = c(0.05,0)
    )
)


5 Exercise 5

Create Your Own Data Frame:

Objective: Create a data frame in R with 30 rows containing a mix of data types: continuous, discrete, nominal, and ordinal.

5.1 Instructions

  1. Open RStudio or the R console.

  2. Create a vector for each column in your data frame:

    • Date: 30 dates (can be sequential or random within a month/year)
    • Continuous: numeric values that can take decimal values (e.g., height, weight, temperature)
    • Discrete: numeric values that can only take whole numbers (e.g., number of items, number of vehicles)
    • Nominal: categorical values with no order (e.g., color, gender, city)
    • Ordinal: categorical values with a defined order (e.g., Low, Medium, High; Beginner, Intermediate, Expert)
  3. Combine all vectors into a data frame called my_data.

  4. Check your data frame using head() or View() to ensure it has 30 rows and the columns are correct.

  5. Optional tasks:

    • Summarize each column using summary()
    • Count the frequency of each category for Nominal and Ordinal columns using table()

5.2 Hints

  • Use seq.Date() or as.Date() to generate the Date column.
  • Use runif() or rnorm() for continuous numeric data.
  • Use sample() for discrete, nominal, and ordinal data.
  • Ensure the ordinal vector is created with factor(..., levels = c("Low","Medium","High"), ordered = TRUE) (or similar).
# Exercise 5: Create Your Own Data Frame

# Load package
library(reactable)

# Date: 30 sequential dates in October 2025
Date <- seq.Date(from = as.Date("2025-10-01"), by = "day", length.out = 30)

# Continuous variable
Temperature <- runif(30, min = 25.0, max = 35.0)

# Discrete variable
Items_Sold <- sample(1:20, size = 30, replace = TRUE)

# Nominal variable 
City <- sample(c("Jakarta", "Bandung", "Surabaya", "Medan", "Cikarang"), size = 30, replace = TRUE)

# Ordinal variable
Satisfaction <- factor(
  sample(c("Low", "Medium", "High"), size = 30, replace = TRUE),
  levels = c("Low", "Medium", "High"),
  ordered = TRUE
)

# Combine all columns into one data frame 

my_data <- data.frame(Date, Temperature, Items_Sold, City, Satisfaction)
my_data$Satisfaction_order <- as.numeric(my_data$Satisfaction)

# Display interactive table using reactable 
reactable( my_data,  
  sortable = FALSE,
  filterable = TRUE,
  searchable = FALSE,
  bordered = TRUE,
  striped = TRUE,
  highlight = TRUE,
  defaultPageSize = 10,
  columns = list(
    Date = colDef(format = colFormat(date = TRUE)),  
    Temperature = colDef(format = colFormat(digits = 2)),
    Satisfaction_order = colDef(format = colFormat(digits = 0)),  
    Satisfaction = colDef(
      style = function(value) {
        color <- switch(as.character(value),
                        "Low" = "red",
                        "Medium" = "orange",
                        "High" = "green")
        list(background = color, color = "black", fontWeight = "bold")
      }
    )
  ),
  defaultColDef = colDef(align = "center")
)
# Optional: Summary

summary(my_data)
##       Date             Temperature      Items_Sold       City          
##  Min.   :2025-10-01   Min.   :25.39   Min.   : 1.0   Length:30         
##  1st Qu.:2025-10-08   1st Qu.:27.53   1st Qu.: 7.0   Class :character  
##  Median :2025-10-15   Median :29.74   Median :12.0   Mode  :character  
##  Mean   :2025-10-15   Mean   :29.96   Mean   :11.2                     
##  3rd Qu.:2025-10-22   3rd Qu.:32.26   3rd Qu.:17.0                     
##  Max.   :2025-10-30   Max.   :34.31   Max.   :19.0                     
##  Satisfaction Satisfaction_order
##  Low   :11    Min.   :1.000     
##  Medium: 9    1st Qu.:1.000     
##  High  :10    Median :2.000     
##               Mean   :1.967     
##               3rd Qu.:3.000     
##               Max.   :3.000
# Frequency tables

cat("Frequency of City (Nominal):\n")
## Frequency of City (Nominal):
print(table(my_data$City))
## 
##  Bandung Cikarang  Jakarta    Medan Surabaya 
##        6        3       11        4        6
cat("\nFrequency of Satisfaction (Ordinal):\n")
## 
## Frequency of Satisfaction (Ordinal):
print(table(my_data$Satisfaction))
## 
##    Low Medium   High 
##     11      9     10