Task 3 - Data Exploration/Discovery/Profiling

Introduction

This R Markdown was written to explain how I did the tasks I did, and to prove that I did all the tasks myself.

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load data csv files

For all my tasks, I first need to import all the csv files in the corresponding dataset.

products <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/products.csv", header = TRUE)

orders <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/orders.csv", header = TRUE)

customers <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/customers.csv", header = TRUE)

sales <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/sales.csv", header = TRUE)

A - Metadata Analysis

# Get the data types of each column
data_types <- sapply(products, class)

# Estimate the data format of each column
data_formats <- ifelse(data_types == "numeric", "Decimal", "Text")

# Determine the size of each column (maximum character length or maximum numeric value)
field_sizes <- sapply(products, function(col) {
  if (is.numeric(col)) {
    max(col, na.rm = TRUE)
  } else {
    max(nchar(as.character(col)), na.rm = TRUE)
  }
})

# Get the first value of each column (example)
examples <- sapply(products, function(col) head(col, 1))

# Convert the results into a data frame
metadata_df <- data.frame(
  Field_Name = names(products),
  Data_Type = data_types,
  Data_Format = data_formats,
  Field_Size = field_sizes,
  Example = examples
)

# Print the results
print(metadata_df)

##                Field_Name Data_Type Data_Format Field_Size
## product_ID     product_ID   integer        Text       1259
## product_type product_type character        Text          8
## product_name product_name character        Text         17
## size                 size character        Text          2
## colour             colour character        Text          6
## price               price   integer        Text        119
## quantity         quantity   integer        Text         80
## description   description character        Text         55
##                                                   Example
## product_ID                                              0
## product_type                                        Shirt
## product_name                                 Oxford Cloth
## size                                                   XS
## colour                                                red
## price                                                 114
## quantity                                               66
## description  A red coloured, XS sized, Oxford Cloth Shirt

# Save this data frame to a CSV file:
write.csv(metadata_df, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/metadata_analysis_products.csv", row.names = FALSE)

# Get the data types of each column
data_types <- sapply(orders, class)

# Estimate the data format of each column
data_formats <- ifelse(data_types == "numeric", "Decimal", "Date/Text")

# Determine the size of each column (maximum character length or maximum numeric value)
field_sizes <- sapply(orders, function(col) {
  if (is.numeric(col)) {
    max(col, na.rm = TRUE)
  } else {
    max(nchar(as.character(col)), na.rm = TRUE)
  }
})

# Get the first value of each column (example)
examples <- sapply(orders, function(col) head(col, 1))

# Convert the results into a data frame
metadata_df <- data.frame(
  Field_Name = names(orders),
  Data_Type = data_types,
  Data_Format = data_formats,
  Field_Size = field_sizes,
  Example = examples
)

# Print the results
print(metadata_df)

##                  Field_Name Data_Type Data_Format Field_Size    Example
## order_id           order_id   integer   Date/Text       1000          1
## customer_id     customer_id   integer   Date/Text       1000         64
## payment             payment   integer   Date/Text      59910      30811
## order_date       order_date character   Date/Text         10  2021-8-30
## delivery_date delivery_date character   Date/Text         10 2021-09-24

# Save this data frame to a CSV file:
write.csv(metadata_df, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/metadata_analysis_orders.csv", row.names = FALSE)

# Get the data types of each column
data_types <- sapply(customers, class)

# Estimate the data format of each column
data_formats <- ifelse(data_types == "numeric", "Decimal", "Text")

# Determine the size of each column (maximum character length or maximum numeric value)
field_sizes <- sapply(customers, function(col) {
  if (is.numeric(col)) {
    max(col, na.rm = TRUE)
  } else {
    max(nchar(as.character(col)), na.rm = TRUE)
  }
})

# Get the first value of each column (example)
examples <- sapply(customers, function(col) head(col, 1))

# Convert the results into a data frame
metadata_df <- data.frame(
  Field_Name = names(customers),
  Data_Type = data_types,
  Data_Format = data_formats,
  Field_Size = field_sizes,
  Example = examples
)

# Print the results
print(metadata_df)

##                  Field_Name Data_Type Data_Format Field_Size
## customer_id     customer_id   integer        Text       1000
## customer_name customer_name character        Text         23
## gender               gender character        Text         11
## age                     age   integer        Text         80
## home_address   home_address character        Text         33
## zip_code           zip_code   integer        Text       9998
## city                   city character        Text         22
## state                 state character        Text         28
## country             country character        Text          9
##                                      Example
## customer_id                                1
## customer_name                  Leanna Busson
## gender                                Female
## age                                       30
## home_address  8606 Victoria TerraceSuite 560
## zip_code                                5464
## city                           Johnstonhaven
## state                     Northern Territory
## country                            Australia

# Save this data frame to a CSV file:
write.csv(metadata_df, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/metadata_analysis_customers.csv", row.names = FALSE)

# Get the data types of each column
data_types <- sapply(sales, class)

# Estimate the data format of each column
data_formats <- ifelse(data_types == "numeric", "Decimal", "Text")

# Determine the size of each column (maximum character length or maximum numeric value)
field_sizes <- sapply(sales, function(col) {
  if (is.numeric(col)) {
    max(col, na.rm = TRUE)
  } else {
    max(nchar(as.character(col)), na.rm = TRUE)
  }
})

# Get the first value of each column (example)
examples <- sapply(sales, function(col) head(col, 1))

# Convert the results into a data frame
metadata_df <- data.frame(
  Field_Name = names(sales),
  Data_Type = data_types,
  Data_Format = data_formats,
  Field_Size = field_sizes,
  Example = examples
)

# Print the results
print(metadata_df)

##                    Field_Name Data_Type Data_Format Field_Size Example
## sales_id             sales_id   integer        Text       4999       0
## order_id             order_id   integer        Text        999       1
## product_id         product_id   integer        Text       1259     218
## price_per_unit price_per_unit   integer        Text        119     106
## quantity             quantity   integer        Text          3       2
## total_price       total_price   integer        Text        357     212

# Save this data frame to a CSV file:
write.csv(metadata_df, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/metadata_analysis_sales.csv", row.names = FALSE)

B - Profiling Data

data <- read.csv("products.csv")

# Descriptive statistics summary
summary(data)

##    product_ID     product_type       product_name           size          
##  Min.   :   0.0   Length:1260        Length:1260        Length:1260       
##  1st Qu.: 314.8   Class :character   Class :character   Class :character  
##  Median : 629.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 629.5                                                           
##  3rd Qu.: 944.2                                                           
##  Max.   :1259.0                                                           
##     colour              price           quantity     description       
##  Length:1260        Min.   : 90.00   Min.   :40.00   Length:1260       
##  Class :character   1st Qu.: 95.75   1st Qu.:50.00   Class :character  
##  Mode  :character   Median :108.50   Median :60.00   Mode  :character  
##                     Mean   :105.81   Mean   :60.15                     
##                     3rd Qu.:114.00   3rd Qu.:70.00                     
##                     Max.   :119.00   Max.   :80.00

# Count of distinct values
distinct_counts <- sapply(data, function(x) length(unique(x)))
print(distinct_counts)

##   product_ID product_type product_name         size       colour        price 
##         1260            3           35            5            7           26 
##     quantity  description 
##           41         1260

# Percentage of 0/blank/null values
percent_zero <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(mean(x == 0, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(percent_zero)

##   product_ID product_type product_name         size       colour        price 
## 0.0007936508           NA           NA           NA           NA 0.0000000000 
##     quantity  description 
## 0.0000000000           NA

percent_blank <- sapply(data, function(x) {
  if (is.character(x)) {
    return(mean(x == "", na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(percent_blank)

##   product_ID product_type product_name         size       colour        price 
##           NA            0            0            0            0           NA 
##     quantity  description 
##           NA            0

percent_null <- sapply(data, function(x) mean(is.na(x)))
print(percent_null)

##   product_ID product_type product_name         size       colour        price 
##            0            0            0            0            0            0 
##     quantity  description 
##            0            0

# Summation of values
sums <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(sum(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(sums)

##   product_ID product_type product_name         size       colour        price 
##       793170           NA           NA           NA           NA       133315 
##     quantity  description 
##        75789           NA

# Value distribution (visualized with histograms)
lapply(data, function(x) {
  if(is.numeric(x)) {
    hist(x, main=paste("Histogram of", deparse(substitute(x))))
  }
})

## $product_ID
## $breaks
##  [1]    0  100  200  300  400  500  600  700  800  900 1000 1100 1200 1300
## 
## $counts
##  [1] 101 100 100 100 100 100 100 100 100 100 100 100  59
## 
## $density
##  [1] 0.0008015873 0.0007936508 0.0007936508 0.0007936508 0.0007936508
##  [6] 0.0007936508 0.0007936508 0.0007936508 0.0007936508 0.0007936508
## [11] 0.0007936508 0.0007936508 0.0004682540
## 
## $mids
##  [1]   50  150  250  350  450  550  650  750  850  950 1050 1150 1250
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $product_type
## NULL
## 
## $product_name
## NULL
## 
## $size
## NULL
## 
## $colour
## NULL
## 
## $price
## $breaks
##  [1]  90  92  94  96  98 100 102 104 106 108 110 112 114 116 118 120
## 
## $counts
##  [1] 175  70 105  35  70  35  35  70  35  70 105 175 105  70 105
## 
## $density
##  [1] 0.06944444 0.02777778 0.04166667 0.01388889 0.02777778 0.01388889
##  [7] 0.01388889 0.02777778 0.01388889 0.02777778 0.04166667 0.06944444
## [13] 0.04166667 0.02777778 0.04166667
## 
## $mids
##  [1]  91  93  95  97  99 101 103 105 107 109 111 113 115 117 119
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $quantity
## $breaks
## [1] 40 45 50 55 60 65 70 75 80
## 
## $counts
## [1] 175 150 153 162 165 149 155 151
## 
## $density
## [1] 0.02777778 0.02380952 0.02428571 0.02571429 0.02619048 0.02365079 0.02460317
## [8] 0.02396825
## 
## $mids
## [1] 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $description
## NULL

# Mean and standard deviation
means <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(mean(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(means)

##   product_ID product_type product_name         size       colour        price 
##     629.5000           NA           NA           NA           NA     105.8056 
##     quantity  description 
##      60.1500           NA

sds <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(sd(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(sds)

##   product_ID product_type product_name         size       colour        price 
##   363.874979           NA           NA           NA           NA     9.704423 
##     quantity  description 
##    11.670573           NA

# Check for outliers using IQR
is_outlier <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  return(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}

data_outliers <- lapply(data, function(x) {
  if(is.numeric(x)) {
    return(is_outlier(x))
  } else {
    return(rep(FALSE, length(x)))
  }
})

# Check for the number of outliers
lapply(data_outliers, function(x) sum(x, na.rm=TRUE))

## $product_ID
## [1] 0
## 
## $product_type
## [1] 0
## 
## $product_name
## [1] 0
## 
## $size
## [1] 0
## 
## $colour
## [1] 0
## 
## $price
## [1] 0
## 
## $quantity
## [1] 0
## 
## $description
## [1] 0

# Save the results into a data frame
results <- data.frame(
  Fields = names(data),
  Distinct_Counts = distinct_counts,
  Percent_Zero = percent_zero,
  Percent_Blank = percent_blank,
  Percent_Null = percent_null,
  Sums = sums,
  Means = means,
  SDs = sds,
  Number_of_Outliers = unlist(lapply(data_outliers, function(x) sum(x, na.rm=TRUE)))
)

# Save the results data frame into a CSV file
write.csv(results, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/profiling_data_products.csv", row.names=FALSE)

data <- read.csv("orders.csv")

# Descriptive statistics summary
summary(data)

##     order_id       customer_id        payment       order_date       
##  Min.   :   1.0   Min.   :   1.0   Min.   :10043   Length:1000       
##  1st Qu.: 250.8   1st Qu.: 275.2   1st Qu.:21329   Class :character  
##  Median : 500.5   Median : 515.0   Median :33698   Mode  :character  
##  Mean   : 500.5   Mean   : 506.6   Mean   :33973                     
##  3rd Qu.: 750.2   3rd Qu.: 737.2   3rd Qu.:46249                     
##  Max.   :1000.0   Max.   :1000.0   Max.   :59910                     
##  delivery_date     
##  Length:1000       
##  Class :character  
##  Mode  :character  
##                    
##                    
##

# Count of distinct values
distinct_counts <- sapply(data, function(x) length(unique(x)))
print(distinct_counts)

##      order_id   customer_id       payment    order_date delivery_date 
##          1000           617           995           291           305

# Percentage of 0/blank/null values
percent_zero <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(mean(x == 0, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(percent_zero)

##      order_id   customer_id       payment    order_date delivery_date 
##             0             0             0            NA            NA

percent_blank <- sapply(data, function(x) {
  if (is.character(x)) {
    return(mean(x == "", na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(percent_blank)

##      order_id   customer_id       payment    order_date delivery_date 
##            NA            NA            NA             0             0

percent_null <- sapply(data, function(x) mean(is.na(x)))
print(percent_null)

##      order_id   customer_id       payment    order_date delivery_date 
##             0             0             0             0             0

# Summation of values
sums <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(sum(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(sums)

##      order_id   customer_id       payment    order_date delivery_date 
##        500500        506640      33972936            NA            NA

# Value distribution (visualized with histograms)
lapply(data, function(x) {
  if(is.numeric(x)) {
    hist(x, main=paste("Histogram of", deparse(substitute(x))))
  }
})

## $order_id
## $breaks
##  [1]    0  100  200  300  400  500  600  700  800  900 1000
## 
## $counts
##  [1] 100 100 100 100 100 100 100 100 100 100
## 
## $density
##  [1] 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
## 
## $mids
##  [1]  50 150 250 350 450 550 650 750 850 950
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $customer_id
## $breaks
##  [1]    0  100  200  300  400  500  600  700  800  900 1000
## 
## $counts
##  [1]  96  79  94  93 124 109 116 113  92  84
## 
## $density
##  [1] 0.00096 0.00079 0.00094 0.00093 0.00124 0.00109 0.00116 0.00113 0.00092
## [10] 0.00084
## 
## $mids
##  [1]  50 150 250 350 450 550 650 750 850 950
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $payment
## $breaks
##  [1] 10000 15000 20000 25000 30000 35000 40000 45000 50000 55000 60000
## 
## $counts
##  [1] 116 111  99 101  92 106 104  93  89  89
## 
## $density
##  [1] 2.32e-05 2.22e-05 1.98e-05 2.02e-05 1.84e-05 2.12e-05 2.08e-05 1.86e-05
##  [9] 1.78e-05 1.78e-05
## 
## $mids
##  [1] 12500 17500 22500 27500 32500 37500 42500 47500 52500 57500
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $order_date
## NULL
## 
## $delivery_date
## NULL

# Mean and standard deviation
means <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(mean(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(means)

##      order_id   customer_id       payment    order_date delivery_date 
##        500.50        506.64      33972.94            NA            NA

sds <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(sd(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(sds)

##      order_id   customer_id       payment    order_date delivery_date 
##      288.8194      277.1155    14451.6090            NA            NA

# Check for outliers using IQR
is_outlier <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  return(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}

data_outliers <- lapply(data, function(x) {
  if(is.numeric(x)) {
    return(is_outlier(x))
  } else {
    return(rep(FALSE, length(x)))
  }
})

# Check for the number of outliers
lapply(data_outliers, function(x) sum(x, na.rm=TRUE))

## $order_id
## [1] 0
## 
## $customer_id
## [1] 0
## 
## $payment
## [1] 0
## 
## $order_date
## [1] 0
## 
## $delivery_date
## [1] 0

# Save the results into a data frame
results <- data.frame(
  Fields = names(data),
  Distinct_Counts = distinct_counts,
  Percent_Zero = percent_zero,
  Percent_Blank = percent_blank,
  Percent_Null = percent_null,
  Sums = sums,
  Means = means,
  SDs = sds,
  Number_of_Outliers = unlist(lapply(data_outliers, function(x) sum(x, na.rm=TRUE)))
)

# Save the results data frame into a CSV file
write.csv(results, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/profiling_data_oders.csv", row.names=FALSE)

data <- read.csv("customers.csv")

# Descriptive statistics summary
summary(data)

##   customer_id     customer_name         gender               age       
##  Min.   :   1.0   Length:1000        Length:1000        Min.   :20.00  
##  1st Qu.: 250.8   Class :character   Class :character   1st Qu.:34.00  
##  Median : 500.5   Mode  :character   Mode  :character   Median :50.00  
##  Mean   : 500.5                                         Mean   :49.86  
##  3rd Qu.: 750.2                                         3rd Qu.:65.00  
##  Max.   :1000.0                                         Max.   :80.00  
##  home_address          zip_code        city              state          
##  Length:1000        Min.   :   2   Length:1000        Length:1000       
##  Class :character   1st Qu.:2402   Class :character   Class :character  
##  Mode  :character   Median :5083   Mode  :character   Mode  :character  
##                     Mean   :5005                                        
##                     3rd Qu.:7460                                        
##                     Max.   :9998                                        
##    country         
##  Length:1000       
##  Class :character  
##  Mode  :character  
##                    
##                    
##

# Count of distinct values
distinct_counts <- sapply(data, function(x) length(unique(x)))
print(distinct_counts)

##   customer_id customer_name        gender           age  home_address 
##          1000          1000             8            61          1000 
##      zip_code          city         state       country 
##           955           961             8             1

# Percentage of 0/blank/null values
percent_zero <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(mean(x == 0, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(percent_zero)

##   customer_id customer_name        gender           age  home_address 
##             0            NA            NA             0            NA 
##      zip_code          city         state       country 
##             0            NA            NA            NA

percent_blank <- sapply(data, function(x) {
  if (is.character(x)) {
    return(mean(x == "", na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(percent_blank)

##   customer_id customer_name        gender           age  home_address 
##            NA             0             0            NA             0 
##      zip_code          city         state       country 
##            NA             0             0             0

percent_null <- sapply(data, function(x) mean(is.na(x)))
print(percent_null)

##   customer_id customer_name        gender           age  home_address 
##             0             0             0             0             0 
##      zip_code          city         state       country 
##             0             0             0             0

# Summation of values
sums <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(sum(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(sums)

##   customer_id customer_name        gender           age  home_address 
##        500500            NA            NA         49860            NA 
##      zip_code          city         state       country 
##       5004872            NA            NA            NA

# Value distribution (visualized with histograms)
lapply(data, function(x) {
  if(is.numeric(x)) {
    hist(x, main=paste("Histogram of", deparse(substitute(x))))
  }
})

## $customer_id
## $breaks
##  [1]    0  100  200  300  400  500  600  700  800  900 1000
## 
## $counts
##  [1] 100 100 100 100 100 100 100 100 100 100
## 
## $density
##  [1] 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
## 
## $mids
##  [1]  50 150 250 350 450 550 650 750 850 950
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $customer_name
## NULL
## 
## $gender
## NULL
## 
## $age
## $breaks
##  [1] 20 25 30 35 40 45 50 55 60 65 70 75 80
## 
## $counts
##  [1] 105  81  86  74  84  78  75  90  90  77  84  76
## 
## $density
##  [1] 0.0210 0.0162 0.0172 0.0148 0.0168 0.0156 0.0150 0.0180 0.0180 0.0154
## [11] 0.0168 0.0152
## 
## $mids
##  [1] 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $home_address
## NULL
## 
## $zip_code
## $breaks
##  [1]     0  1000  2000  3000  4000  5000  6000  7000  8000  9000 10000
## 
## $counts
##  [1]  98 109  94  97  96  94 112 108 101  91
## 
## $density
##  [1] 0.000098 0.000109 0.000094 0.000097 0.000096 0.000094 0.000112 0.000108
##  [9] 0.000101 0.000091
## 
## $mids
##  [1]  500 1500 2500 3500 4500 5500 6500 7500 8500 9500
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $city
## NULL
## 
## $state
## NULL
## 
## $country
## NULL

# Mean and standard deviation
means <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(mean(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(means)

##   customer_id customer_name        gender           age  home_address 
##       500.500            NA            NA        49.860            NA 
##      zip_code          city         state       country 
##      5004.872            NA            NA            NA

sds <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(sd(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(sds)

##   customer_id customer_name        gender           age  home_address 
##     288.81944            NA            NA      17.64783            NA 
##      zip_code          city         state       country 
##    2884.49733            NA            NA            NA

# Check for outliers using IQR
is_outlier <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  return(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}

data_outliers <- lapply(data, function(x) {
  if(is.numeric(x)) {
    return(is_outlier(x))
  } else {
    return(rep(FALSE, length(x)))
  }
})

# Check for the number of outliers
lapply(data_outliers, function(x) sum(x, na.rm=TRUE))

## $customer_id
## [1] 0
## 
## $customer_name
## [1] 0
## 
## $gender
## [1] 0
## 
## $age
## [1] 0
## 
## $home_address
## [1] 0
## 
## $zip_code
## [1] 0
## 
## $city
## [1] 0
## 
## $state
## [1] 0
## 
## $country
## [1] 0

# Save the results into a data frame
results <- data.frame(
  Fields = names(data),
  Distinct_Counts = distinct_counts,
  Percent_Zero = percent_zero,
  Percent_Blank = percent_blank,
  Percent_Null = percent_null,
  Sums = sums,
  Means = means,
  SDs = sds,
  Number_of_Outliers = unlist(lapply(data_outliers, function(x) sum(x, na.rm=TRUE)))
)

# Save the results data frame into a CSV file
write.csv(results, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/profiling_data_customers.csv", row.names=FALSE)

data <- read.csv("sales.csv")

# Descriptive statistics summary
summary(data)

##     sales_id       order_id       product_id     price_per_unit 
##  Min.   :   0   Min.   :  1.0   Min.   :   1.0   Min.   : 90.0  
##  1st Qu.:1250   1st Qu.:258.0   1st Qu.: 323.0   1st Qu.: 95.0  
##  Median :2500   Median :504.5   Median : 635.0   Median :102.0  
##  Mean   :2500   Mean   :503.0   Mean   : 634.1   Mean   :103.5  
##  3rd Qu.:3749   3rd Qu.:749.0   3rd Qu.: 951.0   3rd Qu.:112.0  
##  Max.   :4999   Max.   :999.0   Max.   :1259.0   Max.   :119.0  
##     quantity      total_price   
##  Min.   :1.000   Min.   : 90.0  
##  1st Qu.:1.000   1st Qu.:112.0  
##  Median :2.000   Median :204.0  
##  Mean   :1.992   Mean   :206.4  
##  3rd Qu.:3.000   3rd Qu.:285.0  
##  Max.   :3.000   Max.   :357.0

# Count of distinct values
distinct_counts <- sapply(data, function(x) length(unique(x)))
print(distinct_counts)

##       sales_id       order_id     product_id price_per_unit       quantity 
##           5000            993           1233             23              3 
##    total_price 
##             69

# Percentage of 0/blank/null values
percent_zero <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(mean(x == 0, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(percent_zero)

##       sales_id       order_id     product_id price_per_unit       quantity 
##          2e-04          0e+00          0e+00          0e+00          0e+00 
##    total_price 
##          0e+00

percent_blank <- sapply(data, function(x) {
  if (is.character(x)) {
    return(mean(x == "", na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(percent_blank)

##       sales_id       order_id     product_id price_per_unit       quantity 
##             NA             NA             NA             NA             NA 
##    total_price 
##             NA

percent_null <- sapply(data, function(x) mean(is.na(x)))
print(percent_null)

##       sales_id       order_id     product_id price_per_unit       quantity 
##              0              0              0              0              0 
##    total_price 
##              0

# Summation of values
sums <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(sum(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(sums)

##       sales_id       order_id     product_id price_per_unit       quantity 
##       12497500        2515191        3170266         517508           9962 
##    total_price 
##        1031800

# Value distribution (visualized with histograms)
lapply(data, function(x) {
  if(is.numeric(x)) {
    hist(x, main=paste("Histogram of", deparse(substitute(x))))
  }
})

## $sales_id
## $breaks
##  [1]    0  500 1000 1500 2000 2500 3000 3500 4000 4500 5000
## 
## $counts
##  [1] 501 500 500 500 500 500 500 500 500 499
## 
## $density
##  [1] 0.0002004 0.0002000 0.0002000 0.0002000 0.0002000 0.0002000 0.0002000
##  [8] 0.0002000 0.0002000 0.0001996
## 
## $mids
##  [1]  250  750 1250 1750 2250 2750 3250 3750 4250 4750
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $order_id
## $breaks
##  [1]    0  100  200  300  400  500  600  700  800  900 1000
## 
## $counts
##  [1] 456 523 496 520 477 530 479 539 498 482
## 
## $density
##  [1] 0.000912 0.001046 0.000992 0.001040 0.000954 0.001060 0.000958 0.001078
##  [9] 0.000996 0.000964
## 
## $mids
##  [1]  50 150 250 350 450 550 650 750 850 950
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $product_id
## $breaks
##  [1]    0  100  200  300  400  500  600  700  800  900 1000 1100 1200 1300
## 
## $counts
##  [1] 398 386 372 405 391 415 386 401 392 403 429 381 241
## 
## $density
##  [1] 0.000796 0.000772 0.000744 0.000810 0.000782 0.000830 0.000772 0.000802
##  [9] 0.000784 0.000806 0.000858 0.000762 0.000482
## 
## $mids
##  [1]   50  150  250  350  450  550  650  750  850  950 1050 1150 1250
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $price_per_unit
## $breaks
##  [1]  90  92  94  96  98 100 102 104 106 108 110 112 114 116 118 120
## 
## $counts
##  [1] 559 570 424 429 290 277 127 535 134 256 264 269 135 431 300
## 
## $density
##  [1] 0.0559 0.0570 0.0424 0.0429 0.0290 0.0277 0.0127 0.0535 0.0134 0.0256
## [11] 0.0264 0.0269 0.0135 0.0431 0.0300
## 
## $mids
##  [1]  91  93  95  97  99 101 103 105 107 109 111 113 115 117 119
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $quantity
## $breaks
##  [1] 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0
## 
## $counts
##  [1] 1649    0    0    0 1740    0    0    0    0 1611
## 
## $density
##  [1] 1.649 0.000 0.000 0.000 1.740 0.000 0.000 0.000 0.000 1.611
## 
## $mids
##  [1] 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5 2.7 2.9
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $total_price
## $breaks
##  [1]  80 100 120 140 160 180 200 220 240 260 280 300 320 340 360
## 
## $counts
##  [1] 768 881   0   0  57 738 484 461   0 270 439 312 278 312
## 
## $density
##  [1] 0.00768 0.00881 0.00000 0.00000 0.00057 0.00738 0.00484 0.00461 0.00000
## [10] 0.00270 0.00439 0.00312 0.00278 0.00312
## 
## $mids
##  [1]  90 110 130 150 170 190 210 230 250 270 290 310 330 350
## 
## $xname
## [1] "x"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

# Mean and standard deviation
means <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(mean(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(means)

##       sales_id       order_id     product_id price_per_unit       quantity 
##      2499.5000       503.0382       634.0532       103.5016         1.9924 
##    total_price 
##       206.3600

sds <- sapply(data, function(x) {
  if (is.numeric(x)) {
    return(sd(x, na.rm=TRUE))
  } else {
    return(NA)
  }
})
print(sds)

##       sales_id       order_id     product_id price_per_unit       quantity 
##   1443.5200033    285.9644178    363.2557938      9.1950045      0.8075102 
##    total_price 
##     86.3574567

# Check for outliers using IQR
is_outlier <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  return(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}

data_outliers <- lapply(data, function(x) {
  if(is.numeric(x)) {
    return(is_outlier(x))
  } else {
    return(rep(FALSE, length(x)))
  }
})

# Check for the number of outliers
lapply(data_outliers, function(x) sum(x, na.rm=TRUE))

## $sales_id
## [1] 0
## 
## $order_id
## [1] 0
## 
## $product_id
## [1] 0
## 
## $price_per_unit
## [1] 0
## 
## $quantity
## [1] 0
## 
## $total_price
## [1] 0

# Save the results into a data frame
results <- data.frame(
  Fields = names(data),
  Distinct_Counts = distinct_counts,
  Percent_Zero = percent_zero,
  Percent_Blank = percent_blank,
  Percent_Null = percent_null,
  Sums = sums,
  Means = means,
  SDs = sds,
  Number_of_Outliers = unlist(lapply(data_outliers, function(x) sum(x, na.rm=TRUE)))
)

# Save the results data frame into a CSV file
write.csv(results, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/profiling_data_sales.csv", row.names=FALSE)

Task 3 - Data Exploration/Discovery/Profiling

Minsu Kim 12436628

Introduction

Load data csv files

A - Metadata Analysis

B - Profiling Data