This R Markdown was written to explain how I did the tasks I did, and to prove that I did all the tasks myself.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
For all my tasks, I first need to import all the csv files in the corresponding dataset.
products <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/products.csv", header = TRUE)
orders <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/orders.csv", header = TRUE)
customers <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/customers.csv", header = TRUE)
sales <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/sales.csv", header = TRUE)
# Get the data types of each column
data_types <- sapply(products, class)
# Estimate the data format of each column
data_formats <- ifelse(data_types == "numeric", "Decimal", "Text")
# Determine the size of each column (maximum character length or maximum numeric value)
field_sizes <- sapply(products, function(col) {
if (is.numeric(col)) {
max(col, na.rm = TRUE)
} else {
max(nchar(as.character(col)), na.rm = TRUE)
}
})
# Get the first value of each column (example)
examples <- sapply(products, function(col) head(col, 1))
# Convert the results into a data frame
metadata_df <- data.frame(
Field_Name = names(products),
Data_Type = data_types,
Data_Format = data_formats,
Field_Size = field_sizes,
Example = examples
)
# Print the results
print(metadata_df)
## Field_Name Data_Type Data_Format Field_Size
## product_ID product_ID integer Text 1259
## product_type product_type character Text 8
## product_name product_name character Text 17
## size size character Text 2
## colour colour character Text 6
## price price integer Text 119
## quantity quantity integer Text 80
## description description character Text 55
## Example
## product_ID 0
## product_type Shirt
## product_name Oxford Cloth
## size XS
## colour red
## price 114
## quantity 66
## description A red coloured, XS sized, Oxford Cloth Shirt
# Save this data frame to a CSV file:
write.csv(metadata_df, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/metadata_analysis_products.csv", row.names = FALSE)
# Get the data types of each column
data_types <- sapply(orders, class)
# Estimate the data format of each column
data_formats <- ifelse(data_types == "numeric", "Decimal", "Date/Text")
# Determine the size of each column (maximum character length or maximum numeric value)
field_sizes <- sapply(orders, function(col) {
if (is.numeric(col)) {
max(col, na.rm = TRUE)
} else {
max(nchar(as.character(col)), na.rm = TRUE)
}
})
# Get the first value of each column (example)
examples <- sapply(orders, function(col) head(col, 1))
# Convert the results into a data frame
metadata_df <- data.frame(
Field_Name = names(orders),
Data_Type = data_types,
Data_Format = data_formats,
Field_Size = field_sizes,
Example = examples
)
# Print the results
print(metadata_df)
## Field_Name Data_Type Data_Format Field_Size Example
## order_id order_id integer Date/Text 1000 1
## customer_id customer_id integer Date/Text 1000 64
## payment payment integer Date/Text 59910 30811
## order_date order_date character Date/Text 10 2021-8-30
## delivery_date delivery_date character Date/Text 10 2021-09-24
# Save this data frame to a CSV file:
write.csv(metadata_df, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/metadata_analysis_orders.csv", row.names = FALSE)
# Get the data types of each column
data_types <- sapply(customers, class)
# Estimate the data format of each column
data_formats <- ifelse(data_types == "numeric", "Decimal", "Text")
# Determine the size of each column (maximum character length or maximum numeric value)
field_sizes <- sapply(customers, function(col) {
if (is.numeric(col)) {
max(col, na.rm = TRUE)
} else {
max(nchar(as.character(col)), na.rm = TRUE)
}
})
# Get the first value of each column (example)
examples <- sapply(customers, function(col) head(col, 1))
# Convert the results into a data frame
metadata_df <- data.frame(
Field_Name = names(customers),
Data_Type = data_types,
Data_Format = data_formats,
Field_Size = field_sizes,
Example = examples
)
# Print the results
print(metadata_df)
## Field_Name Data_Type Data_Format Field_Size
## customer_id customer_id integer Text 1000
## customer_name customer_name character Text 23
## gender gender character Text 11
## age age integer Text 80
## home_address home_address character Text 33
## zip_code zip_code integer Text 9998
## city city character Text 22
## state state character Text 28
## country country character Text 9
## Example
## customer_id 1
## customer_name Leanna Busson
## gender Female
## age 30
## home_address 8606 Victoria TerraceSuite 560
## zip_code 5464
## city Johnstonhaven
## state Northern Territory
## country Australia
# Save this data frame to a CSV file:
write.csv(metadata_df, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/metadata_analysis_customers.csv", row.names = FALSE)
# Get the data types of each column
data_types <- sapply(sales, class)
# Estimate the data format of each column
data_formats <- ifelse(data_types == "numeric", "Decimal", "Text")
# Determine the size of each column (maximum character length or maximum numeric value)
field_sizes <- sapply(sales, function(col) {
if (is.numeric(col)) {
max(col, na.rm = TRUE)
} else {
max(nchar(as.character(col)), na.rm = TRUE)
}
})
# Get the first value of each column (example)
examples <- sapply(sales, function(col) head(col, 1))
# Convert the results into a data frame
metadata_df <- data.frame(
Field_Name = names(sales),
Data_Type = data_types,
Data_Format = data_formats,
Field_Size = field_sizes,
Example = examples
)
# Print the results
print(metadata_df)
## Field_Name Data_Type Data_Format Field_Size Example
## sales_id sales_id integer Text 4999 0
## order_id order_id integer Text 999 1
## product_id product_id integer Text 1259 218
## price_per_unit price_per_unit integer Text 119 106
## quantity quantity integer Text 3 2
## total_price total_price integer Text 357 212
# Save this data frame to a CSV file:
write.csv(metadata_df, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/metadata_analysis_sales.csv", row.names = FALSE)
data <- read.csv("products.csv")
# Descriptive statistics summary
summary(data)
## product_ID product_type product_name size
## Min. : 0.0 Length:1260 Length:1260 Length:1260
## 1st Qu.: 314.8 Class :character Class :character Class :character
## Median : 629.5 Mode :character Mode :character Mode :character
## Mean : 629.5
## 3rd Qu.: 944.2
## Max. :1259.0
## colour price quantity description
## Length:1260 Min. : 90.00 Min. :40.00 Length:1260
## Class :character 1st Qu.: 95.75 1st Qu.:50.00 Class :character
## Mode :character Median :108.50 Median :60.00 Mode :character
## Mean :105.81 Mean :60.15
## 3rd Qu.:114.00 3rd Qu.:70.00
## Max. :119.00 Max. :80.00
# Count of distinct values
distinct_counts <- sapply(data, function(x) length(unique(x)))
print(distinct_counts)
## product_ID product_type product_name size colour price
## 1260 3 35 5 7 26
## quantity description
## 41 1260
# Percentage of 0/blank/null values
percent_zero <- sapply(data, function(x) {
if (is.numeric(x)) {
return(mean(x == 0, na.rm=TRUE))
} else {
return(NA)
}
})
print(percent_zero)
## product_ID product_type product_name size colour price
## 0.0007936508 NA NA NA NA 0.0000000000
## quantity description
## 0.0000000000 NA
percent_blank <- sapply(data, function(x) {
if (is.character(x)) {
return(mean(x == "", na.rm=TRUE))
} else {
return(NA)
}
})
print(percent_blank)
## product_ID product_type product_name size colour price
## NA 0 0 0 0 NA
## quantity description
## NA 0
percent_null <- sapply(data, function(x) mean(is.na(x)))
print(percent_null)
## product_ID product_type product_name size colour price
## 0 0 0 0 0 0
## quantity description
## 0 0
# Summation of values
sums <- sapply(data, function(x) {
if (is.numeric(x)) {
return(sum(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(sums)
## product_ID product_type product_name size colour price
## 793170 NA NA NA NA 133315
## quantity description
## 75789 NA
# Value distribution (visualized with histograms)
lapply(data, function(x) {
if(is.numeric(x)) {
hist(x, main=paste("Histogram of", deparse(substitute(x))))
}
})
## $product_ID
## $breaks
## [1] 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300
##
## $counts
## [1] 101 100 100 100 100 100 100 100 100 100 100 100 59
##
## $density
## [1] 0.0008015873 0.0007936508 0.0007936508 0.0007936508 0.0007936508
## [6] 0.0007936508 0.0007936508 0.0007936508 0.0007936508 0.0007936508
## [11] 0.0007936508 0.0007936508 0.0004682540
##
## $mids
## [1] 50 150 250 350 450 550 650 750 850 950 1050 1150 1250
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $product_type
## NULL
##
## $product_name
## NULL
##
## $size
## NULL
##
## $colour
## NULL
##
## $price
## $breaks
## [1] 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120
##
## $counts
## [1] 175 70 105 35 70 35 35 70 35 70 105 175 105 70 105
##
## $density
## [1] 0.06944444 0.02777778 0.04166667 0.01388889 0.02777778 0.01388889
## [7] 0.01388889 0.02777778 0.01388889 0.02777778 0.04166667 0.06944444
## [13] 0.04166667 0.02777778 0.04166667
##
## $mids
## [1] 91 93 95 97 99 101 103 105 107 109 111 113 115 117 119
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $quantity
## $breaks
## [1] 40 45 50 55 60 65 70 75 80
##
## $counts
## [1] 175 150 153 162 165 149 155 151
##
## $density
## [1] 0.02777778 0.02380952 0.02428571 0.02571429 0.02619048 0.02365079 0.02460317
## [8] 0.02396825
##
## $mids
## [1] 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $description
## NULL
# Mean and standard deviation
means <- sapply(data, function(x) {
if (is.numeric(x)) {
return(mean(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(means)
## product_ID product_type product_name size colour price
## 629.5000 NA NA NA NA 105.8056
## quantity description
## 60.1500 NA
sds <- sapply(data, function(x) {
if (is.numeric(x)) {
return(sd(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(sds)
## product_ID product_type product_name size colour price
## 363.874979 NA NA NA NA 9.704423
## quantity description
## 11.670573 NA
# Check for outliers using IQR
is_outlier <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
return(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}
data_outliers <- lapply(data, function(x) {
if(is.numeric(x)) {
return(is_outlier(x))
} else {
return(rep(FALSE, length(x)))
}
})
# Check for the number of outliers
lapply(data_outliers, function(x) sum(x, na.rm=TRUE))
## $product_ID
## [1] 0
##
## $product_type
## [1] 0
##
## $product_name
## [1] 0
##
## $size
## [1] 0
##
## $colour
## [1] 0
##
## $price
## [1] 0
##
## $quantity
## [1] 0
##
## $description
## [1] 0
# Save the results into a data frame
results <- data.frame(
Fields = names(data),
Distinct_Counts = distinct_counts,
Percent_Zero = percent_zero,
Percent_Blank = percent_blank,
Percent_Null = percent_null,
Sums = sums,
Means = means,
SDs = sds,
Number_of_Outliers = unlist(lapply(data_outliers, function(x) sum(x, na.rm=TRUE)))
)
# Save the results data frame into a CSV file
write.csv(results, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/profiling_data_products.csv", row.names=FALSE)
data <- read.csv("orders.csv")
# Descriptive statistics summary
summary(data)
## order_id customer_id payment order_date
## Min. : 1.0 Min. : 1.0 Min. :10043 Length:1000
## 1st Qu.: 250.8 1st Qu.: 275.2 1st Qu.:21329 Class :character
## Median : 500.5 Median : 515.0 Median :33698 Mode :character
## Mean : 500.5 Mean : 506.6 Mean :33973
## 3rd Qu.: 750.2 3rd Qu.: 737.2 3rd Qu.:46249
## Max. :1000.0 Max. :1000.0 Max. :59910
## delivery_date
## Length:1000
## Class :character
## Mode :character
##
##
##
# Count of distinct values
distinct_counts <- sapply(data, function(x) length(unique(x)))
print(distinct_counts)
## order_id customer_id payment order_date delivery_date
## 1000 617 995 291 305
# Percentage of 0/blank/null values
percent_zero <- sapply(data, function(x) {
if (is.numeric(x)) {
return(mean(x == 0, na.rm=TRUE))
} else {
return(NA)
}
})
print(percent_zero)
## order_id customer_id payment order_date delivery_date
## 0 0 0 NA NA
percent_blank <- sapply(data, function(x) {
if (is.character(x)) {
return(mean(x == "", na.rm=TRUE))
} else {
return(NA)
}
})
print(percent_blank)
## order_id customer_id payment order_date delivery_date
## NA NA NA 0 0
percent_null <- sapply(data, function(x) mean(is.na(x)))
print(percent_null)
## order_id customer_id payment order_date delivery_date
## 0 0 0 0 0
# Summation of values
sums <- sapply(data, function(x) {
if (is.numeric(x)) {
return(sum(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(sums)
## order_id customer_id payment order_date delivery_date
## 500500 506640 33972936 NA NA
# Value distribution (visualized with histograms)
lapply(data, function(x) {
if(is.numeric(x)) {
hist(x, main=paste("Histogram of", deparse(substitute(x))))
}
})
## $order_id
## $breaks
## [1] 0 100 200 300 400 500 600 700 800 900 1000
##
## $counts
## [1] 100 100 100 100 100 100 100 100 100 100
##
## $density
## [1] 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
##
## $mids
## [1] 50 150 250 350 450 550 650 750 850 950
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $customer_id
## $breaks
## [1] 0 100 200 300 400 500 600 700 800 900 1000
##
## $counts
## [1] 96 79 94 93 124 109 116 113 92 84
##
## $density
## [1] 0.00096 0.00079 0.00094 0.00093 0.00124 0.00109 0.00116 0.00113 0.00092
## [10] 0.00084
##
## $mids
## [1] 50 150 250 350 450 550 650 750 850 950
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $payment
## $breaks
## [1] 10000 15000 20000 25000 30000 35000 40000 45000 50000 55000 60000
##
## $counts
## [1] 116 111 99 101 92 106 104 93 89 89
##
## $density
## [1] 2.32e-05 2.22e-05 1.98e-05 2.02e-05 1.84e-05 2.12e-05 2.08e-05 1.86e-05
## [9] 1.78e-05 1.78e-05
##
## $mids
## [1] 12500 17500 22500 27500 32500 37500 42500 47500 52500 57500
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $order_date
## NULL
##
## $delivery_date
## NULL
# Mean and standard deviation
means <- sapply(data, function(x) {
if (is.numeric(x)) {
return(mean(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(means)
## order_id customer_id payment order_date delivery_date
## 500.50 506.64 33972.94 NA NA
sds <- sapply(data, function(x) {
if (is.numeric(x)) {
return(sd(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(sds)
## order_id customer_id payment order_date delivery_date
## 288.8194 277.1155 14451.6090 NA NA
# Check for outliers using IQR
is_outlier <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
return(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}
data_outliers <- lapply(data, function(x) {
if(is.numeric(x)) {
return(is_outlier(x))
} else {
return(rep(FALSE, length(x)))
}
})
# Check for the number of outliers
lapply(data_outliers, function(x) sum(x, na.rm=TRUE))
## $order_id
## [1] 0
##
## $customer_id
## [1] 0
##
## $payment
## [1] 0
##
## $order_date
## [1] 0
##
## $delivery_date
## [1] 0
# Save the results into a data frame
results <- data.frame(
Fields = names(data),
Distinct_Counts = distinct_counts,
Percent_Zero = percent_zero,
Percent_Blank = percent_blank,
Percent_Null = percent_null,
Sums = sums,
Means = means,
SDs = sds,
Number_of_Outliers = unlist(lapply(data_outliers, function(x) sum(x, na.rm=TRUE)))
)
# Save the results data frame into a CSV file
write.csv(results, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/profiling_data_oders.csv", row.names=FALSE)
data <- read.csv("customers.csv")
# Descriptive statistics summary
summary(data)
## customer_id customer_name gender age
## Min. : 1.0 Length:1000 Length:1000 Min. :20.00
## 1st Qu.: 250.8 Class :character Class :character 1st Qu.:34.00
## Median : 500.5 Mode :character Mode :character Median :50.00
## Mean : 500.5 Mean :49.86
## 3rd Qu.: 750.2 3rd Qu.:65.00
## Max. :1000.0 Max. :80.00
## home_address zip_code city state
## Length:1000 Min. : 2 Length:1000 Length:1000
## Class :character 1st Qu.:2402 Class :character Class :character
## Mode :character Median :5083 Mode :character Mode :character
## Mean :5005
## 3rd Qu.:7460
## Max. :9998
## country
## Length:1000
## Class :character
## Mode :character
##
##
##
# Count of distinct values
distinct_counts <- sapply(data, function(x) length(unique(x)))
print(distinct_counts)
## customer_id customer_name gender age home_address
## 1000 1000 8 61 1000
## zip_code city state country
## 955 961 8 1
# Percentage of 0/blank/null values
percent_zero <- sapply(data, function(x) {
if (is.numeric(x)) {
return(mean(x == 0, na.rm=TRUE))
} else {
return(NA)
}
})
print(percent_zero)
## customer_id customer_name gender age home_address
## 0 NA NA 0 NA
## zip_code city state country
## 0 NA NA NA
percent_blank <- sapply(data, function(x) {
if (is.character(x)) {
return(mean(x == "", na.rm=TRUE))
} else {
return(NA)
}
})
print(percent_blank)
## customer_id customer_name gender age home_address
## NA 0 0 NA 0
## zip_code city state country
## NA 0 0 0
percent_null <- sapply(data, function(x) mean(is.na(x)))
print(percent_null)
## customer_id customer_name gender age home_address
## 0 0 0 0 0
## zip_code city state country
## 0 0 0 0
# Summation of values
sums <- sapply(data, function(x) {
if (is.numeric(x)) {
return(sum(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(sums)
## customer_id customer_name gender age home_address
## 500500 NA NA 49860 NA
## zip_code city state country
## 5004872 NA NA NA
# Value distribution (visualized with histograms)
lapply(data, function(x) {
if(is.numeric(x)) {
hist(x, main=paste("Histogram of", deparse(substitute(x))))
}
})
## $customer_id
## $breaks
## [1] 0 100 200 300 400 500 600 700 800 900 1000
##
## $counts
## [1] 100 100 100 100 100 100 100 100 100 100
##
## $density
## [1] 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
##
## $mids
## [1] 50 150 250 350 450 550 650 750 850 950
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $customer_name
## NULL
##
## $gender
## NULL
##
## $age
## $breaks
## [1] 20 25 30 35 40 45 50 55 60 65 70 75 80
##
## $counts
## [1] 105 81 86 74 84 78 75 90 90 77 84 76
##
## $density
## [1] 0.0210 0.0162 0.0172 0.0148 0.0168 0.0156 0.0150 0.0180 0.0180 0.0154
## [11] 0.0168 0.0152
##
## $mids
## [1] 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $home_address
## NULL
##
## $zip_code
## $breaks
## [1] 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000
##
## $counts
## [1] 98 109 94 97 96 94 112 108 101 91
##
## $density
## [1] 0.000098 0.000109 0.000094 0.000097 0.000096 0.000094 0.000112 0.000108
## [9] 0.000101 0.000091
##
## $mids
## [1] 500 1500 2500 3500 4500 5500 6500 7500 8500 9500
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $city
## NULL
##
## $state
## NULL
##
## $country
## NULL
# Mean and standard deviation
means <- sapply(data, function(x) {
if (is.numeric(x)) {
return(mean(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(means)
## customer_id customer_name gender age home_address
## 500.500 NA NA 49.860 NA
## zip_code city state country
## 5004.872 NA NA NA
sds <- sapply(data, function(x) {
if (is.numeric(x)) {
return(sd(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(sds)
## customer_id customer_name gender age home_address
## 288.81944 NA NA 17.64783 NA
## zip_code city state country
## 2884.49733 NA NA NA
# Check for outliers using IQR
is_outlier <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
return(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}
data_outliers <- lapply(data, function(x) {
if(is.numeric(x)) {
return(is_outlier(x))
} else {
return(rep(FALSE, length(x)))
}
})
# Check for the number of outliers
lapply(data_outliers, function(x) sum(x, na.rm=TRUE))
## $customer_id
## [1] 0
##
## $customer_name
## [1] 0
##
## $gender
## [1] 0
##
## $age
## [1] 0
##
## $home_address
## [1] 0
##
## $zip_code
## [1] 0
##
## $city
## [1] 0
##
## $state
## [1] 0
##
## $country
## [1] 0
# Save the results into a data frame
results <- data.frame(
Fields = names(data),
Distinct_Counts = distinct_counts,
Percent_Zero = percent_zero,
Percent_Blank = percent_blank,
Percent_Null = percent_null,
Sums = sums,
Means = means,
SDs = sds,
Number_of_Outliers = unlist(lapply(data_outliers, function(x) sum(x, na.rm=TRUE)))
)
# Save the results data frame into a CSV file
write.csv(results, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/profiling_data_customers.csv", row.names=FALSE)
data <- read.csv("sales.csv")
# Descriptive statistics summary
summary(data)
## sales_id order_id product_id price_per_unit
## Min. : 0 Min. : 1.0 Min. : 1.0 Min. : 90.0
## 1st Qu.:1250 1st Qu.:258.0 1st Qu.: 323.0 1st Qu.: 95.0
## Median :2500 Median :504.5 Median : 635.0 Median :102.0
## Mean :2500 Mean :503.0 Mean : 634.1 Mean :103.5
## 3rd Qu.:3749 3rd Qu.:749.0 3rd Qu.: 951.0 3rd Qu.:112.0
## Max. :4999 Max. :999.0 Max. :1259.0 Max. :119.0
## quantity total_price
## Min. :1.000 Min. : 90.0
## 1st Qu.:1.000 1st Qu.:112.0
## Median :2.000 Median :204.0
## Mean :1.992 Mean :206.4
## 3rd Qu.:3.000 3rd Qu.:285.0
## Max. :3.000 Max. :357.0
# Count of distinct values
distinct_counts <- sapply(data, function(x) length(unique(x)))
print(distinct_counts)
## sales_id order_id product_id price_per_unit quantity
## 5000 993 1233 23 3
## total_price
## 69
# Percentage of 0/blank/null values
percent_zero <- sapply(data, function(x) {
if (is.numeric(x)) {
return(mean(x == 0, na.rm=TRUE))
} else {
return(NA)
}
})
print(percent_zero)
## sales_id order_id product_id price_per_unit quantity
## 2e-04 0e+00 0e+00 0e+00 0e+00
## total_price
## 0e+00
percent_blank <- sapply(data, function(x) {
if (is.character(x)) {
return(mean(x == "", na.rm=TRUE))
} else {
return(NA)
}
})
print(percent_blank)
## sales_id order_id product_id price_per_unit quantity
## NA NA NA NA NA
## total_price
## NA
percent_null <- sapply(data, function(x) mean(is.na(x)))
print(percent_null)
## sales_id order_id product_id price_per_unit quantity
## 0 0 0 0 0
## total_price
## 0
# Summation of values
sums <- sapply(data, function(x) {
if (is.numeric(x)) {
return(sum(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(sums)
## sales_id order_id product_id price_per_unit quantity
## 12497500 2515191 3170266 517508 9962
## total_price
## 1031800
# Value distribution (visualized with histograms)
lapply(data, function(x) {
if(is.numeric(x)) {
hist(x, main=paste("Histogram of", deparse(substitute(x))))
}
})
## $sales_id
## $breaks
## [1] 0 500 1000 1500 2000 2500 3000 3500 4000 4500 5000
##
## $counts
## [1] 501 500 500 500 500 500 500 500 500 499
##
## $density
## [1] 0.0002004 0.0002000 0.0002000 0.0002000 0.0002000 0.0002000 0.0002000
## [8] 0.0002000 0.0002000 0.0001996
##
## $mids
## [1] 250 750 1250 1750 2250 2750 3250 3750 4250 4750
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $order_id
## $breaks
## [1] 0 100 200 300 400 500 600 700 800 900 1000
##
## $counts
## [1] 456 523 496 520 477 530 479 539 498 482
##
## $density
## [1] 0.000912 0.001046 0.000992 0.001040 0.000954 0.001060 0.000958 0.001078
## [9] 0.000996 0.000964
##
## $mids
## [1] 50 150 250 350 450 550 650 750 850 950
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $product_id
## $breaks
## [1] 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300
##
## $counts
## [1] 398 386 372 405 391 415 386 401 392 403 429 381 241
##
## $density
## [1] 0.000796 0.000772 0.000744 0.000810 0.000782 0.000830 0.000772 0.000802
## [9] 0.000784 0.000806 0.000858 0.000762 0.000482
##
## $mids
## [1] 50 150 250 350 450 550 650 750 850 950 1050 1150 1250
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $price_per_unit
## $breaks
## [1] 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120
##
## $counts
## [1] 559 570 424 429 290 277 127 535 134 256 264 269 135 431 300
##
## $density
## [1] 0.0559 0.0570 0.0424 0.0429 0.0290 0.0277 0.0127 0.0535 0.0134 0.0256
## [11] 0.0264 0.0269 0.0135 0.0431 0.0300
##
## $mids
## [1] 91 93 95 97 99 101 103 105 107 109 111 113 115 117 119
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $quantity
## $breaks
## [1] 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0
##
## $counts
## [1] 1649 0 0 0 1740 0 0 0 0 1611
##
## $density
## [1] 1.649 0.000 0.000 0.000 1.740 0.000 0.000 0.000 0.000 1.611
##
## $mids
## [1] 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5 2.7 2.9
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $total_price
## $breaks
## [1] 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360
##
## $counts
## [1] 768 881 0 0 57 738 484 461 0 270 439 312 278 312
##
## $density
## [1] 0.00768 0.00881 0.00000 0.00000 0.00057 0.00738 0.00484 0.00461 0.00000
## [10] 0.00270 0.00439 0.00312 0.00278 0.00312
##
## $mids
## [1] 90 110 130 150 170 190 210 230 250 270 290 310 330 350
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# Mean and standard deviation
means <- sapply(data, function(x) {
if (is.numeric(x)) {
return(mean(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(means)
## sales_id order_id product_id price_per_unit quantity
## 2499.5000 503.0382 634.0532 103.5016 1.9924
## total_price
## 206.3600
sds <- sapply(data, function(x) {
if (is.numeric(x)) {
return(sd(x, na.rm=TRUE))
} else {
return(NA)
}
})
print(sds)
## sales_id order_id product_id price_per_unit quantity
## 1443.5200033 285.9644178 363.2557938 9.1950045 0.8075102
## total_price
## 86.3574567
# Check for outliers using IQR
is_outlier <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
return(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}
data_outliers <- lapply(data, function(x) {
if(is.numeric(x)) {
return(is_outlier(x))
} else {
return(rep(FALSE, length(x)))
}
})
# Check for the number of outliers
lapply(data_outliers, function(x) sum(x, na.rm=TRUE))
## $sales_id
## [1] 0
##
## $order_id
## [1] 0
##
## $product_id
## [1] 0
##
## $price_per_unit
## [1] 0
##
## $quantity
## [1] 0
##
## $total_price
## [1] 0
# Save the results into a data frame
results <- data.frame(
Fields = names(data),
Distinct_Counts = distinct_counts,
Percent_Zero = percent_zero,
Percent_Blank = percent_blank,
Percent_Null = percent_null,
Sums = sums,
Means = means,
SDs = sds,
Number_of_Outliers = unlist(lapply(data_outliers, function(x) sum(x, na.rm=TRUE)))
)
# Save the results data frame into a CSV file
write.csv(results, "C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass4/profiling_data_sales.csv", row.names=FALSE)