R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# Load data
data <- read.csv("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass2/cleaneddata.csv")

# Select data range
selected_data <- data[, 1:20]

# Meta analysis data
metadata <- data.frame(
  Field_Name = names(selected_data),
  Data_Type = sapply(selected_data, class),
  Field_Size = sapply(selected_data, function(x) max(nchar(as.character(x)))),
  Sample_Data = sapply(selected_data, function(x) sample(x, 1))
)

setwd("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass2")

# Write to CSV
write.csv(metadata, "cleanddata_metadataanalysis.csv", row.names = FALSE)
# Profiling data
profiling <- data.frame(
  Field_Name = names(selected_data),
  Distinct_Count = sapply(selected_data, function(x) length(unique(x))),
  Zero_Blank_Null_Percent = sapply(selected_data, function(x) mean(x == 0 | x == "" | is.na(x))),
  Sum = sapply(selected_data, function(x) if(is.numeric(x)) sum(x, na.rm = TRUE) else NA),
  Min = sapply(selected_data, function(x) if(is.numeric(x)) min(x, na.rm = TRUE) else NA),
  Max = sapply(selected_data, function(x) if(is.numeric(x)) max(x, na.rm = TRUE) else NA),
  Mean = sapply(selected_data, function(x) if(is.numeric(x)) mean(x, na.rm = TRUE) else NA),
  Std_Dev = sapply(selected_data, function(x) if(is.numeric(x)) sd(x, na.rm = TRUE) else NA)
)

setwd("C:/Users/kms57/Desktop/UTS/Sem 2/32558 Business Intelligence/ass2")

# Write to CSV
write.csv(profiling, "cleanddata_dataprofiling.csv", row.names = FALSE)