WHOLE-ANALYSIS

Hi, If you are Reading this. I have decided to make a good template for the rest of my analysis. Well lets say that most of my analysis are short and are not much good because it kinda lacks tons of things like its mostly descriptive statistics.

I only have 1 hour for learning R and it would really lack tons of things so doing a template would give the expected value of the whole analysis.

By doing this i would welcome whole analysis(the template) and short analysis(most of my analysis).

WHOLE ANALYSIS

State the objective for the hypothesis testing and Analysis
Load and install libraries
Load the dataset
Summarize the dataset
Clean the dataset.(include rename)
Sampling
Central Tendency
Variance and STD
Percentile, Quartile, IQR and boxplots
Plots, Distribution, Kurtosis, and skewness
Confidence interval Effect size, and Power
One tail or two tail?
Hypothesis Testing ( min of 2 and max of 3 hypothesis)

Define Null and Alternative
State Alpha
Show Degrees of freedom
State decision Rule
Do the test
Conclusion

FUNCTIONS IN THE ANALYSIS

I might update more in the future

Load Library

#library(tidyverse)
#library(dplyr)
#library(ggplot2)

Read Data (CSV,XLSX,etc) Dont forget to include the library for reading the specific file

#data <- read.csv("data.csv")
#readr::read_delim("path/tomyfile/myfilename.txt", delim = "\t")
#my_data <- read_excel("my_file.xls")
#my_data <- read_excel("my_file.xlsx")

Summary of the Dataset

#summary_stats <- data %>%
  #select(all_of(numerical_vars)) %>%
  #summary()

#frequency_distributions <- data %>%
  #group_by(across(all_of(categorical_vars))) %>%
  #count() %>%
  #ungroup()

#data %>%
#str()

#data %>%
#head(7)

#data %>%
#colnames()

#colnames(data)

Cleaning

#missing_values <- data %>%
  #summarise(across(everything()) %>%
  #is.na() %>% 
  #sum())

#data <- data %>%
  #select(-Column.Name, -Column.Name)

#iqr <- IQR(data$column)
#upper_limit <- quantile(data$column, 0.75) + 1.5 * iqr
#lower_limit <- quantile(data$column, 0.25) - 1.5 * iqr

#clean_data <- baby_names %>%
  #filter(Count >= lower_limit & Count <= upper_limit)

#data %>%
#colnames()

#colnames(data)

#mental_health_care <- mental_health_care %>%
  #mutate(
    #Indicator_Split = strsplit(Indicator, ", Last"),
    #Before_Comma = Indicator_Split[[1]][1],
    #After_Comma = Indicator_Split[[1]][2],
    #Recent_Prescription = After_Comma
  #) %>%
  #select(-Indicator_Split)

Sampling

#data <- data %>%
  #sample_n({numerical value must be 10 percent of whole population }, replace = FALSE)

#head(data)

#data %>%
#head(7)

Central Tendency, Variance and STD, Quartile and Percentile

#data_stats <- data %>%
  #summarise(
    #min_data = min(data),
    #max_data = max(data),
    #range_data = max_year - min_year,
    #mean_data = mean(data),
    #median_data = median(data),
    #mode_data = mode(data),
    #sd_data = sd(data),
    #variance_year = var(data)
  #)

#gender_stats <- data %>%
  #count(categoricalcolumn) %>%
  #mutate(proportion = n / sum(n))

#most_frequent_names <- head(name_stats, 10)

#print(paste("The variance of data is:", data_stats$))

#calculate_quantiles <- function(x) {
  #data.frame(
    #percentile_10 = quantile(x, 0.1),
    #percentile_25 = quantile(x, 0.25),
    #percentile_50 = quantile(x, 0.5),
    #percentile_75 = quantile(x, 0.75),
    #percentile_90 = quantile(x, 0.9),
    #quartile_1 = quantile(x, 0.25),
    #quartile_3 = quantile(x, 0.75),
    #iqr = diff(quantile(x, c(0.25, 0.75)))
  #)
#}
#data %>%
  #summarise(
  #year_quantiles = list(calculate_quantiles(Year.of.Birth)),
    #count_quantiles = list(calculate_quantiles(Count)),
  #rank_quantiles = list(calculate_quantiles(Rank))
  #) %>%
  #unnest(cols = c(year_quantiles, count_quantiles, rank_quantiles), names_sep = "_") %>%
  #print()

Plots

#ggplot(data, aes(x column, y column)) +
  #geom_line() +
  #labs(title = "Line Plot", x = "X-axis", y = "Y-axis")

#ggplot(data, aes(x column, y column)) +
  #geom_point() +
  #labs(title = "Scatter Plot", x = "X-axis", y = "Y-axis")

#ggplot(data, aes(x column)) +
  #geom_histogram(binwidth = 1) + # Adjust binwidth as needed
  #labs(title = "Histogram", x = "X-axis", y = "Frequency")


#ggplot(data, aes(x = "", fill = category)) +
  #geom_bar(stat = "count") +
  #coord_polar(theta = "y") +
  #labs(title = "Pie Chart")

#ggplot(data, aes(x = category, y = value)) + 
  #geom_bar(stat = "summary", fun = "mean") +
  #labs(title = "Bar Chart", x = "Category", y = "Value")

#ggplot(data, aes(x = category, y = value)) +
  #geom_boxplot() +
  #labs(title = "Box Plot", x = "Category", y = "Value")

8)Kurtosis , plots and distribution

#library(moments)
#library(e1071)

#skewness <- skewness(data$x)
#kurtosis <- kurtosis(data$x)

#cat("Skewness:", skewness, "\n")
#cat("Kurtosis:", kurtosis, "\n")

Confidence Interval, Effect Size and power.

#confidence_level <- 0.95

#confidence_intervals <- data %>%
  #group_by(column) %>%
  #summarize(
    #mean_count = mean(column),
    #sd_count = sd(column),
    #n = n(),
    #lower_ci = mean_count - qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n),
    #upper_ci = mean_count + qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n)
  #)

#print(confidence_intervals)


#cohen_d <- data %>%
  #group_by(column) %>%
  #summarize(
    #mean_count = mean(Column),
    #sd_count = sd(Column),
    #n = n()
  #) %>%
  #mutate(
    #pooled_sd = sqrt(((n[1] - 1) * sd_count[1]^2 + (n[2] - 1) * sd_count[2]^2) / (n[1] + n[2] - 2)),
    #cohen_d = (mean_count[1] - mean_count[2]) / pooled_sd
  #)

#print(cohen_d$cohen_d)

#power_analysis <- pwr.t.test(n = NULL, d = 0.5, sig.level = 0.05, power = 0.8)

#print(power_analysis)

Hypothesis Testing

#Let x and y be the column for testing

# One Sample T-Test
#t.test(x, mu = 5)

# Two Sample T-Test
#t.test(x, y)

# Directional hypothesis testing
#t.test(x, mu = 2, alternative = 'greater')

# one sample test
#wilcox.test(x, exact = FALSE)

# Two sample test
#wilcox.test(x, y)

#Correlation
#cor.test(dataset$column1, dataset$column2)

WHOLE-ANALYSIS_TEMPLATE

AJ

2024-10-11

WHOLE ANALYSIS

FUNCTIONS IN THE ANALYSIS