Hi, If you are Reading this. I have decided to make a good template for the rest of my analysis. Well lets say that most of my analysis are short and are not much good because it kinda lacks tons of things like its mostly descriptive statistics.

I only have 1 hour for learning R and it would really lack tons of things so doing a template would give the expected value of the whole analysis.

By doing this i would welcome whole analysis(the template) and short analysis(most of my analysis).

WHOLE ANALYSIS

  1. State the objective for the hypothesis testing and Analysis

  2. Load and install libraries

  3. Load the dataset

  4. Summarize the dataset

  5. Clean the dataset.(include rename)

  6. Sampling

  7. Central Tendency

  8. Variance and STD

  9. Percentile, Quartile, IQR and boxplots

  10. Plots, Distribution, Kurtosis, and skewness

  11. Confidence interval Effect size, and Power

  12. One tail or two tail?

  13. Hypothesis Testing ( min of 2 and max of 3 hypothesis)

  1. Define Null and Alternative

  2. State Alpha

  3. Show Degrees of freedom

  4. State decision Rule

  5. Do the test

  6. Conclusion

FUNCTIONS IN THE ANALYSIS

I might update more in the future

  1. Load Library
#library(tidyverse)
#library(dplyr)
#library(ggplot2)
  1. Read Data (CSV,XLSX,etc) Dont forget to include the library for reading the specific file
#data <- read.csv("data.csv")
#readr::read_delim("path/tomyfile/myfilename.txt", delim = "\t")
#my_data <- read_excel("my_file.xls")
#my_data <- read_excel("my_file.xlsx")
  1. Summary of the Dataset
#summary_stats <- data %>%
  #select(all_of(numerical_vars)) %>%
  #summary()

#frequency_distributions <- data %>%
  #group_by(across(all_of(categorical_vars))) %>%
  #count() %>%
  #ungroup()

#data %>%
#str()

#data %>%
#head(7)

#data %>%
#colnames()

#colnames(data)
  1. Cleaning
#missing_values <- data %>%
  #summarise(across(everything()) %>%
  #is.na() %>% 
  #sum())

#data <- data %>%
  #select(-Column.Name, -Column.Name)

#iqr <- IQR(data$column)
#upper_limit <- quantile(data$column, 0.75) + 1.5 * iqr
#lower_limit <- quantile(data$column, 0.25) - 1.5 * iqr

#clean_data <- baby_names %>%
  #filter(Count >= lower_limit & Count <= upper_limit)

#data %>%
#colnames()

#colnames(data)

#mental_health_care <- mental_health_care %>%
  #mutate(
    #Indicator_Split = strsplit(Indicator, ", Last"),
    #Before_Comma = Indicator_Split[[1]][1],
    #After_Comma = Indicator_Split[[1]][2],
    #Recent_Prescription = After_Comma
  #) %>%
  #select(-Indicator_Split)
  1. Sampling
#data <- data %>%
  #sample_n({numerical value must be 10 percent of whole population }, replace = FALSE)

#head(data)

#data %>%
#head(7)
  1. Central Tendency, Variance and STD, Quartile and Percentile
#data_stats <- data %>%
  #summarise(
    #min_data = min(data),
    #max_data = max(data),
    #range_data = max_year - min_year,
    #mean_data = mean(data),
    #median_data = median(data),
    #mode_data = mode(data),
    #sd_data = sd(data),
    #variance_year = var(data)
  #)

#gender_stats <- data %>%
  #count(categoricalcolumn) %>%
  #mutate(proportion = n / sum(n))

#most_frequent_names <- head(name_stats, 10)

#print(paste("The variance of data is:", data_stats$))

#calculate_quantiles <- function(x) {
  #data.frame(
    #percentile_10 = quantile(x, 0.1),
    #percentile_25 = quantile(x, 0.25),
    #percentile_50 = quantile(x, 0.5),
    #percentile_75 = quantile(x, 0.75),
    #percentile_90 = quantile(x, 0.9),
    #quartile_1 = quantile(x, 0.25),
    #quartile_3 = quantile(x, 0.75),
    #iqr = diff(quantile(x, c(0.25, 0.75)))
  #)
#}
#data %>%
  #summarise(
  #year_quantiles = list(calculate_quantiles(Year.of.Birth)),
    #count_quantiles = list(calculate_quantiles(Count)),
  #rank_quantiles = list(calculate_quantiles(Rank))
  #) %>%
  #unnest(cols = c(year_quantiles, count_quantiles, rank_quantiles), names_sep = "_") %>%
  #print()
  1. Plots
#ggplot(data, aes(x column, y column)) +
  #geom_line() +
  #labs(title = "Line Plot", x = "X-axis", y = "Y-axis")

#ggplot(data, aes(x column, y column)) +
  #geom_point() +
  #labs(title = "Scatter Plot", x = "X-axis", y = "Y-axis")

#ggplot(data, aes(x column)) +
  #geom_histogram(binwidth = 1) + # Adjust binwidth as needed
  #labs(title = "Histogram", x = "X-axis", y = "Frequency")


#ggplot(data, aes(x = "", fill = category)) +
  #geom_bar(stat = "count") +
  #coord_polar(theta = "y") +
  #labs(title = "Pie Chart")

#ggplot(data, aes(x = category, y = value)) + 
  #geom_bar(stat = "summary", fun = "mean") +
  #labs(title = "Bar Chart", x = "Category", y = "Value")

#ggplot(data, aes(x = category, y = value)) +
  #geom_boxplot() +
  #labs(title = "Box Plot", x = "Category", y = "Value")

8)Kurtosis , plots and distribution

#library(moments)
#library(e1071)

#skewness <- skewness(data$x)
#kurtosis <- kurtosis(data$x)

#cat("Skewness:", skewness, "\n")
#cat("Kurtosis:", kurtosis, "\n")
  1. Confidence Interval, Effect Size and power.
#confidence_level <- 0.95

#confidence_intervals <- data %>%
  #group_by(column) %>%
  #summarize(
    #mean_count = mean(column),
    #sd_count = sd(column),
    #n = n(),
    #lower_ci = mean_count - qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n),
    #upper_ci = mean_count + qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n)
  #)

#print(confidence_intervals)


#cohen_d <- data %>%
  #group_by(column) %>%
  #summarize(
    #mean_count = mean(Column),
    #sd_count = sd(Column),
    #n = n()
  #) %>%
  #mutate(
    #pooled_sd = sqrt(((n[1] - 1) * sd_count[1]^2 + (n[2] - 1) * sd_count[2]^2) / (n[1] + n[2] - 2)),
    #cohen_d = (mean_count[1] - mean_count[2]) / pooled_sd
  #)

#print(cohen_d$cohen_d)

#power_analysis <- pwr.t.test(n = NULL, d = 0.5, sig.level = 0.05, power = 0.8)

#print(power_analysis)
  1. Hypothesis Testing
#Let x and y be the column for testing

# One Sample T-Test
#t.test(x, mu = 5)

# Two Sample T-Test
#t.test(x, y)

# Directional hypothesis testing
#t.test(x, mu = 2, alternative = 'greater')

# one sample test
#wilcox.test(x, exact = FALSE)

# Two sample test
#wilcox.test(x, y)

#Correlation
#cor.test(dataset$column1, dataset$column2)