Hi, If you are Reading this. I have decided to make a good template for the rest of my analysis. Well lets say that most of my analysis are short and are not much good because it kinda lacks tons of things like its mostly descriptive statistics.
I only have 1 hour for learning R and it would really lack tons of things so doing a template would give the expected value of the whole analysis.
By doing this i would welcome whole analysis(the template) and short analysis(most of my analysis).
State the objective for the hypothesis testing and Analysis
Load and install libraries
Load the dataset
Summarize the dataset
Clean the dataset.(include rename)
Sampling
Central Tendency
Variance and STD
Percentile, Quartile, IQR and boxplots
Plots, Distribution, Kurtosis, and skewness
Confidence interval Effect size, and Power
One tail or two tail?
Hypothesis Testing ( min of 2 and max of 3 hypothesis)
Define Null and Alternative
State Alpha
Show Degrees of freedom
State decision Rule
Do the test
Conclusion
I might update more in the future
#library(tidyverse)
#library(dplyr)
#library(ggplot2)
#data <- read.csv("data.csv")
#readr::read_delim("path/tomyfile/myfilename.txt", delim = "\t")
#my_data <- read_excel("my_file.xls")
#my_data <- read_excel("my_file.xlsx")
#summary_stats <- data %>%
#select(all_of(numerical_vars)) %>%
#summary()
#frequency_distributions <- data %>%
#group_by(across(all_of(categorical_vars))) %>%
#count() %>%
#ungroup()
#data %>%
#str()
#data %>%
#head(7)
#data %>%
#colnames()
#colnames(data)
#missing_values <- data %>%
#summarise(across(everything()) %>%
#is.na() %>%
#sum())
#data <- data %>%
#select(-Column.Name, -Column.Name)
#iqr <- IQR(data$column)
#upper_limit <- quantile(data$column, 0.75) + 1.5 * iqr
#lower_limit <- quantile(data$column, 0.25) - 1.5 * iqr
#clean_data <- baby_names %>%
#filter(Count >= lower_limit & Count <= upper_limit)
#data %>%
#colnames()
#colnames(data)
#mental_health_care <- mental_health_care %>%
#mutate(
#Indicator_Split = strsplit(Indicator, ", Last"),
#Before_Comma = Indicator_Split[[1]][1],
#After_Comma = Indicator_Split[[1]][2],
#Recent_Prescription = After_Comma
#) %>%
#select(-Indicator_Split)
#data <- data %>%
#sample_n({numerical value must be 10 percent of whole population }, replace = FALSE)
#head(data)
#data %>%
#head(7)
#data_stats <- data %>%
#summarise(
#min_data = min(data),
#max_data = max(data),
#range_data = max_year - min_year,
#mean_data = mean(data),
#median_data = median(data),
#mode_data = mode(data),
#sd_data = sd(data),
#variance_year = var(data)
#)
#gender_stats <- data %>%
#count(categoricalcolumn) %>%
#mutate(proportion = n / sum(n))
#most_frequent_names <- head(name_stats, 10)
#print(paste("The variance of data is:", data_stats$))
#calculate_quantiles <- function(x) {
#data.frame(
#percentile_10 = quantile(x, 0.1),
#percentile_25 = quantile(x, 0.25),
#percentile_50 = quantile(x, 0.5),
#percentile_75 = quantile(x, 0.75),
#percentile_90 = quantile(x, 0.9),
#quartile_1 = quantile(x, 0.25),
#quartile_3 = quantile(x, 0.75),
#iqr = diff(quantile(x, c(0.25, 0.75)))
#)
#}
#data %>%
#summarise(
#year_quantiles = list(calculate_quantiles(Year.of.Birth)),
#count_quantiles = list(calculate_quantiles(Count)),
#rank_quantiles = list(calculate_quantiles(Rank))
#) %>%
#unnest(cols = c(year_quantiles, count_quantiles, rank_quantiles), names_sep = "_") %>%
#print()
#ggplot(data, aes(x column, y column)) +
#geom_line() +
#labs(title = "Line Plot", x = "X-axis", y = "Y-axis")
#ggplot(data, aes(x column, y column)) +
#geom_point() +
#labs(title = "Scatter Plot", x = "X-axis", y = "Y-axis")
#ggplot(data, aes(x column)) +
#geom_histogram(binwidth = 1) + # Adjust binwidth as needed
#labs(title = "Histogram", x = "X-axis", y = "Frequency")
#ggplot(data, aes(x = "", fill = category)) +
#geom_bar(stat = "count") +
#coord_polar(theta = "y") +
#labs(title = "Pie Chart")
#ggplot(data, aes(x = category, y = value)) +
#geom_bar(stat = "summary", fun = "mean") +
#labs(title = "Bar Chart", x = "Category", y = "Value")
#ggplot(data, aes(x = category, y = value)) +
#geom_boxplot() +
#labs(title = "Box Plot", x = "Category", y = "Value")
8)Kurtosis , plots and distribution
#library(moments)
#library(e1071)
#skewness <- skewness(data$x)
#kurtosis <- kurtosis(data$x)
#cat("Skewness:", skewness, "\n")
#cat("Kurtosis:", kurtosis, "\n")
#confidence_level <- 0.95
#confidence_intervals <- data %>%
#group_by(column) %>%
#summarize(
#mean_count = mean(column),
#sd_count = sd(column),
#n = n(),
#lower_ci = mean_count - qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n),
#upper_ci = mean_count + qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n)
#)
#print(confidence_intervals)
#cohen_d <- data %>%
#group_by(column) %>%
#summarize(
#mean_count = mean(Column),
#sd_count = sd(Column),
#n = n()
#) %>%
#mutate(
#pooled_sd = sqrt(((n[1] - 1) * sd_count[1]^2 + (n[2] - 1) * sd_count[2]^2) / (n[1] + n[2] - 2)),
#cohen_d = (mean_count[1] - mean_count[2]) / pooled_sd
#)
#print(cohen_d$cohen_d)
#power_analysis <- pwr.t.test(n = NULL, d = 0.5, sig.level = 0.05, power = 0.8)
#print(power_analysis)
#Let x and y be the column for testing
# One Sample T-Test
#t.test(x, mu = 5)
# Two Sample T-Test
#t.test(x, y)
# Directional hypothesis testing
#t.test(x, mu = 2, alternative = 'greater')
# one sample test
#wilcox.test(x, exact = FALSE)
# Two sample test
#wilcox.test(x, y)
#Correlation
#cor.test(dataset$column1, dataset$column2)