pacman::p_load(readxl, openxlsx, kableExtra, tidyverse, tidymodels)

mydata_csv = read.csv("mydata.csv")
mydata_csv

# Read this dataset excluding the first column
mydata_csv = read.csv("mydata.csv")[, -1]
mydata_csv

# Read the dataset from the .txt file
mydata_txt = read.table("mydata.txt", header = TRUE, sep = " ")
mydata_txt

# Read the dataset from the .xlsx file
mydata_xlsx = read.xlsx("mydata.xlsx")
mydata_xlsx

mydata_xlsx = read_xlsx("mydata.xlsx", sheet = "Sheet 1", range = "A1:E20")
mydata_xlsx

# ``, '', "", [ row , column ], $, ~ are used in R.
# ``, '', "" are used for quoting.
# [ row , column ] is used for indexing.
mydata_xlsx[1:2, 2] # First and second row, second column

# $ is used to access the columns of a dataset.
mydata_xlsx$age # Access the age column

mydata_xlsx$family[3:5] # Access the family column from the 3rd to 5th row

mydata_xlsx[-c(2, 6), -2] # Access dataset except 2nd and 6th rows, and 2nd column

# ~ is used in formulas.
# For example, y ~ x means y is modeled as a function of x.
boxplot(age ~ family, data = mydata_xlsx) # Boxplot of age by family

pacman::p_load(car) # qqPlot() function is in the car package, you can load this at the beginning.
qqPlot(age ~ family, data = mydata_xlsx)

mydata_xlsx %>% 
  select(age, family, land) %>% 
  filter(age >40, family %in% c(1, 2)) # Select serial, age, family columns and filter age > 45 and family 1 or 2

# Stack the age and family columns into one column named merged
long = mydata_xlsx %>% 
  pivot_longer(cols = c(age, family) , names_to = 'merged', values_to = 'values') # Combine age and family columns into merged column
long

# Unstack the merged column into age and family columns
short = long %>% 
  pivot_wider(names_from = merged, values_from = values) # Split merged column into age and family columns
short

# Create new variables (income_per_member) using mutate() function
short = short %>% 
  mutate(income_per_member = income/family) # Create new variable income_per_member
head(short)

# Create new variable (status: poor < 1000 income_per_member) using conditional statement
short = short %>% 
  mutate(status = ifelse(income_per_member < 1000, "poor", "rich")) # Create new variable status
head(short)

# Create new variable (status1: poor < 1000 income_per_member) using case_when() function
short = short %>% 
  mutate(status1 = case_when(
    income_per_member < 1000 ~ "poor",
    income_per_member >= 1000 ~ "rich"
  )) # Create new variable status1
head(short)

# Create new variable (status2: poor < 1000, middle 1000-2000, rich > 2000 income_per_member) using ifelse() function and dollar sign
# This is not tidy thinking
# Tidy thinking uses  %>% 
short$status2 = ifelse(short$income_per_member < 1000, "poor", 
    ifelse(short$income_per_member <= 2000, "middle", "rich")) # Create new variable status2
head(short)

# Cut the variable (income_per_member) into intervals (poor < 1000, middle 1000-2000, rich > 2000) using cut() function and save as status3
# right = FALSE means the intervals are left-closed
short$status3 = cut(short$income_per_member, right = FALSE,
                    breaks = c(-Inf, 1000, 1999.9999, Inf), 
                    labels = c("poor", "middle", "rich")) # Create new variable status3
head(short)

# Create a frequency table of status3
short %>% select(status3) %>% table() # tidy way
table(short$status3) # traditional way

status3
  poor middle   rich 
    11      4      4

  poor middle   rich 
    11      4      4

# Create a cross-tabulation of status3 and land
table(short$status3, short$land) # Cross-tabulation of status3 and land
# Tidy way
short %>% select(status3, land) %>% table()

        
         rural urban
  poor       7     4
  middle     1     3
  rich       1     3

        land
status3  rural urban
  poor       7     4
  middle     1     3
  rich       1     3

# Mean, SD, Median, Min, Max, Range, IQR, Summary functions
# Select a variable using $ sign and apply function
mean(mydata_xlsx$income) # Mean of income_per_member
sd(mydata_xlsx$income) # Standard deviation of income_per_member
median(mydata_xlsx$income) # Median of income_per_member
min(mydata_xlsx$income) # Minimum of income_per_member
max(mydata_xlsx$income) # Maximum of income_per_member
range(mydata_xlsx$income) # Range of income_per_member
IQR(mydata_xlsx$income) # Interquartile range of income_per_member
summary(mydata_xlsx$income) # Summary of income_per_member

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1002    2191    2759    2941    3756    4940

# Tidy way
# se and cv: Standard error and coefficient of variation
# se = sd/sqrt(n), cv(%) = sd/mean*100
mydata_xlsx %>% summarise(mean = mean(income),
                         sd = sd(income),
                         iqr = IQR(income),
                         cv = sd(income)*100/mean(income),
                         se = sd(income)/sqrt(n()),
                         frequency = n())

# Tidy way: summary by another factor (land)
# se and cv: Standard error and coefficient of variation
# se = sd/sqrt(n), cv(%) = sd/mean*100
mydata_xlsx %>% group_by(land) %>% summarise(mean = mean(income),
                         sd = sd(income),
                         iqr = IQR(income),
                         cv = sd(income)*100/mean(income),
                         se = sd(income)/sqrt(n()),
                         frequency = n())

# Multiple variable, multiple functions
# transpose using t()
# split column names and sort according to variable
# rename() the columns
# separate() into multiple or unite() into one column
# format using kble() from kableExtra package
mydata_xlsx %>% 
    select(age, income, family) %>% 
    summarise_all(list(mean = mean, max = max)) %>% 
    t() %>% 
    as.data.frame() %>% 
    round(2) %>% 
    rownames_to_column('variable') %>% 
    separate(variable, c('variable', 'statistics'), sep = '_') %>% 
    arrange(variable) %>% 
    pivot_wider(names_from = statistics, values_from = V1)

# Custom function: Create a function to calculate the standard error
# Function name is followed by parentheses and curly brackets.
se = function(x) {
  return(sd(x)/sqrt(length(x)))
}

se(mydata_xlsx$income) %>% round(2) # Standard error of income_per_member

# IQR=Q3−Q1, Q2 = Median
# Boxplot: Q1, Q2, Q3, Whiskers, Outliers
# The lower whisker extends from Q1 to the minimum value within 1.5 × IQR below Q1.
# The upper whisker extends from Q3 to the maximum value within 1.5 × IQR above Q3.
# Outliers are the values beyond the whiskers.
boxplot(mydata_xlsx$income) # Boxplot of income_per_member

head(mydata_xlsx)

# We want to change the variable types and save (overwrite) in the original name of the dataset
mydata_xlsx = mydata_xlsx %>% mutate_if(is.character, as.factor)
head(mydata_xlsx)

mydata_xlsx = mydata_xlsx %>% mutate_at(c('serial', 'family'), as.integer)
head(mydata_xlsx)

mydata_xlsx = mydata_xlsx %>% mutate_at('family', as.numeric) %>% mutate_at(c('age', 'income'), round, 2)
head(mydata_xlsx)

set.seed(123)
n = 30 # Sample size
B = 1000 # Number of samples
sample_means = replicate(B, mean(sample(mydata_xlsx$income, n, replace = TRUE))) # Generate B sample means

# Calculate the mean and standard deviation of the sample means
(mean_sample_means = mean(sample_means)) # Mean of the sample means
(sd_sample_means = sd(sample_means)) # Standard deviation of the sample means
(se_sample_means = sd_sample_means) # Standard error of the sample means

# Plot the sampling distribution of the sample mean
hist(sample_means, breaks = 30, col = "skyblue", main = "Sampling Distribution of the Sample Mean", xlab = "Sample Mean")

# use second bracket to combine multiline codes
{ 
  plot(density(sample_means), col = "blue", 
       main = "Sampling Distribution of the Sample Mean", 
       xlab = "x-axis => Sample Mean or z-values \n Probability = Area under the curve.", 
       ylab = "Density or relative frequency", lty = 1, lwd = 2)
  abline(v = mean_sample_means, col = "red", lty = 2, lwd = 2) # Add a vertical line for the mean of the sample means
  abline(v = mean_sample_means + c(-1, 1)*se_sample_means*1.96, col = "black", lty = 1, lwd = 2) # Add vertical lines for the mean ± 1.96 SE
  text(3000, 0.0005, paste('mean =', round(mean_sample_means)), pos = 4, col = "red", srt = 90) # Add a text for the mean of the sample means
  text(mean_sample_means, 0, '0', pos = 3, col = "black") 
  text(3400, 0.0005, 'mean+1.96se', pos = 4, col = "black", srt=90)
  text(3200, 0.0000, '+1.96', pos = 4, col = "black", srt=0)
  text(2400, 0.0005, 'mean-1.96se', pos = 4, col = "black", srt=90)
  text(2400, 0.0000, '-1.96', pos = 4, col = "black", srt=0)
  text(2900, 0.0020, '95% CI [2557, 3341]', pos = 1, col = "black", srt=0)
}

# Now sort the mydata_xlsx$income dataset by income in ascending order
income_ascending = mydata_xlsx %>% select(income) %>% arrange()
income_ascending

# Now sort the mydata_xlsx$income dataset by income in ascending order
income_ascending = sort(mydata_xlsx$income, decreasing = FALSE)
# Or tidy thinking
income_ascending = mydata_xlsx %>% arrange(income, decreasing = FALSE)
head(income_ascending)

# Find the 2.5th and 97.5th percentiles of the income dataset
income_ascending %>% select(income) %>% summarise_all(quantile, c(0.025, 0.975)) %>% round(2)
# 95% CI [1186.531, 4774.397]

Warning message:
"Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.
ℹ The deprecated feature was likely used in the dplyr package.
  Please report the issue at <https://github.com/tidyverse/dplyr/issues>."

# Calculate CI from the sample mean
mean_income = mean(mydata_xlsx$income)
sd_income = sd(mydata_xlsx$income)
se_income = se(mydata_xlsx$income)

c((mean_income - 1.96*se_income),(mean_income + 1.96*se_income)) %>% round(2)
# 95% CI [2441.083, 3440.422]

# Conditional statement
# if, else if, else
# if (condition) {statement} else {statement}
# if (condition) {statement} else if (condition) {statement} else {statement}

if (mean(mydata_xlsx$income) > 3000) {
  print("The mean income is greater than 3000.")
} else {
  print("The mean income is less than or equal to 3000.")
}

[1] "The mean income is less than or equal to 3000."

if (mean(mydata_xlsx$income) > 4000) {
  print("The mean income is greater than 4000.")
} else if (mean(mydata_xlsx$income) > 3000) {
  print("The mean income is greater than 3000 but less than or equal to 4000.")
} else {
  print("The mean income is less than or equal to 3000.")
}

[1] "The mean income is less than or equal to 3000."

# Loop function
# for loop, while loop
# for (variable in sequence) {statement}
for (i in 1:5) {
  print(i)
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5

# Iterate the mean function over three columns of the dataset and same the means in means object
means = c() # blank vector for storing calculated means
for (column in c("age", "family", "income")) {
  means = c(means, mean(mydata_xlsx[[column]]))
}
means %>% round(2)

# same output using apply function
apply(mydata_xlsx[c("age", "family", "income")], 2, mean) %>% round(2)

# same output in tidy thinking
mydata_xlsx %>% select(age, family, income) %>% summarise_all(mean) %>% round(2)

# Look the format of the column indexing.
# mydata_xlsx[[column]] is used for indexing the columns of the dataset.
# Here, column = 1, in the first iteration to calculate the mean of age
# column = 2, in the second iteration to calculate the mean of family, and so on.
# Just for your checking
mydata_xlsx[[2]] # First column of the dataset (value only)
mydata_xlsx['age'] # This contains variable name and values
mydata_xlsx[['age']] # This contains only values

# ggplot2 is a powerful package for creating graphics in R.
# It is based on the grammar of graphics, which is a structured way (layer by layer) of creating graphics.
# There are different ways to put the dataset name and aes-thetics as well as geom-etric shapes
ggplot(mtcars) + 
    geom_violin(aes(cyl, mpg, group = cyl))

ggplot(mtcars, aes(cyl, mpg, group = cyl)) + 
    geom_violin()

ggplot(mtcars) + 
    aes(cyl, mpg, group = cyl) + 
    geom_violin()

mtcars %>% ggplot() + 
    aes(cyl, mpg, group = cyl) + 
    geom_violin()

mtcars %>% ggplot(aes(cyl, mpg, group = cyl) ) + 
    geom_violin()

p = mtcars %>% ggplot(aes(cyl, mpg, group = cyl))
p = p + geom_violin()
p = p + theme_bw()
p

# Scattered plot of x=age and y=income
pacman::p_load(jtools) # for theme_apa()
ggplot(mydata_xlsx, aes(x = age, y = income)) + 
  geom_point() + # Scatter plot of age and income
  labs(title = "Scatter plot of Age and Income", x = "Age", y = "Income") + # Add title and axis labels
  theme_bw()

# Boxplot of income by land
ggplot(mydata_xlsx, aes(x = land, y = income)) + 
  geom_boxplot() + # Boxplot of income by land
  labs(title = "Boxplot of Income by Land", x = "Land", y = "Income") + # Add title and axis labels
  theme_apa()

# Histogram of income
ggplot(mydata_xlsx, aes(x = income)) + 
  geom_histogram(binwidth = 500, fill = "skyblue", color = "white") + # Histogram of income
  labs(title = "Histogram of Income", x = "Income", y = "Frequency") + # Add title and axis labels
  theme_test()

# Density plot of income by land
ggplot(mydata_xlsx, aes(x = income, fill = land)) + 
  geom_density(alpha = 0.5) + 
  labs(title = "Fig. 1.1 Density Plot of Income by Land", x = "Income", y = "Density") +

  theme_apa()+
  theme(plot.title = element_text(hjust = 0.5, vjust = -150),
        plot.margin = margin(10, 10, 50, 10),
        legend.position = 'top')

# Bar plot of family by land
ggplot(mydata_xlsx, aes(x = family, fill = land)) + 
  geom_bar(position = "dodge") + # Bar plot of family by land
  labs(title = "Bar Plot of Family by Land", x = "Family", y = "Count") + 
  theme_apa()

# Scatter plot of income by age
ggplot(mydata_xlsx, aes(x = age, y = income, group = 1)) + 
  geom_point(color = "tomato") + 
  labs(title = "Line Plot of Income by Age", x = "Age", y = "Income") + 
  theme_apa() +
  geom_smooth(method = "lm", se = TRUE, formula = y ~ x)

# Pie chart of family labelled by percentage
# Calculate counts and proportions for each family
pie_data = mydata_xlsx %>%
  group_by(family) %>%
  summarise(count = n()) %>%
  mutate(percentage = 100 * count / sum(count))

# Create the pie chart
ggplot(pie_data, aes(x = "", y = count, fill = factor(family))) +
  geom_bar(stat = "identity", width = 1) + 
  coord_polar("y", start = 0) +  
  geom_text(aes(label = paste0(round(percentage, 1), '%')   ), 
            position = position_stack(vjust = 0.5), size = 6) + 
  labs(title = "Pie Chart of Family", fill = "Family") + 
  scale_fill_discrete('Family size') +
  theme_void() +  
  theme(legend.position = "right",
        plot.title = element_text(hjust = 0.5, size = rel(2)))

# Using base pie()
x = pie_data$count
percent = paste0(round(pie_data$percentage, 2), '%')
categories = c('One member', 'Two member', 'Three member', 'Four member', 'Five member')
labels = paste0(categories, '\n [', percent, ']')

pie(x, labels, col = rainbow(5))
title(main = "Fig. Pie Chart of Family", adj = 0.5, line = -27)

# Read specific range (B7:N507) from a sheet (wrangling) in excel file (DataSets.xlsx)
df = read_excel('DataSets.xlsx', sheet = 'wrangling', range = 'B7:N507')
head(df)

# We will exclude sr. column
df %>% select(-sr.) %>% head()

# We will calculate means, se, upper and lower limits from this dataset and plot in ggplot
# Define the lower and upper function
# Reorder factors levels: # plot_df$Months = factor(plot_df$Months, 
# levels = c('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'), 
# labels = c('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'))

se = function(x){sd(x)/sqrt(length(x))}
lower95 = function(x){mean(x) - 1.96*se(x)}
upper95 = function(x){mean(x) + 1.96*se(x)}

plot_df = df %>% 
    select(-sr.) %>% 
    summarise_all(list(mean = mean, lower = lower95, upper = upper95)) %>% 
    t() %>% as.data.frame() %>% 
    rownames_to_column('variable') %>% 
    separate(variable, c('Months', 'Statistics')) %>% 
    pivot_wider(names_from = Statistics, values_from = V1) %>% 
    mutate(Months = factor(Months, levels = c('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'),
                          labels = c('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec')))
plot_df

ggplot(plot_df, aes(x = Months, y = mean, group = 1, fill = Months)) +
    geom_col() +
    geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.25) +
    geom_text(aes(y = upper + 0.01, label = formatC(mean, format = 'f', digits = 2))) +
    theme_bw() +
    theme(legend.position = 'none') +
    labs(x = '', y = 'Average proportion of water scarcity')

# More complex, try to learn on your own
# Spider plot or radar chart
# devtools::install_github("ricardo-bion/ggradar")
pacman::p_load(ggradar, scales, patchwork)

# Create a dataset for the spider plot
set.seed(1)
spider_data = as.data.frame(
  matrix(sample(1:10 , 96 , replace = TRUE),
         ncol=8, byrow = TRUE))
spider_data

# Change the column names
colnames(spider_data) = c(  "Dhaka", "Khulna", "Barishal", "Chittagong", "Rangpur", 
                            "Rajshahi", "Sylhet", "Cumilla")

# Scale the data between 0 and 1 and keep two decimal places
scaled_data <- round(apply(spider_data, 2, scales::rescale), 2)
plot_data <- as.data.frame(scaled_data)
plot_data

# Create the radar plot for the first row of the dataset
ggradar(plot_data[1, ], 
        values.radar = c('0%', '50%', '100%'),
        grid.min = 0, grid.mid = 0.5, grid.max = 1,
        legend.position = "top")

# Create the radar plot for two rows of the dataset 
ggradar(plot_data[2:3,], 
        values.radar = c('0%', '50%', '100%'),
        grid.min = 0, grid.mid = 0.5, grid.max = 1,
        legend.position = "top")

# Create the radar plot for all rows of the dataset using a loop
# Create an empty list to store the plots
options(repr.plot.height=18)
{radar_plots = list()
  
  for (i in 1:nrow(plot_data)) {
    radar_plot = ggradar(plot_data[i,], 
                         values.radar = c('0%', '50%', '100%'),
                         grid.min = 0, grid.mid = 0.5, grid.max = 1,
                         plot.title = row.names(plot_data)[i],
                         legend.position = "top",
                         group.point.size = 3,
                         group.line.width = 1,
                         axis.label.size = 4,
                         grid.label.size = 4,
                         base.size = 15) +
      theme(plot.title = element_text(size = 15, hjust = 0.5))
    
    radar_plots[[i]] = radar_plot # Store the radar plot in the list
  }
  
  wrap_plots(radar_plots, ncol = 3) # Combine the radar plots into a single plot
}

# Save the plot as a .png file
ggsave("spider_plot.png", dpi = 300, width = 8, height = 9, units = 'in')

X	serial	age	family	income	land
<int>	<int>	<dbl>	<int>	<dbl>	<chr>
1	1	43.31857	2	2759.327	urban
2	2	44.30947	5	4017.901	rural
3	3	49.67612	1	3516.885	rural
4	4	45.21153	1	3840.730	urban
5	5	45.38786	2	1002.499	rural
6	6	50.14519	3	2901.266	rural
7	7	46.38275	4	1880.476	urban
8	8	41.20482	5	2519.266	urban
9	9	42.93944	5	3451.084	rural
10	10	43.66301	3	2407.192	urban
11	11	48.67225	1	1444.542	urban
12	12	46.07944	2	1974.478	urban
13	13	46.20231	5	3672.222	rural
14	14	45.33205	5	2670.587	rural
15	15	43.33248	4	4152.783	urban
16	16	50.36074	5	1411.459	rural
17	17	46.49355	2	2739.571	rural
18	18	39.10015	1	4939.828	urban
19	19	47.10407	1	4572.204	urban
20	20	43.58163	3	4545.876	urban

serial	age	family	income	land
<int>	<dbl>	<int>	<dbl>	<chr>
1	43.31857	2	2759.327	urban
2	44.30947	5	4017.901	rural
3	49.67612	1	3516.885	rural
4	45.21153	1	3840.730	urban
5	45.38786	2	1002.499	rural
6	50.14519	3	2901.266	rural
7	46.38275	4	1880.476	urban
8	41.20482	5	2519.266	urban
9	42.93944	5	3451.084	rural
10	43.66301	3	2407.192	urban
11	48.67225	1	1444.542	urban
12	46.07944	2	1974.478	urban
13	46.20231	5	3672.222	rural
14	45.33205	5	2670.587	rural
15	43.33248	4	4152.783	urban
16	50.36074	5	1411.459	rural
17	46.49355	2	2739.571	rural
18	39.10015	1	4939.828	urban
19	47.10407	1	4572.204	urban
20	43.58163	3	4545.876	urban

	serial	age	family	income	land
	<int>	<dbl>	<int>	<dbl>	<chr>
1	1	43.31857	2	2759.327	urban
2	2	44.30947	5	4017.901	rural
3	3	49.67612	1	3516.885	rural
4	4	45.21153	1	3840.730	urban
5	5	45.38786	2	1002.499	rural
6	6	50.14519	3	2901.266	rural
7	7	46.38275	4	1880.476	urban
8	8	41.20482	5	2519.266	urban
9	9	42.93944	5	3451.084	rural
10	10	43.66301	3	2407.192	urban
11	11	48.67225	1	1444.542	urban
12	12	46.07944	2	1974.478	urban
13	13	46.20231	5	3672.222	rural
14	14	45.33205	5	2670.587	rural
15	15	43.33248	4	4152.783	urban
16	16	50.36074	5	1411.459	rural
17	17	46.49355	2	2739.571	rural
18	18	39.10015	1	4939.828	urban
19	19	47.10407	1	4572.204	urban
20	20	43.58163	3	4545.876	urban

	serial	age	family	income	land
	<dbl>	<dbl>	<dbl>	<dbl>	<chr>
1	1	43.31857	2	2759.327	urban
2	2	44.30947	5	4017.901	rural
3	3	49.67612	1	3516.885	rural
4	4	45.21153	1	3840.730	urban
5	5	45.38786	2	1002.499	rural
6	6	50.14519	3	2901.266	rural
7	7	46.38275	4	1880.476	urban
8	8	41.20482	5	2519.266	urban
9	9	42.93944	5	3451.084	rural
10	10	43.66301	3	2407.192	urban
11	11	48.67225	1	1444.542	urban
12	12	46.07944	2	1974.478	urban
13	13	46.20231	5	3672.222	rural
14	14	45.33205	5	2670.587	rural
15	15	43.33248	4	4152.783	urban
16	16	50.36074	5	1411.459	rural
17	17	46.49355	2	2739.571	rural
18	18	39.10015	1	4939.828	urban
19	19	47.10407	1	4572.204	urban
20	20	43.58163	3	4545.876	urban

serial	age	family	income	land
<dbl>	<dbl>	<dbl>	<dbl>	<chr>
1	43.31857	2	2759.327	urban
2	44.30947	5	4017.901	rural
3	49.67612	1	3516.885	rural
4	45.21153	1	3840.730	urban
5	45.38786	2	1002.499	rural
6	50.14519	3	2901.266	rural
7	46.38275	4	1880.476	urban
8	41.20482	5	2519.266	urban
9	42.93944	5	3451.084	rural
10	43.66301	3	2407.192	urban
11	48.67225	1	1444.542	urban
12	46.07944	2	1974.478	urban
13	46.20231	5	3672.222	rural
14	45.33205	5	2670.587	rural
15	43.33248	4	4152.783	urban
16	50.36074	5	1411.459	rural
17	46.49355	2	2739.571	rural
18	39.10015	1	4939.828	urban
19	47.10407	1	4572.204	urban

Tidy Thinking in R¶

Professor Dr. Md. Kamrul Hasan¶

Piping operator %>% (Ctrl+Shift+M)¶

Most common error source¶

Data processing with tidy thinking¶

Select columns and filter rows¶

Long table (stacking) and Short table (unstacking)¶

Use mydata_xlsx dataset for descriptive statistics¶

Changing variable types¶

Demonstrate CLT (Center Limit Theorem) using this dataset¶

Data visualization: ggplot2 package¶

Bar chart with calculated mean and 95% confidence interval¶

Click here for additional bivariate and multivariate plots commands.¶

Months	mean	lower	upper
<fct>	<dbl>	<dbl>	<dbl>
Jan	0.286	0.24635048	0.32564952
Feb	0.354	0.31204114	0.39595886
Mar	0.392	0.34916482	0.43483518
Apr	0.358	0.31593559	0.40006441
May	0.224	0.18741857	0.26058143
Jun	0.218	0.18177258	0.25422742
Jul	0.180	0.14629076	0.21370924
Aug	0.090	0.06488994	0.11511006
Sep	0.082	0.05792679	0.10607321
Oct	0.080	0.05619629	0.10380371
Nov	0.076	0.05274863	0.09925137
Dec	0.042	0.02439998	0.05960002

A tibble: 19 × 1
income
<dbl>
2759.33
4017.90
3516.88
3840.73
1002.50
2901.27
1880.48
2519.27
3451.08
2407.19
1444.54
1974.48
3672.22
2670.59
4152.78
1411.46
2739.57
4939.83
4572.20

A tibble: 2 × 1
income
<dbl>
1186.53
4774.40

A tibble: 19 × 1
age
<dbl>
43.32
44.31
49.68
45.21
45.39
50.15
46.38
41.20
42.94
43.66
48.67
46.08
46.20
45.33
43.33
50.36
46.49
39.10
47.10

A data.frame: 12 × 8
Dhaka	Khulna	Barishal	Chittagong	Rangpur	Rajshahi	Sylhet	Cumilla
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1.00	0.33	0.62	0.00	0.00	0.57	0.11	0.22
0.00	0.44	0.38	1.00	0.50	1.00	0.67	0.89
0.50	0.44	0.88	0.89	0.38	0.29	0.11	1.00
1.00	0.00	0.25	0.22	0.50	1.00	1.00	0.56
0.38	0.33	1.00	0.89	0.62	0.43	0.89	0.78
1.00	0.67	0.75	0.56	1.00	0.57	0.22	1.00
0.62	0.78	0.00	0.11	0.50	0.43	0.00	0.22
0.25	0.78	0.50	0.67	0.50	0.71	0.67	0.00
0.38	0.78	0.88	0.89	0.62	0.14	0.67	0.56
0.00	0.44	0.50	0.00	0.88	0.57	0.67	0.22
0.62	0.11	1.00	1.00	0.62	0.00	0.11	1.00
0.00	1.00	1.00	0.78	1.00	0.29	0.67	0.78