#1. Lesson Overview: # This lesson plan introduces you to the basics of Exploratory Data Analysis (EDA) and Statistical Inference using R. The goal is to equip students with the skills to summarise, visualise, and draw inferences from data.

#2. Learning Objectives: # Understand the principles of EDA and its importance in data analysis. #Use R to import, clean, and explore datasets. #Create basic data visualisations (e.g., histograms, boxplots, scatter plots). #Grasp the basics of statistical inference, including confidence intervals and hypothesis testing. #3. Lesson Structure: #Introduction (15 minutes): Overview of EDA and its role in understanding datasets. Introduction to statistical inference concepts such as confidence intervals and hypothesis testing.

#Activity 1: Data Import and Cleaning #Code Snippets and Detailed Steps: # Importing the ‘mtcars’ dataset data <- mtcars

Checking the structure and summary

str(data) summary(data)

Cleaning data: Removing NA values (if any)

library(dplyr) clean_data <- data %>% na.omit()

Filtering data (example: filter rows where mpg > 20)

filtered_data <- clean_data %>% filter(mpg > 20)

Adding a new column (example: mutate to create a calculated column)

modified_data <- filtered_data %>% mutate(hp_per_cyl = hp / cyl)

#Discussion: #What does the structure and summary output tell us about this dataset?

#Why is it important to remove NA values before analysis?

#What other types of data cleaning might be needed in real-world datasets?

#Why might we want to create a new variable like hp_per_cyl? How could this help in analysis?

#Activity 2: Exploratory Data Analysis #Code Snippets and Detailed Steps:

Basic exploratory commands

head(clean_data) summary(clean_data)

Visualisations using ggplot2

library(ggplot2)

Histogram of ‘mpg’

ggplot(clean_data, aes(x = mpg)) + geom_histogram(binwidth = 2, fill = “blue”, color = “black”) + theme_minimal()

Boxplot of ‘hp’

ggplot(clean_data, aes(y = hp)) + geom_boxplot(fill = “lightgreen”, color = “darkgreen”) + theme_minimal()

Scatter plot of ‘wt’ vs ‘mpg’

ggplot(clean_data, aes(x = wt, y = mpg)) + geom_point(color = “red”) + theme_minimal() + labs(title = “Scatter Plot of Weight vs. MPG”)

#Activity 3: Descriptive Analysis #Goal: Calculate summary statistics to understand distributions, central tendency, and variability. # Descriptive Statistics library(psych)

Descriptive statistics for all numeric variables

describe(clean_data)

Grouped descriptive statistics: Mean mpg by number of cylinders

clean_data %>% group_by(cyl) %>% summarise( count = n(), mean_mpg = mean(mpg), sd_mpg = sd(mpg), median_mpg = median(mpg), min_mpg = min(mpg), max_mpg = max(mpg) )

#visual for descriptive stats:

Bar chart of mean mpg by cylinder

clean_data %>% group_by(cyl) %>% summarise(mean_mpg = mean(mpg)) %>% ggplot(aes(x = factor(cyl), y = mean_mpg)) + geom_bar(stat = “identity”, fill = “skyblue”) + labs(x = “Number of Cylinders”, y = “Mean MPG”, title = “Mean MPG by Cylinders”) + theme_minimal()

#Activity 4: Diagnostic Analysis #Goal: Identify relationships and potential causes behind trends or anomalies. # Correlation matrix cor_matrix <- round(cor(clean_data), 2) print(cor_matrix)

Visualising correlations

library(corrplot) corrplot(cor_matrix, method = “circle”, type = “upper”, tl.cex = 0.8)

Checking relationship between ‘hp’ and ‘mpg’

ggplot(clean_data, aes(x = hp, y = mpg)) + geom_point() + geom_smooth(method = “lm”, se = TRUE, color = “blue”) + theme_minimal() + labs(title = “HP vs MPG with Linear Fit”)

Diagnostic: Compare MPG by number of gears

ggplot(clean_data, aes(x = factor(gear), y = mpg)) + geom_boxplot(fill = “orange”) + labs(x = “Number of Gears”, y = “MPG”, title = “MPG by Gear Count”) + theme_minimal()

#Introduce Inference Concepts (if time allows) # Hypothesis Test: Is the mean mpg > 20? t.test(clean_data$mpg, mu = 20, alternative = “greater”)

Confidence interval for mean mpg

t.test(clean_data\(mpg)\)conf.int

#Activity 3: Introduction to Statistical Inference #Code Snippets and Detailed Steps: # Concept: Sample vs. Population explained # Example of confidence interval calculation for ‘mpg’

mean_conf_interval <- t.test(clean_data$mpg, conf.level = 0.95) print(mean_conf_interval)

Hypothesis testing: t-test to compare means of two groups (e.g., ‘am’ as automatic vs. manual)

t_test_result <- t.test(mpg ~ am, data = clean_data) print(t_test_result)

#Lesson Overview: #This lesson plan introduces you to the basics of Exploratory Data Analysis (EDA) and Statistical Inference using R. The goal is to equip students with the skills to summarise, visualise, and draw inferences from data. #2. Learning Objectives: #Understand the principles of EDA and its importance in data analysis. #Use R to import, clean, and explore datasets. #Create basic data visualisations (e.g., histograms, boxplots, scatter plots). #Perform correlation analysis and linear regression to identify relationships. #Grasp the basics of statistical inference, including confidence intervals and hypothesis testing.

#Activity 4: Data Import and Cleaning #Code Snippets and Detailed Steps:

Importing the ‘mtcars’ dataset

data <- mtcars

Checking the structure and summary

str(data) summary(data)

Cleaning data: Removing NA values (if any)

library(dplyr) clean_data <- data %>% na.omit()

Filtering data (example: filter rows where mpg > 20)

filtered_data <- clean_data %>% filter(mpg > 20)

Adding a new column (example: mutate to create a calculated column)

modified_data <- filtered_data %>% mutate(hp_per_cyl = hp / cyl)

#Activity 5: Exploratory Data Analysis #Code Snippets and Detailed Steps:

Basic exploratory commands

head(clean_data) summary(clean_data)

Visualisations using ggplot2

library(ggplot2)

Histogram of ‘mpg’

ggplot(clean_data, aes(x = mpg)) + geom_histogram(binwidth = 2, fill = “blue”, color = “black”) + theme_minimal()

Boxplot of ‘hp’

ggplot(clean_data, aes(y = hp)) + geom_boxplot(fill = “lightgreen”, color = “darkgreen”) + theme_minimal()

Scatter plot of ‘wt’ vs ‘mpg’

ggplot(clean_data, aes(x = wt, y = mpg)) + geom_point(color = “red”) + theme_minimal() + labs(title = “Scatter Plot of Weight vs. MPG”)

#Activity 6: Correlation Analysis and Linear Regression #Code Snippets and Detailed Steps: # Correlation analysis cor_matrix <- cor(clean_data[, c(“mpg”, “wt”, “hp”, “cyl”)]) print(cor_matrix)

Visualising correlation

library(corrplot) corrplot(cor_matrix, method = “circle”)

#Categorical Correlations # Using the ‘mtcars’ dataset with a factor variable mtcars\(am <- factor(mtcars\)am, labels = c(“Automatic”, “Manual”))

Chi-Square test to check for association between ‘am’ (transmission) and ‘cyl’ (number of cylinders)

table_am_cyl <- table(mtcars\(am, mtcars\)cyl) chi_square_result <- chisq.test(table_am_cyl)

Interpretation: A significant p-value indicates that the variables ‘am’ and ‘cyl’ are not independent.

Calculating Cramér’s V for association strength

library(DescTools) cramers_v <- CramerV(table_am_cyl) print(cramers_v)

Linear regression model

linear_model <- lm(mpg ~ wt + hp, data = clean_data) summary(linear_model)

Visualising regression line

ggplot(clean_data, aes(x = wt, y = mpg)) + geom_point() + geom_smooth(method = “lm”, col = “blue”) + labs(title = “Linear Regression of MPG on Weight”)

#Activity 7: Introduction to Statistical Inference #Code Snippets and Detailed Steps: # Concept: Sample vs. Population explained # Example of confidence interval calculation for ‘mpg’ t_test_result_one_sample <- t.test(clean_data$mpg, mu = 20) print(t_test_result_one_sample)

Hypothesis testing: t-test to compare means of two groups (e.g., ‘am’ as automatic vs. manual)

t_test_result_two_sample <- t.test(mpg ~ am, data = clean_data) print(t_test_result_two_sample)