#1. Lesson Overview: # This lesson plan introduces you to the basics of Exploratory Data Analysis (EDA) and Statistical Inference using R. The goal is to equip students with the skills to summarise, visualise, and draw inferences from data.
#2. Learning Objectives: # Understand the principles of EDA and its importance in data analysis. #Use R to import, clean, and explore datasets. #Create basic data visualisations (e.g., histograms, boxplots, scatter plots). #Grasp the basics of statistical inference, including confidence intervals and hypothesis testing. #3. Lesson Structure: #Introduction (15 minutes): Overview of EDA and its role in understanding datasets. Introduction to statistical inference concepts such as confidence intervals and hypothesis testing.
#Activity 1: Data Import and Cleaning #Code Snippets and Detailed Steps: # Importing the ‘mtcars’ dataset data <- mtcars
str(data) summary(data)
library(dplyr) clean_data <- data %>% na.omit()
filtered_data <- clean_data %>% filter(mpg > 20)
modified_data <- filtered_data %>% mutate(hp_per_cyl = hp / cyl)
#Discussion: #What does the structure and summary output tell us about this dataset?
#Why is it important to remove NA values before analysis?
#What other types of data cleaning might be needed in real-world datasets?
#Why might we want to create a new variable like hp_per_cyl? How could this help in analysis?
#Activity 2: Exploratory Data Analysis #Code Snippets and Detailed Steps:
head(clean_data) summary(clean_data)
library(ggplot2)
ggplot(clean_data, aes(x = mpg)) + geom_histogram(binwidth = 2, fill = “blue”, color = “black”) + theme_minimal()
ggplot(clean_data, aes(y = hp)) + geom_boxplot(fill = “lightgreen”, color = “darkgreen”) + theme_minimal()
ggplot(clean_data, aes(x = wt, y = mpg)) + geom_point(color = “red”) + theme_minimal() + labs(title = “Scatter Plot of Weight vs. MPG”)
#Activity 3: Descriptive Analysis #Goal: Calculate summary statistics to understand distributions, central tendency, and variability. # Descriptive Statistics library(psych)
describe(clean_data)
clean_data %>% group_by(cyl) %>% summarise( count = n(), mean_mpg = mean(mpg), sd_mpg = sd(mpg), median_mpg = median(mpg), min_mpg = min(mpg), max_mpg = max(mpg) )
#visual for descriptive stats:
clean_data %>% group_by(cyl) %>% summarise(mean_mpg = mean(mpg)) %>% ggplot(aes(x = factor(cyl), y = mean_mpg)) + geom_bar(stat = “identity”, fill = “skyblue”) + labs(x = “Number of Cylinders”, y = “Mean MPG”, title = “Mean MPG by Cylinders”) + theme_minimal()
#Activity 4: Diagnostic Analysis #Goal: Identify relationships and potential causes behind trends or anomalies. # Correlation matrix cor_matrix <- round(cor(clean_data), 2) print(cor_matrix)
library(corrplot) corrplot(cor_matrix, method = “circle”, type = “upper”, tl.cex = 0.8)
ggplot(clean_data, aes(x = hp, y = mpg)) + geom_point() + geom_smooth(method = “lm”, se = TRUE, color = “blue”) + theme_minimal() + labs(title = “HP vs MPG with Linear Fit”)
ggplot(clean_data, aes(x = factor(gear), y = mpg)) + geom_boxplot(fill = “orange”) + labs(x = “Number of Gears”, y = “MPG”, title = “MPG by Gear Count”) + theme_minimal()
#Introduce Inference Concepts (if time allows) # Hypothesis Test: Is the mean mpg > 20? t.test(clean_data$mpg, mu = 20, alternative = “greater”)
t.test(clean_data\(mpg)\)conf.int
#Activity 3: Introduction to Statistical Inference #Code Snippets and Detailed Steps: # Concept: Sample vs. Population explained # Example of confidence interval calculation for ‘mpg’
mean_conf_interval <- t.test(clean_data$mpg, conf.level = 0.95) print(mean_conf_interval)
t_test_result <- t.test(mpg ~ am, data = clean_data) print(t_test_result)
#Lesson Overview: #This lesson plan introduces you to the basics of Exploratory Data Analysis (EDA) and Statistical Inference using R. The goal is to equip students with the skills to summarise, visualise, and draw inferences from data. #2. Learning Objectives: #Understand the principles of EDA and its importance in data analysis. #Use R to import, clean, and explore datasets. #Create basic data visualisations (e.g., histograms, boxplots, scatter plots). #Perform correlation analysis and linear regression to identify relationships. #Grasp the basics of statistical inference, including confidence intervals and hypothesis testing.
#Activity 4: Data Import and Cleaning #Code Snippets and Detailed Steps:
data <- mtcars
str(data) summary(data)
library(dplyr) clean_data <- data %>% na.omit()
filtered_data <- clean_data %>% filter(mpg > 20)
modified_data <- filtered_data %>% mutate(hp_per_cyl = hp / cyl)
#Activity 5: Exploratory Data Analysis #Code Snippets and Detailed Steps:
head(clean_data) summary(clean_data)
library(ggplot2)
ggplot(clean_data, aes(x = mpg)) + geom_histogram(binwidth = 2, fill = “blue”, color = “black”) + theme_minimal()
ggplot(clean_data, aes(y = hp)) + geom_boxplot(fill = “lightgreen”, color = “darkgreen”) + theme_minimal()
ggplot(clean_data, aes(x = wt, y = mpg)) + geom_point(color = “red”) + theme_minimal() + labs(title = “Scatter Plot of Weight vs. MPG”)
#Activity 6: Correlation Analysis and Linear Regression #Code Snippets and Detailed Steps: # Correlation analysis cor_matrix <- cor(clean_data[, c(“mpg”, “wt”, “hp”, “cyl”)]) print(cor_matrix)
library(corrplot) corrplot(cor_matrix, method = “circle”)
#Categorical Correlations # Using the ‘mtcars’ dataset with a factor variable mtcars\(am <- factor(mtcars\)am, labels = c(“Automatic”, “Manual”))
table_am_cyl <- table(mtcars\(am, mtcars\)cyl) chi_square_result <- chisq.test(table_am_cyl)
print(chi_square_result)
library(DescTools) cramers_v <- CramerV(table_am_cyl) print(cramers_v)
linear_model <- lm(mpg ~ wt + hp, data = clean_data) summary(linear_model)
ggplot(clean_data, aes(x = wt, y = mpg)) + geom_point() + geom_smooth(method = “lm”, col = “blue”) + labs(title = “Linear Regression of MPG on Weight”)
#Activity 7: Introduction to Statistical Inference #Code Snippets and Detailed Steps: # Concept: Sample vs. Population explained # Example of confidence interval calculation for ‘mpg’ t_test_result_one_sample <- t.test(clean_data$mpg, mu = 20) print(t_test_result_one_sample)
t_test_result_two_sample <- t.test(mpg ~ am, data = clean_data) print(t_test_result_two_sample)