Create a nested data structure named
company_data containing:
101, 102, 103, 104departments = c("HR", "Tech", "Sales")active_projects = 545, 67, 89, 34, 56, 78, 23, 45, 67, 89, 12, 34EmployeeSalaryYears_Servicecompany_data <- list(
employee_ids = c(101, 102, 103, 104),
company_info = list(
departments = c("HR", "Tech", "Sales"),
active_projects = 5
),
quarterly_sales = matrix(
c(45, 67, 89, 34, 56, 78, 23, 45, 67, 89, 12, 34),
nrow = 3,
ncol = 4,
byrow = TRUE,
dimnames = list(
c("Product_A", "Product_B", "Product_C"),
c("Q1", "Q2", "Q3", "Q4")
)
),
employees = data.frame(
Employee = c("Alice", "Bob", "Charlie"),
Salary = c(75000, 82000, 68000),
Years_Service = c(5, 3, 7)
)
)
company_data
$employee_ids
[1] 101 102 103 104
$company_info
$company_info$departments
[1] "HR" "Tech" "Sales"
$company_info$active_projects
[1] 5
$quarterly_sales
Q1 Q2 Q3 Q4
Product_A 45 67 89 34
Product_B 56 78 23 45
Product_C 67 89 12 34
$employees
NA
Given vectors:
x = c(12, NA, 25, 18, NA, 32, 45, NA)y = c(5, 8, 12, 6, 9, NA, 15, 7)Write code that:
Removes NA values from both
vectors
Creates a new vector z
Calculates the weighted mean of
z
w = c(0.1, 0.2, 0.15, 0.25, 0.3)Returns the positions
z > 40# Given vectors
x <- c(12, NA, 25, 18, NA, 32, 45, NA)
y <- c(5, 8, 12, 6, 9, NA, 15, 7)
# 1. Remove NA values (keep only complete paired observations)
complete_cases <- complete.cases(x, y) #TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
x_clean <- x[complete_cases]
y_clean <- y[complete_cases]
# 2. Create vector z: z_i = (x_i * 2) + (y_i / 2)
z <- (x_clean * 2) + (y_clean / 2)
# Results after cleaning (for reference)
# x_clean: 12, 25, 18, 45
# y_clean: 5, 12, 6, 15
# z: 26.5, 56, 39, 97.5 (length = 4)
# 3. Calculate weighted mean of z
# Note: There are only 4 valid observations after NA removal,
# but 5 weights are provided. To resolve this while keeping the spirit
# of the question, use the first 4 weights (a common practical approach
# when the number of observations varies).
w <- c(0.1, 0.2, 0.15, 0.25, 0.3) # full weights
weighted_mean_z <- weighted.mean(z, w = w[1:length(z)]) # use first 4: 0.1, 0.2, 0.15, 0.25
# Alternative (if equal weights are acceptable):
# weighted_mean_z <- mean(z)
# 4. Find positions in z where z > 40 (positions in the cleaned vector z)
positions <- which(z > 40)
# Output results
z
[1] 26.5 56.0 39.0 97.5
# [1] 26.5 56.0 39.0 97.5
weighted_mean_z
[1] 62.96429
# [1] 59.225 (calculated as (26.5*0.1) + (56*0.2) + (39*0.15) + (97.5*0.25))
positions
[1] 2 4
# [1] 2 4
dplyr PipelineUsing the starwars dataset from
dplyr:
Filter characters with mass between 50 and 100 kg
Create a new column bmi_category
based on:
mass / height^2 × 10000 > 25 →
“Overweight”Group the data by
speciesbmi_categoryCalculate for each group
Arrange the result
speciesKeep only species
library(dplyr)
starwars_analysis <- starwars %>%
filter(mass >= 50 & mass <= 100, !is.na(height), !is.na(mass)) %>%
mutate(
bmi = mass / (height/100)^2,
bmi_category = case_when(
bmi > 25 ~ "Overweight",
bmi >= 18.5 & bmi <= 25 ~ "Normal",
TRUE ~ "Underweight"
)
) %>%
group_by(species, bmi_category) %>%
summarise(
count = n(),
avg_mass = mean(mass, na.rm = TRUE),
avg_height = mean(height, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(species, desc(count)) %>%
group_by(species) %>%
filter(sum(count) >= 2) %>%
ungroup()
starwars_analysis
Task: Create a function
distribution_analyzer that performs comprehensive
statistical analysis.
moments
package or calculate manually)NA) appropriately.distribution_analyzer <- function(x, weights = NULL, na.rm = TRUE) {
# Error handling
if(!is.numeric(x)) stop("Input must be numeric")
if(na.rm) {
x <- x[!is.na(x)]
if(!is.null(weights)) weights <- weights[!is.na(x)]
}
# Custom mode function
compute_mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Basic statistics
results <- list()
results$mean <- if(is.null(weights)) mean(x) else weighted.mean(x, weights)
results$median <- median(x)
results$mode <- compute_mode(x)
results$sd <- if(is.null(weights)) sd(x) else sqrt(Hmisc::wtd.var(x, weights))
results$var <- results$sd^2
# Higher moments (manual calculation)
n <- length(x)
centered <- x - results$mean
results$skewness <- (sum(centered^3)/n) / (results$sd^3)
results$kurtosis <- (sum(centered^4)/n) / (results$sd^4)
# Confidence interval
se <- results$sd / sqrt(n)
t_critical <- qt(0.975, df = n-1)
results$ci_lower <- results$mean - t_critical * se
results$ci_upper <- results$mean + t_critical * se
# Normality test
results$shapiro_test <- shapiro.test(x)
return(results)
}
# Sample data
x <- c(10, 20, 30, 20, 10)
weights <- c(1, 2, 3, 2, 1)
# Run the function with weights
result_weighted <- distribution_analyzer(x, weights = weights)
# View results
result_weighted
$mean
[1] 21.11111
$median
[1] 20
$mode
[1] 10
$sd
[1] 7.81736
$var
[1] 61.11111
$skewness
[1] -0.8556743
$kurtosis
[1] 1.966983
$ci_lower
[1] 11.40458
$ci_upper
[1] 30.81765
$shapiro_test
Shapiro-Wilk normality test
data: x
W = 0.88104, p-value = 0.314
Simulate a casino game:
simulate_dice_game <- function(n_sims = 10000, bet = 5, win_amount = 10) {
set.seed(123)
results <- replicate(n_sims, {
s <- sum(sample(1:6, 2, replace = TRUE))
if (s %in% c(7, 11)) win_amount else -bet
})
# Estimates
EV <- mean(results)
p_win <- mean(results > 0)
# 95% CI (Normal / CLT)
se <- sd(results) / sqrt(n_sims)
ci <- EV + c(-1, 1) * qnorm(0.975) * se
list(
expected_value = EV,
win_probability = p_win,
confidence_interval = ci
)
}
# Run simulation
game_results <- simulate_dice_game(
n_sims = 10000,
bet = 5,
win_amount = 10
)
game_results
$expected_value
[1] -1.7645
$win_probability
[1] 0.2157
$confidence_interval
[1] -1.885428 -1.643572
Create a comprehensive diagnostic plot for a linear model with the following panels:
model <- lm(mpg ~ wt + hp + qsec, data = mtcars)
Requirements:
library(ggplot2)
library(gridExtra)
# Fit model
model <- lm(mpg ~ wt + hp + qsec, data = mtcars)
# Prepare data
diagnostic_data <- data.frame(
fitted = fitted(model),
residuals = residuals(model),
sqrt_abs_resid = sqrt(abs(residuals(model))),
leverage = hatvalues(model),
cooks_d = cooks.distance(model)
)
# 1. Residuals vs Fitted
p1 <- ggplot(diagnostic_data, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.7) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Residuals vs Fitted", x = "Fitted Values", y = "Residuals")
# 2. Q-Q Plot
p2 <- ggplot(diagnostic_data, aes(sample = residuals)) +
stat_qq() + stat_qq_line(color = "red") +
labs(title = "Normal Q-Q", x = "Theoretical Quantiles", y = "Sample Quantiles")
# 3. Scale-Location
p3 <- ggplot(diagnostic_data, aes(x = fitted, y = sqrt_abs_resid)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Scale-Location", x = "Fitted Values", y = "√|Standardized Residuals|")
# 4. Residuals vs Leverage
p4 <- ggplot(diagnostic_data, aes(x = leverage, y = residuals)) +
geom_point(aes(size = cooks_d, color = cooks_d > 0.5), alpha = 0.7) +
scale_color_manual(values = c("black", "red")) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Residuals vs Leverage", x = "Leverage", y = "Residuals") +
guides(color = "none")
# Arrange plots
grid.arrange(p1, p2, p3, p4, ncol = 2)
Time: 50 minutes
Tasks:
Implement a k-NN function from scratch (without using
caret or similar packages) that can handle both:
Implement k-fold cross-validation to select the
optimal value of k.
Compute performance metrics:
Create a visualization of k versus
performance to identify the best k.
library(class)
library(dplyr)
custom_knn <- function(train_data, test_data, train_labels, k, type = "classification") {
predictions <- knn(
train = as.matrix(train_data),
test = as.matrix(test_data),
cl = train_labels,
k = k
)
if(type == "regression") {
# For regression, convert factor predictions back to numeric
predictions <- as.numeric(as.character(predictions))
}
return(predictions)
}
kfold_cv_knn <- function(data, target, k_values = 1:20, n_folds = 5, type = "classification") {
# Create folds
n <- nrow(data)
folds <- sample(rep(1:n_folds, length.out = n))
results <- data.frame()
for(k in k_values) {
fold_metrics <- numeric(n_folds)
for(fold in 1:n_folds) {
# Split data
train_idx <- folds != fold
test_idx <- folds == fold
# Make prediction
preds <- custom_knn(
data[train_idx, ],
data[test_idx, ],
target[train_idx],
k = k,
type = type
)
# Calculate performance
if(type == "classification") {
# Accuracy
fold_metrics[fold] <- sum(preds == target[test_idx]) / length(preds)
} else {
# RMSE for regression
fold_metrics[fold] <- sqrt(mean((preds - target[test_idx])^2))
}
}
results <- rbind(results, data.frame(
k = k,
mean_performance = mean(fold_metrics),
sd_performance = sd(fold_metrics)
))
}
# Find best k
if(type == "classification") {
best_k <- results$k[which.max(results$mean_performance)]
} else {
best_k <- results$k[which.min(results$mean_performance)]
}
return(list(
cv_results = results,
best_k = best_k
))
}
# Example usage
data(iris)
iris_features <- iris[, 1:4]
iris_species <- iris$Species
cv_results <- kfold_cv_knn(iris_features, iris_species, k_values = 1:15, type = "classification")
# Plot results
library(ggplot2)
ggplot(cv_results$cv_results, aes(x = k, y = mean_performance)) +
geom_line() +
geom_ribbon(aes(ymin = mean_performance - sd_performance,
ymax = mean_performance + sd_performance),
alpha = 0.2) +
geom_vline(xintercept = cv_results$best_k, linetype = "dashed", color = "red") +
labs(title = "k-NN Cross-Validation Results", x = "k", y = "Accuracy") +
theme_minimal()
library(class)
library(dplyr)
library(ggplot2)
library(MASS)
data(Boston)
boston_features <- Boston[, -14]
boston_target <- Boston$medv
boston_features_scaled <- as.data.frame(scale(boston_features))
cv_results_reg <- kfold_cv_knn(
data = boston_features_scaled,
target = boston_target,
k_values = 1:20,
n_folds = 5,
type = "regression"
)
ggplot(cv_results_reg$cv_results, aes(x = k, y = mean_performance)) +
geom_line(color = "blue") +
geom_ribbon(aes(ymin = mean_performance - sd_performance,
ymax = mean_performance + sd_performance),
alpha = 0.2, fill = "blue") +
geom_vline(xintercept = cv_results_reg$best_k, linetype = "dashed", color = "red") +
labs(title = "k-NN CV Results (Boston Housing Regression)",
x = "k",
y = "Mean RMSE",
subtitle = paste("Best k =", cv_results_reg$best_k)) +
theme_minimal()
library(class)
library(dplyr)
library(ggplot2)
data(mtcars)
mtcars_features <- mtcars[, c("mpg", "disp", "hp", "wt", "qsec")]
mtcars_target <- as.factor(mtcars$cyl)
mtcars_features_scaled <- as.data.frame(scale(mtcars_features))
cv_results_class <- kfold_cv_knn(
data = mtcars_features_scaled,
target = mtcars_target,
k_values = 1:10,
n_folds = 5,
type = "classification"
)
ggplot(cv_results_class$cv_results, aes(x = k, y = mean_performance)) +
geom_line(color = "darkgreen") +
geom_ribbon(aes(ymin = mean_performance - sd_performance,
ymax = mean_performance + sd_performance),
alpha = 0.2, fill = "green") +
geom_vline(xintercept = cv_results_class$best_k, linetype = "dashed", color = "red") +
labs(title = "k-NN CV Results (mtcars Cylinder Classification)",
x = "k",
y = "Mean Accuracy",
subtitle = paste("Best k =", cv_results_class$best_k)) +
theme_minimal()
Time: 60 minutes
Dataset: diamonds from
ggplot2
price using the IQR
method.price_per_carat = price ÷ caratvolume = x × y × zdepth_category (categorize depth)cut,
color, clarity using
faceting.carat,
price, and depth (use plotly if
available).log(price).ranger or
randomForest package).library(tidyverse)
library(caret)
library(ranger)
# 1. Data Preparation
diamonds_clean <- diamonds %>%
mutate(
price_per_carat = price / carat,
volume = x * y * z,
depth_category = cut(depth, breaks = 5, labels = c("Very Shallow", "Shallow", "Medium", "Deep", "Very Deep"))
) %>%
filter(price <= quantile(price, 0.99) & price >= quantile(price, 0.01))
# 2. EDA - Correlation heatmap
library(corrplot)
numeric_cols <- diamonds_clean %>%
select(where(is.numeric)) %>%
select(-x, -y, -z) # Remove dimension columns
cor_matrix <- cor(numeric_cols)
corrplot(cor_matrix, method = "color", type = "upper")
# 3. Modeling setup
set.seed(123)
diamonds_clean$log_price <- log(diamonds_clean$price)
# Train-test split
train_idx <- createDataPartition(diamonds_clean$log_price, p = 0.7, list = FALSE)
train_data <- diamonds_clean[train_idx, ]
test_data <- diamonds_clean[-train_idx, ]
# Linear model
lm_model <- lm(log_price ~ carat + cut + color + clarity + depth + table,
data = train_data)
# Random Forest
rf_model <- ranger(
log_price ~ carat + cut + color + clarity + depth + table,
data = train_data,
num.trees = 500,
mtry = 3,
importance = "impurity"
)
# Cross-validation comparison
train_control <- trainControl(method = "cv", number = 5)
# Linear model CV
lm_cv <- train(
log_price ~ carat + cut + color + clarity + depth + table,
data = train_data,
method = "lm",
trControl = train_control
)
# Random Forest CV
rf_cv <- train(
log_price ~ carat + cut + color + clarity + depth + table,
data = train_data,
method = "ranger",
trControl = train_control,
tuneGrid = expand.grid(
mtry = c(2, 3, 4),
splitrule = "variance",
min.node.size = 5
)
)
Growing trees.. Progress: 65%. Estimated remaining time: 16 seconds.
# Compare models
results <- resamples(list(
Linear = lm_cv,
RandomForest = rf_cv
))
summary(results)
Call:
summary.resamples(object = results)
Models: Linear, RandomForest
Number of resamples: 5
MAE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
Linear 0.2550032 0.2556220 0.2559445 0.2565396 0.2577014 0.2584267 0
RandomForest 0.1407062 0.1423713 0.1483277 0.1460058 0.1485131 0.1501109 0
RMSE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
Linear 0.3192101 0.3220438 0.3251787 0.3262524 0.3262542 0.3385752 0
RandomForest 0.1847776 0.1887349 0.1947069 0.1929534 0.1974694 0.1990780 0
Rsquared
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
Linear 0.8829245 0.8917176 0.8919805 0.8909221 0.8921635 0.8958243 0
RandomForest 0.9725903 0.9730766 0.9733414 0.9738127 0.9742229 0.9758326 0
# 4. Production outputs
# Save models
saveRDS(lm_model, "diamonds_lm_model.rds")
saveRDS(rf_model, "diamonds_rf_model.rds")
# Prediction function
predict_diamond_price <- function(new_data, model_type = "rf") {
if(model_type == "rf") {
model <- readRDS("diamonds_rf_model.rds")
pred_log <- predict(model, new_data)$predictions
} else {
model <- readRDS("diamonds_lm_model.rds")
pred_log <- predict(model, new_data)
}
# Convert back from log scale
price_pred <- exp(pred_log)
return(data.frame(
predicted_price = price_pred,
predicted_log_price = pred_log
))
}