# AI Student Usage Analysis - Regression Modeling


library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(broom)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Load cleaned data
ai_data <- ai_assistant_usage_student_life <- read_csv("C:/Users/User1/Desktop/Medium Article/ai_assistant_usage_student_life.csv")
## Rows: 10000 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): SessionID, StudentLevel, Discipline, TaskType, FinalOutcome
## dbl  (4): SessionLengthMin, TotalPrompts, AI_AssistanceLevel, SatisfactionRa...
## lgl  (1): UsedAgain
## date (1): SessionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 1. Focus on a Specific Subgroup


# Choose a specific discipline and task type for regression
# I'm focusing on Psychology students doing Writing tasks
writing_psych <- ai_data %>%
  filter(Discipline == "Psychology", TaskType == "Writing")

cat("Sessions for Psychology Writing tasks:", nrow(writing_psych), "\n")
## Sessions for Psychology Writing tasks: 438
# 2. Visualize the Relationship


# Scatter plot with regression line
ggplot(writing_psych, aes(x = AI_AssistanceLevel, y = SatisfactionRating)) +
  geom_jitter(alpha = 0.3, width = 0.2, color = "green") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "In Psychology Writing Tasks: Does More AI Help Lead to Higher Satisfaction?",
    subtitle = "Each point represents one student session",
    x = "AI Assistance Level (1 = Low, 5 = High)",
    y = "Satisfaction Rating (1 = Low, 5 = High)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# 3. Build Linear Regression Model


# Simple linear regression
model <- lm(SatisfactionRating ~ AI_AssistanceLevel, data = writing_psych)

# Model summary
summary(model)
## 
## Call:
## lm(formula = SatisfactionRating ~ AI_AssistanceLevel, data = writing_psych)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3658 -0.4843  0.1157  0.5111  1.9157 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.33989    0.12704   2.675  0.00774 ** 
## AI_AssistanceLevel  0.88147    0.03571  24.683  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7256 on 436 degrees of freedom
## Multiple R-squared:  0.5829, Adjusted R-squared:  0.5819 
## F-statistic: 609.3 on 1 and 436 DF,  p-value: < 2.2e-16
# Extract key metrics
r_squared <- summary(model)$r.squared
p_value <- summary(model)$coefficients[2, 4]
intercept <- coef(model)[1]
slope <- coef(model)[2]

cat("========================================\n")
## ========================================
cat("Regression Model Results\n")
## Regression Model Results
cat("========================================\n")
## ========================================
cat("R-squared:", round(r_squared, 3), "\n")
## R-squared: 0.583
cat("P-value:", format(p_value, scientific = TRUE), "\n")
## P-value: 8.241971e-85
cat("Equation: Satisfaction =", round(intercept, 3), "+", 
    round(slope, 3), "× AI Assistance\n")
## Equation: Satisfaction = 0.34 + 0.881 × AI Assistance
cat("========================================\n")
## ========================================
# 4. Visualize Model Predictions


writing_psych %>%
  mutate(Predicted = predict(model, .)) %>%
  ggplot(aes(x = AI_AssistanceLevel)) +
  geom_jitter(aes(y = SatisfactionRating), alpha = 0.3, width = 0.2) +
  geom_line(aes(y = Predicted), color = "red", size = 1.2) +
  annotate("text", x = 4.2, y = 2.5, 
           label = paste("R² =", round(r_squared, 3)),
           size = 5, color = "red") +
  annotate("text", x = 4.2, y = 2.2,
           label = paste("p < 0.001"),
           size = 4, color = "red") +
  labs(
    title = "Predicting Student Satisfaction from AI Assistance",
    subtitle = "For Psychology Students Working on Writing Tasks",
    x = "AI Assistance Level (1-5)",
    y = "Satisfaction Rating (1-5)"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 5. Compare Different Subgroups


# Compare across disciplines
compare_models <- function(data, discipline, task) {
  subset <- data %>% filter(Discipline == discipline, TaskType == task)
  if(nrow(subset) > 10) {
    model <- lm(SatisfactionRating ~ AI_AssistanceLevel, data = subset)
    return(c(discipline, task, nrow(subset), summary(model)$r.squared))
  }
  return(NULL)
}

# Compare Psychology Writing vs Computer Science Coding
psych_writing <- writing_psych
cs_coding <- ai_data %>% filter(Discipline == "Computer Science", TaskType == "Coding")

model_psych <- lm(SatisfactionRating ~ AI_AssistanceLevel, data = psych_writing)
model_cs <- lm(SatisfactionRating ~ AI_AssistanceLevel, data = cs_coding)

# Visualize comparison
comparison_data <- bind_rows(
  psych_writing %>% mutate(Group = "Psychology - Writing"),
  cs_coding %>% mutate(Group = "Computer Science - Coding")
)

ggplot(comparison_data, aes(x = AI_AssistanceLevel, y = SatisfactionRating, color = Group)) +
  geom_jitter(alpha = 0.2, width = 0.2) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "AI Impact Differs Across Disciplines",
    subtitle = "Psychology students show stronger positive relationship than CS students",
    x = "AI Assistance Level",
    y = "Satisfaction Rating",
    color = "Group"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# 6. Model Diagnostics


# Check model assumptions
par(mfrow = c(2, 2))
plot(model)

par(mfrow = c(1, 1))

# Save model results
saveRDS(model, "regression_model.rds")