# AI Student Usage Analysis - Data Exploration

# Load required libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(lubridate)
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Load data
ai_data <- read_csv("C:/Users/User1/Desktop/Medium Article/ai_assistant_usage_student_life.csv")
## Rows: 10000 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): SessionID, StudentLevel, Discipline, TaskType, FinalOutcome
## dbl  (4): SessionLengthMin, TotalPrompts, AI_AssistanceLevel, SatisfactionRa...
## lgl  (1): UsedAgain
## date (1): SessionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 1. Data Overview


# Check structure
glimpse(ai_data)
## Rows: 10,000
## Columns: 11
## $ SessionID          <chr> "SESSION00001", "SESSION00002", "SESSION00003", "SE…
## $ StudentLevel       <chr> "Undergraduate", "Undergraduate", "Undergraduate", …
## $ Discipline         <chr> "Computer Science", "Psychology", "Business", "Comp…
## $ SessionDate        <date> 2024-11-03, 2024-08-25, 2025-01-12, 2025-05-06, 20…
## $ SessionLengthMin   <dbl> 31.20, 13.09, 19.22, 3.70, 28.12, 7.54, 14.60, 9.20…
## $ TotalPrompts       <dbl> 11, 6, 5, 1, 9, 1, 3, 3, 1, 1, 6, 5, 6, 5, 8, 6, 2,…
## $ TaskType           <chr> "Studying", "Studying", "Coding", "Coding", "Writin…
## $ AI_AssistanceLevel <dbl> 2, 3, 3, 3, 3, 5, 3, 5, 3, 3, 2, 3, 5, 4, 3, 2, 5, …
## $ FinalOutcome       <chr> "Assignment Completed", "Assignment Completed", "As…
## $ UsedAgain          <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TR…
## $ SatisfactionRating <dbl> 1.0, 2.0, 3.3, 3.5, 2.9, 5.0, 1.9, 3.9, 3.3, 1.4, 1…
# Check missing values
colSums(is.na(ai_data))
##          SessionID       StudentLevel         Discipline        SessionDate 
##                  0                  0                  0                  0 
##   SessionLengthMin       TotalPrompts           TaskType AI_AssistanceLevel 
##                  0                  0                  0                  0 
##       FinalOutcome          UsedAgain SatisfactionRating 
##                  0                  0                  0
# Summary statistics
summary(ai_data)
##   SessionID         StudentLevel        Discipline         SessionDate        
##  Length:10000       Length:10000       Length:10000       Min.   :2024-06-24  
##  Class :character   Class :character   Class :character   1st Qu.:2024-09-20  
##  Mode  :character   Mode  :character   Mode  :character   Median :2024-12-25  
##                                                           Mean   :2024-12-23  
##                                                           3rd Qu.:2025-03-27  
##                                                           Max.   :2025-06-24  
##  SessionLengthMin  TotalPrompts      TaskType         AI_AssistanceLevel
##  Min.   :  0.03   Min.   : 1.000   Length:10000       Min.   :1.000     
##  1st Qu.:  9.63   1st Qu.: 2.000   Class :character   1st Qu.:3.000     
##  Median : 16.65   Median : 4.000   Mode  :character   Median :4.000     
##  Mean   : 19.85   Mean   : 5.607                      Mean   :3.479     
##  3rd Qu.: 26.67   3rd Qu.: 8.000                      3rd Qu.:4.000     
##  Max.   :110.81   Max.   :39.000                      Max.   :5.000     
##  FinalOutcome       UsedAgain       SatisfactionRating
##  Length:10000       Mode :logical   Min.   :1.000     
##  Class :character   FALSE:2936      1st Qu.:2.600     
##  Mode  :character   TRUE :7064      Median :3.500     
##                                     Mean   :3.418     
##                                     3rd Qu.:4.400     
##                                     Max.   :5.000
# 2. Data Cleaning & Preparation


# Remove rows with missing values
ai_data <- ai_data %>% drop_na()

# Create categorical AI assistance level
ai_data <- ai_data %>%
  mutate(AI_Assistance_Label = case_when(
    AI_AssistanceLevel %in% c(1, 2) ~ "Low",
    AI_AssistanceLevel %in% c(3) ~ "Medium",
    AI_AssistanceLevel %in% c(4, 5) ~ "High"
  ))

# Convert to factor for better plotting
ai_data$AI_Assistance_Label <- factor(ai_data$AI_Assistance_Label, 
                                      levels = c("Low", "Medium", "High"))

# Extract date features
ai_data <- ai_data %>%
  mutate(
    SessionDate = ymd(SessionDate),
    Month = month(SessionDate, label = TRUE),
    Year = year(SessionDate)
  )
# 3. Exploratory Analysis


# Task Type Distribution
task_counts <- ai_data %>%
  count(TaskType) %>%
  arrange(desc(n))
# Plot 1: What are students using AI for?
ggplot(task_counts, aes(x = reorder(TaskType, n), y = n, fill = TaskType)) +
  geom_col() +
  geom_text(aes(label = n), hjust = -0.2, size = 3) +
  coord_flip() +
  labs(
    title = "What Are Students Using AI For?",
    subtitle = paste("Total sessions:", nrow(ai_data)),
    x = "Task Type",
    y = "Number of Sessions"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "Set2")

# Plot 2: AI Assistance Level vs Final Outcome
ai_data %>%
  filter(!is.na(AI_Assistance_Label)) %>%
  ggplot(aes(x = AI_Assistance_Label, fill = FinalOutcome)) +
  geom_bar(position = "fill") +
  labs(
    title = "Does Higher AI Assistance Lead to Better Outcomes?",
    subtitle = "Higher assistance correlates with more completions, but also more confusion",
    x = "AI Assistance Level",
    y = "Proportion of Sessions",
    fill = "Final Outcome"
  ) +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1") +
  scale_y_continuous(labels = percent_format())

# Plot 3: AI Assistance by Discipline
ai_data %>%
  count(Discipline, AI_Assistance_Label) %>%
  group_by(Discipline) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(x = Discipline, y = proportion, fill = AI_Assistance_Label)) +
  geom_col(position = "stack") +
  labs(
    title = "AI Usage Patterns Across Disciplines",
    subtitle = "Which fields rely most heavily on AI?",
    x = "Discipline",
    y = "Proportion",
    fill = "AI Assistance Level"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Blues")

# Plot 4: Satisfaction Rating Distribution
ggplot(ai_data, aes(x = SatisfactionRating)) +
  geom_histogram(binwidth = 0.5, fill = "red", color = "yellow") +
  labs(
    title = "Distribution of Student Satisfaction Ratings",
    subtitle = "Most students report moderate to high satisfaction",
    x = "Satisfaction Rating (1-5)",
    y = "Count"
  ) +
  theme_minimal()

# Save cleaned data for later use
saveRDS(ai_data, "ai_data_cleaned.rds")