# AI Student Usage Analysis - Data Exploration
# Load required libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(lubridate)
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Load data
ai_data <- read_csv("C:/Users/User1/Desktop/Medium Article/ai_assistant_usage_student_life.csv")
## Rows: 10000 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): SessionID, StudentLevel, Discipline, TaskType, FinalOutcome
## dbl (4): SessionLengthMin, TotalPrompts, AI_AssistanceLevel, SatisfactionRa...
## lgl (1): UsedAgain
## date (1): SessionDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 1. Data Overview
# Check structure
glimpse(ai_data)
## Rows: 10,000
## Columns: 11
## $ SessionID <chr> "SESSION00001", "SESSION00002", "SESSION00003", "SE…
## $ StudentLevel <chr> "Undergraduate", "Undergraduate", "Undergraduate", …
## $ Discipline <chr> "Computer Science", "Psychology", "Business", "Comp…
## $ SessionDate <date> 2024-11-03, 2024-08-25, 2025-01-12, 2025-05-06, 20…
## $ SessionLengthMin <dbl> 31.20, 13.09, 19.22, 3.70, 28.12, 7.54, 14.60, 9.20…
## $ TotalPrompts <dbl> 11, 6, 5, 1, 9, 1, 3, 3, 1, 1, 6, 5, 6, 5, 8, 6, 2,…
## $ TaskType <chr> "Studying", "Studying", "Coding", "Coding", "Writin…
## $ AI_AssistanceLevel <dbl> 2, 3, 3, 3, 3, 5, 3, 5, 3, 3, 2, 3, 5, 4, 3, 2, 5, …
## $ FinalOutcome <chr> "Assignment Completed", "Assignment Completed", "As…
## $ UsedAgain <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TR…
## $ SatisfactionRating <dbl> 1.0, 2.0, 3.3, 3.5, 2.9, 5.0, 1.9, 3.9, 3.3, 1.4, 1…
# Check missing values
colSums(is.na(ai_data))
## SessionID StudentLevel Discipline SessionDate
## 0 0 0 0
## SessionLengthMin TotalPrompts TaskType AI_AssistanceLevel
## 0 0 0 0
## FinalOutcome UsedAgain SatisfactionRating
## 0 0 0
# Summary statistics
summary(ai_data)
## SessionID StudentLevel Discipline SessionDate
## Length:10000 Length:10000 Length:10000 Min. :2024-06-24
## Class :character Class :character Class :character 1st Qu.:2024-09-20
## Mode :character Mode :character Mode :character Median :2024-12-25
## Mean :2024-12-23
## 3rd Qu.:2025-03-27
## Max. :2025-06-24
## SessionLengthMin TotalPrompts TaskType AI_AssistanceLevel
## Min. : 0.03 Min. : 1.000 Length:10000 Min. :1.000
## 1st Qu.: 9.63 1st Qu.: 2.000 Class :character 1st Qu.:3.000
## Median : 16.65 Median : 4.000 Mode :character Median :4.000
## Mean : 19.85 Mean : 5.607 Mean :3.479
## 3rd Qu.: 26.67 3rd Qu.: 8.000 3rd Qu.:4.000
## Max. :110.81 Max. :39.000 Max. :5.000
## FinalOutcome UsedAgain SatisfactionRating
## Length:10000 Mode :logical Min. :1.000
## Class :character FALSE:2936 1st Qu.:2.600
## Mode :character TRUE :7064 Median :3.500
## Mean :3.418
## 3rd Qu.:4.400
## Max. :5.000
# 2. Data Cleaning & Preparation
# Remove rows with missing values
ai_data <- ai_data %>% drop_na()
# Create categorical AI assistance level
ai_data <- ai_data %>%
mutate(AI_Assistance_Label = case_when(
AI_AssistanceLevel %in% c(1, 2) ~ "Low",
AI_AssistanceLevel %in% c(3) ~ "Medium",
AI_AssistanceLevel %in% c(4, 5) ~ "High"
))
# Convert to factor for better plotting
ai_data$AI_Assistance_Label <- factor(ai_data$AI_Assistance_Label,
levels = c("Low", "Medium", "High"))
# Extract date features
ai_data <- ai_data %>%
mutate(
SessionDate = ymd(SessionDate),
Month = month(SessionDate, label = TRUE),
Year = year(SessionDate)
)
# 3. Exploratory Analysis
# Task Type Distribution
task_counts <- ai_data %>%
count(TaskType) %>%
arrange(desc(n))
# Plot 1: What are students using AI for?
ggplot(task_counts, aes(x = reorder(TaskType, n), y = n, fill = TaskType)) +
geom_col() +
geom_text(aes(label = n), hjust = -0.2, size = 3) +
coord_flip() +
labs(
title = "What Are Students Using AI For?",
subtitle = paste("Total sessions:", nrow(ai_data)),
x = "Task Type",
y = "Number of Sessions"
) +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_brewer(palette = "Set2")

# Plot 2: AI Assistance Level vs Final Outcome
ai_data %>%
filter(!is.na(AI_Assistance_Label)) %>%
ggplot(aes(x = AI_Assistance_Label, fill = FinalOutcome)) +
geom_bar(position = "fill") +
labs(
title = "Does Higher AI Assistance Lead to Better Outcomes?",
subtitle = "Higher assistance correlates with more completions, but also more confusion",
x = "AI Assistance Level",
y = "Proportion of Sessions",
fill = "Final Outcome"
) +
theme_minimal() +
scale_fill_brewer(palette = "Set1") +
scale_y_continuous(labels = percent_format())

# Plot 3: AI Assistance by Discipline
ai_data %>%
count(Discipline, AI_Assistance_Label) %>%
group_by(Discipline) %>%
mutate(proportion = n / sum(n)) %>%
ggplot(aes(x = Discipline, y = proportion, fill = AI_Assistance_Label)) +
geom_col(position = "stack") +
labs(
title = "AI Usage Patterns Across Disciplines",
subtitle = "Which fields rely most heavily on AI?",
x = "Discipline",
y = "Proportion",
fill = "AI Assistance Level"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(palette = "Blues")

# Plot 4: Satisfaction Rating Distribution
ggplot(ai_data, aes(x = SatisfactionRating)) +
geom_histogram(binwidth = 0.5, fill = "red", color = "yellow") +
labs(
title = "Distribution of Student Satisfaction Ratings",
subtitle = "Most students report moderate to high satisfaction",
x = "Satisfaction Rating (1-5)",
y = "Count"
) +
theme_minimal()

# Save cleaned data for later use
saveRDS(ai_data, "ai_data_cleaned.rds")