# Sample dataset
df <- data.frame(
ID = 1:5,
Treatment = c("A", "B", "A", "B", "A"),
InsectCount = c(45, 50, 60, 70, 55),
PredatorCount = c(5, 6, 4, 7, 5)
)Basic Data Exploration in R
๐งฎ 1. Basic Structure of Data
str(df)'data.frame': 5 obs. of 4 variables:
$ ID : int 1 2 3 4 5
$ Treatment : chr "A" "B" "A" "B" ...
$ InsectCount : num 45 50 60 70 55
$ PredatorCount: num 5 6 4 7 5
head(df) ID Treatment InsectCount PredatorCount
1 1 A 45 5
2 2 B 50 6
3 3 A 60 4
4 4 B 70 7
5 5 A 55 5
tail(df) ID Treatment InsectCount PredatorCount
1 1 A 45 5
2 2 B 50 6
3 3 A 60 4
4 4 B 70 7
5 5 A 55 5
names(df)[1] "ID" "Treatment" "InsectCount" "PredatorCount"
colnames(df)[1] "ID" "Treatment" "InsectCount" "PredatorCount"
rownames(df)[1] "1" "2" "3" "4" "5"
dim(df)[1] 5 4
nrow(df)[1] 5
ncol(df)[1] 4
summary(df) ID Treatment InsectCount PredatorCount
Min. :1 Length:5 Min. :45 Min. :4.0
1st Qu.:2 Class :character 1st Qu.:50 1st Qu.:5.0
Median :3 Mode :character Median :55 Median :5.0
Mean :3 Mean :56 Mean :5.4
3rd Qu.:4 3rd Qu.:60 3rd Qu.:6.0
Max. :5 Max. :70 Max. :7.0
๐ข 2. Data Types and Classes
class(df$InsectCount)[1] "numeric"
typeof(df$InsectCount)[1] "double"
mode(df$InsectCount)[1] "numeric"
sapply(df, class) ID Treatment InsectCount PredatorCount
"integer" "character" "numeric" "numeric"
lapply(df, class)$ID
[1] "integer"
$Treatment
[1] "character"
$InsectCount
[1] "numeric"
$PredatorCount
[1] "numeric"
x <- c(1, 2, 3)
typeof(x)[1] "double"
class(x)[1] "numeric"
mode(x)[1] "numeric"
๐ฃ 3. Check for Factor (Categorical Variable)
is.factor(df$Treatment)[1] FALSE
df$Treatment <- as.factor(df$Treatment)
levels(df$Treatment)[1] "A" "B"
nlevels(df$Treatment)[1] 2
โ 4. Check for Logical, Numeric, Character
is.numeric(df$InsectCount)[1] TRUE
is.character(df$Treatment)[1] FALSE
is.integer(df$ID)[1] TRUE
is.logical(df$ID)[1] FALSE
๐ 5. Unique Values and Frequency Tables
unique(df$Treatment)[1] A B
Levels: A B
table(df$Treatment)
A B
3 2
table(df$Treatment, df$PredatorCount)
4 5 6 7
A 1 2 0 0
B 0 0 1 1
๐ 6. Missing Values
any(is.na(df))[1] FALSE
sum(is.na(df))[1] 0
colSums(is.na(df)) ID Treatment InsectCount PredatorCount
0 0 0 0
๐ 7. Inspect Specific Rows and Columns
df[5, ] ID Treatment InsectCount PredatorCount
5 5 A 55 5
df[1:5, ] ID Treatment InsectCount PredatorCount
1 1 A 45 5
2 2 B 50 6
3 3 A 60 4
4 4 B 70 7
5 5 A 55 5
df[, "InsectCount"][1] 45 50 60 70 55
df$InsectCount[1] 45 50 60 70 55
๐ง 8. Basic Descriptive Statistics
mean(df$InsectCount)[1] 56
median(df$InsectCount)[1] 55
sd(df$InsectCount)[1] 9.617692
var(df$InsectCount)[1] 92.5
range(df$InsectCount)[1] 45 70
quantile(df$InsectCount, probs = c(0.25, 0.5, 0.75))25% 50% 75%
50 55 60
๐ 9. Visual Exploration
hist(df$InsectCount)boxplot(df$InsectCount)barplot(table(df$Treatment))plot(df$InsectCount, df$PredatorCount)๐ 10. Checking Variable Names and Types at Once
data.frame(Variable = names(df),
Class = sapply(df, class),
IsFactor = sapply(df, is.factor),
Missing = colSums(is.na(df))) Variable Class IsFactor Missing
ID ID integer FALSE 0
Treatment Treatment factor TRUE 0
InsectCount InsectCount numeric FALSE 0
PredatorCount PredatorCount numeric FALSE 0
๐งน 11. Useful dplyr Summary (Optional)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
glimpse(df)Rows: 5
Columns: 4
$ ID <int> 1, 2, 3, 4, 5
$ Treatment <fct> A, B, A, B, A
$ InsectCount <dbl> 45, 50, 60, 70, 55
$ PredatorCount <dbl> 5, 6, 4, 7, 5
df %>%
summarise(across(everything(), list(class = ~class(.), na = ~sum(is.na(.))))) ID_class ID_na Treatment_class Treatment_na InsectCount_class InsectCount_na
1 integer 0 factor 0 numeric 0
PredatorCount_class PredatorCount_na
1 numeric 0
โ Tip
You can wrap all your checks into one script or Quarto/Markdown report when exploring new data. This improves reproducibility and reporting.