Basic Data Exploration in R

# Sample dataset
df <- data.frame(
  ID = 1:5,
  Treatment = c("A", "B", "A", "B", "A"),
  InsectCount = c(45, 50, 60, 70, 55),
  PredatorCount = c(5, 6, 4, 7, 5)
)

🧮 1. Basic Structure of Data

str(df)

'data.frame':   5 obs. of  4 variables:
 $ ID           : int  1 2 3 4 5
 $ Treatment    : chr  "A" "B" "A" "B" ...
 $ InsectCount  : num  45 50 60 70 55
 $ PredatorCount: num  5 6 4 7 5

head(df)

  ID Treatment InsectCount PredatorCount
1  1         A          45             5
2  2         B          50             6
3  3         A          60             4
4  4         B          70             7
5  5         A          55             5

tail(df)

  ID Treatment InsectCount PredatorCount
1  1         A          45             5
2  2         B          50             6
3  3         A          60             4
4  4         B          70             7
5  5         A          55             5

names(df)

[1] "ID"            "Treatment"     "InsectCount"   "PredatorCount"

colnames(df)

[1] "ID"            "Treatment"     "InsectCount"   "PredatorCount"

rownames(df)

[1] "1" "2" "3" "4" "5"

dim(df)

[1] 5 4

nrow(df)

[1] 5

ncol(df)

[1] 4

summary(df)

       ID     Treatment          InsectCount PredatorCount
 Min.   :1   Length:5           Min.   :45   Min.   :4.0  
 1st Qu.:2   Class :character   1st Qu.:50   1st Qu.:5.0  
 Median :3   Mode  :character   Median :55   Median :5.0  
 Mean   :3                      Mean   :56   Mean   :5.4  
 3rd Qu.:4                      3rd Qu.:60   3rd Qu.:6.0  
 Max.   :5                      Max.   :70   Max.   :7.0

🔢 2. Data Types and Classes

class(df$InsectCount)

[1] "numeric"

typeof(df$InsectCount)

[1] "double"

mode(df$InsectCount)

[1] "numeric"

sapply(df, class)

           ID     Treatment   InsectCount PredatorCount 
    "integer"   "character"     "numeric"     "numeric"

lapply(df, class)

$ID
[1] "integer"

$Treatment
[1] "character"

$InsectCount
[1] "numeric"

$PredatorCount
[1] "numeric"

x <- c(1, 2, 3)
typeof(x)

[1] "double"

class(x)

[1] "numeric"

mode(x)

[1] "numeric"

🔣 3. Check for Factor (Categorical Variable)

is.factor(df$Treatment)

[1] FALSE

df$Treatment <- as.factor(df$Treatment)
levels(df$Treatment)

[1] "A" "B"

nlevels(df$Treatment)

[1] 2

✅ 4. Check for Logical, Numeric, Character

is.numeric(df$InsectCount)

[1] TRUE

is.character(df$Treatment)

[1] FALSE

is.integer(df$ID)

[1] TRUE

is.logical(df$ID)

[1] FALSE

🆎 5. Unique Values and Frequency Tables

unique(df$Treatment)

[1] A B
Levels: A B

table(df$Treatment)


A B 
3 2

table(df$Treatment, df$PredatorCount)

📏 6. Missing Values

any(is.na(df))

[1] FALSE

sum(is.na(df))

[1] 0

colSums(is.na(df))

           ID     Treatment   InsectCount PredatorCount 
            0             0             0             0

🔍 7. Inspect Specific Rows and Columns

df[5, ]

  ID Treatment InsectCount PredatorCount
5  5         A          55             5

df[1:5, ]

  ID Treatment InsectCount PredatorCount
1  1         A          45             5
2  2         B          50             6
3  3         A          60             4
4  4         B          70             7
5  5         A          55             5

df[, "InsectCount"]

[1] 45 50 60 70 55

df$InsectCount

[1] 45 50 60 70 55

🧠 8. Basic Descriptive Statistics

mean(df$InsectCount)

[1] 56

median(df$InsectCount)

[1] 55

sd(df$InsectCount)

[1] 9.617692

var(df$InsectCount)

[1] 92.5

range(df$InsectCount)

[1] 45 70

quantile(df$InsectCount, probs = c(0.25, 0.5, 0.75))

25% 50% 75% 
 50  55  60

📊 9. Visual Exploration

hist(df$InsectCount)

boxplot(df$InsectCount)

barplot(table(df$Treatment))

plot(df$InsectCount, df$PredatorCount)

📄 10. Checking Variable Names and Types at Once

data.frame(Variable = names(df),
           Class = sapply(df, class),
           IsFactor = sapply(df, is.factor),
           Missing = colSums(is.na(df)))

                   Variable   Class IsFactor Missing
ID                       ID integer    FALSE       0
Treatment         Treatment  factor     TRUE       0
InsectCount     InsectCount numeric    FALSE       0
PredatorCount PredatorCount numeric    FALSE       0

🧹 11. Useful dplyr Summary (Optional)

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

glimpse(df)

Rows: 5
Columns: 4
$ ID            <int> 1, 2, 3, 4, 5
$ Treatment     <fct> A, B, A, B, A
$ InsectCount   <dbl> 45, 50, 60, 70, 55
$ PredatorCount <dbl> 5, 6, 4, 7, 5

df %>%
  summarise(across(everything(), list(class = ~class(.), na = ~sum(is.na(.)))))

  ID_class ID_na Treatment_class Treatment_na InsectCount_class InsectCount_na
1  integer     0          factor            0           numeric              0
  PredatorCount_class PredatorCount_na
1             numeric                0

✅ Tip

You can wrap all your checks into one script or Quarto/Markdown report when exploring new data. This improves reproducibility and reporting.