Basic Data Exploration in R

# Sample dataset
df <- data.frame(
  ID = 1:5,
  Treatment = c("A", "B", "A", "B", "A"),
  InsectCount = c(45, 50, 60, 70, 55),
  PredatorCount = c(5, 6, 4, 7, 5)
)

๐Ÿงฎ 1. Basic Structure of Data

str(df)
'data.frame':   5 obs. of  4 variables:
 $ ID           : int  1 2 3 4 5
 $ Treatment    : chr  "A" "B" "A" "B" ...
 $ InsectCount  : num  45 50 60 70 55
 $ PredatorCount: num  5 6 4 7 5
head(df)
  ID Treatment InsectCount PredatorCount
1  1         A          45             5
2  2         B          50             6
3  3         A          60             4
4  4         B          70             7
5  5         A          55             5
tail(df)
  ID Treatment InsectCount PredatorCount
1  1         A          45             5
2  2         B          50             6
3  3         A          60             4
4  4         B          70             7
5  5         A          55             5
names(df)
[1] "ID"            "Treatment"     "InsectCount"   "PredatorCount"
colnames(df)
[1] "ID"            "Treatment"     "InsectCount"   "PredatorCount"
rownames(df)
[1] "1" "2" "3" "4" "5"
dim(df)
[1] 5 4
nrow(df)
[1] 5
ncol(df)
[1] 4
summary(df)
       ID     Treatment          InsectCount PredatorCount
 Min.   :1   Length:5           Min.   :45   Min.   :4.0  
 1st Qu.:2   Class :character   1st Qu.:50   1st Qu.:5.0  
 Median :3   Mode  :character   Median :55   Median :5.0  
 Mean   :3                      Mean   :56   Mean   :5.4  
 3rd Qu.:4                      3rd Qu.:60   3rd Qu.:6.0  
 Max.   :5                      Max.   :70   Max.   :7.0  

๐Ÿ”ข 2. Data Types and Classes

class(df$InsectCount)
[1] "numeric"
typeof(df$InsectCount)
[1] "double"
mode(df$InsectCount)
[1] "numeric"
sapply(df, class)
           ID     Treatment   InsectCount PredatorCount 
    "integer"   "character"     "numeric"     "numeric" 
lapply(df, class)
$ID
[1] "integer"

$Treatment
[1] "character"

$InsectCount
[1] "numeric"

$PredatorCount
[1] "numeric"
x <- c(1, 2, 3)
typeof(x)
[1] "double"
class(x)
[1] "numeric"
mode(x)
[1] "numeric"

๐Ÿ”ฃ 3. Check for Factor (Categorical Variable)

is.factor(df$Treatment)
[1] FALSE
df$Treatment <- as.factor(df$Treatment)
levels(df$Treatment)
[1] "A" "B"
nlevels(df$Treatment)
[1] 2

โœ… 4. Check for Logical, Numeric, Character

is.numeric(df$InsectCount)
[1] TRUE
is.character(df$Treatment)
[1] FALSE
is.integer(df$ID)
[1] TRUE
is.logical(df$ID)
[1] FALSE

๐Ÿ†Ž 5. Unique Values and Frequency Tables

unique(df$Treatment)
[1] A B
Levels: A B
table(df$Treatment)

A B 
3 2 
table(df$Treatment, df$PredatorCount)
   
    4 5 6 7
  A 1 2 0 0
  B 0 0 1 1

๐Ÿ“ 6. Missing Values

any(is.na(df))
[1] FALSE
sum(is.na(df))
[1] 0
colSums(is.na(df))
           ID     Treatment   InsectCount PredatorCount 
            0             0             0             0 

๐Ÿ” 7. Inspect Specific Rows and Columns

df[5, ]
  ID Treatment InsectCount PredatorCount
5  5         A          55             5
df[1:5, ]
  ID Treatment InsectCount PredatorCount
1  1         A          45             5
2  2         B          50             6
3  3         A          60             4
4  4         B          70             7
5  5         A          55             5
df[, "InsectCount"]
[1] 45 50 60 70 55
df$InsectCount
[1] 45 50 60 70 55

๐Ÿง  8. Basic Descriptive Statistics

mean(df$InsectCount)
[1] 56
median(df$InsectCount)
[1] 55
sd(df$InsectCount)
[1] 9.617692
var(df$InsectCount)
[1] 92.5
range(df$InsectCount)
[1] 45 70
quantile(df$InsectCount, probs = c(0.25, 0.5, 0.75))
25% 50% 75% 
 50  55  60 

๐Ÿ“Š 9. Visual Exploration

hist(df$InsectCount)

boxplot(df$InsectCount)

barplot(table(df$Treatment))

plot(df$InsectCount, df$PredatorCount)


๐Ÿ“„ 10. Checking Variable Names and Types at Once

data.frame(Variable = names(df),
           Class = sapply(df, class),
           IsFactor = sapply(df, is.factor),
           Missing = colSums(is.na(df)))
                   Variable   Class IsFactor Missing
ID                       ID integer    FALSE       0
Treatment         Treatment  factor     TRUE       0
InsectCount     InsectCount numeric    FALSE       0
PredatorCount PredatorCount numeric    FALSE       0

๐Ÿงน 11. Useful dplyr Summary (Optional)

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
glimpse(df)
Rows: 5
Columns: 4
$ ID            <int> 1, 2, 3, 4, 5
$ Treatment     <fct> A, B, A, B, A
$ InsectCount   <dbl> 45, 50, 60, 70, 55
$ PredatorCount <dbl> 5, 6, 4, 7, 5
df %>%
  summarise(across(everything(), list(class = ~class(.), na = ~sum(is.na(.)))))
  ID_class ID_na Treatment_class Treatment_na InsectCount_class InsectCount_na
1  integer     0          factor            0           numeric              0
  PredatorCount_class PredatorCount_na
1             numeric                0

โœ… Tip

You can wrap all your checks into one script or Quarto/Markdown report when exploring new data. This improves reproducibility and reporting.