Data Analysis using R

Vectors and Dataframes in R

———————————————————————–

############################################################
# Clean R Environment
############################################################
# Remove all objects from the workspace
rm(list = ls())

############################################################
### Set the working directory where your files are located
setwd("D:/D Drive/Ph.D. Course Work/PhD_2025/DataFile")
### Verify the current working directory
getwd()
## [1] "D:/D Drive/Ph.D. Course Work/PhD_2025/DataFile"
st = c("s1","s2","s3","s4")  # Character vector
marks = c(20.5, 25, 27, 21)  # Numeric vector
marks
## [1] 20.5 25.0 27.0 21.0
### R usually displays 7 significant digits for numeric values.
### Sequence of integers increment by 1
1:10  
##  [1]  1  2  3  4  5  6  7  8  9 10
### Sequence with arbitrary increment
seq(from = 1, to = 20, by = 0.2)
##  [1]  1.0  1.2  1.4  1.6  1.8  2.0  2.2  2.4  2.6  2.8  3.0  3.2  3.4  3.6  3.8
## [16]  4.0  4.2  4.4  4.6  4.8  5.0  5.2  5.4  5.6  5.8  6.0  6.2  6.4  6.6  6.8
## [31]  7.0  7.2  7.4  7.6  7.8  8.0  8.2  8.4  8.6  8.8  9.0  9.2  9.4  9.6  9.8
## [46] 10.0 10.2 10.4 10.6 10.8 11.0 11.2 11.4 11.6 11.8 12.0 12.2 12.4 12.6 12.8
## [61] 13.0 13.2 13.4 13.6 13.8 14.0 14.2 14.4 14.6 14.8 15.0 15.2 15.4 15.6 15.8
## [76] 16.0 16.2 16.4 16.6 16.8 17.0 17.2 17.4 17.6 17.8 18.0 18.2 18.4 18.6 18.8
## [91] 19.0 19.2 19.4 19.6 19.8 20.0
###creates integer sequences with step   allows custom increments.
rep(c("A", "B"), 5) # repeats elements a specified number of times.
##  [1] "A" "B" "A" "B" "A" "B" "A" "B" "A" "B"
st[3]   # Access the 3rd element
## [1] "s3"
st[-3]  # Exclude the 3rd element
## [1] "s1" "s2" "s4"
###Types of Vectors
####### Numeric
####### Integer
####### Complex
####### Character
####### Logical


### Factor
### Factors are used for categorical data in R.
v = c(1, 1, 0, 0, 1, 1)
f1 = factor(v, levels = c(0,1),
            labels = c("good","bad"),
            ordered = TRUE)
f1
## [1] bad  bad  good good bad  bad 
## Levels: good < bad
levels(f1)  # Get all factor levels
## [1] "good" "bad"
nlevels(f1) # Number of levels
## [1] 2
### Factors store categorical data and can be ordered. Useful for statistical modeling.


as.numeric(f1)       # Convert factor to numeric
## [1] 2 2 1 1 2 2
as.character(marks)  # Convert numeric to character
## [1] "20.5" "25"   "27"   "21"
length(st)        # Number of elements
## [1] 4
st[length(st)]    # Last element using length
## [1] "s4"
#Numerical Operations on Vectors}

marks + 5   # Addition
## [1] 25.5 30.0 32.0 26.0
marks - 2   # Subtraction
## [1] 18.5 23.0 25.0 19.0
marks * 2   # Multiplication
## [1] 41 50 54 42
marks / 2   # Division
## [1] 10.25 12.50 13.50 10.50
marks ^ 2   # Exponentiation
## [1] 420.25 625.00 729.00 441.00
###Vector Summary Functions

min(marks)
## [1] 20.5
max(marks)
## [1] 27
sum(marks)
## [1] 93.5
prod(marks)
## [1] 290587.5
cumsum(marks)  # Cumulative sum
## [1] 20.5 45.5 72.5 93.5
###Logical Operators
x = c(TRUE, FALSE, TRUE)
!x                     # NOT
## [1] FALSE  TRUE FALSE
x & c(TRUE, TRUE, FALSE)  # AND
## [1]  TRUE FALSE FALSE
x | c(FALSE, TRUE, TRUE)  # OR
## [1] TRUE TRUE TRUE
marks > 22             # Comparison
## [1] FALSE  TRUE  TRUE FALSE
marks == 25            # Equality
## [1] FALSE  TRUE FALSE FALSE
###paste() combines strings with spaces or separators.
paste("Student", st, "has marks", marks)
## [1] "Student s1 has marks 20.5" "Student s2 has marks 25"  
## [3] "Student s3 has marks 27"   "Student s4 has marks 21"
sort(marks)   # Sort ascending
## [1] 20.5 21.0 25.0 27.0
rank(marks)   # Rank elements
## [1] 1 3 4 2
order(marks)  # Indices that would sort the vector
## [1] 1 4 2 3
###Data Frame

df = data.frame(Name=st, Marks=marks, Grade=c("A","B","A","C"))
df
##   Name Marks Grade
## 1   s1  20.5     A
## 2   s2  25.0     B
## 3   s3  27.0     A
## 4   s4  21.0     C
### Data frames store different types in columns but all columns must have the same length.

cars_df = data.frame(speed=c(4,7,12), dist=c(2,10,24))
cars_df
##   speed dist
## 1     4    2
## 2     7   10
## 3    12   24
###Data Frame Dimensions and Column Names}

nrow(df)     # Number of rows
## [1] 4
ncol(df)     # Number of columns
## [1] 3
dim(df)      # Rows x Columns
## [1] 4 3
names(df)    # Column names
## [1] "Name"  "Marks" "Grade"
dimnames(df) # Row and column names
## [[1]]
## [1] "1" "2" "3" "4"
## 
## [[2]]
## [1] "Name"  "Marks" "Grade"
###Subset of Data Frame
df[3, ]           # All variables for 3rd observation
##   Name Marks Grade
## 3   s3    27     A
df[, 2]           # 2nd variable for all observations
## [1] 20.5 25.0 27.0 21.0
df[c(1:5,12,15), ]  # Observations 1 to 5, 12, 15
##      Name Marks Grade
## 1      s1  20.5     A
## 2      s2  25.0     B
## 3      s3  27.0     A
## 4      s4  21.0     C
## NA   <NA>    NA  <NA>
## NA.1 <NA>    NA  <NA>
## NA.2 <NA>    NA  <NA>
df[-(10:20), ]       # Remove observations 10 to 20
##   Name Marks Grade
## 1   s1  20.5     A
## 2   s2  25.0     B
## 3   s3  27.0     A
## 4   s4  21.0     C
df[nrow(df), ]        # Last observation
##   Name Marks Grade
## 4   s4    21     C