## creating the data set using data.frame and sample()
set.seed(123)
data <- data.frame(
ID = 1:30,
Gender = sample(c("Male", "Female"), 30, replace = TRUE),
AgeGroup = sample(c("18-25", "26-35", "36-45"), 30, replace = TRUE),
EducationLevel = sample(c("High School", "Bachelor's", "Master's", "PhD"), 30, replace = TRUE),
stringsAsFactors = FALSE
)
question 2- current structure containing character strings- Gender,
AgeGroup, EducationLevels
cat("\nStructure before conversion:\n")
##
## Structure before conversion:
str(data)
## 'data.frame': 30 obs. of 4 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ AgeGroup : chr "18-25" "26-35" "36-45" "26-35" ...
## $ EducationLevel: chr "High School" "Master's" "High School" "Bachelor's" ...
question 3 converting character strings to factors using within (),
creating columns.
data <- within(data, {
Gender <- factor(Gender, levels = c("Female", "Male"))
AgeGroup <- factor(AgeGroup, levels = c("18-25", "26-35", "36-45"), ordered = TRUE)
EducationLevel <- factor(EducationLevel,
levels = c("High School", "Bachelor's", "Master's", "PhD"),
ordered = TRUE)
})
question 4- verify conversion from strings -> factors
cat("\nStructure after conversion:\n")
##
## Structure after conversion:
str(data)
## 'data.frame': 30 obs. of 4 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 1 1 2 2 ...
## $ AgeGroup : Ord.factor w/ 3 levels "18-25"<"26-35"<..: 1 2 3 2 1 3 3 1 3 2 ...
## $ EducationLevel: Ord.factor w/ 4 levels "High School"<..: 1 3 1 2 1 3 1 3 2 4 ...
question 5- use (summary) for quick breakdown of data
cat("\nSummary of factor columns:\n")
##
## Summary of factor columns:
print(summary(data[c("Gender", "AgeGroup", "EducationLevel")]))
## Gender AgeGroup EducationLevel
## Female:13 18-25:13 High School:8
## Male :17 26-35: 7 Bachelor's :9
## 36-45:10 Master's :7
## PhD :6
question 6- creating graphs
library(colorspace)
rainbow_hcl(4)
## [1] "#E495A5" "#ABB065" "#39BEB1" "#ACA4E2"
diverge_hcl(3)
## [1] "#023FA5" "#E2E2E2" "#8E063B"
op <- par(mfrow = c(1,3), mar = c(5,4,2,1))
barplot(table(data$AgeGroup),
main = "Age Groups",
xlab = "Age Group", ylab = "Number of Individuals",
col = diverge_hcl(3))
barplot(table(data$Gender),
main = "Gender",
xlab = "Gender", ylab = "Number of Individuals",
col =c("magenta","steelblue1"))
barplot(table(data$EducationLevel),
main = "Education Level",
las = 2, cex.names = 0.8,
xlab = "", ylab = "Number of Individuals",
col = rainbow_hcl(4))

par(op)
question 7- paragraph
counts_age <- table(data$AgeGroup)
props_age <- prop.table(counts_age)
top_age <- names(which.max(counts_age))
top_ed <- names(which.max(table(data$EducationLevel)))
top_gen <- names(which.max(table(data$Gender)))
cat("\nParagraph:",
"\nThe most prevalent age group is ", top_age, " with ", counts_age[top_age], " individuals. ",
"\nThis distribution suggests that analyses may be more strongly influenced by the\n",
top_age, "group compared to others.")
##
## Paragraph:
## The most prevalent age group is 18-25 with 13 individuals.
## This distribution suggests that analyses may be more strongly influenced by the
## 18-25 group compared to others.