Assignment 4- Cassidy Longoria

## creating the data set using data.frame and sample()
set.seed(123)
data <- data.frame(
  ID = 1:30,
  Gender = sample(c("Male", "Female"), 30, replace = TRUE),
  AgeGroup = sample(c("18-25", "26-35", "36-45"), 30, replace = TRUE),
  EducationLevel = sample(c("High School", "Bachelor's", "Master's", "PhD"), 30, replace = TRUE),
  stringsAsFactors = FALSE
)

question 2- current structure containing character strings- Gender, AgeGroup, EducationLevels

cat("\nStructure before conversion:\n")

## 
## Structure before conversion:

str(data)

## 'data.frame':    30 obs. of  4 variables:
##  $ ID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender        : chr  "Male" "Male" "Male" "Female" ...
##  $ AgeGroup      : chr  "18-25" "26-35" "36-45" "26-35" ...
##  $ EducationLevel: chr  "High School" "Master's" "High School" "Bachelor's" ...

question 3 converting character strings to factors using within (), creating columns.

data <- within(data, {
  Gender <- factor(Gender, levels = c("Female", "Male"))
  AgeGroup <- factor(AgeGroup, levels = c("18-25", "26-35", "36-45"), ordered = TRUE)
  EducationLevel <- factor(EducationLevel,
                           levels = c("High School", "Bachelor's", "Master's", "PhD"),
                           ordered = TRUE)
})

question 4- verify conversion from strings -> factors

cat("\nStructure after conversion:\n")

## 
## Structure after conversion:

str(data)

## 'data.frame':    30 obs. of  4 variables:
##  $ ID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender        : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 1 1 2 2 ...
##  $ AgeGroup      : Ord.factor w/ 3 levels "18-25"<"26-35"<..: 1 2 3 2 1 3 3 1 3 2 ...
##  $ EducationLevel: Ord.factor w/ 4 levels "High School"<..: 1 3 1 2 1 3 1 3 2 4 ...

question 5- use (summary) for quick breakdown of data

cat("\nSummary of factor columns:\n")

## 
## Summary of factor columns:

print(summary(data[c("Gender", "AgeGroup", "EducationLevel")]))

##     Gender    AgeGroup      EducationLevel
##  Female:13   18-25:13   High School:8     
##  Male  :17   26-35: 7   Bachelor's :9     
##              36-45:10   Master's   :7     
##                         PhD        :6

question 6- creating graphs

library(colorspace)
rainbow_hcl(4)

## [1] "#E495A5" "#ABB065" "#39BEB1" "#ACA4E2"

diverge_hcl(3)

## [1] "#023FA5" "#E2E2E2" "#8E063B"

op <- par(mfrow = c(1,3), mar = c(5,4,2,1))
barplot(table(data$AgeGroup), 
        main = "Age Groups",
        xlab = "Age Group", ylab = "Number of Individuals",
        col = diverge_hcl(3))
barplot(table(data$Gender),
        main = "Gender",
        xlab = "Gender",    ylab = "Number of Individuals",
        col =c("magenta","steelblue1"))
barplot(table(data$EducationLevel), 
        main = "Education Level", 
        las = 2, cex.names = 0.8,
        xlab = "", ylab = "Number of Individuals",
        col = rainbow_hcl(4))

par(op)

question 7- paragraph

counts_age <- table(data$AgeGroup)
props_age  <- prop.table(counts_age)
top_age    <- names(which.max(counts_age))
top_ed     <- names(which.max(table(data$EducationLevel)))
top_gen    <- names(which.max(table(data$Gender)))

cat("\nParagraph:",
"\nThe most prevalent age group is ", top_age, " with ", counts_age[top_age], " individuals. ",
"\nThis distribution suggests that analyses may be more strongly influenced by the\n", 
top_age, "group compared to others.")

## 
## Paragraph: 
## The most prevalent age group is  18-25  with  13  individuals.  
## This distribution suggests that analyses may be more strongly influenced by the
##  18-25 group compared to others.