We are considering a data set to answer some follow on questions about that data set. We are considering this data based on a Satisfaction Survey:

Find the mean and median satisfaction ratings

# Creating the dataset
RespondentID <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
Age <- c(25, 34, 28, 45, 30, 22, 40, 35, 29, 50, 31, 38, 27, 42, 26, 33, 39, 28, 46, 31)
Gender <- c("Female", "Male", "Non-binary", "Male", "Female", "Female", "Male", "Non-binary", "Female", "Male", 
            "Female", "Male", "Female", "Non-binary", "Male", "Female", "Male", "Female", "Non-binary", "Male")
Satisfaction <- c(8, 6, 7, 5, 9, 7, 4, 6, 8, 7, 9, 6, 7, 8, 5, 8, 7, 6, 7, 5)
Department <- c("HR", "IT", "Finance", "Marketing", "HR", "IT", "Finance", "Marketing", "HR", "Finance",
                "Marketing", "IT", "HR", "Finance", "Marketing", "IT", "HR", "Finance", "Marketing", "IT")

# Combining into a data frame
data <- data.frame(RespondentID, Age, Gender, Satisfaction, Department)

# Calculating mean and median satisfaction
mean_satisfaction <- mean(data$Satisfaction)
median_satisfaction <- median(data$Satisfaction)

# Printing the results
cat("Mean Satisfaction:", mean_satisfaction, "\n")
Mean Satisfaction: 6.75 
cat("Median Satisfaction:", median_satisfaction, "\n")
Median Satisfaction: 7 

We want to know the average ages of respondents in the HR department.

# Create a vector of ages for respondents in the 'HR' department
hr_ages <- data$Age[data$Department == 'HR']

# Compute the average age of respondents in the 'HR' department
average_age_hr <- mean(hr_ages)

# Print the average age
average_age_hr
[1] 30
# Convert the Age and Satisfaction columns into a matrix
age_satisfaction_matrix <- as.matrix(data[, c("Age", "Satisfaction")])

# Create a data frame from the matrix to compute mean satisfaction by age group
age_groups <- cut(age_satisfaction_matrix[, 1], breaks = seq(20, 60, by = 10), right = FALSE)
mean_satisfaction_by_age_group <- tapply(age_satisfaction_matrix[, 2], age_groups, mean)

# Print the mean satisfaction rating for each age group
mean_satisfaction_by_age_group
 [20,30)  [30,40)  [40,50)  [50,60) 
6.857143 7.000000 6.000000 7.000000 

After learning the mean satisfaction rating for each age group, we want to know the mean satisfaction rating by department.

# Assuming the 'data' data frame is already created

# Calculate mean satisfaction ratings by department
mean_satisfaction <- aggregate(Satisfaction ~ Department, data = data, FUN = mean)

# Convert the result to a matrix
satisfaction_matrix <- as.matrix(mean_satisfaction[, -1])
rownames(satisfaction_matrix) <- mean_satisfaction$Department

# Display the matrix
print(satisfaction_matrix)
          [,1]
Finance    6.4
HR         7.8
IT         6.4
Marketing  6.4

We want to know how many rows and columns are in the data frame.

# Get the number of rows and columns
dimensions <- dim(data)
rows <- dimensions[1]
cols <- dimensions[2]

# Summary of Gender and Department columns
gender_summary <- summary(data$Gender)
department_summary <- summary(data$Department)

# Print the results
print(paste("Number of rows:", rows))
[1] "Number of rows: 20"
print(paste("Number of columns:", cols))
[1] "Number of columns: 5"
print("Summary of Gender:")
[1] "Summary of Gender:"
print(gender_summary)
   Length     Class      Mode 
       20 character character 
print("Summary of Department:")
[1] "Summary of Department:"
print(department_summary)
   Length     Class      Mode 
       20 character character 

We want to find the average satisfaction rate of those aged 30-40.

# Assuming the 'data' data frame is already created

# Filter the data frame for respondents aged between 30 and 40
filtered_data <- subset(data, Age >= 30 & Age <= 40)

# Calculate the average satisfaction rating for this age group
average_satisfaction <- mean(filtered_data$Satisfaction)

# Print the average satisfaction rating
print(paste("Average satisfaction rating for respondents aged 30 to 40:", average_satisfaction))
[1] "Average satisfaction rating for respondents aged 30 to 40: 6.66666666666667"

We want a histogram of the age column.

# Load necessary library for visualization
library(ggplot2)

# Create a histogram of the Age column
ggplot(data, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Ages", x = "Age", y = "Frequency") +
  theme_minimal()

We want a histogram based off the Satisfaction Ratings

# Load necessary library for visualization
library(ggplot2)

# Create a histogram of the Satisfaction ratings
ggplot(data, aes(x = Satisfaction)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Satisfaction Ratings", x = "Satisfaction Rating", y = "Frequency") +
  theme_minimal()

library(ggplot2)

# Create the boxplot
ggplot(data, aes(x = Department, y = Satisfaction)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot of Satisfaction Ratings by Department",
       x = "Department",
       y = "Satisfaction Rating") +
  theme_minimal()

We want to create box plots from the data frames given for satisfaction by department. Now we want a second boxplot based off age by gender.

# Load necessary library
library(ggplot2)

# Create the boxplot for Age by Gender
ggplot(data, aes(x = Gender, y = Age)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Boxplot of Age by Gender",
       x = "Gender",
       y = "Age") +
  theme_minimal()

We now need to identify outliers using the IQR method.

# Create the data frame
data <- data.frame(
  Satisfaction = c(8, 6, 7, 5, 9, 7, 4, 6, 8, 7, 9, 6, 7, 8, 5, 8, 7, 6, 7, 5)
)

# Calculate Q1 and Q3
Q1 <- quantile(data$Satisfaction, 0.25)
Q3 <- quantile(data$Satisfaction, 0.75)

# Calculate IQR
IQR_value <- IQR(data$Satisfaction)

# Determine outlier thresholds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Identify outliers
outliers <- data$Satisfaction[data$Satisfaction < lower_bound | data$Satisfaction > upper_bound]

# Print outliers
print(outliers)
numeric(0)

Using a boxplot, we need to identify any outliers in Satisfaction ratings.

# Load necessary library
library(ggplot2)

# Create boxplot
ggplot(data, aes(x = Department, y = Satisfaction)) +
  geom_boxplot() +
  labs(title = "Boxplot of Satisfaction Ratings by Department",
       x = "Department",
       y = "Satisfaction Rating") +
  theme_minimal()

We want to show linear regression to predict Satisfaction based on Age.

# Fit the linear regression model
model <- lm(Satisfaction ~ Age, data = data)

# Summary of the model
summary(model)

Call:
lm(formula = Satisfaction ~ Age, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.54574 -0.79895  0.03224  0.98160  2.15040 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.89620    1.42944   5.524 3.03e-05 ***
Age         -0.03376    0.04111  -0.821    0.422    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.384 on 18 degrees of freedom
Multiple R-squared:  0.03612,   Adjusted R-squared:  -0.01743 
F-statistic: 0.6746 on 1 and 18 DF,  p-value: 0.4222

We want to create a scatter plot with a regression line. We want to find the relationship between Satisfaction and Age.

# Load necessary library
library(ggplot2)

# Create scatter plot with regression line
ggplot(data, aes(x = Age, y = Satisfaction)) +
  geom_point() +  # Add points for the scatter plot
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add the regression line
  labs(title = "Scatter Plot of Age vs. Satisfaction",
       x = "Age",
       y = "Satisfaction Rating") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

---
title: "Presentation 3 Jose Salazar"
output: html_notebook
---

We are considering a data set to answer some follow on questions about that data set.
We are considering this data based on a Satisfaction Survey:

Find the mean and median satisfaction ratings 

```{r}
# Creating the dataset
RespondentID <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
Age <- c(25, 34, 28, 45, 30, 22, 40, 35, 29, 50, 31, 38, 27, 42, 26, 33, 39, 28, 46, 31)
Gender <- c("Female", "Male", "Non-binary", "Male", "Female", "Female", "Male", "Non-binary", "Female", "Male", 
            "Female", "Male", "Female", "Non-binary", "Male", "Female", "Male", "Female", "Non-binary", "Male")
Satisfaction <- c(8, 6, 7, 5, 9, 7, 4, 6, 8, 7, 9, 6, 7, 8, 5, 8, 7, 6, 7, 5)
Department <- c("HR", "IT", "Finance", "Marketing", "HR", "IT", "Finance", "Marketing", "HR", "Finance",
                "Marketing", "IT", "HR", "Finance", "Marketing", "IT", "HR", "Finance", "Marketing", "IT")

# Combining into a data frame
data <- data.frame(RespondentID, Age, Gender, Satisfaction, Department)

# Calculating mean and median satisfaction
mean_satisfaction <- mean(data$Satisfaction)
median_satisfaction <- median(data$Satisfaction)

# Printing the results
cat("Mean Satisfaction:", mean_satisfaction, "\n")
cat("Median Satisfaction:", median_satisfaction, "\n")

```
We want to know the average ages of respondents in the HR department.
```{r}
# Create a vector of ages for respondents in the 'HR' department
hr_ages <- data$Age[data$Department == 'HR']

# Compute the average age of respondents in the 'HR' department
average_age_hr <- mean(hr_ages)

# Print the average age
average_age_hr

```
```{r}
# Convert the Age and Satisfaction columns into a matrix
age_satisfaction_matrix <- as.matrix(data[, c("Age", "Satisfaction")])

# Create a data frame from the matrix to compute mean satisfaction by age group
age_groups <- cut(age_satisfaction_matrix[, 1], breaks = seq(20, 60, by = 10), right = FALSE)
mean_satisfaction_by_age_group <- tapply(age_satisfaction_matrix[, 2], age_groups, mean)

# Print the mean satisfaction rating for each age group
mean_satisfaction_by_age_group

```
After learning the mean satisfaction rating for each age group, we want to know the mean satisfaction rating by department.
```{r}
# Assuming the 'data' data frame is already created

# Calculate mean satisfaction ratings by department
mean_satisfaction <- aggregate(Satisfaction ~ Department, data = data, FUN = mean)

# Convert the result to a matrix
satisfaction_matrix <- as.matrix(mean_satisfaction[, -1])
rownames(satisfaction_matrix) <- mean_satisfaction$Department

# Display the matrix
print(satisfaction_matrix)
```
We want to know how many rows and columns are in the data frame.
```{r}
# Get the number of rows and columns
dimensions <- dim(data)
rows <- dimensions[1]
cols <- dimensions[2]

# Summary of Gender and Department columns
gender_summary <- summary(data$Gender)
department_summary <- summary(data$Department)

# Print the results
print(paste("Number of rows:", rows))
print(paste("Number of columns:", cols))
print("Summary of Gender:")
print(gender_summary)
print("Summary of Department:")
print(department_summary)

```
We want to find the average satisfaction rate of those aged 30-40.
```{r}
# Assuming the 'data' data frame is already created

# Filter the data frame for respondents aged between 30 and 40
filtered_data <- subset(data, Age >= 30 & Age <= 40)

# Calculate the average satisfaction rating for this age group
average_satisfaction <- mean(filtered_data$Satisfaction)

# Print the average satisfaction rating
print(paste("Average satisfaction rating for respondents aged 30 to 40:", average_satisfaction))

```
We want a histogram of the age column.
```{r}
# Load necessary library for visualization
library(ggplot2)

# Create a histogram of the Age column
ggplot(data, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Ages", x = "Age", y = "Frequency") +
  theme_minimal()
```
We want a histogram based off the Satisfaction Ratings
```{r}
# Load necessary library for visualization
library(ggplot2)

# Create a histogram of the Satisfaction ratings
ggplot(data, aes(x = Satisfaction)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Satisfaction Ratings", x = "Satisfaction Rating", y = "Frequency") +
  theme_minimal()

```
```{r}
library(ggplot2)

# Create the boxplot
ggplot(data, aes(x = Department, y = Satisfaction)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot of Satisfaction Ratings by Department",
       x = "Department",
       y = "Satisfaction Rating") +
  theme_minimal()

```
We want to create box plots from the data frames given for satisfaction by department. Now we want a second boxplot based off age by gender.
```{r}
# Load necessary library
library(ggplot2)

# Create the boxplot for Age by Gender
ggplot(data, aes(x = Gender, y = Age)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Boxplot of Age by Gender",
       x = "Gender",
       y = "Age") +
  theme_minimal()

```
We now need to identify outliers using the IQR method.
```{r}
# Create the data frame
data <- data.frame(
  Satisfaction = c(8, 6, 7, 5, 9, 7, 4, 6, 8, 7, 9, 6, 7, 8, 5, 8, 7, 6, 7, 5)
)

# Calculate Q1 and Q3
Q1 <- quantile(data$Satisfaction, 0.25)
Q3 <- quantile(data$Satisfaction, 0.75)

# Calculate IQR
IQR_value <- IQR(data$Satisfaction)

# Determine outlier thresholds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Identify outliers
outliers <- data$Satisfaction[data$Satisfaction < lower_bound | data$Satisfaction > upper_bound]

# Print outliers
print(outliers)

```
Using a boxplot, we need to identify any outliers in Satisfaction ratings.
```{r}
# Load necessary library
library(ggplot2)

# Create boxplot
ggplot(data, aes(x = Department, y = Satisfaction)) +
  geom_boxplot() +
  labs(title = "Boxplot of Satisfaction Ratings by Department",
       x = "Department",
       y = "Satisfaction Rating") +
  theme_minimal()

```
We want to show linear regression to predict Satisfaction based on Age.
```{r}
# Fit the linear regression model
model <- lm(Satisfaction ~ Age, data = data)

# Summary of the model
summary(model)

```
We want to create a scatter plot with a regression line. We want to find the relationship between Satisfaction and Age.
```{r}
# Load necessary library
library(ggplot2)

# Create scatter plot with regression line
ggplot(data, aes(x = Age, y = Satisfaction)) +
  geom_point() +  # Add points for the scatter plot
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add the regression line
  labs(title = "Scatter Plot of Age vs. Satisfaction",
       x = "Age",
       y = "Satisfaction Rating") +
  theme_minimal()

```
