We are considering a data set to answer some follow on questions
about that data set. We are considering this data based on a
Satisfaction Survey:
Find the mean and median satisfaction ratings
# Creating the dataset
RespondentID <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
Age <- c(25, 34, 28, 45, 30, 22, 40, 35, 29, 50, 31, 38, 27, 42, 26, 33, 39, 28, 46, 31)
Gender <- c("Female", "Male", "Non-binary", "Male", "Female", "Female", "Male", "Non-binary", "Female", "Male",
"Female", "Male", "Female", "Non-binary", "Male", "Female", "Male", "Female", "Non-binary", "Male")
Satisfaction <- c(8, 6, 7, 5, 9, 7, 4, 6, 8, 7, 9, 6, 7, 8, 5, 8, 7, 6, 7, 5)
Department <- c("HR", "IT", "Finance", "Marketing", "HR", "IT", "Finance", "Marketing", "HR", "Finance",
"Marketing", "IT", "HR", "Finance", "Marketing", "IT", "HR", "Finance", "Marketing", "IT")
# Combining into a data frame
data <- data.frame(RespondentID, Age, Gender, Satisfaction, Department)
# Calculating mean and median satisfaction
mean_satisfaction <- mean(data$Satisfaction)
median_satisfaction <- median(data$Satisfaction)
# Printing the results
cat("Mean Satisfaction:", mean_satisfaction, "\n")
Mean Satisfaction: 6.75
cat("Median Satisfaction:", median_satisfaction, "\n")
Median Satisfaction: 7
We want to know the average ages of respondents in the HR
department.
# Create a vector of ages for respondents in the 'HR' department
hr_ages <- data$Age[data$Department == 'HR']
# Compute the average age of respondents in the 'HR' department
average_age_hr <- mean(hr_ages)
# Print the average age
average_age_hr
[1] 30
# Convert the Age and Satisfaction columns into a matrix
age_satisfaction_matrix <- as.matrix(data[, c("Age", "Satisfaction")])
# Create a data frame from the matrix to compute mean satisfaction by age group
age_groups <- cut(age_satisfaction_matrix[, 1], breaks = seq(20, 60, by = 10), right = FALSE)
mean_satisfaction_by_age_group <- tapply(age_satisfaction_matrix[, 2], age_groups, mean)
# Print the mean satisfaction rating for each age group
mean_satisfaction_by_age_group
[20,30) [30,40) [40,50) [50,60)
6.857143 7.000000 6.000000 7.000000
After learning the mean satisfaction rating for each age group, we
want to know the mean satisfaction rating by department.
# Assuming the 'data' data frame is already created
# Calculate mean satisfaction ratings by department
mean_satisfaction <- aggregate(Satisfaction ~ Department, data = data, FUN = mean)
# Convert the result to a matrix
satisfaction_matrix <- as.matrix(mean_satisfaction[, -1])
rownames(satisfaction_matrix) <- mean_satisfaction$Department
# Display the matrix
print(satisfaction_matrix)
[,1]
Finance 6.4
HR 7.8
IT 6.4
Marketing 6.4
We want to know how many rows and columns are in the data frame.
# Get the number of rows and columns
dimensions <- dim(data)
rows <- dimensions[1]
cols <- dimensions[2]
# Summary of Gender and Department columns
gender_summary <- summary(data$Gender)
department_summary <- summary(data$Department)
# Print the results
print(paste("Number of rows:", rows))
[1] "Number of rows: 20"
print(paste("Number of columns:", cols))
[1] "Number of columns: 5"
print("Summary of Gender:")
[1] "Summary of Gender:"
print(gender_summary)
Length Class Mode
20 character character
print("Summary of Department:")
[1] "Summary of Department:"
print(department_summary)
Length Class Mode
20 character character
We want to find the average satisfaction rate of those aged
30-40.
# Assuming the 'data' data frame is already created
# Filter the data frame for respondents aged between 30 and 40
filtered_data <- subset(data, Age >= 30 & Age <= 40)
# Calculate the average satisfaction rating for this age group
average_satisfaction <- mean(filtered_data$Satisfaction)
# Print the average satisfaction rating
print(paste("Average satisfaction rating for respondents aged 30 to 40:", average_satisfaction))
[1] "Average satisfaction rating for respondents aged 30 to 40: 6.66666666666667"
We want a histogram of the age column.
# Load necessary library for visualization
library(ggplot2)
# Create a histogram of the Age column
ggplot(data, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Ages", x = "Age", y = "Frequency") +
theme_minimal()

We want a histogram based off the Satisfaction Ratings
# Load necessary library for visualization
library(ggplot2)
# Create a histogram of the Satisfaction ratings
ggplot(data, aes(x = Satisfaction)) +
geom_histogram(binwidth = 1, fill = "green", color = "black", alpha = 0.7) +
labs(title = "Histogram of Satisfaction Ratings", x = "Satisfaction Rating", y = "Frequency") +
theme_minimal()

library(ggplot2)
# Create the boxplot
ggplot(data, aes(x = Department, y = Satisfaction)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Boxplot of Satisfaction Ratings by Department",
x = "Department",
y = "Satisfaction Rating") +
theme_minimal()

We want to create box plots from the data frames given for
satisfaction by department. Now we want a second boxplot based off age
by gender.
# Load necessary library
library(ggplot2)
# Create the boxplot for Age by Gender
ggplot(data, aes(x = Gender, y = Age)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Boxplot of Age by Gender",
x = "Gender",
y = "Age") +
theme_minimal()

We now need to identify outliers using the IQR method.
# Create the data frame
data <- data.frame(
Satisfaction = c(8, 6, 7, 5, 9, 7, 4, 6, 8, 7, 9, 6, 7, 8, 5, 8, 7, 6, 7, 5)
)
# Calculate Q1 and Q3
Q1 <- quantile(data$Satisfaction, 0.25)
Q3 <- quantile(data$Satisfaction, 0.75)
# Calculate IQR
IQR_value <- IQR(data$Satisfaction)
# Determine outlier thresholds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Identify outliers
outliers <- data$Satisfaction[data$Satisfaction < lower_bound | data$Satisfaction > upper_bound]
# Print outliers
print(outliers)
numeric(0)
Using a boxplot, we need to identify any outliers in Satisfaction
ratings.
# Load necessary library
library(ggplot2)
# Create boxplot
ggplot(data, aes(x = Department, y = Satisfaction)) +
geom_boxplot() +
labs(title = "Boxplot of Satisfaction Ratings by Department",
x = "Department",
y = "Satisfaction Rating") +
theme_minimal()

We want to show linear regression to predict Satisfaction based on
Age.
# Fit the linear regression model
model <- lm(Satisfaction ~ Age, data = data)
# Summary of the model
summary(model)
Call:
lm(formula = Satisfaction ~ Age, data = data)
Residuals:
Min 1Q Median 3Q Max
-2.54574 -0.79895 0.03224 0.98160 2.15040
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.89620 1.42944 5.524 3.03e-05 ***
Age -0.03376 0.04111 -0.821 0.422
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.384 on 18 degrees of freedom
Multiple R-squared: 0.03612, Adjusted R-squared: -0.01743
F-statistic: 0.6746 on 1 and 18 DF, p-value: 0.4222
We want to create a scatter plot with a regression line. We want to
find the relationship between Satisfaction and Age.
# Load necessary library
library(ggplot2)
# Create scatter plot with regression line
ggplot(data, aes(x = Age, y = Satisfaction)) +
geom_point() + # Add points for the scatter plot
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add the regression line
labs(title = "Scatter Plot of Age vs. Satisfaction",
x = "Age",
y = "Satisfaction Rating") +
theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

---
title: "Presentation 3 Jose Salazar"
output: html_notebook
---

We are considering a data set to answer some follow on questions about that data set.
We are considering this data based on a Satisfaction Survey:

Find the mean and median satisfaction ratings 

```{r}
# Creating the dataset
RespondentID <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
Age <- c(25, 34, 28, 45, 30, 22, 40, 35, 29, 50, 31, 38, 27, 42, 26, 33, 39, 28, 46, 31)
Gender <- c("Female", "Male", "Non-binary", "Male", "Female", "Female", "Male", "Non-binary", "Female", "Male", 
            "Female", "Male", "Female", "Non-binary", "Male", "Female", "Male", "Female", "Non-binary", "Male")
Satisfaction <- c(8, 6, 7, 5, 9, 7, 4, 6, 8, 7, 9, 6, 7, 8, 5, 8, 7, 6, 7, 5)
Department <- c("HR", "IT", "Finance", "Marketing", "HR", "IT", "Finance", "Marketing", "HR", "Finance",
                "Marketing", "IT", "HR", "Finance", "Marketing", "IT", "HR", "Finance", "Marketing", "IT")

# Combining into a data frame
data <- data.frame(RespondentID, Age, Gender, Satisfaction, Department)

# Calculating mean and median satisfaction
mean_satisfaction <- mean(data$Satisfaction)
median_satisfaction <- median(data$Satisfaction)

# Printing the results
cat("Mean Satisfaction:", mean_satisfaction, "\n")
cat("Median Satisfaction:", median_satisfaction, "\n")

```
We want to know the average ages of respondents in the HR department.
```{r}
# Create a vector of ages for respondents in the 'HR' department
hr_ages <- data$Age[data$Department == 'HR']

# Compute the average age of respondents in the 'HR' department
average_age_hr <- mean(hr_ages)

# Print the average age
average_age_hr

```
```{r}
# Convert the Age and Satisfaction columns into a matrix
age_satisfaction_matrix <- as.matrix(data[, c("Age", "Satisfaction")])

# Create a data frame from the matrix to compute mean satisfaction by age group
age_groups <- cut(age_satisfaction_matrix[, 1], breaks = seq(20, 60, by = 10), right = FALSE)
mean_satisfaction_by_age_group <- tapply(age_satisfaction_matrix[, 2], age_groups, mean)

# Print the mean satisfaction rating for each age group
mean_satisfaction_by_age_group

```
After learning the mean satisfaction rating for each age group, we want to know the mean satisfaction rating by department.
```{r}
# Assuming the 'data' data frame is already created

# Calculate mean satisfaction ratings by department
mean_satisfaction <- aggregate(Satisfaction ~ Department, data = data, FUN = mean)

# Convert the result to a matrix
satisfaction_matrix <- as.matrix(mean_satisfaction[, -1])
rownames(satisfaction_matrix) <- mean_satisfaction$Department

# Display the matrix
print(satisfaction_matrix)
```
We want to know how many rows and columns are in the data frame.
```{r}
# Get the number of rows and columns
dimensions <- dim(data)
rows <- dimensions[1]
cols <- dimensions[2]

# Summary of Gender and Department columns
gender_summary <- summary(data$Gender)
department_summary <- summary(data$Department)

# Print the results
print(paste("Number of rows:", rows))
print(paste("Number of columns:", cols))
print("Summary of Gender:")
print(gender_summary)
print("Summary of Department:")
print(department_summary)

```
We want to find the average satisfaction rate of those aged 30-40.
```{r}
# Assuming the 'data' data frame is already created

# Filter the data frame for respondents aged between 30 and 40
filtered_data <- subset(data, Age >= 30 & Age <= 40)

# Calculate the average satisfaction rating for this age group
average_satisfaction <- mean(filtered_data$Satisfaction)

# Print the average satisfaction rating
print(paste("Average satisfaction rating for respondents aged 30 to 40:", average_satisfaction))

```
We want a histogram of the age column.
```{r}
# Load necessary library for visualization
library(ggplot2)

# Create a histogram of the Age column
ggplot(data, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Ages", x = "Age", y = "Frequency") +
  theme_minimal()
```
We want a histogram based off the Satisfaction Ratings
```{r}
# Load necessary library for visualization
library(ggplot2)

# Create a histogram of the Satisfaction ratings
ggplot(data, aes(x = Satisfaction)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Satisfaction Ratings", x = "Satisfaction Rating", y = "Frequency") +
  theme_minimal()

```
```{r}
library(ggplot2)

# Create the boxplot
ggplot(data, aes(x = Department, y = Satisfaction)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot of Satisfaction Ratings by Department",
       x = "Department",
       y = "Satisfaction Rating") +
  theme_minimal()

```
We want to create box plots from the data frames given for satisfaction by department. Now we want a second boxplot based off age by gender.
```{r}
# Load necessary library
library(ggplot2)

# Create the boxplot for Age by Gender
ggplot(data, aes(x = Gender, y = Age)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Boxplot of Age by Gender",
       x = "Gender",
       y = "Age") +
  theme_minimal()

```
We now need to identify outliers using the IQR method.
```{r}
# Create the data frame
data <- data.frame(
  Satisfaction = c(8, 6, 7, 5, 9, 7, 4, 6, 8, 7, 9, 6, 7, 8, 5, 8, 7, 6, 7, 5)
)

# Calculate Q1 and Q3
Q1 <- quantile(data$Satisfaction, 0.25)
Q3 <- quantile(data$Satisfaction, 0.75)

# Calculate IQR
IQR_value <- IQR(data$Satisfaction)

# Determine outlier thresholds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Identify outliers
outliers <- data$Satisfaction[data$Satisfaction < lower_bound | data$Satisfaction > upper_bound]

# Print outliers
print(outliers)

```
Using a boxplot, we need to identify any outliers in Satisfaction ratings.
```{r}
# Load necessary library
library(ggplot2)

# Create boxplot
ggplot(data, aes(x = Department, y = Satisfaction)) +
  geom_boxplot() +
  labs(title = "Boxplot of Satisfaction Ratings by Department",
       x = "Department",
       y = "Satisfaction Rating") +
  theme_minimal()

```
We want to show linear regression to predict Satisfaction based on Age.
```{r}
# Fit the linear regression model
model <- lm(Satisfaction ~ Age, data = data)

# Summary of the model
summary(model)

```
We want to create a scatter plot with a regression line. We want to find the relationship between Satisfaction and Age.
```{r}
# Load necessary library
library(ggplot2)

# Create scatter plot with regression line
ggplot(data, aes(x = Age, y = Satisfaction)) +
  geom_point() +  # Add points for the scatter plot
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add the regression line
  labs(title = "Scatter Plot of Age vs. Satisfaction",
       x = "Age",
       y = "Satisfaction Rating") +
  theme_minimal()

```
