Analysis of Survey Data

In this document, we will analyze the survey data to extract meaningful insights

Vectors

To start, we import the dataset and extract the vector of satisfaction ratings and calculate the mean and median.

names(survey_data)
[1] "RespondentID" "Age"          "Gender"       "Satisfaction" "Department"  
###We would like to extract the vector of satisfaction ratings.
satisfaction_ratings <- survey_data$Satisfaction
head(satisfaction_ratings)
[1] 8 6 7 5 7 7
# Calculate the mean
mean_satisfaction <- mean(satisfaction_ratings, na.rm = TRUE)
# Display the mean
mean_satisfaction
[1] 6.65
# Calculate the median
median_satisfaction <- median(satisfaction_ratings, na.rm = TRUE)

# Display the median
median_satisfaction
[1] 7
mean_satisfaction
[1] 6.65
median_satisfaction
[1] 7
### The mean of the satisfaction ratings is 6.65 and the median is 7.

If we need to look specifically at satisfaction ratings for the HR department, we need to create a vector for the ages of respondents in the HR department specifically, and then calculate the mean of just those respondents.

names(survey_data)
[1] "RespondentID" "Age"          "Gender"       "Satisfaction" "Department"  
# Create a vector of ages for respondents in the 'HR' department
ages_in_HR <- survey_data$Age[survey_data$Department == "HR"]
ages_in_HR
[1] 25 29 27 39
# Calculate the average age of HR respondents
average_age_HR <- mean(ages_in_HR, na.rm = TRUE)

# Display the average age
average_age_HR
[1] 30
# Extract ages of respondents in the 'HR' department
ages_in_HR <- survey_data$Age[survey_data$Department == "HR"]

# Calculate the average age
average_age_HR <- mean(ages_in_HR, na.rm = TRUE)

# Display the average age
average_age_HR
[1] 30
# Print a formatted message
print(paste("The average age of HR respondents is", round(average_age_HR, 2), "years."))
[1] "The average age of HR respondents is 30 years."

Matrices
Next we will create a matrix of satisfaction ratings and then transpose it.

# Extract Age and Satisfaction columns
age <- survey_data$Age
satisfaction <- survey_data$Satisfaction
# Combine Age and Satisfaction into a matrix
age_satisfaction_matrix <- cbind(age, satisfaction)
# View the first few rows of the matrix
head(age_satisfaction_matrix)
     age satisfaction
[1,]  25            8
[2,]  34            6
[3,]  28            7
[4,]  45            5
[5,]  30            7
[6,]  22            7
# Extract Age and Satisfaction from the matrix
ages <- age_satisfaction_matrix[, "age"]
satisfactions <- age_satisfaction_matrix[, "satisfaction"]
# Compute the mean satisfaction rating for each age group
mean_satisfaction_by_age <- tapply(satisfactions, ages, mean, na.rm = TRUE)

# Display the results
mean_satisfaction_by_age
 22  25  26  27  28  29  30  31  33  34  35  38  39  40  42  45  46  50 
7.0 8.0 5.0 7.0 6.5 8.0 7.0 7.0 8.0 6.0 6.0 6.0 7.0 4.0 8.0 5.0 7.0 7.0 
# Set column name
colnames(mean_satisfaction_matrix) <- "Mean_Satisfaction"
mean_satisfaction_matrix
          Mean_Satisfaction
Finance                 6.4
HR                      7.5
IT                      6.5
Marketing               6.4
# Transpose the matrix
transposed_matrix <- t(mean_satisfaction_matrix)

# Display the transposed matrix
transposed_matrix
                  Finance  HR  IT Marketing
Mean_Satisfaction     6.4 7.5 6.5       6.4
# Create a matrix with multiple metrics
dept_metrics_matrix <- cbind(
  Mean_Satisfaction = mean_satisfaction_by_dept,
  Mean_Age = tapply(survey_data$Age, department, mean, na.rm = TRUE),
  Employee_Count = tapply(rep(1, length(department)), department, sum)
)

# Convert to matrix
dept_metrics_matrix <- as.matrix(dept_metrics_matrix)

# Display the matrix
dept_metrics_matrix
          Mean_Satisfaction Mean_Age Employee_Count
Finance                 6.4 37.60000              5
HR                      7.5 30.00000              4
IT                      6.5 31.33333              6
Marketing               6.4 36.60000              5
# Transpose the matrix
transposed_matrix <- t(dept_metrics_matrix)

# Display the transposed matrix
transposed_matrix
                  Finance   HR       IT Marketing
Mean_Satisfaction     6.4  7.5  6.50000       6.4
Mean_Age             37.6 30.0 31.33333      36.6
Employee_Count        5.0  4.0  6.00000       5.0
# Calculate the mean of each column (department)
mean_per_department <- colMeans(transposed_matrix, na.rm = TRUE)

# Display the results
mean_per_department
  Finance        HR        IT Marketing 
 16.33333  13.83333  14.61111  16.00000 

Dataframes

# Step 1: Compute metrics for each department
mean_satisfaction_by_dept <- tapply(survey_data$Satisfaction, survey_data$Department, mean, na.rm = TRUE)
mean_age_by_dept <- tapply(survey_data$Age, survey_data$Department, mean, na.rm = TRUE)
employee_count_by_dept <- tapply(rep(1, nrow(survey_data)), survey_data$Department, sum)
# Convert the results into a data frame
mean_satisfaction_df <- data.frame(
  Age = as.numeric(names(mean_satisfaction_by_age)),
  Mean_Satisfaction = as.numeric(mean_satisfaction_by_age)
)

# Display the data frame
mean_satisfaction_df
# Extract Department and Satisfaction columns
department <- survey_data$Department
satisfaction <- survey_data$Satisfaction
# Compute mean satisfaction rating for each department
mean_satisfaction_by_dept <- tapply(satisfaction, department, mean, na.rm = TRUE)
# Convert the result into a matrix
mean_satisfaction_matrix <- as.matrix(mean_satisfaction_by_dept)
# Set row names (departments)
rownames(mean_satisfaction_matrix) <- names(mean_satisfaction_by_dept)


# Get the dimensions of the data frame
dimensions <- dim(survey_data)

# Number of rows
num_rows <- nrow(survey_data)

# Number of columns
num_columns <- ncol(survey_data)

# Display the results
cat("The data frame has", num_rows, "rows and", num_columns, "columns.\n")
The data frame has 20 rows and 5 columns.
# Summary of the Gender column
gender_summary <- summary(survey_data$Gender)
cat("\nSummary of Gender:\n")

Summary of Gender:
print(gender_summary)
    Female       Male Non-Binary 
         8          8          4 
# Summary of the Department column
department_summary <- summary(survey_data$Department)
cat("\nSummary of Department:\n")

Summary of Department:
print(department_summary)
  Finance        HR        IT Marketing 
        5         4         6         5 
str(survey_data$Gender)
 Factor w/ 3 levels "Female","Male",..: 1 2 3 2 1 1 2 3 1 2 ...
str(survey_data$Department)
 Factor w/ 4 levels "Finance","HR",..: 2 3 1 4 3 3 1 4 2 1 ...
# Convert Gender and Department to factors
survey_data$Gender <- as.factor(survey_data$Gender)
survey_data$Department <- as.factor(survey_data$Department)
# Check the structure of the Gender column
str(survey_data$Gender)
 Factor w/ 3 levels "Female","Male",..: 1 2 3 2 1 1 2 3 1 2 ...
# Check the structure of the Department column
str(survey_data$Department)
 Factor w/ 4 levels "Finance","HR",..: 2 3 1 4 3 3 1 4 2 1 ...
# Summary of the Gender column
cat("\nSummary of Gender:\n")

Summary of Gender:
print(summary(survey_data$Gender))
    Female       Male Non-Binary 
         8          8          4 
# Filter the data frame to include respondents aged between 30 and 40
filtered_data <- subset(survey_data, Age >= 30 & Age <= 40)
# Calculate the average satisfaction rating for the filtered data
average_satisfaction <- mean(filtered_data$Satisfaction, na.rm = TRUE)

# Display the result
cat("The average satisfaction rating for respondents aged between 30 and 40 is:", round(average_satisfaction, 2), "\n")
The average satisfaction rating for respondents aged between 30 and 40 is: 6.44 
num_respondents <- nrow(filtered_data)
cat("Number of respondents aged between 30 and 40:", num_respondents, "\n")
Number of respondents aged between 30 and 40: 9 
cat("The average satisfaction rating for respondents aged between 30 and 40 is:", round(average_satisfaction, 2), "\n")
The average satisfaction rating for respondents aged between 30 and 40 is: 6.44 

Histograms

# install.packages("ggplot2")

# Load ggplot2
library(ggplot2)

# Create a histogram of the Age column using ggplot2
ggplot(survey_data, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "lightblue", color = "black") +
  labs(
    title = "Distribution of Ages",
    x = "Age",
    y = "Frequency"
  ) +
  theme_minimal()

# Create a histogram of the Satisfaction ratings using ggplot2
ggplot(survey_data, aes(x = Satisfaction)) +
  geom_histogram(binwidth = 1, fill = "lightgreen", color = "black", boundary = 0.5) +
  scale_x_continuous(breaks = seq(min(survey_data$Satisfaction, na.rm = TRUE), max(survey_data$Satisfaction, na.rm = TRUE), by = 1)) +
  labs(
    title = "Distribution of Satisfaction Ratings",
    x = "Satisfaction Rating",
    y = "Frequency"
  ) +
  theme_minimal()

The histogram shows us that there are more 30 year-old employees than any other age bracket, with most employees falling between 25 and 45 years old. There are very few employees under 25 or older than 45. The histogram of satisfaction ratings in green indicates that most people are somewhat satisfied with their job and few people are extremely satisfied or dissatisfied.

Boxplots
We will create a boxplot of satisfaction ratings by department and a second one by gender.

# Create a boxplot of Satisfaction ratings by Department
ggplot(survey_data, aes(x = Department, y = Satisfaction)) +
  geom_boxplot(fill = "lightblue", color = "darkblue") +
  labs(
    title = "Satisfaction Ratings by Department",
    x = "Department",
    y = "Satisfaction Rating"
  ) +
  theme_minimal()

When looking at the boxplot by department, it is clear that HR employees are happiest in their positions. Marketing has a high IQR (interquartile range), meaning the spread of satisfaction ratings, indicated by the light blue box, is spread out over a larger range showing that the middle 50% of satisfaction varies over a larger rating area. We can also say that Finance, HR, and IT have tighter spreads than in Marketing, meaning their satisfaction variability is close to the median and other scores within their IQR. The median is the dark black line in the boxplot. The median is relatively similar in all departments, between 6.5 and 7.5. The boxplot indicates there is a single outlier in Finance who is extremely dissatisfied in their position.. The boxplot for age by gender indicates that there far fewer female-identifying employees and that they are younger than male or non-binary employees.

# Create a boxplot of Age by Gender
ggplot(survey_data, aes(x = Gender, y = Age)) +
  geom_boxplot(fill = "pink", color = "darkred") +
  labs(
    title = "Age Distribution by Gender",
    x = "Gender",
    y = "Age"
  ) +
  theme_minimal()

# Extract the Satisfaction ratings
satisfaction_ratings <- survey_data$Satisfaction

Outliers
We will identify and list any outliers.
There was a single outlier in Finance in the satisfaction ratings, which correlates to the findings in the IQR method.I do not think that a single employee would effect the overall satisfaction of employees.

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 <- quantile(satisfaction_ratings, 0.25, na.rm = TRUE)
Q3 <- quantile(satisfaction_ratings, 0.75, na.rm = TRUE)

# Calculate IQR
IQR_value <- IQR(satisfaction_ratings, na.rm = TRUE)

# Display the results
cat("Q1 (25th percentile):", Q1, "\n")
Q1 (25th percentile): 6 
cat("Q3 (75th percentile):", Q3, "\n")
Q3 (75th percentile): 7.25 
cat("Interquartile Range (IQR):", IQR_value, "\n")
Interquartile Range (IQR): 1.25 
# Calculate the lower and upper bounds
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Display the bounds
cat("Lower Bound for Outliers:", lower_bound, "\n")
Lower Bound for Outliers: 4.125 
cat("Upper Bound for Outliers:", upper_bound, "\n")
Upper Bound for Outliers: 9.125 
# Identify outliers
outliers <- satisfaction_ratings[satisfaction_ratings < lower_bound | satisfaction_ratings > upper_bound]

# Display the outliers
cat("Outliers in Satisfaction Ratings:\n")
Outliers in Satisfaction Ratings:
print(outliers)
[1] 4
# Get indices of outliers
outlier_indices <- which(satisfaction_ratings < lower_bound | satisfaction_ratings > upper_bound)

# Create a data frame of outliers
outliers_df <- data.frame(
  Index = outlier_indices,
  Satisfaction_Rating = satisfaction_ratings[outlier_indices]
)

# Display the data frame
print(outliers_df)
# Create a boxplot of Satisfaction ratings by Department
ggplot(survey_data, aes(x = Department, y = Satisfaction)) +
  geom_boxplot(fill = "lightblue", color = "darkblue", outlier.color = "red", outlier.shape = 16, outlier.size = 2) +
  labs(
    title = "Satisfaction Ratings by Department",
    x = "Department",
    y = "Satisfaction Rating"
  ) +
  theme_minimal()

# Load necessary library
library(dplyr)

# Function to identify outliers using IQR method
identify_outliers <- function(data) {
  Q1 <- quantile(data$Satisfaction, 0.25, na.rm = TRUE)
  Q3 <- quantile(data$Satisfaction, 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_value
  upper_bound <- Q3 + 1.5 * IQR_value
  data %>%
    filter(Satisfaction < lower_bound | Satisfaction > upper_bound)
}

# Apply the function to each department
outliers_by_dept <- survey_data %>%
  group_by(Department) %>%
  do(identify_outliers(.))

# Display the outliers
print(outliers_by_dept)

There is a single outlier, male age 40, in Finance.

Linear Regression

Y’ corresponds to the predicted value (satisfaction value), X corresponds to the independent variable (age), b is the slope in the graph (-0.0268), and A (or β0) is the intercept, representing the value of the dependent variable when the independent variable is 0 (A= 7.559).


# Perform the linear regression
lm_model <- lm(Satisfaction ~ Age, data = survey_data)

# View the summary of the regression model
summary(lm_model)

Call:
lm(formula = Satisfaction ~ Age, data = survey_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.4879 -0.6888  0.1772  0.8625  2.2710 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.55947    1.32790   5.693 2.13e-05 ***
Age         -0.02679    0.03819  -0.702    0.492    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.285 on 18 degrees of freedom
Multiple R-squared:  0.02661,   Adjusted R-squared:  -0.02746 
F-statistic: 0.4921 on 1 and 18 DF,  p-value: 0.4919
# Extract the coefficients
coefficients <- coef(lm_model)

# Display the regression equation
cat("Regression Equation: Satisfaction =", round(coefficients[1], 4), "+", round(coefficients[2], 4), "* Age\n")
Regression Equation: Satisfaction = 7.5595 + -0.0268 * Age
# View the summary of the regression model
summary(lm_model)

Call:
lm(formula = Satisfaction ~ Age, data = survey_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.4879 -0.6888  0.1772  0.8625  2.2710 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.55947    1.32790   5.693 2.13e-05 ***
Age         -0.02679    0.03819  -0.702    0.492    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.285 on 18 degrees of freedom
Multiple R-squared:  0.02661,   Adjusted R-squared:  -0.02746 
F-statistic: 0.4921 on 1 and 18 DF,  p-value: 0.4919
# Scatter plot with regression line
ggplot(survey_data, aes(x = Age, y = Satisfaction)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Scatter Plot of Satisfaction vs. Age",
    x = "Age",
    y = "Satisfaction Rating"
  ) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

The regression line indicates that satisfaction decreases very slightly with age because it has a negative slope. The scatter plot shows a visual representation of the regression line. Because the dots are far from the regression line, age may not be the correct indicator for satisfaction. We may need to use a more complex model because this does not seem to be a good fit.

